v3.10.0

2025-09-17 23:44:27 +00:00 · 2019-12-03 12:26:11 -05:00
parent 91ec6f1771
commit 40039386a0
58 changed files with 3372 additions and 1920 deletions
--- a/14
+++ b/14
@@ -24,18 +24,10 @@ be installed manually. There may be others, read the error messages they
 will give a clue as to the missing package.
 The following command should install everything you need on Debian based
-distributions such as Ubuntu:
+distributions such as Ubuntu. Fedora and other distributions may have similar
 but different package names.
-sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake zlib1g-dev
+sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev
 build-essential  (Development Tools package group on Fedora)
 automake
 libjansson-dev
 libgmp-dev
 libcurl4-openssl-dev
 libssl-dev
 lib-thread
 zlib1g-dev
 SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
 openssl 1.1.0e or higher. Add one of the following, depending on the
--- a/5
+++ b/5
@@ -22,14 +22,13 @@ Step by step...
 Refer to Linux compile instructions and install required packages.
-Additionally, install mingw-64.
+Additionally, install mingw-w64.
 sudo apt-get install mingw-w64
 2. Create a local library directory for packages to be compiled in the next
-   step. Recommended location is $HOME/usr/lib/
+   step. Suggested location is $HOME/usr/lib/
 3. Download and build other packages for mingw that don't have a mingw64
   version available in the repositories.
--- a/Makefile.am
+++ b/Makefile.am
@@ -174,7 +174,6 @@ cpuminer_SOURCES = \
  algo/sha/sph_sha2big.c \
  algo/sha/sha256-hash-4way.c \
  algo/sha/sha512-hash-4way.c \
  algo/sha/sha256_hash_11way.c \
  algo/sha/sha2.c \
  algo/sha/sha256t-gate.c \
  algo/sha/sha256t-4way.c \
@@ -198,7 +197,6 @@ cpuminer_SOURCES = \
  algo/skein/skein-gate.c \
  algo/skein/skein2.c \
  algo/skein/skein2-4way.c \
  algo/skein/skein2-gate.c \
  algo/sm3/sm3.c \
  algo/sm3/sm3-hash-4way.c \
  algo/swifftx/swifftx.c \
--- a/README.txt
+++ b/README.txt
@@ -29,6 +29,7 @@ cpuminer-sse2.exe      "-msse2"                  Core2, Nehalem
 cpuminer-aes-sse42.exe "-march=westmere"         Westmere
 cpuminer-avx.exe       "-march=corei7-avx"       Sandy-Ivybridge
 cpuminer-avx2.exe      "-march=core-avx2"        Haswell, Sky-Kaby-Coffeelake
 cpuminer-avx512.exe    "-march=skylake-avx512"   Skylake-X, Cascadelake-X
 cpuminer-zen           "-march=znver1"           AMD Ryzen, Threadripper
 If you like this software feel free to donate:
--- a/20
+++ b/20
@@ -31,6 +31,26 @@ FreeBSD YMMV.
 Change Log
 ----------
 v3.10.0
 AVX-512 is now supported on selected algos, Windows binary is now available.
 AVX512 optimizations are available for argon2d, blake2s, keccak, keccakc,
 skein & skein2.
 Fixed CPU temperature for some CPU models (Linux only).
 Fixed a bug that caused some lanes not to submit shares.
 Fixed some previously undetected buffer overflows.
 Lyra2rev2 3% faster SSE2 and AVX2.
 Added "-fno-asynchronous-unwind-tables" to AVX512 build acript for Windows
 to fix known mingw issue.
 Changed AVX2 build script to explicitly add AES to address change in
 behaviour in GCC 9. 
 v3.9.11
 Added x22i & x25x algos.
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -59,7 +59,6 @@ extern "C"{
 typedef struct {
   unsigned char buf[64<<2];
   uint32_t H[8<<2];
   uint32_t S[4<<2];
 //   __m128i buf[16] __attribute__ ((aligned (64)));
 //   __m128i H[8];
 //   __m128i S[4];    
@@ -93,7 +92,6 @@ void blake256r8_4way_close(void *cc, void *dst);
 typedef struct {
   __m256i buf[16] __attribute__ ((aligned (64)));
   __m256i H[8];
   __m256i S[4];
   size_t ptr;
   sph_u32 T0, T1;
   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -304,16 +304,17 @@ static const sph_u32 CS[16] = {
 #endif
 // Blake-256 4 way
 #define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
 do { \
-   a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
+   a = _mm_add_epi32( _mm_add_epi32( a, b ), \
-                                   _mm_set1_epi32( c1 ), m0 ), b ), a ); \
+                      _mm_xor_si128( _mm_set1_epi32( c1 ), m0 ) ); \
   d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
   c = _mm_add_epi32( c, d ); \
   b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
-   a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
+   a = _mm_add_epi32( _mm_add_epi32( a, b ), \
-                                   _mm_set1_epi32( c0 ), m1 ), b ), a ); \
+                      _mm_xor_si128( _mm_set1_epi32( c0 ), m1 ) ); \
   d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
   c = _mm_add_epi32( c, d ); \
   b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
@@ -321,7 +322,8 @@ do { \
 #if SPH_COMPACT_BLAKE_32
-// Blake-256 4 way
+// Not used
 #if 0
 #define ROUND_S_4WAY(r)   do { \
 	GS_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
@@ -342,6 +344,8 @@ do { \
 		CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \
 } while (0)
 #endif
 #else
 #define ROUND_S_4WAY(r)   do { \
@@ -359,7 +363,6 @@ do { \
 #define DECL_STATE32_4WAY \
 	__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
 	__m128i S0, S1, S2, S3; \
        uint32_t T0, T1;
 #define READ_STATE32_4WAY(state)   do { \
@@ -371,10 +374,6 @@ do { \
 		H5 = casti_m128i( state->H, 5 ); \
 		H6 = casti_m128i( state->H, 6 ); \
 		H7 = casti_m128i( state->H, 7 ); \
 		S0 = casti_m128i( state->S, 0 ); \
 		S1 = casti_m128i( state->S, 1 ); \
 		S2 = casti_m128i( state->S, 2 ); \
 		S3 = casti_m128i( state->S, 3 ); \
 		T0 = (state)->T0; \
 		T1 = (state)->T1; \
 	} while (0)
@@ -388,17 +387,13 @@ do { \
 		casti_m128i( state->H, 5 ) = H5; \
 		casti_m128i( state->H, 6 ) = H6; \
 		casti_m128i( state->H, 7 ) = H7; \
 		casti_m128i( state->S, 0 ) = S0; \
 		casti_m128i( state->S, 1 ) = S1; \
 		casti_m128i( state->S, 2 ) = S2; \
 		casti_m128i( state->S, 3 ) = S3; \
 		(state)->T0 = T0; \
 		(state)->T1 = T1; \
 	} while (0)
 #if SPH_COMPACT_BLAKE_32
 // not used
-
+#if 0
 #define COMPRESS32_4WAY( rounds )   do { \
 	__m128i M[16]; \
 	__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -441,6 +436,7 @@ do { \
        H7 = _mm_xor_si128( _mm_xor_si128( \
                                   _mm_xor_si128( S3, V7 ), VF ), H7 ); \
 	} while (0)
 #endif
 #else
@@ -508,10 +504,10 @@ do { \
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-   V8 = _mm_xor_si128( S0, m128_const1_64( 0x243F6A88243F6A88 ) ); \
+   V8 = m128_const1_64( 0x243F6A88243F6A88 ); \
-   V9 = _mm_xor_si128( S1, m128_const1_64( 0x85A308D385A308D3 ) ); \
+   V9 = m128_const1_64( 0x85A308D385A308D3 ); \
-   VA = _mm_xor_si128( S2, m128_const1_64( 0x13198A2E13198A2E ) ); \
+   VA = m128_const1_64( 0x13198A2E13198A2E ); \
-   VB = _mm_xor_si128( S3, m128_const1_64( 0x0370734403707344 ) ); \
+   VB = m128_const1_64( 0x0370734403707344 ); \
   VC = _mm_xor_si128( _mm_set1_epi32( T0 ), \
                           m128_const1_64( 0xA4093822A4093822 ) ); \
   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), \
@@ -538,14 +534,14 @@ do { \
      ROUND_S_4WAY(2); \
      ROUND_S_4WAY(3); \
   } \
-   H0 = mm128_xor4( V8, V0, S0, H0 ); \
+   H0 = _mm_xor_si128( _mm_xor_si128( V8, V0 ), H0 ); \
-   H1 = mm128_xor4( V9, V1, S1, H1 ); \
+   H1 = _mm_xor_si128( _mm_xor_si128( V9, V1 ), H1 ); \
-   H2 = mm128_xor4( VA, V2, S2, H2 ); \
+   H2 = _mm_xor_si128( _mm_xor_si128( VA, V2 ), H2 ); \
-   H3 = mm128_xor4( VB, V3, S3, H3 ); \
+   H3 = _mm_xor_si128( _mm_xor_si128( VB, V3 ), H3 ); \
-   H4 = mm128_xor4( VC, V4, S0, H4 ); \
+   H4 = _mm_xor_si128( _mm_xor_si128( VC, V4 ), H4 ); \
-   H5 = mm128_xor4( VD, V5, S1, H5 ); \
+   H5 = _mm_xor_si128( _mm_xor_si128( VD, V5 ), H5 ); \
-   H6 = mm128_xor4( VE, V6, S2, H6 ); \
+   H6 = _mm_xor_si128( _mm_xor_si128( VE, V6 ), H6 ); \
-   H7 = mm128_xor4( VF, V7, S3, H7 ); \
+   H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \
 } while (0)
 #endif
@@ -556,13 +552,13 @@ do { \
 #define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
 do { \
-   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
+   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
-                 _mm256_set1_epi32( c1 ), m0 ), b ), a ); \
+                         _mm256_xor_si256( _mm256_set1_epi32( c1 ), m0 ) ); \
   d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
   c = _mm256_add_epi32( c, d ); \
   b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
-   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
+   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
-                 _mm256_set1_epi32( c0 ), m1 ), b ), a ); \
+                         _mm256_xor_si256( _mm256_set1_epi32( c0 ), m1 ) ); \
   d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
   c = _mm256_add_epi32( c, d ); \
   b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
@@ -581,7 +577,6 @@ do { \
 #define DECL_STATE32_8WAY \
   __m256i H0, H1, H2, H3, H4, H5, H6, H7; \
   __m256i S0, S1, S2, S3; \
   sph_u32 T0, T1;
 #define READ_STATE32_8WAY(state) \
@@ -594,10 +589,6 @@ do { \
   H5 = (state)->H[5]; \
   H6 = (state)->H[6]; \
   H7 = (state)->H[7]; \
   S0 = (state)->S[0]; \
   S1 = (state)->S[1]; \
   S2 = (state)->S[2]; \
   S3 = (state)->S[3]; \
   T0 = (state)->T0; \
   T1 = (state)->T1; \
 } while (0)
@@ -612,10 +603,6 @@ do { \
   (state)->H[5] = H5; \
   (state)->H[6] = H6; \
   (state)->H[7] = H7; \
   (state)->S[0] = S0; \
   (state)->S[1] = S1; \
   (state)->S[2] = S2; \
   (state)->S[3] = S3; \
   (state)->T0 = T0; \
   (state)->T1 = T1; \
 } while (0)
@@ -635,10 +622,10 @@ do { \
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-   V8 = _mm256_xor_si256( S0, m256_const1_64( 0x243F6A88243F6A88 ) ); \
+   V8 = m256_const1_64( 0x243F6A88243F6A88 ); \
-   V9 = _mm256_xor_si256( S1, m256_const1_64( 0x85A308D385A308D3 ) ); \
+   V9 = m256_const1_64( 0x85A308D385A308D3 ); \
-   VA = _mm256_xor_si256( S2, m256_const1_64( 0x13198A2E13198A2E ) ); \
+   VA = m256_const1_64( 0x13198A2E13198A2E ); \
-   VB = _mm256_xor_si256( S3, m256_const1_64( 0x0370734403707344 ) ); \
+   VB = m256_const1_64( 0x0370734403707344 ); \
   VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
                              m256_const1_64( 0xA4093822A4093822 ) ); \
   VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
@@ -682,14 +669,14 @@ do { \
      ROUND_S_8WAY(2); \
      ROUND_S_8WAY(3); \
   } \
-   H0 = mm256_xor4( V8, V0, S0, H0 ); \
+   H0 = _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), H0 ); \
-   H1 = mm256_xor4( V9, V1, S1, H1 ); \
+   H1 = _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), H1 ); \
-   H2 = mm256_xor4( VA, V2, S2, H2 ); \
+   H2 = _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), H2 ); \
-   H3 = mm256_xor4( VB, V3, S3, H3 ); \
+   H3 = _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), H3 ); \
-   H4 = mm256_xor4( VC, V4, S0, H4 ); \
+   H4 = _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), H4 ); \
-   H5 = mm256_xor4( VD, V5, S1, H5 ); \
+   H5 = _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), H5 ); \
-   H6 = mm256_xor4( VE, V6, S2, H6 ); \
+   H6 = _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), H6 ); \
-   H7 = mm256_xor4( VF, V7, S3, H7 ); \
+   H7 = _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), H7 ); \
 } while (0)
@@ -703,7 +690,6 @@ static void
 blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
                   const uint32_t *salt, int rounds )
 {
   __m128i zero = m128_zero;
   casti_m128i( ctx->H, 0 ) = m128_const1_64( 0x6A09E6676A09E667 );
   casti_m128i( ctx->H, 1 ) = m128_const1_64( 0xBB67AE85BB67AE85 );
   casti_m128i( ctx->H, 2 ) = m128_const1_64( 0x3C6EF3723C6EF372 );
@@ -712,11 +698,6 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
   casti_m128i( ctx->H, 5 ) = m128_const1_64( 0x9B05688C9B05688C );
   casti_m128i( ctx->H, 6 ) = m128_const1_64( 0x1F83D9AB1F83D9AB );
   casti_m128i( ctx->H, 7 ) = m128_const1_64( 0x5BE0CD195BE0CD19 );
   casti_m128i( ctx->S, 0 ) = zero;
   casti_m128i( ctx->S, 1 ) = zero;
   casti_m128i( ctx->S, 2 ) = zero;
   casti_m128i( ctx->S, 3 ) = zero;
   ctx->T0 = ctx->T1 = 0;
   ctx->ptr = 0;
   ctx->rounds = rounds;
@@ -824,7 +805,6 @@ static void
 blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
                   const sph_u32 *salt, int rounds )
 {
   __m256i zero = m256_zero;
   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E6676A09E667 );
   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE85BB67AE85 );
   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF3723C6EF372 );
@@ -833,10 +813,6 @@ blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C9B05688C );
   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9AB1F83D9AB );
   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD195BE0CD19 );
   casti_m256i( sc->S, 0 ) = zero;
   casti_m256i( sc->S, 1 ) = zero;
   casti_m256i( sc->S, 2 ) = zero;
   casti_m256i( sc->S, 3 ) = zero;
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
   sc->rounds = rounds;
--- a/algo/blake/blake2b-4way.c
+++ b/algo/blake/blake2b-4way.c
@@ -4,13 +4,59 @@
 */
 #include "blake2b-gate.h"
 #if defined(BLAKE2B_4WAY)
 #include <string.h>
 #include <stdint.h>
 #include "blake2b-hash-4way.h"
 #if defined(BLAKE2B_8WAY)
 int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*8] __attribute__ ((aligned (128)));;
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[25]);   // 3*8+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   int thr_id = mythr->id;
   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   mm512_bswap32_intrlv80_8x64( vdata, pdata );
   do {
      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
      blake2b_8way_init( &ctx );
      blake2b_8way_update( &ctx, vdata, 80 );
      blake2b_8way_final( &ctx, hash );
      for ( int lane = 0; lane < 8; lane++ )
      if ( hash7[ lane<<1 ] < Htarg )
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
          {
              pdata[19] = n + lane;
              submit_lane_solution( work, lane_hash, mythr, lane );
          }
      }
      n += 8;
   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
 #elif defined(BLAKE2B_4WAY)
 // Function not used, code inlined.
 void blake2b_4way_hash(void *output, const void *input)
 {
--- a/algo/blake/blake2b-gate.c
+++ b/algo/blake/blake2b-gate.c
@@ -1,15 +1,19 @@
 #include "blake2b-gate.h"
 bool register_blake2b_algo( algo_gate_t* gate )
 {
-#if defined(BLAKE2B_4WAY)
+#if defined(BLAKE2B_8WAY)
  gate->scanhash  = (void*)&scanhash_blake2b_8way;
 //  gate->hash      = (void*)&blake2b_8way_hash;
 #elif defined(BLAKE2B_4WAY)
  gate->scanhash  = (void*)&scanhash_blake2b_4way;
  gate->hash      = (void*)&blake2b_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_blake2b;
  gate->hash      = (void*)&blake2b_hash;
 #endif
-  gate->optimizations =  AVX2_OPT;
+  gate->optimizations =  AVX2_OPT | AVX512_OPT;
  return true;
 };
--- a/algo/blake/blake2b-gate.h
+++ b/algo/blake/blake2b-gate.h
@@ -4,13 +4,21 @@
 #include <stdint.h>
 #include "algo-gate-api.h"
-#if defined(__AVX2__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define BLAKE2B_8WAY
 #elif defined(__AVX2__)
  #define BLAKE2B_4WAY
 #endif
 bool register_blake2b_algo( algo_gate_t* gate );
-#if defined(BLAKE2B_4WAY)
+#if defined(BLAKE2B_8WAY)
 //void blake2b_8way_hash( void *state, const void *input );
 int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 #elif defined(BLAKE2B_4WAY)
 void blake2b_4way_hash( void *state, const void *input );
 int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
--- a/algo/blake/blake2b-hash-4way.c
+++ b/algo/blake/blake2b-hash-4way.c
@@ -33,6 +33,178 @@
 #include "blake2b-hash-4way.h"
 static const uint8_t sigma[12][16] =
 {
      { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
      { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
      { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
      { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
      { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
      { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
      { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
      { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
      { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
      { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
      { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
      { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
 };
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 #define B2B8W_G(a, b, c, d, x, y) \
 { \
   v[a] = _mm512_add_epi64( _mm512_add_epi64( v[a], v[b] ), x ); \
   v[d] = mm512_ror_64( _mm512_xor_si512( v[d], v[a] ), 32 ); \
   v[c] = _mm512_add_epi64( v[c], v[d] ); \
   v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 24 ); \
   v[a] = _mm512_add_epi64( _mm512_add_epi64( v[a], v[b] ), y ); \
   v[d] = mm512_ror_64( _mm512_xor_si512( v[d], v[a] ), 16 ); \
   v[c] = _mm512_add_epi64( v[c], v[d] ); \
   v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
 }
 static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
 {  
   __m512i v[16], m[16];
   v[ 0] = ctx->h[0];
   v[ 1] = ctx->h[1];
   v[ 2] = ctx->h[2];
   v[ 3] = ctx->h[3];
   v[ 4] = ctx->h[4];
   v[ 5] = ctx->h[5];
   v[ 6] = ctx->h[6];
   v[ 7] = ctx->h[7];
   v[ 8] = m512_const1_64( 0x6A09E667F3BCC908 );
   v[ 9] = m512_const1_64( 0xBB67AE8584CAA73B );
   v[10] = m512_const1_64( 0x3C6EF372FE94F82B );
   v[11] = m512_const1_64( 0xA54FF53A5F1D36F1 );
   v[12] = m512_const1_64( 0x510E527FADE682D1 );
   v[13] = m512_const1_64( 0x9B05688C2B3E6C1F );
   v[14] = m512_const1_64( 0x1F83D9ABFB41BD6B );
   v[15] = m512_const1_64( 0x5BE0CD19137E2179 );
   v[12] = _mm512_xor_si512( v[12], _mm512_set1_epi64( ctx->t[0] ) );
   v[13] = _mm512_xor_si512( v[13], _mm512_set1_epi64( ctx->t[1] ) );
   if ( last )
      v[14] = mm512_not( v[14] );
   m[ 0] = ctx->b[ 0];
   m[ 1] = ctx->b[ 1];
   m[ 2] = ctx->b[ 2];
   m[ 3] = ctx->b[ 3];
   m[ 4] = ctx->b[ 4];
   m[ 5] = ctx->b[ 5];
   m[ 6] = ctx->b[ 6];
   m[ 7] = ctx->b[ 7];
   m[ 8] = ctx->b[ 8];
   m[ 9] = ctx->b[ 9];
   m[10] = ctx->b[10];
   m[11] = ctx->b[11];
   m[12] = ctx->b[12];
   m[13] = ctx->b[13];
   m[14] = ctx->b[14];
   m[15] = ctx->b[15];
   for ( int i = 0; i < 12; i++ )
   {
      B2B8W_G( 0, 4,  8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] );
      B2B8W_G( 1, 5,  9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] );
      B2B8W_G( 2, 6, 10, 14, m[ sigma[i][ 4] ], m[ sigma[i][ 5] ] );
      B2B8W_G( 3, 7, 11, 15, m[ sigma[i][ 6] ], m[ sigma[i][ 7] ] );
      B2B8W_G( 0, 5, 10, 15, m[ sigma[i][ 8] ], m[ sigma[i][ 9] ] );
      B2B8W_G( 1, 6, 11, 12, m[ sigma[i][10] ], m[ sigma[i][11] ] );
      B2B8W_G( 2, 7,  8, 13, m[ sigma[i][12] ], m[ sigma[i][13] ] );
      B2B8W_G( 3, 4,  9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
   }
   ctx->h[0] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[0], v[0] ), v[ 8] );
   ctx->h[1] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[1], v[1] ), v[ 9] );
   ctx->h[2] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[2], v[2] ), v[10] );
   ctx->h[3] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[3], v[3] ), v[11] );
   ctx->h[4] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[4], v[4] ), v[12] );
   ctx->h[5] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[5], v[5] ), v[13] );
   ctx->h[6] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[6], v[6] ), v[14] );
   ctx->h[7] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[7], v[7] ), v[15] );
 }
 int blake2b_8way_init( blake2b_8way_ctx *ctx )
 {
   size_t i;
   ctx->h[0] = m512_const1_64( 0x6A09E667F3BCC908 );
   ctx->h[1] = m512_const1_64( 0xBB67AE8584CAA73B );
   ctx->h[2] = m512_const1_64( 0x3C6EF372FE94F82B );
   ctx->h[3] = m512_const1_64( 0xA54FF53A5F1D36F1 );
   ctx->h[4] = m512_const1_64( 0x510E527FADE682D1 );
   ctx->h[5] = m512_const1_64( 0x9B05688C2B3E6C1F );
   ctx->h[6] = m512_const1_64( 0x1F83D9ABFB41BD6B );
   ctx->h[7] = m512_const1_64( 0x5BE0CD19137E2179 );
   ctx->h[0] = _mm512_xor_si512( ctx->h[0], m512_const1_64( 0x01010020 ) );
   ctx->t[0] = 0;
   ctx->t[1] = 0;
   ctx->c = 0;
   ctx->outlen = 32;
   for ( i = 0; i < 16; i++ )
     ctx->b[i] = m512_zero;
   return 0;
 }
 void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
                          size_t inlen )
 {
   __m512i* in =(__m512i*)input;
   size_t i, c;
   c = ctx->c >> 3;
   for ( i = 0; i < (inlen >> 3); i++ )
   {
      if ( ctx->c == 128 )
      {
         ctx->t[0] += ctx->c;
         if ( ctx->t[0] < ctx->c )
            ctx->t[1]++;
         blake2b_8way_compress( ctx, 0 );
         ctx->c = 0;
      }
      ctx->b[ c++ ] = in[i];
      ctx->c += 8;
   }
 }
 void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
 {
   size_t c;
   c = ctx->c >> 3;
   ctx->t[0] += ctx->c;
   if ( ctx->t[0] < ctx->c )
      ctx->t[1]++;
   while ( ctx->c < 128 )
   {
      ctx->b[c++] = m512_zero;
      ctx->c += 8;
   }
   blake2b_8way_compress( ctx, 1 );           // final block flag = 1
   casti_m512i( out, 0 ) = ctx->h[0];
   casti_m512i( out, 1 ) = ctx->h[1];
   casti_m512i( out, 2 ) = ctx->h[2];
   casti_m512i( out, 3 ) = ctx->h[3];
 }
 #endif
 #if defined(__AVX2__)
 // G Mixing function.
@@ -61,21 +233,6 @@ static const uint64_t blake2b_iv[8] = {
 static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
 {
 	const uint8_t sigma[12][16] = {
 		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
 		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
 		{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
 		{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
 		{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
 		{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
 		{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
 		{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
 		{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
 		{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
 		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
 		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
 	};
 	int i;
 	__m256i v[16], m[16];
   v[ 0] = ctx->h[0];
@@ -118,7 +275,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
   m[14] = ctx->b[14];
   m[15] = ctx->b[15];
-	for ( i = 0; i < 12; i++ )
+	for ( int i = 0; i < 12; i++ )
   { 
 		B2B_G( 0, 4,  8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] );
 		B2B_G( 1, 5,  9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] );
--- a/algo/blake/blake2b-hash-4way.h
+++ b/algo/blake/blake2b-hash-4way.h
@@ -2,8 +2,6 @@
 #ifndef __BLAKE2B_HASH_4WAY_H__
 #define __BLAKE2B_HASH_4WAY_H__
 #if defined(__AVX2__)
 #include "simd-utils.h"
 #include <stddef.h>
 #include <stdint.h>
@@ -16,14 +14,34 @@
 #define ALIGN(x) __attribute__((aligned(x)))
 #endif
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 ALIGN(128) typedef struct {
   __m512i b[16]; // input buffer
   __m512i h[8];  // chained state
   uint64_t t[2];  // total number of bytes
   size_t c;       // pointer for b[]
   size_t outlen;  // digest size
 } blake2b_8way_ctx;
 int blake2b_8way_init( blake2b_8way_ctx *ctx );
 void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
                          size_t inlen );
 void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
 #endif
 #if defined(__AVX2__)
 // state context
-ALIGN(64) typedef struct {
+ALIGN(128) typedef struct {
 	__m256i b[16]; // input buffer
 	__m256i h[8];  // chained state
 	uint64_t t[2];  // total number of bytes
 	size_t c;       // pointer for b[]
 	size_t outlen;  // digest size
-} blake2b_4way_ctx __attribute__((aligned(64)));
+} blake2b_4way_ctx;
 int blake2b_4way_init( blake2b_4way_ctx *ctx );
 void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
--- a/algo/blake/blake2s-4way.c
+++ b/algo/blake/blake2s-4way.c
@@ -3,22 +3,72 @@
 #include <string.h>
 #include <stdint.h>
-#if defined(BLAKE2S_8WAY)
+#if defined(BLAKE2S_16WAY)
 static __thread blake2s_16way_state blake2s_16w_ctx;
 void blake2s_16way_hash( void *output, const void *input )
 {
   blake2s_16way_state ctx;
   memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
   blake2s_16way_update( &ctx, input + (64<<4), 16 );
   blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
 }
 int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*16] __attribute__ ((aligned (128)));
   uint32_t hash[8*16] __attribute__ ((aligned (64)));
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[7<<4]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
   uint32_t n = first_nonce;
   int thr_id = mythr->id;  
   mm512_bswap32_intrlv80_16x32( vdata, pdata );
   blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
   blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
   do {
      *noncev = mm512_bswap_32( _mm512_set_epi32(
 	                  n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
 	                  n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
      pdata[19] = n;
      blake2s_16way_hash( hash, vdata );
      for ( int lane = 0; lane < 16; lane++ )
      if ( unlikely( hash7[lane] <= Htarg ) )
      {
         extr_lane_16x32( lane_hash, hash, lane, 256 );
         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
         {
              pdata[19] = n + lane;
              submit_lane_solution( work, lane_hash, mythr, lane );
         }
      }
      n += 16;
   } while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
 #elif defined(BLAKE2S_8WAY)
 static __thread blake2s_8way_state blake2s_8w_ctx;
 void blake2s_8way_hash( void *output, const void *input )
 {
   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
   blake2s_8way_state ctx;
   memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
   blake2s_8way_update( &ctx, input + (64<<3), 16 );
-   blake2s_8way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
+   blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
   dintrlv_8x32( output,     output+ 32, output+ 64, output+ 96,
                 output+128, output+160, output+192, output+224,
                 vhash, 256 );
 }
 int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
@@ -26,13 +76,15 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
 {
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t hash[8*8] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[7<<3]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
   uint32_t n = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id; 
   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
@@ -45,16 +97,17 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
      blake2s_8way_hash( hash, vdata );
-
+      for ( int lane = 0; lane < 8; lane++ )
-      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( hash7[lane] <= Htarg ) )
      if (  (hash+(i<<3))[7] <= Htarg )
      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
      {
-          pdata[19] = n+i;
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
-          submit_lane_solution( work, hash+(i<<3), mythr, i );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
         {
              pdata[19] = n + lane;
              submit_lane_solution( work, lane_hash, mythr, lane );
         }
      }
      n += 8;
   } while ( (n < max_nonce) && !work_restart[thr_id].restart );
   *hashes_done = n - first_nonce + 1;
@@ -67,15 +120,10 @@ static __thread blake2s_4way_state blake2s_4w_ctx;
 void blake2s_4way_hash( void *output, const void *input )
 {
   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
   blake2s_4way_state ctx;
   memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
   blake2s_4way_update( &ctx, input + (64<<2), 16 );
-   blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
+   blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
   dintrlv_4x32( output, output+32, output+64, output+96,
 		            vhash, 256 );
 }
 int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
@@ -83,13 +131,15 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
   uint32_t n = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id; 
   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
@@ -101,15 +151,16 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
      blake2s_4way_hash( hash, vdata );
-      for ( int i = 0; i < 4; i++ )
+      for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
      if ( (hash+(i<<3))[7] <= Htarg )
      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
      {
-          pdata[19] = n+i;
+         extr_lane_4x32( lane_hash, hash, lane, 256 );
-          submit_lane_solution( work, hash+(i<<3), mythr, i );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
              pdata[19] = n + lane;
              submit_lane_solution( work, lane_hash, mythr, lane );
              }
      }
      n += 4;
   } while ( (n < max_nonce) && !work_restart[thr_id].restart );
   *hashes_done = n - first_nonce + 1;
--- a/algo/blake/blake2s-gate.c
+++ b/algo/blake/blake2s-gate.c
@@ -2,7 +2,11 @@
 bool register_blake2s_algo( algo_gate_t* gate )
 {
-#if defined(BLAKE2S_8WAY)
+#if defined(BLAKE2S_16WAY)
  gate->scanhash  = (void*)&scanhash_blake2s_16way;
  gate->hash      = (void*)&blake2s_16way_hash;
 #elif defined(BLAKE2S_8WAY)
 //#if defined(BLAKE2S_8WAY)
  gate->scanhash  = (void*)&scanhash_blake2s_8way;
  gate->hash      = (void*)&blake2s_8way_hash;
 #elif defined(BLAKE2S_4WAY)
@@ -12,7 +16,7 @@ bool register_blake2s_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_blake2s;
  gate->hash      = (void*)&blake2s_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };
--- a/algo/blake/blake2s-gate.h
+++ b/algo/blake/blake2s-gate.h
@@ -8,13 +8,26 @@
 #if defined(__SSE2__)
  #define BLAKE2S_4WAY
 #endif
 #if defined(__AVX2__)
  #define BLAKE2S_8WAY
 #endif
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define BLAKE2S_16WAY
 #endif
 bool register_blake2s_algo( algo_gate_t* gate );
-#if defined(BLAKE2S_8WAY)
+#if defined(BLAKE2S_16WAY)
 void blake2s_16way_hash( void *state, const void *input );
 int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 #elif defined (BLAKE2S_8WAY)
 //#if defined(BLAKE2S_8WAY)
 void blake2s_8way_hash( void *state, const void *input );
 int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -165,13 +165,13 @@ do { \
 // 
 // Supported:
 //    64 + 16 bytes  (blake2s with midstate optimization)
-//    80 bytes without midstate (blake2s without midstate optimization)
+//    80 bytes       (blake2s without midstate optimization)
 //    Any multiple of 64 bytes in one shot (x25x)
 //
 // Unsupported:
-//    Stream of 64 byte blocks one at a time.   
+//    Stream of full 64 byte blocks one at a time.   
-//
+
-// use for part blocks or when streaming more data
+// use only when streaming more data or final block not full.
 int blake2s_4way_update( blake2s_4way_state *S, const void *in,
                         uint64_t inlen )
 {
@@ -466,6 +466,168 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
 #endif // __AVX2__
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 // Blake2s-256 16 way
 int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
 {
   __m512i m[16];
   __m512i v[16];
   memcpy_512( m, block, 16 );
   memcpy_512( v, S->h, 8 );
   v[ 8] = m512_const1_64( 0x6A09E6676A09E667ULL );
   v[ 9] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
   v[10] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
   v[11] = m512_const1_64( 0xA54FF53AA54FF53AULL );
   v[12] = _mm512_xor_si512( _mm512_set1_epi32( S->t[0] ),
                          m512_const1_64( 0x510E527F510E527FULL ) );
   v[13] = _mm512_xor_si512( _mm512_set1_epi32( S->t[1] ),
                          m512_const1_64( 0x9B05688C9B05688CULL ) );
   v[14] = _mm512_xor_si512( _mm512_set1_epi32( S->f[0] ),
                          m512_const1_64( 0x1F83D9AB1F83D9ABULL ) );
   v[15] = _mm512_xor_si512( _mm512_set1_epi32( S->f[1] ),
                          m512_const1_64( 0x5BE0CD195BE0CD19ULL ) );
 #define G16W( sigma0, sigma1, a, b, c, d) \
 do { \
   uint8_t s0 = sigma0; \
   uint8_t s1 = sigma1; \
   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s0 ] ); \
   d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
   c = _mm512_add_epi32( c, d ); \
   b = mm512_ror_32( _mm512_xor_si512( b, c ), 12 ); \
   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s1 ] ); \
   d = mm512_ror_32( _mm512_xor_si512( d, a ),  8 ); \
   c = _mm512_add_epi32( c, d ); \
   b = mm512_ror_32( _mm512_xor_si512( b, c ),  7 ); \
 } while(0)
 #define ROUND16W(r)  \
 do { \
   uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
   G16W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
   G16W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
   G16W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
   G16W( sigma[ 6], sigma[ 7], v[ 3], v[ 7], v[11], v[15] ); \
   G16W( sigma[ 8], sigma[ 9], v[ 0], v[ 5], v[10], v[15] ); \
   G16W( sigma[10], sigma[11], v[ 1], v[ 6], v[11], v[12] ); \
   G16W( sigma[12], sigma[13], v[ 2], v[ 7], v[ 8], v[13] ); \
   G16W( sigma[14], sigma[15], v[ 3], v[ 4], v[ 9], v[14] ); \
 } while(0)
   ROUND16W( 0 );
   ROUND16W( 1 );
   ROUND16W( 2 );
   ROUND16W( 3 );
   ROUND16W( 4 );
   ROUND16W( 5 );
   ROUND16W( 6 );
   ROUND16W( 7 );
   ROUND16W( 8 );
   ROUND16W( 9 );
   for( size_t i = 0; i < 8; ++i )
      S->h[i] = _mm512_xor_si512( _mm512_xor_si512( S->h[i], v[i] ), v[i + 8] );
 #undef G16W
 #undef ROUND16W
   return 0;
 }
 int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen )
 {
   blake2s_nway_param P[1];
   P->digest_length = outlen;
   P->key_length    = 0;
   P->fanout        = 1;
   P->depth         = 1;
   P->leaf_length   = 0;
   *((uint64_t*)(P->node_offset)) = 0;
   P->node_depth    = 0;
   P->inner_length  = 0;
   memset( P->salt,     0, sizeof( P->salt ) );
   memset( P->personal, 0, sizeof( P->personal ) );
   memset( S, 0, sizeof( blake2s_16way_state ) );
   S->h[0] = m512_const1_64( 0x6A09E6676A09E667ULL );
   S->h[1] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
   S->h[2] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
   S->h[3] = m512_const1_64( 0xA54FF53AA54FF53AULL );
   S->h[4] = m512_const1_64( 0x510E527F510E527FULL );
   S->h[5] = m512_const1_64( 0x9B05688C9B05688CULL );
   S->h[6] = m512_const1_64( 0x1F83D9AB1F83D9ABULL );
   S->h[7] = m512_const1_64( 0x5BE0CD195BE0CD19ULL );
   uint32_t *p = ( uint32_t * )( P );
   /* IV XOR ParamBlock */
   for ( size_t i = 0; i < 8; ++i )
      S->h[i] = _mm512_xor_si512( S->h[i], _mm512_set1_epi32( p[i] ) );
   return 0;
 }
 int blake2s_16way_update( blake2s_16way_state *S, const void *in,
                         uint64_t inlen )
 {
  __m512i *input = (__m512i*)in;
  __m512i *buf = (__m512i*)S->buf;
  const int bsize = BLAKE2S_BLOCKBYTES;
   while( inlen > 0 )
   {
      size_t left = S->buflen;
      if( inlen >= bsize - left )
      {
         memcpy_512( buf + (left>>2), input, (bsize - left) >> 2 );
         S->buflen += bsize - left;
         S->t[0] += BLAKE2S_BLOCKBYTES;
         S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
         blake2s_16way_compress( S, buf );
         S->buflen = 0;
         input += ( bsize >> 2 );
         inlen -= bsize;
      }
      else
      {
          memcpy_512( buf + ( left>>2 ), input, inlen>>2 );
          S->buflen += (size_t) inlen;
          input += ( inlen>>2 );
          inlen -= inlen;
      }
   }
   return 0;
 }
 int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen )
 {
   __m512i *buf = (__m512i*)S->buf;
   S->t[0] += S->buflen;
   S->t[1] += ( S->t[0] < S->buflen );
   if ( S->last_node )
      S->f[1] = ~0U;
   S->f[0] = ~0U;
   memset_zero_512( buf + ( S->buflen>>2 ),
                    ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
   blake2s_16way_compress( S, buf );
   for ( int i = 0; i < 8; ++i )
      casti_m512i( out, i ) = S->h[ i ];
   return 0;
 }
 #endif   // AVX512
 #if 0
 int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
 {
--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -64,7 +64,7 @@ typedef struct __blake2s_nway_param
 ALIGN( 64 ) typedef struct __blake2s_4way_state
 {
   __m128i h[8];
-   uint8_t  buf[ 2 * BLAKE2S_BLOCKBYTES * 4 ];
+   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 4 ];
   uint32_t t[2];
   uint32_t f[2];
   size_t   buflen;
@@ -75,13 +75,16 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen );
 int blake2s_4way_update( blake2s_4way_state *S, const void *in,
                         uint64_t inlen );
 int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
 int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
                              const void *input, uint64_t inlen );
 #if defined(__AVX2__)
 ALIGN( 64 ) typedef struct __blake2s_8way_state
 {
   __m256i h[8];
-   uint8_t  buf[ 2 * BLAKE2S_BLOCKBYTES * 8 ];
+   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 8 ];
   uint32_t t[2];
   uint32_t f[2];
   size_t   buflen;
@@ -92,9 +95,27 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen );
 int blake2s_8way_update( blake2s_8way_state *S, const void *in,
                         uint64_t inlen );
 int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
-int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
+//int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
-                              const void *input, uint64_t inlen );
+//                              const void *input, uint64_t inlen );
 #endif
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 ALIGN( 128 ) typedef struct __blake2s_16way_state
 {
   __m512i h[8];
   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 16 ];
   uint32_t t[2];
   uint32_t f[2];
   size_t   buflen;
   uint8_t  last_node;
 } blake2s_16way_state ;
 int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen );
 int blake2s_16way_update( blake2s_16way_state *S, const void *in,
                         uint64_t inlen );
 int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
 #endif
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -78,7 +78,7 @@ void bmw256_4way_addbits_and_close(
 // BMW-256 8 way 32
 typedef struct {
-   __m256i buf[64];
+   __m256i buf[16];
   __m256i H[16];
   size_t ptr;
   uint32_t bit_count;  // assume bit_count fits in 32 bits
@@ -121,7 +121,7 @@ typedef struct {
   __m256i H[16];
   size_t ptr;
   sph_u64 bit_count;
-} bmw_4way_big_context;
+} bmw_4way_big_context __attribute__((aligned(128)));
 typedef bmw_4way_big_context bmw512_4way_context;
@@ -137,6 +137,22 @@ void bmw512_4way_addbits_and_close(
 #endif  // __AVX2__
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 typedef struct {
   __m512i buf[16];
   __m512i H[16];
   size_t ptr;
   uint64_t bit_count;
 } bmw512_8way_context __attribute__((aligned(128)));
 void bmw512_8way_init( bmw512_8way_context *ctx );
 void bmw512_8way_update( bmw512_8way_context *ctx, const void *data,
                         size_t len );
 void bmw512_8way_close( bmw512_8way_context *ctx, void *dst );
 #endif // AVX512
 #ifdef __cplusplus
 }
 #endif
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -137,165 +137,151 @@ static const uint32_t IV256[] = {
                           ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \
      add_elt_s( M, H, (i)-16 ) )
 // Expressions are grouped using associativity to reduce CPU depenedencies,
 // resulting in some sign changes compared to the reference code.
 #define Ws0 \
   _mm_add_epi32( \
-       _mm_add_epi32( \
+      _mm_add_epi32( \
-          _mm_add_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
-             _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
+                        _mm_xor_si128( M[ 7], H[ 7] ) ), \
-                            _mm_xor_si128( M[ 7], H[ 7] ) ), \
+         _mm_xor_si128( M[10], H[10] ) ), \
-             _mm_xor_si128( M[10], H[10] ) ), \
+      _mm_add_epi32( _mm_xor_si128( M[13], H[13] ), \
-          _mm_xor_si128( M[13], H[13] ) ), \
+                     _mm_xor_si128( M[14], H[14] ) ) )
       _mm_xor_si128( M[14], H[14] ) )
 #define Ws1 \
-   _mm_sub_epi32( \
+   _mm_add_epi32( \
       _mm_add_epi32( \
-          _mm_add_epi32( \
+          _mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
-             _mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
+                         _mm_xor_si128( M[ 8], H[ 8] ) ), \
-                            _mm_xor_si128( M[ 8], H[ 8] ) ), \
+          _mm_xor_si128( M[11], H[11] ) ), \
-             _mm_xor_si128( M[11], H[11] ) ), \
+       _mm_sub_epi32( _mm_xor_si128( M[14], H[14] ), \
-          _mm_xor_si128( M[14], H[14] ) ), \
+                      _mm_xor_si128( M[15], H[15] ) ) )
       _mm_xor_si128( M[15], H[15] ) )
 #define Ws2 \
-   _mm_add_epi32( \
+   _mm_sub_epi32( \
-       _mm_sub_epi32( \
+      _mm_add_epi32( \
-          _mm_add_epi32( \
+         _mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
-             _mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
+                        _mm_xor_si128( M[ 7], H[ 7] ) ), \
-                            _mm_xor_si128( M[ 7], H[ 7] ) ), \
+         _mm_xor_si128( M[ 9], H[ 9] ) ), \
-             _mm_xor_si128( M[ 9], H[ 9] ) ), \
+      _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
-          _mm_xor_si128( M[12], H[12] ) ), \
+                     _mm_xor_si128( M[15], H[15] ) ) )
       _mm_xor_si128( M[15], H[15] ) )
 #define Ws3 \
-   _mm_add_epi32( \
+   _mm_sub_epi32( \
-       _mm_sub_epi32( \
+      _mm_add_epi32( \
-          _mm_add_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
-             _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
+                        _mm_xor_si128( M[ 1], H[ 1] ) ), \
-                            _mm_xor_si128( M[ 1], H[ 1] ) ), \
+         _mm_xor_si128( M[ 8], H[ 8] ) ), \
-             _mm_xor_si128( M[ 8], H[ 8] ) ), \
+      _mm_sub_epi32( _mm_xor_si128( M[10], H[10] ), \
-          _mm_xor_si128( M[10], H[10] ) ), \
+                     _mm_xor_si128( M[13], H[13] ) ) )
       _mm_xor_si128( M[13], H[13] ) )
 #define Ws4 \
   _mm_sub_epi32( \
-       _mm_sub_epi32( \
+      _mm_add_epi32( \
-          _mm_add_epi32( \
+         _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
-             _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
+                        _mm_xor_si128( M[ 2], H[ 2] ) ), \
-                            _mm_xor_si128( M[ 2], H[ 2] ) ), \
+         _mm_xor_si128( M[ 9], H[ 9] ) ), \
-             _mm_xor_si128( M[ 9], H[ 9] ) ), \
+      _mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
-          _mm_xor_si128( M[11], H[11] ) ), \
+                     _mm_xor_si128( M[14], H[14] ) ) )
       _mm_xor_si128( M[14], H[14] ) )
 #define Ws5 \
-   _mm_add_epi32( \
+   _mm_sub_epi32( \
-       _mm_sub_epi32( \
+      _mm_add_epi32( \
-          _mm_add_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
-             _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
+                        _mm_xor_si128( M[ 2], H[ 2] ) ), \
-                            _mm_xor_si128( M[ 2], H[ 2] ) ), \
+         _mm_xor_si128( M[10], H[10] ) ), \
-             _mm_xor_si128( M[10], H[10] ) ), \
+      _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
-          _mm_xor_si128( M[12], H[12] ) ), \
+                     _mm_xor_si128( M[15], H[15] ) ) )
       _mm_xor_si128( M[15], H[15] ) )
 #define Ws6 \
-   _mm_add_epi32( \
+   _mm_sub_epi32( \
-       _mm_sub_epi32( \
+      _mm_sub_epi32( \
-          _mm_sub_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
-             _mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
+                        _mm_xor_si128( M[ 0], H[ 0] ) ), \
-                            _mm_xor_si128( M[ 0], H[ 0] ) ), \
+         _mm_xor_si128( M[ 3], H[ 3] ) ), \
-             _mm_xor_si128( M[ 3], H[ 3] ) ), \
+      _mm_sub_epi32( _mm_xor_si128( M[11], H[11] ), \
-          _mm_xor_si128( M[11], H[11] ) ), \
+                     _mm_xor_si128( M[13], H[13] ) ) )
       _mm_xor_si128( M[13], H[13] ) )
 #define Ws7 \
   _mm_sub_epi32( \
-       _mm_sub_epi32( \
+      _mm_sub_epi32( \
-          _mm_sub_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
-             _mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
+                        _mm_xor_si128( M[ 4], H[ 4] ) ), \
-                            _mm_xor_si128( M[ 4], H[ 4] ) ), \
+         _mm_xor_si128( M[ 5], H[ 5] ) ), \
-             _mm_xor_si128( M[ 5], H[ 5] ) ), \
+      _mm_add_epi32( _mm_xor_si128( M[12], H[12] ), \
-          _mm_xor_si128( M[12], H[12] ) ), \
+                     _mm_xor_si128( M[14], H[14] ) ) )
       _mm_xor_si128( M[14], H[14] ) )
 #define Ws8 \
   _mm_sub_epi32( \
       _mm_add_epi32( \
          _mm_sub_epi32( \
             _mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
                            _mm_xor_si128( M[ 5], H[ 5] ) ), \
             _mm_xor_si128( M[ 6], H[ 6] ) ), \
          _mm_xor_si128( M[13], H[13] ) ), \
       _mm_xor_si128( M[15], H[15] ) )
 #define Ws9 \
   _mm_add_epi32( \
-       _mm_sub_epi32( \
+      _mm_sub_epi32( \
-          _mm_add_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
-             _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
+                        _mm_xor_si128( M[ 5], H[ 5] ) ), \
-                            _mm_xor_si128( M[ 3], H[ 3] ) ), \
+         _mm_xor_si128( M[ 6], H[ 6] ) ), \
-             _mm_xor_si128( M[ 6], H[ 6] ) ), \
+      _mm_sub_epi32( _mm_xor_si128( M[13], H[13] ), \
-          _mm_xor_si128( M[ 7], H[ 7] ) ), \
+                     _mm_xor_si128( M[15], H[15] ) ) )
-       _mm_xor_si128( M[14], H[14] ) )
+#define Ws9 \
   _mm_sub_epi32( \
      _mm_add_epi32( \
         _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
                        _mm_xor_si128( M[ 3], H[ 3] ) ), \
         _mm_xor_si128( M[ 6], H[ 6] ) ), \
      _mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
                     _mm_xor_si128( M[14], H[14] ) ) )
 #define Ws10 \
-   _mm_add_epi32( \
+   _mm_sub_epi32( \
-       _mm_sub_epi32( \
+      _mm_sub_epi32( \
-          _mm_sub_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
-             _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
+                        _mm_xor_si128( M[ 1], H[ 1] ) ), \
-                            _mm_xor_si128( M[ 1], H[ 1] ) ), \
+         _mm_xor_si128( M[ 4], H[ 4] ) ), \
-             _mm_xor_si128( M[ 4], H[ 4] ) ), \
+      _mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
-          _mm_xor_si128( M[ 7], H[ 7] ) ), \
+                     _mm_xor_si128( M[15], H[15] ) ) )
       _mm_xor_si128( M[15], H[15] ) )
 #define Ws11 \
-   _mm_add_epi32( \
+   _mm_sub_epi32( \
-       _mm_sub_epi32( \
+      _mm_sub_epi32( \
-          _mm_sub_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
-             _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
+                        _mm_xor_si128( M[ 0], H[ 0] ) ), \
-                            _mm_xor_si128( M[ 0], H[ 0] ) ), \
+         _mm_xor_si128( M[ 2], H[ 2] ) ), \
-             _mm_xor_si128( M[ 2], H[ 2] ) ), \
+      _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
-          _mm_xor_si128( M[ 5], H[ 5] ) ), \
+                     _mm_xor_si128( M[ 9], H[ 9] ) ) )
       _mm_xor_si128( M[ 9], H[ 9] ) )
 #define Ws12 \
-   _mm_add_epi32( \
+   _mm_sub_epi32( \
-       _mm_sub_epi32( \
+      _mm_sub_epi32( \
-          _mm_sub_epi32( \
+         _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
-             _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
+                        _mm_xor_si128( M[ 3], H[ 3] ) ), \
-                            _mm_xor_si128( M[ 3], H[ 3] ) ), \
+         _mm_xor_si128( M[ 6], H[ 6] ) ), \
-             _mm_xor_si128( M[ 6], H[ 6] ) ), \
+      _mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
-          _mm_xor_si128( M[ 9], H[ 9] ) ), \
+                     _mm_xor_si128( M[10], H[10] ) ) )
       _mm_xor_si128( M[10], H[10] ) )
 #define Ws13 \
   _mm_add_epi32( \
-       _mm_add_epi32( \
+      _mm_add_epi32( \
-          _mm_add_epi32( \
+         _mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
-             _mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
+                        _mm_xor_si128( M[ 4], H[ 4] ) ), \
-                            _mm_xor_si128( M[ 4], H[ 4] ) ), \
+         _mm_xor_si128( M[ 7], H[ 7] ) ), \
-             _mm_xor_si128( M[ 7], H[ 7] ) ), \
+      _mm_add_epi32( _mm_xor_si128( M[10], H[10] ), \
-          _mm_xor_si128( M[10], H[10] ) ), \
+                     _mm_xor_si128( M[11], H[11] ) ) )
       _mm_xor_si128( M[11], H[11] ) )
 #define Ws14 \
   _mm_sub_epi32( \
-       _mm_sub_epi32( \
+      _mm_add_epi32( \
-          _mm_add_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
-             _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
+                        _mm_xor_si128( M[ 5], H[ 5] ) ), \
-                               _mm_xor_si128( M[ 5], H[ 5] ) ), \
+         _mm_xor_si128( M[ 8], H[ 8] ) ), \
-             _mm_xor_si128( M[ 8], H[ 8] ) ), \
+      _mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
-          _mm_xor_si128( M[11], H[11] ) ), \
+                     _mm_xor_si128( M[12], H[12] ) ) )
       _mm_xor_si128( M[12], H[12] ) )
 #define Ws15 \
-   _mm_add_epi32( \
+   _mm_sub_epi32( \
-       _mm_sub_epi32( \
+      _mm_sub_epi32( \
-          _mm_sub_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
-             _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
+                        _mm_xor_si128( M[ 4], H[4] ) ), \
-                            _mm_xor_si128( M[ 4], H[ 4] ) ), \
+         _mm_xor_si128( M[ 6], H[ 6] ) ), \
-             _mm_xor_si128( M[ 6], H[ 6] ) ), \
+      _mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
-          _mm_xor_si128( M[ 9], H[ 9] ) ), \
+                     _mm_xor_si128( M[13], H[13] ) ) )
       _mm_xor_si128( M[13], H[13] ) )
 void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
@@ -700,163 +686,148 @@ bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #define W8s0 \
   _mm256_add_epi32( \
-       _mm256_add_epi32( \
+      _mm256_add_epi32( \
-          _mm256_add_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 5], H[ 5] ), \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 5], H[ 5] ), \
+                           _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+         _mm256_xor_si256( M[10], H[10] ) ), \
-             _mm256_xor_si256( M[10], H[10] ) ), \
+      _mm256_add_epi32( _mm256_xor_si256( M[13], H[13] ), \
-          _mm256_xor_si256( M[13], H[13] ) ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )
       _mm256_xor_si256( M[14], H[14] ) )
 #define W8s1 \
-   _mm256_sub_epi32( \
+   _mm256_add_epi32( \
       _mm256_add_epi32( \
-          _mm256_add_epi32( \
+          _mm256_sub_epi32( _mm256_xor_si256( M[ 6], H[ 6] ), \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 6], H[ 6] ), \
+                            _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-                               _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
-             _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_sub_epi32( _mm256_xor_si256( M[14], H[14] ), \
-          _mm256_xor_si256( M[14], H[14] ) ), \
+                         _mm256_xor_si256( M[15], H[15] ) ) )
       _mm256_xor_si256( M[15], H[15] ) )
 #define W8s2 \
-   _mm256_add_epi32( \
+   _mm256_sub_epi32( \
-       _mm256_sub_epi32( \
+      _mm256_add_epi32( \
-          _mm256_add_epi32( \
+         _mm256_add_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
-             _mm256_add_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                           _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+         _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[12], H[12] ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )
       _mm256_xor_si256( M[15], H[15] ) )
 #define W8s3 \
-   _mm256_add_epi32( \
+   _mm256_sub_epi32( \
-       _mm256_sub_epi32( \
+      _mm256_add_epi32( \
-          _mm256_add_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                           _mm256_xor_si256( M[ 1], H[ 1] ) ), \
-                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+         _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[10], H[10] ), \
-          _mm256_xor_si256( M[10], H[10] ) ), \
+                        _mm256_xor_si256( M[13], H[13] ) ) )
       _mm256_xor_si256( M[13], H[13] ) )
 #define W8s4 \
   _mm256_sub_epi32( \
-       _mm256_sub_epi32( \
+      _mm256_add_epi32( \
-          _mm256_add_epi32( \
+         _mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
-             _mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                           _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+         _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+      _mm256_add_epi32( _mm256_xor_si256( M[11], H[11] ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )
       _mm256_xor_si256( M[14], H[14] ) )
 #define W8s5 \
-   _mm256_add_epi32( \
+   _mm256_sub_epi32( \
-       _mm256_sub_epi32( \
+      _mm256_add_epi32( \
-          _mm256_add_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                           _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+         _mm256_xor_si256( M[10], H[10] ) ), \
-             _mm256_xor_si256( M[10], H[10] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[12], H[12] ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )
       _mm256_xor_si256( M[15], H[15] ) )
 #define W8s6 \
-   _mm256_add_epi32( \
+   _mm256_sub_epi32( \
-       _mm256_sub_epi32( \
+      _mm256_sub_epi32( \
-          _mm256_sub_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 4], H[ 4] ), \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 4], H[ 4] ), \
+                           _mm256_xor_si256( M[ 0], H[ 0] ) ), \
-                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+         _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-             _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[11], H[11] ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
+                        _mm256_xor_si256( M[13], H[13] ) ) )
       _mm256_xor_si256( M[13], H[13] ) )
 #define W8s7 \
   _mm256_sub_epi32( \
-       _mm256_sub_epi32( \
+      _mm256_sub_epi32( \
-          _mm256_sub_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                           _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+         _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-             _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+      _mm256_add_epi32( _mm256_xor_si256( M[12], H[12] ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )
       _mm256_xor_si256( M[14], H[14] ) )
 #define W8s8 \
-   _mm256_sub_epi32( \
+   _mm256_add_epi32( \
-       _mm256_add_epi32( \
+      _mm256_sub_epi32( \
-          _mm256_sub_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                           _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[13], H[13] ), \
-          _mm256_xor_si256( M[13], H[13] ) ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )
       _mm256_xor_si256( M[15], H[15] ) )
 #define W8s9 \
-   _mm256_add_epi32( \
+   _mm256_sub_epi32( \
-       _mm256_sub_epi32( \
+      _mm256_add_epi32( \
-          _mm256_add_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                           _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[ 7], H[ 7] ), \
-          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )
       _mm256_xor_si256( M[14], H[14] ) )
 #define W8s10 \
-   _mm256_add_epi32( \
+   _mm256_sub_epi32( \
-       _mm256_sub_epi32( \
+      _mm256_sub_epi32( \
-          _mm256_sub_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                           _mm256_xor_si256( M[ 1], H[ 1] ) ), \
-                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+         _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-             _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[ 7], H[ 7] ), \
-          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )
       _mm256_xor_si256( M[15], H[15] ) )
 #define W8s11 \
-   _mm256_add_epi32( \
+   _mm256_sub_epi32( \
-       _mm256_sub_epi32( \
+      _mm256_sub_epi32( \
-          _mm256_sub_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                           _mm256_xor_si256( M[ 0], H[ 0] ) ), \
-                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+         _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-             _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[ 5], H[ 5] ), \
-          _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+                        _mm256_xor_si256( M[ 9], H[ 9] ) ) )
       _mm256_xor_si256( M[ 9], H[ 9] ) )
 #define W8s12 \
-   _mm256_add_epi32( \
+   _mm256_sub_epi32( \
-       _mm256_sub_epi32( \
+      _mm256_sub_epi32( \
-          _mm256_sub_epi32( \
+         _mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
-             _mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                           _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[ 9], H[ 9] ), \
-          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+                        _mm256_xor_si256( M[10], H[10] ) ) )
       _mm256_xor_si256( M[10], H[10] ) )
 #define W8s13 \
   _mm256_add_epi32( \
-       _mm256_add_epi32( \
+      _mm256_add_epi32( \
-          _mm256_add_epi32( \
+         _mm256_add_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \
-             _mm256_add_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                           _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+         _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-             _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+      _mm256_add_epi32( _mm256_xor_si256( M[10], H[10] ), \
-          _mm256_xor_si256( M[10], H[10] ) ), \
+                        _mm256_xor_si256( M[11], H[11] ) ) )
       _mm256_xor_si256( M[11], H[11] ) )
 #define W8s14 \
   _mm256_sub_epi32( \
-       _mm256_sub_epi32( \
+      _mm256_add_epi32( \
-          _mm256_add_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                           _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+         _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+      _mm256_add_epi32( _mm256_xor_si256( M[11], H[11] ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
+                        _mm256_xor_si256( M[12], H[12] ) ) )
       _mm256_xor_si256( M[12], H[12] ) )
 #define W8s15 \
-   _mm256_add_epi32( \
+   _mm256_sub_epi32( \
-       _mm256_sub_epi32( \
+      _mm256_sub_epi32( \
-          _mm256_sub_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[12], H[12] ), \
-             _mm256_sub_epi32( _mm256_xor_si256( M[12], H[12] ), \
+                           _mm256_xor_si256( M[ 4], H[4] ) ), \
-                               _mm256_xor_si256( M[ 4], H[4] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[ 9], H[ 9] ), \
-          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+                        _mm256_xor_si256( M[13], H[13] ) ) )
-       _mm256_xor_si256( M[13], H[13] ) )
+
 void compress_small_8way( const __m256i *M, const __m256i H[16],
 	                  __m256i dH[16] )
--- a/algo/bmw/bmw512-4way.c
+++ b/algo/bmw/bmw512-4way.c
@@ -1,13 +1,66 @@
 #include "bmw512-gate.h"
 #ifdef BMW512_4WAY
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
 //#include "sph_keccak.h"
 #include "bmw-hash-4way.h"
 #if defined(BMW512_8WAY)
 void bmw512hash_8way(void *state, const void *input)
 {
    bmw512_8way_context ctx;
    bmw512_8way_init( &ctx );
    bmw512_8way_update( &ctx, input, 80 );
    bmw512_8way_close( &ctx, state );
 }
 int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
   uint32_t hash[16*8] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[49]);   // 3*16+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
 //   const uint32_t Htarg = ptarget[7];
   int thr_id = mythr->id;
   mm512_bswap32_intrlv80_8x64( vdata, pdata );
   do {
       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0 ,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
      bmw512hash_8way( hash, vdata );
      for ( int lane = 0; lane < 8; lane++ )
      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) )
          {
              pdata[19] = n + lane;
              submit_lane_solution( work, lane_hash, mythr, lane );
          }
      }
      n += 4;
   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
 #elif defined(BMW512_4WAY)
 //#ifdef BMW512_4WAY
 void bmw512hash_4way(void *state, const void *input)
 {
    bmw512_4way_context ctx;
--- a/algo/bmw/bmw512-gate.c
+++ b/algo/bmw/bmw512-gate.c
@@ -2,9 +2,12 @@
 bool register_bmw512_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
  opt_target_factor = 256.0;
-#if defined (BMW512_4WAY)
+#if defined (BMW512_8WAY)
  gate->scanhash  = (void*)&scanhash_bmw512_8way;
  gate->hash      = (void*)&bmw512hash_8way;
 #elif defined (BMW512_4WAY)
  gate->scanhash  = (void*)&scanhash_bmw512_4way;
  gate->hash      = (void*)&bmw512hash_4way;
 #else
--- a/algo/bmw/bmw512-gate.h
+++ b/algo/bmw/bmw512-gate.h
@@ -1,23 +1,33 @@
 #ifndef BMW512_GATE_H__
-#define BMW512_GATE_H__
+#define BMW512_GATE_H__ 1
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(__AVX2__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define BMW512_8WAY 1
 #elif defined(__AVX2__)
  #define BMW512_4WAY 1
 #endif
-#if defined(BMW512_4WAY)
+#if defined(BMW512_8WAY)
 void bmw512hash_8way( void *state, const void *input );
 int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 #elif defined(BMW512_4WAY)
 void bmw512hash_4way( void *state, const void *input );
 int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-#endif
+#else
 void bmw512hash( void *state, const void *input );
 int scanhash_bmw512( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 #endif
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -556,7 +556,7 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
   compress_big_2way( buf, h, h2 );
   memcpy_128( buf, h2, 16 );
   compress_big_2way( buf, final_b2, h1 );
-   memcpy( (__m128i*)dst, h1+16, 8 );
+   memcpy( (__m128i*)dst, h1+8, 8 );
 }
 #endif  // __SSE2__
@@ -636,165 +636,152 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
                     sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \
      add_elt_b( M, H, (i)-16 ) )
 #define Wb0 \
   _mm256_add_epi64( \
-       _mm256_add_epi64( \
+      _mm256_add_epi64( \
-          _mm256_add_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
+                           _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+         _mm256_xor_si256( M[10], H[10] ) ), \
-             _mm256_xor_si256( M[10], H[10] ) ), \
+      _mm256_add_epi64( _mm256_xor_si256( M[13], H[13] ), \
-          _mm256_xor_si256( M[13], H[13] ) ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )
       _mm256_xor_si256( M[14], H[14] ) )
 #define Wb1 \
-   _mm256_sub_epi64( \
+   _mm256_add_epi64( \
       _mm256_add_epi64( \
-          _mm256_add_epi64( \
+          _mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
+                            _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-                               _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
-             _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_sub_epi64( _mm256_xor_si256( M[14], H[14] ), \
-          _mm256_xor_si256( M[14], H[14] ) ), \
+                         _mm256_xor_si256( M[15], H[15] ) ) )
       _mm256_xor_si256( M[15], H[15] ) )
 #define Wb2 \
-   _mm256_add_epi64( \
+   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
+      _mm256_add_epi64( \
-          _mm256_add_epi64( \
+         _mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                           _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+         _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )
       _mm256_xor_si256( M[15], H[15] ) )
 #define Wb3 \
-   _mm256_add_epi64( \
+   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
+      _mm256_add_epi64( \
-          _mm256_add_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                           _mm256_xor_si256( M[ 1], H[ 1] ) ), \
-                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+         _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[10], H[10] ), \
-          _mm256_xor_si256( M[10], H[10] ) ), \
+                        _mm256_xor_si256( M[13], H[13] ) ) )
       _mm256_xor_si256( M[13], H[13] ) )
 #define Wb4 \
   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
+      _mm256_add_epi64( \
-          _mm256_add_epi64( \
+         _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                           _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+         _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+      _mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )
       _mm256_xor_si256( M[14], H[14] ) )
 #define Wb5 \
-   _mm256_add_epi64( \
+   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
+      _mm256_add_epi64( \
-          _mm256_add_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                           _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+         _mm256_xor_si256( M[10], H[10] ) ), \
-             _mm256_xor_si256( M[10], H[10] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )
       _mm256_xor_si256( M[15], H[15] ) )
 #define Wb6 \
-   _mm256_add_epi64( \
+   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
+      _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
+                           _mm256_xor_si256( M[ 0], H[ 0] ) ), \
-                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+         _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-             _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[11], H[11] ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
+                        _mm256_xor_si256( M[13], H[13] ) ) )
       _mm256_xor_si256( M[13], H[13] ) )
 #define Wb7 \
   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
+      _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                           _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+         _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-             _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+      _mm256_add_epi64( _mm256_xor_si256( M[12], H[12] ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )
       _mm256_xor_si256( M[14], H[14] ) )
 #define Wb8 \
-   _mm256_sub_epi64( \
+   _mm256_add_epi64( \
-       _mm256_add_epi64( \
+      _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                           _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[13], H[13] ), \
-          _mm256_xor_si256( M[13], H[13] ) ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )
       _mm256_xor_si256( M[15], H[15] ) )
 #define Wb9 \
-   _mm256_add_epi64( \
+   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
+      _mm256_add_epi64( \
-          _mm256_add_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                           _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
-          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )
       _mm256_xor_si256( M[14], H[14] ) )
 #define Wb10 \
-   _mm256_add_epi64( \
+   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
+      _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                           _mm256_xor_si256( M[ 1], H[ 1] ) ), \
-                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+         _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-             _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
-          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )
       _mm256_xor_si256( M[15], H[15] ) )
 #define Wb11 \
-   _mm256_add_epi64( \
+   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
+      _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                           _mm256_xor_si256( M[ 0], H[ 0] ) ), \
-                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+         _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-             _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
-          _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+                        _mm256_xor_si256( M[ 9], H[ 9] ) ) )
       _mm256_xor_si256( M[ 9], H[ 9] ) )
 #define Wb12 \
-   _mm256_add_epi64( \
+   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
+      _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
+         _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                           _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
-          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+                        _mm256_xor_si256( M[10], H[10] ) ) )
       _mm256_xor_si256( M[10], H[10] ) )
 #define Wb13 \
   _mm256_add_epi64( \
-       _mm256_add_epi64( \
+      _mm256_add_epi64( \
-          _mm256_add_epi64( \
+         _mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                           _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+         _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-             _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+      _mm256_add_epi64( _mm256_xor_si256( M[10], H[10] ), \
-          _mm256_xor_si256( M[10], H[10] ) ), \
+                        _mm256_xor_si256( M[11], H[11] ) ) )
       _mm256_xor_si256( M[11], H[11] ) )
 #define Wb14 \
   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
+      _mm256_add_epi64( \
-          _mm256_add_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                           _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+         _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+      _mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
+                        _mm256_xor_si256( M[12], H[12] ) ) )
       _mm256_xor_si256( M[12], H[12] ) )
 #define Wb15 \
-   _mm256_add_epi64( \
+   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
+      _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
-             _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
+                           _mm256_xor_si256( M[ 4], H[4] ) ), \
-                               _mm256_xor_si256( M[ 4], H[4] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
-          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+                        _mm256_xor_si256( M[13], H[13] ) ) )
-       _mm256_xor_si256( M[13], H[13] ) )
+
 void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
 {
@@ -1079,6 +1066,477 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #endif  // __AVX2__
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 // BMW-512 8 WAY
 #define s8b0(x) \
   mm512_xor4( _mm512_srli_epi64( (x), 1), _mm512_slli_epi64( (x), 3), \
                mm512_rol_64(     (x), 4),  mm512_rol_64(     (x),37) )
 #define s8b1(x) \
   mm512_xor4( _mm512_srli_epi64( (x), 1), _mm512_slli_epi64( (x), 2), \
                mm512_rol_64(     (x),13),  mm512_rol_64(     (x),43) )
 #define s8b2(x) \
   mm512_xor4( _mm512_srli_epi64( (x), 2), _mm512_slli_epi64( (x), 1), \
                mm512_rol_64(     (x),19),  mm512_rol_64(     (x),53) )
 #define s8b3(x) \
   mm512_xor4( _mm512_srli_epi64( (x), 2), _mm512_slli_epi64( (x), 2), \
                mm512_rol_64(     (x),28),  mm512_rol_64(     (x),59) )
 #define s8b4(x) \
  _mm512_xor_si512( (x), _mm512_srli_epi64( (x), 1 ) )
 #define s8b5(x) \
  _mm512_xor_si512( (x), _mm512_srli_epi64( (x), 2 ) )
 #define r8b1(x)    mm512_rol_64( x,  5 )
 #define r8b2(x)    mm512_rol_64( x, 11 )
 #define r8b3(x)    mm512_rol_64( x, 27 )
 #define r8b4(x)    mm512_rol_64( x, 32 )
 #define r8b5(x)    mm512_rol_64( x, 37 )
 #define r8b6(x)    mm512_rol_64( x, 43 )
 #define r8b7(x)    mm512_rol_64( x, 53 )
 #define rol8w_off_64( M, j, off ) \
   mm512_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
                  ( ( (j) + (off) ) & 0xF ) + 1 )
 #define add_elt_b8( M, H, j ) \
   _mm512_xor_si512( \
      _mm512_add_epi64( \
            _mm512_sub_epi64( _mm512_add_epi64( rol8w_off_64( M, j, 0 ), \
                                                rol8w_off_64( M, j, 3 ) ), \
                             rol8w_off_64( M, j, 10 ) ), \
            _mm512_set1_epi64( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
       H[ ( (j)+7 ) & 0xF ] )
 #define expand1b8( qt, M, H, i ) \
   _mm512_add_epi64( mm512_add4_64( \
      mm512_add4_64( s8b1( qt[ (i)-16 ] ), s8b2( qt[ (i)-15 ] ), \
                     s8b3( qt[ (i)-14 ] ), s8b0( qt[ (i)-13 ] )), \
      mm512_add4_64( s8b1( qt[ (i)-12 ] ), s8b2( qt[ (i)-11 ] ), \
                     s8b3( qt[ (i)-10 ] ), s8b0( qt[ (i)- 9 ] )), \
      mm512_add4_64( s8b1( qt[ (i)- 8 ] ), s8b2( qt[ (i)- 7 ] ), \
                     s8b3( qt[ (i)- 6 ] ), s8b0( qt[ (i)- 5 ] )), \
      mm512_add4_64( s8b1( qt[ (i)- 4 ] ), s8b2( qt[ (i)- 3 ] ), \
                     s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) ), \
      add_elt_b8( M, H, (i)-16 ) )
 #define expand2b8( qt, M, H, i) \
   _mm512_add_epi64( mm512_add4_64( \
      mm512_add4_64( qt[ (i)-16 ], r8b1( qt[ (i)-15 ] ), \
                     qt[ (i)-14 ], r8b2( qt[ (i)-13 ] ) ), \
      mm512_add4_64( qt[ (i)-12 ], r8b3( qt[ (i)-11 ] ), \
                     qt[ (i)-10 ], r8b4( qt[ (i)- 9 ] ) ), \
      mm512_add4_64( qt[ (i)- 8 ], r8b5( qt[ (i)- 7 ] ), \
                     qt[ (i)- 6 ], r8b6( qt[ (i)- 5 ] ) ), \
      mm512_add4_64( qt[ (i)- 4 ], r8b7( qt[ (i)- 3 ] ), \
                     s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ), \
      add_elt_b8( M, H, (i)-16 ) )
 #define W8b0 \
   _mm512_add_epi64( \
      _mm512_add_epi64( \
         _mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
                           _mm512_xor_si512( M[ 7], H[ 7] ) ), \
         _mm512_xor_si512( M[10], H[10] ) ), \
      _mm512_add_epi64( _mm512_xor_si512( M[13], H[13] ), \
                        _mm512_xor_si512( M[14], H[14] ) ) )
 #define W8b1 \
   _mm512_add_epi64( \
       _mm512_add_epi64( \
          _mm512_sub_epi64( _mm512_xor_si512( M[ 6], H[ 6] ), \
                            _mm512_xor_si512( M[ 8], H[ 8] ) ), \
          _mm512_xor_si512( M[11], H[11] ) ), \
       _mm512_sub_epi64( _mm512_xor_si512( M[14], H[14] ), \
                         _mm512_xor_si512( M[15], H[15] ) ) )
 #define W8b2 \
   _mm512_sub_epi64( \
      _mm512_add_epi64( \
         _mm512_add_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
                           _mm512_xor_si512( M[ 7], H[ 7] ) ), \
         _mm512_xor_si512( M[ 9], H[ 9] ) ), \
      _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
                        _mm512_xor_si512( M[15], H[15] ) ) )
 #define W8b3 \
   _mm512_sub_epi64( \
      _mm512_add_epi64( \
         _mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
                           _mm512_xor_si512( M[ 1], H[ 1] ) ), \
         _mm512_xor_si512( M[ 8], H[ 8] ) ), \
      _mm512_sub_epi64( _mm512_xor_si512( M[10], H[10] ), \
                        _mm512_xor_si512( M[13], H[13] ) ) )
 #define W8b4 \
   _mm512_sub_epi64( \
      _mm512_add_epi64( \
         _mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
                           _mm512_xor_si512( M[ 2], H[ 2] ) ), \
         _mm512_xor_si512( M[ 9], H[ 9] ) ), \
      _mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
                        _mm512_xor_si512( M[14], H[14] ) ) )
 #define W8b5 \
   _mm512_sub_epi64( \
      _mm512_add_epi64( \
         _mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
                           _mm512_xor_si512( M[ 2], H[ 2] ) ), \
         _mm512_xor_si512( M[10], H[10] ) ), \
      _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
                        _mm512_xor_si512( M[15], H[15] ) ) )
 #define W8b6 \
   _mm512_sub_epi64( \
      _mm512_sub_epi64( \
         _mm512_sub_epi64( _mm512_xor_si512( M[ 4], H[ 4] ), \
                           _mm512_xor_si512( M[ 0], H[ 0] ) ), \
         _mm512_xor_si512( M[ 3], H[ 3] ) ), \
      _mm512_sub_epi64( _mm512_xor_si512( M[11], H[11] ), \
                        _mm512_xor_si512( M[13], H[13] ) ) )
 #define W8b7 \
   _mm512_sub_epi64( \
      _mm512_sub_epi64( \
         _mm512_sub_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
                           _mm512_xor_si512( M[ 4], H[ 4] ) ), \
         _mm512_xor_si512( M[ 5], H[ 5] ) ), \
      _mm512_add_epi64( _mm512_xor_si512( M[12], H[12] ), \
                        _mm512_xor_si512( M[14], H[14] ) ) )
 #define W8b8 \
   _mm512_add_epi64( \
      _mm512_sub_epi64( \
         _mm512_sub_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
                           _mm512_xor_si512( M[ 5], H[ 5] ) ), \
         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
      _mm512_sub_epi64( _mm512_xor_si512( M[13], H[13] ), \
                        _mm512_xor_si512( M[15], H[15] ) ) )
 #define W8b9 \
   _mm512_sub_epi64( \
      _mm512_add_epi64( \
         _mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
                           _mm512_xor_si512( M[ 3], H[ 3] ) ), \
         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
      _mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
                        _mm512_xor_si512( M[14], H[14] ) ) )
 #define W8b10 \
   _mm512_sub_epi64( \
      _mm512_sub_epi64( \
         _mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
                           _mm512_xor_si512( M[ 1], H[ 1] ) ), \
         _mm512_xor_si512( M[ 4], H[ 4] ) ), \
      _mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
                        _mm512_xor_si512( M[15], H[15] ) ) )
 #define W8b11 \
   _mm512_sub_epi64( \
      _mm512_sub_epi64( \
         _mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
                           _mm512_xor_si512( M[ 0], H[ 0] ) ), \
         _mm512_xor_si512( M[ 2], H[ 2] ) ), \
      _mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
                        _mm512_xor_si512( M[ 9], H[ 9] ) ) )
 #define W8b12 \
   _mm512_sub_epi64( \
      _mm512_sub_epi64( \
         _mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
                           _mm512_xor_si512( M[ 3], H[ 3] ) ), \
         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
      _mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
                        _mm512_xor_si512( M[10], H[10] ) ) )
 #define W8b13 \
   _mm512_add_epi64( \
      _mm512_add_epi64( \
         _mm512_add_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
                           _mm512_xor_si512( M[ 4], H[ 4] ) ), \
         _mm512_xor_si512( M[ 7], H[ 7] ) ), \
      _mm512_add_epi64( _mm512_xor_si512( M[10], H[10] ), \
                        _mm512_xor_si512( M[11], H[11] ) ) )
 #define W8b14 \
   _mm512_sub_epi64( \
      _mm512_add_epi64( \
         _mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
                           _mm512_xor_si512( M[ 5], H[ 5] ) ), \
         _mm512_xor_si512( M[ 8], H[ 8] ) ), \
      _mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
                        _mm512_xor_si512( M[12], H[12] ) ) )
 #define W8b15 \
   _mm512_sub_epi64( \
      _mm512_sub_epi64( \
         _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
                           _mm512_xor_si512( M[ 4], H[4] ) ), \
         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
      _mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
                        _mm512_xor_si512( M[13], H[13] ) ) )
 void compress_big_8way( const __m512i *M, const __m512i H[16],
                        __m512i dH[16] )
 {
   __m512i qt[32], xl, xh;
   qt[ 0] = _mm512_add_epi64( s8b0( W8b0 ), H[ 1] );
   qt[ 1] = _mm512_add_epi64( s8b1( W8b1 ), H[ 2] );
   qt[ 2] = _mm512_add_epi64( s8b2( W8b2 ), H[ 3] );
   qt[ 3] = _mm512_add_epi64( s8b3( W8b3 ), H[ 4] );
   qt[ 4] = _mm512_add_epi64( s8b4( W8b4 ), H[ 5] );
   qt[ 5] = _mm512_add_epi64( s8b0( W8b5 ), H[ 6] );
   qt[ 6] = _mm512_add_epi64( s8b1( W8b6 ), H[ 7] );
   qt[ 7] = _mm512_add_epi64( s8b2( W8b7 ), H[ 8] );
   qt[ 8] = _mm512_add_epi64( s8b3( W8b8 ), H[ 9] );
   qt[ 9] = _mm512_add_epi64( s8b4( W8b9 ), H[10] );
   qt[10] = _mm512_add_epi64( s8b0( W8b10), H[11] );
   qt[11] = _mm512_add_epi64( s8b1( W8b11), H[12] );
   qt[12] = _mm512_add_epi64( s8b2( W8b12), H[13] );
   qt[13] = _mm512_add_epi64( s8b3( W8b13), H[14] );
   qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
   qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
   qt[16] = expand1b8( qt, M, H, 16 );
   qt[17] = expand1b8( qt, M, H, 17 );
   qt[18] = expand2b8( qt, M, H, 18 );
   qt[19] = expand2b8( qt, M, H, 19 );
   qt[20] = expand2b8( qt, M, H, 20 );
   qt[21] = expand2b8( qt, M, H, 21 );
   qt[22] = expand2b8( qt, M, H, 22 );
   qt[23] = expand2b8( qt, M, H, 23 );
   qt[24] = expand2b8( qt, M, H, 24 );
   qt[25] = expand2b8( qt, M, H, 25 );
   qt[26] = expand2b8( qt, M, H, 26 );
   qt[27] = expand2b8( qt, M, H, 27 );
   qt[28] = expand2b8( qt, M, H, 28 );
   qt[29] = expand2b8( qt, M, H, 29 );
   qt[30] = expand2b8( qt, M, H, 30 );
   qt[31] = expand2b8( qt, M, H, 31 );
   xl = _mm512_xor_si512(
           mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
           mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
   xh = _mm512_xor_si512( xl, _mm512_xor_si512(
           mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
           mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
 #define DH1( m, sl, sr, a, b, c ) \
   _mm512_add_epi64( \
               _mm512_xor_si512( M[m], \
                  _mm512_xor_si512( _mm512_slli_epi64( xh, sl ), \
                                    _mm512_srli_epi64( qt[a], sr ) ) ), \
               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
 #define DHL( m, rl, sl, h, a, b, c ) \
   _mm512_add_epi64( _mm512_add_epi64( \
       mm512_rol_64( dH[h], rl ), \
          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
                 _mm512_xor_si512( _mm512_slli_epi64( xl, sl ), \
                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
 #define DHR( m, rl, sr, h, a, b, c ) \
   _mm512_add_epi64( _mm512_add_epi64( \
       mm512_rol_64( dH[h], rl ), \
          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
                 _mm512_xor_si512( _mm512_srli_epi64( xl, sr ), \
                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
   dH[ 0] = DH1(  0,  5,  5, 16, 24, 0 );
   dH[ 1] = DH1(  1,  7,  8, 17, 25, 1 );
   dH[ 2] = DH1(  2,  5,  5, 18, 26, 2 );
   dH[ 3] = DH1(  3,  1,  5, 19, 27, 3 );
   dH[ 4] = DH1(  4,  3,  0, 20, 28, 4 );
   dH[ 5] = DH1(  5,  6,  6, 21, 29, 5 );
   dH[ 6] = DH1(  6,  4,  6, 22, 30, 6 );
   dH[ 7] = DH1(  7, 11,  2, 23, 31, 7 );
   dH[ 8] = DHL(  8,  9,  8,  4, 24, 23,  8 );
   dH[ 9] = DHR(  9, 10,  6,  5, 25, 16,  9 );
   dH[10] = DHL( 10, 11,  6,  6, 26, 17, 10 );
   dH[11] = DHL( 11, 12,  4,  7, 27, 18, 11 );
   dH[12] = DHR( 12, 13,  3,  0, 28, 19, 12 );
   dH[13] = DHR( 13, 14,  4,  1, 29, 20, 13 );
   dH[14] = DHR( 14, 15,  7,  2, 30, 21, 14 );
   dH[15] = DHR( 15, 16,  2,  3, 31, 22, 15 );
 #undef DH1
 #undef DHL
 #undef DHR
 }
 static const __m512i final_b8[16] =
 {
   { 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0,
     0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0,
     0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0,
     0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 },
   { 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1,
     0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1,
     0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1,
     0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 },
   { 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2,
     0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2,
     0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2,
     0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 },
   { 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3,
     0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3,
     0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3,
     0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 },
   { 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4,
     0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4,
     0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4,
     0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 },
   { 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5,
     0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5,
     0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5,
     0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 },
   { 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6,
     0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6,
     0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6,
     0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 },
   { 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7,
     0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7,
     0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7,
     0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7 },
   { 0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8,
     0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8,
     0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8,
     0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8 },
   { 0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9,
     0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9,
     0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9,
     0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9 },
   { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
     0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
     0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
     0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
   { 0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab,
     0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab,
     0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab,
     0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab },
   { 0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac,
     0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac,
     0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac,
     0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac },
   { 0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad,
     0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad,
     0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad,
     0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad },
   { 0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae,
     0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae,
     0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae,
     0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae },
   { 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf,
     0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf,
     0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf,
     0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf }
 };
 void bmw512_8way_init( bmw512_8way_context *ctx )
 //bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
 {
   ctx->H[ 0] = m512_const1_64( 0x8081828384858687 );
   ctx->H[ 1] = m512_const1_64( 0x88898A8B8C8D8E8F );
   ctx->H[ 2] = m512_const1_64( 0x9091929394959697 );
   ctx->H[ 3] = m512_const1_64( 0x98999A9B9C9D9E9F );
   ctx->H[ 4] = m512_const1_64( 0xA0A1A2A3A4A5A6A7 );
   ctx->H[ 5] = m512_const1_64( 0xA8A9AAABACADAEAF );
   ctx->H[ 6] = m512_const1_64( 0xB0B1B2B3B4B5B6B7 );
   ctx->H[ 7] = m512_const1_64( 0xB8B9BABBBCBDBEBF );
   ctx->H[ 8] = m512_const1_64( 0xC0C1C2C3C4C5C6C7 );
   ctx->H[ 9] = m512_const1_64( 0xC8C9CACBCCCDCECF );
   ctx->H[10] = m512_const1_64( 0xD0D1D2D3D4D5D6D7 );
   ctx->H[11] = m512_const1_64( 0xD8D9DADBDCDDDEDF );
   ctx->H[12] = m512_const1_64( 0xE0E1E2E3E4E5E6E7 );
   ctx->H[13] = m512_const1_64( 0xE8E9EAEBECEDEEEF );
   ctx->H[14] = m512_const1_64( 0xF0F1F2F3F4F5F6F7 );
   ctx->H[15] = m512_const1_64( 0xF8F9FAFBFCFDFEFF );
   ctx->ptr = 0;
   ctx->bit_count = 0;
 }
 void bmw512_8way_update( bmw512_8way_context *ctx, const void *data,
                                size_t len )
 {
   __m512i *vdata = (__m512i*)data;
   __m512i *buf;
   __m512i htmp[16];
   __m512i *h1, *h2;
   size_t ptr;
   const int buf_size = 128;  // bytes of one lane, compatible with len
   ctx->bit_count += len << 3;
   buf = ctx->buf;
   ptr = ctx->ptr;
   h1 = ctx->H;
   h2 = htmp;
   while ( len > 0 )
   {
      size_t clen;
      clen = buf_size - ptr;
      if ( clen > len )
         clen = len;
      memcpy_512( buf + (ptr>>3), vdata, clen >> 3 );
      vdata = vdata + (clen>>3);
      len -= clen;
      ptr += clen;
      if ( ptr == buf_size )
      {
         __m512i *ht;
         compress_big_8way( buf, h1, h2 );
         ht = h1;
         h1 = h2;
         h2 = ht;
         ptr = 0;
      }
   }
   ctx->ptr = ptr;
   if ( h1 != ctx->H )
        memcpy_512( ctx->H, h1, 16 );
 }
 void bmw512_8way_close( bmw512_8way_context *ctx, void *dst )
 {
   __m512i *buf;
   __m512i h1[16], h2[16], *h;
   size_t ptr, u, v;
   const int buf_size = 128;  // bytes of one lane, compatible with len
   buf = ctx->buf;
   ptr = ctx->ptr;
   buf[ ptr>>3 ] = m512_const1_64( 0x80 );
   ptr += 8;
   h = ctx->H;
   if (  ptr > (buf_size - 8) )
   {
      memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
      compress_big_8way( buf, h, h1 );
      ptr = 0;
      h = h1;
   }
   memset_zero_512( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
   buf[ (buf_size - 8) >> 3 ] = _mm512_set1_epi64( ctx->bit_count );
   compress_big_8way( buf, h, h2 );
   for ( u = 0; u < 16; u ++ )
      buf[ u ] = h2[ u ];
   compress_big_8way( buf, final_b8, h1 );
   for (u = 0, v = 8; u < 8; u ++, v ++)
      casti_m512i( dst, u ) = h1[ v ];
 }
 #endif // AVX512
 #ifdef __cplusplus
 }
 #endif
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -1,18 +1,68 @@
 #include "keccak-gate.h"
 #ifdef KECCAK_4WAY
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
 #include "sph_keccak.h"
 #include "keccak-hash-4way.h"
 #if defined(KECCAK_8WAY)
 void keccakhash_8way(void *state, const void *input)
 {
    keccak256_8way_context ctx;
    keccak256_8way_init( &ctx );
    keccak256_8way_update( &ctx, input, 80 );
    keccak256_8way_close( &ctx, state );
 }
 int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[24*8] __attribute__ ((aligned (128)));
   uint32_t hash[16*8] __attribute__ ((aligned (64)));
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[49]);   // 3*16+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
   const uint32_t Htarg = ptarget[7];
   int thr_id = mythr->id;  
   mm512_bswap32_intrlv80_8x64( vdata, pdata );
   do {
       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
      keccakhash_8way( hash, vdata );
      for ( int lane = 0; lane < 8; lane++ )
      if ( hash7[ lane<<1 ] < Htarg ) 
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
          {
              pdata[19] = n + lane;
              submit_lane_solution( work, lane_hash, mythr, lane );
          }
      }
      n += 8;
   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
 #elif defined(KECCAK_4WAY)
 void keccakhash_4way(void *state, const void *input)
 {
    keccak256_4way_context ctx;
    keccak256_4way_init( &ctx );
-    keccak256_4way( &ctx, input, 80 );
+    keccak256_4way_update( &ctx, input, 80 );
    keccak256_4way_close( &ctx, state );
 }
@@ -28,8 +78,8 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-//   const uint32_t Htarg = ptarget[7];
+   const uint32_t Htarg = ptarget[7];
-    int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;
   mm256_bswap32_intrlv80_4x64( vdata, pdata );
   do {
@@ -39,7 +89,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
      keccakhash_4way( hash, vdata );
      for ( int lane = 0; lane < 4; lane++ )
-      if ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 )
+      if ( hash7[ lane<<1 ] < Htarg )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -3,30 +3,36 @@
 bool register_keccak_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  opt_target_factor = 128.0;
-#if defined (KECCAK_4WAY)
+#if defined (KECCAK_8WAY)
  gate->scanhash  = (void*)&scanhash_keccak_8way;
  gate->hash      = (void*)&keccakhash_8way;
 #elif defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
 #else
-  gate->scanhash        = (void*)&scanhash_keccak;
+  gate->scanhash  = (void*)&scanhash_keccak;
-  gate->hash            = (void*)&keccakhash;
+  gate->hash      = (void*)&keccakhash;
 #endif
  return true;
 };
 bool register_keccakc_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
  gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
  opt_target_factor = 256.0;
-#if defined (KECCAK_4WAY)
+#if defined (KECCAK_8WAY)
  gate->scanhash  = (void*)&scanhash_keccak_8way;
  gate->hash      = (void*)&keccakhash_8way;
 #elif defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
 #else
-  gate->scanhash        = (void*)&scanhash_keccak;
+  gate->scanhash  = (void*)&scanhash_keccak;
-  gate->hash            = (void*)&keccakhash;
+  gate->hash      = (void*)&keccakhash;
 #endif
  return true;
 };
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -1,23 +1,33 @@
 #ifndef KECCAK_GATE_H__
-#define KECCAK_GATE_H__
+#define KECCAK_GATE_H__ 1
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(__AVX2__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define KECCAK_4WAY
+  #define KECCAK_8WAY 1
 #elif defined(__AVX2__)
  #define KECCAK_4WAY 1
 #endif
-#if defined(KECCAK_4WAY)
+#if defined(KECCAK_8WAY)
 void keccakhash_8way( void *state, const void *input );
 int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 #elif defined(KECCAK_4WAY)
 void keccakhash_4way( void *state, const void *input );
 int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-#endif
+#else
 void keccakhash( void *state, const void *input );
 int scanhash_keccak( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 #endif
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -1,23 +1,24 @@
 #include <stddef.h>
 #include <stdint.h>
 #include "keccak-hash-4way.h"
-#if defined(__AVX2__)
+static const uint64_t RC[] = {
-
+        0x0000000000000001, 0x0000000000008082,
-static const sph_u64 RC[] = {
+        0x800000000000808A, 0x8000000080008000,
-        SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
+        0x000000000000808B, 0x0000000080000001,
-        SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
+        0x8000000080008081, 0x8000000000008009,
-        SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
+        0x000000000000008A, 0x0000000000000088,
-        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
+        0x0000000080008009, 0x000000008000000A,
-        SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
+        0x000000008000808B, 0x800000000000008B,
-        SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
+        0x8000000000008089, 0x8000000000008003,
-        SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
+        0x8000000000008002, 0x8000000000000080,
-        SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
+        0x000000000000800A, 0x800000008000000A,
-        SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
+        0x8000000080008081, 0x8000000000008080,
-        SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
+        0x0000000080000001, 0x8000000080008008
        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
        SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
 };
 // generic macros
 #define a00   (kc->w[ 0])
 #define a10   (kc->w[ 1])
 #define a20   (kc->w[ 2])
@@ -48,6 +49,197 @@ static const sph_u64 RC[] = {
 #define READ_STATE(sc)
 #define WRITE_STATE(sc)
 #define MOV64(d, s)      (d = s)
 #define XOR64_IOTA       XOR64
 #define LPAR   (
 #define RPAR   )
 #define DO(x)   x
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 #define INPUT_BUF(size)   do { \
    size_t j; \
    for (j = 0; j < (size>>3); j++ ) \
        kc->w[j ] = _mm512_xor_si512( kc->w[j], buf[j] ); \
 } while (0)
 // Targetted macros, keccak-macros.h is included for each target.
 #define DECL64(x)        __m512i x
 #define XOR64(d, a, b)   (d = _mm512_xor_si512(a,b))
 #define AND64(d, a, b)   (d = _mm512_and_si512(a,b))
 #define OR64(d, a, b)    (d = _mm512_or_si512(a,b))
 #define NOT64(d, s)      (d = _mm512_xor_si512(s,m512_neg1))
 #define ROL64(d, v, n)   (d = mm512_rol_64(v, n))
 #include "keccak-macros.c"
 #define KECCAK_F_1600   DO(KECCAK_F_1600_512)
 #define KECCAK_F_1600_512   do { \
    int j; \
    for (j = 0; j < 24; j += 8) \
    { \
       KF_ELT( 0,  1, _mm512_set1_epi64( RC[j + 0] ) ); \
       KF_ELT( 1,  2, _mm512_set1_epi64( RC[j + 1] ) ); \
       KF_ELT( 2,  3, _mm512_set1_epi64( RC[j + 2] ) ); \
       KF_ELT( 3,  4, _mm512_set1_epi64( RC[j + 3] ) ); \
       KF_ELT( 4,  5, _mm512_set1_epi64( RC[j + 4] ) ); \
       KF_ELT( 5,  6, _mm512_set1_epi64( RC[j + 5] ) ); \
       KF_ELT( 6,  7, _mm512_set1_epi64( RC[j + 6] ) ); \
       KF_ELT( 7,  8, _mm512_set1_epi64( RC[j + 7] ) ); \
       P8_TO_P0; \
    } \
 } while (0)
 static void keccak64_8way_init( keccak64_ctx_m512i *kc, unsigned out_size )
 {
   __m512i zero = m512_zero;
   __m512i neg1 = m512_neg1;
   // Initialization for the "lane complement".
   kc->w[ 0] = zero;   kc->w[ 1] = neg1;
   kc->w[ 2] = neg1;   kc->w[ 3] = zero;
   kc->w[ 4] = zero;   kc->w[ 5] = zero;
   kc->w[ 6] = zero;   kc->w[ 7] = zero;
   kc->w[ 8] = neg1;   kc->w[ 9] = zero;
   kc->w[10] = zero;   kc->w[11] = zero;
   kc->w[12] = neg1;   kc->w[13] = zero;
   kc->w[14] = zero;   kc->w[15] = zero;
   kc->w[16] = zero;   kc->w[17] = neg1;
   kc->w[18] = zero;   kc->w[19] = zero;
   kc->w[20] = neg1;   kc->w[21] = zero;
   kc->w[22] = zero;   kc->w[23] = zero;
   kc->w[24] = zero;   kc->ptr = 0;
   kc->lim = 200 - (out_size >> 2);
 }
 static void
 keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
               size_t lim )
 {
    __m512i *buf;
    __m512i *vdata = (__m512i*)data;
    size_t ptr;
    DECL_STATE
    buf = kc->buf;
    ptr = kc->ptr;
    if ( len < (lim - ptr) )
    {
        memcpy_512( buf + (ptr>>3), vdata, len>>3 );
        kc->ptr = ptr + len;
        return;
    }
    READ_STATE( kc );
    while ( len > 0 )
    {
        size_t clen;
        clen = (lim - ptr);
        if ( clen > len )
             clen = len;
        memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
        ptr += clen;
        vdata = vdata + (clen>>3);
        len -= clen;
        if ( ptr == lim )
        {
            INPUT_BUF( lim );
            KECCAK_F_1600;
            ptr = 0;
        }
    }
    WRITE_STATE( kc );
    kc->ptr = ptr;
 }
 static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
                                 size_t byte_len, size_t lim )
 {
    unsigned eb;
    union {
       __m512i tmp[lim + 1];
       sph_u64 dummy;   /* for alignment */
    } u;
    size_t j;
    size_t m512_len = byte_len >> 3;
    eb = 0x100  >> 8;
    if ( kc->ptr == (lim - 8) )
    {
        const uint64_t t = eb | 0x8000000000000000;
        u.tmp[0] = m512_const1_64( t );
        j = 8;
    }
    else
    {
        j = lim - kc->ptr;
        u.tmp[0] = m512_const1_64( eb );
        memset_zero_512( u.tmp + 1, (j>>3) - 2 );
        u.tmp[ (j>>3) - 1] = m512_const1_64( 0x8000000000000000 );
    }
    keccak64_8way_core( kc, u.tmp, j, lim );
    /* Finalize the "lane complement" */
    NOT64( kc->w[ 1], kc->w[ 1] );
    NOT64( kc->w[ 2], kc->w[ 2] );
    NOT64( kc->w[ 8], kc->w[ 8] );
    NOT64( kc->w[12], kc->w[12] );
    NOT64( kc->w[17], kc->w[17] );
    NOT64( kc->w[20], kc->w[20] );
    memcpy_512( dst, kc->w, m512_len );
 }
 void keccak256_8way_init( void *kc )
 {
   keccak64_8way_init( kc, 256 );
 }
 void
 keccak256_8way_update(void *cc, const void *data, size_t len)
 {
    keccak64_8way_core(cc, data, len, 136);
 }
 void
 keccak256_8way_close(void *cc, void *dst)
 {
    keccak64_8way_close(cc, dst, 32, 136);
 }
 void keccak512_8way_init( void *kc )
 {
   keccak64_8way_init( kc, 512 );
 }
 void
 keccak512_8way_update(void *cc, const void *data, size_t len)
 {
        keccak64_8way_core(cc, data, len, 72);
 }
 void
 keccak512_8way_close(void *cc, void *dst)
 {
        keccak64_8way_close(cc, dst, 64, 72);
 }
 #undef INPUT_BUF
 #undef DECL64
 #undef XOR64
 #undef AND64
 #undef OR64
 #undef NOT64
 #undef ROL64
 #undef KECCAK_F_1600
 #endif  // AVX512
 #if defined(__AVX2__)
 #define INPUT_BUF(size)   do { \
    size_t j; \
    for (j = 0; j < (size>>3); j++ ) \
@@ -55,314 +247,28 @@ static const sph_u64 RC[] = {
 } while (0)
 #define DECL64(x)        __m256i x
 #define MOV64(d, s)      (d = s)
 #define XOR64(d, a, b)   (d = _mm256_xor_si256(a,b))
 #define AND64(d, a, b)   (d = _mm256_and_si256(a,b))
 #define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
 #define NOT64(d, s)      (d = _mm256_xor_si256(s,m256_neg1))
 #define ROL64(d, v, n)   (d = mm256_rol_64(v, n))
 #define XOR64_IOTA       XOR64
-#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
+#include "keccak-macros.c"
                DECL64(tt0); \
                DECL64(tt1); \
                DECL64(tt2); \
                DECL64(tt3); \
                XOR64(tt0, d0, d1); \
                XOR64(tt1, d2, d3); \
                XOR64(tt0, tt0, d4); \
                XOR64(tt0, tt0, tt1); \
                ROL64(tt0, tt0, 1); \
                XOR64(tt2, c0, c1); \
                XOR64(tt3, c2, c3); \
                XOR64(tt0, tt0, c4); \
                XOR64(tt2, tt2, tt3); \
                XOR64(t, tt0, tt2); \
        } while (0)
-#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+#define KECCAK_F_1600   DO(KECCAK_F_1600_256)
        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
        b40, b41, b42, b43, b44) \
        do { \
                DECL64(t0); \
                DECL64(t1); \
                DECL64(t2); \
                DECL64(t3); \
                DECL64(t4); \
                TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
                TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
                TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
                TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
                TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
                XOR64(b00, b00, t0); \
                XOR64(b01, b01, t0); \
                XOR64(b02, b02, t0); \
                XOR64(b03, b03, t0); \
                XOR64(b04, b04, t0); \
                XOR64(b10, b10, t1); \
                XOR64(b11, b11, t1); \
                XOR64(b12, b12, t1); \
                XOR64(b13, b13, t1); \
                XOR64(b14, b14, t1); \
                XOR64(b20, b20, t2); \
                XOR64(b21, b21, t2); \
                XOR64(b22, b22, t2); \
                XOR64(b23, b23, t2); \
                XOR64(b24, b24, t2); \
                XOR64(b30, b30, t3); \
                XOR64(b31, b31, t3); \
                XOR64(b32, b32, t3); \
                XOR64(b33, b33, t3); \
                XOR64(b34, b34, t3); \
                XOR64(b40, b40, t4); \
                XOR64(b41, b41, t4); \
                XOR64(b42, b42, t4); \
                XOR64(b43, b43, t4); \
                XOR64(b44, b44, t4); \
        } while (0)
-#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+#define KECCAK_F_1600_256   do { \
        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
        b40, b41, b42, b43, b44) \
        do { \
                /* ROL64(b00, b00,  0); */ \
                ROL64(b01, b01, 36); \
                ROL64(b02, b02,  3); \
                ROL64(b03, b03, 41); \
                ROL64(b04, b04, 18); \
                ROL64(b10, b10,  1); \
                ROL64(b11, b11, 44); \
                ROL64(b12, b12, 10); \
                ROL64(b13, b13, 45); \
                ROL64(b14, b14,  2); \
                ROL64(b20, b20, 62); \
                ROL64(b21, b21,  6); \
                ROL64(b22, b22, 43); \
                ROL64(b23, b23, 15); \
                ROL64(b24, b24, 61); \
                ROL64(b30, b30, 28); \
                ROL64(b31, b31, 55); \
                ROL64(b32, b32, 25); \
                ROL64(b33, b33, 21); \
                ROL64(b34, b34, 56); \
                ROL64(b40, b40, 27); \
                ROL64(b41, b41, 20); \
                ROL64(b42, b42, 39); \
                ROL64(b43, b43,  8); \
                ROL64(b44, b44, 14); \
        } while (0)
 /*
 * The KHI macro integrates the "lane complement" optimization. On input,
 * some words are complemented:
 *    a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
 * On output, the following words are complemented:
 *    a04 a10 a20 a22 a23 a31
 *
 * The (implicit) permutation and the theta expansion will bring back
 * the input mask for the next round.
 */
 #define KHI_XO(d, a, b, c)   do { \
                DECL64(kt); \
                OR64(kt, b, c); \
                XOR64(d, a, kt); \
        } while (0)
 #define KHI_XA(d, a, b, c)   do { \
                DECL64(kt); \
                AND64(kt, b, c); \
                XOR64(d, a, kt); \
        } while (0)
 #define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
        b40, b41, b42, b43, b44) \
        do { \
                DECL64(c0); \
                DECL64(c1); \
                DECL64(c2); \
                DECL64(c3); \
                DECL64(c4); \
                DECL64(bnn); \
                NOT64(bnn, b20); \
                KHI_XO(c0, b00, b10, b20); \
                KHI_XO(c1, b10, bnn, b30); \
                KHI_XA(c2, b20, b30, b40); \
                KHI_XO(c3, b30, b40, b00); \
                KHI_XA(c4, b40, b00, b10); \
                MOV64(b00, c0); \
                MOV64(b10, c1); \
                MOV64(b20, c2); \
                MOV64(b30, c3); \
                MOV64(b40, c4); \
                NOT64(bnn, b41); \
                KHI_XO(c0, b01, b11, b21); \
                KHI_XA(c1, b11, b21, b31); \
                KHI_XO(c2, b21, b31, bnn); \
                KHI_XO(c3, b31, b41, b01); \
                KHI_XA(c4, b41, b01, b11); \
                MOV64(b01, c0); \
                MOV64(b11, c1); \
                MOV64(b21, c2); \
                MOV64(b31, c3); \
                MOV64(b41, c4); \
                NOT64(bnn, b32); \
                KHI_XO(c0, b02, b12, b22); \
                KHI_XA(c1, b12, b22, b32); \
                KHI_XA(c2, b22, bnn, b42); \
                KHI_XO(c3, bnn, b42, b02); \
                KHI_XA(c4, b42, b02, b12); \
                MOV64(b02, c0); \
                MOV64(b12, c1); \
                MOV64(b22, c2); \
                MOV64(b32, c3); \
                MOV64(b42, c4); \
                NOT64(bnn, b33); \
                KHI_XA(c0, b03, b13, b23); \
                KHI_XO(c1, b13, b23, b33); \
                KHI_XO(c2, b23, bnn, b43); \
                KHI_XA(c3, bnn, b43, b03); \
                KHI_XO(c4, b43, b03, b13); \
                MOV64(b03, c0); \
                MOV64(b13, c1); \
                MOV64(b23, c2); \
                MOV64(b33, c3); \
                MOV64(b43, c4); \
                NOT64(bnn, b14); \
                KHI_XA(c0, b04, bnn, b24); \
                KHI_XO(c1, bnn, b24, b34); \
                KHI_XA(c2, b24, b34, b44); \
                KHI_XO(c3, b34, b44, b04); \
                KHI_XA(c4, b44, b04, b14); \
                MOV64(b04, c0); \
                MOV64(b14, c1); \
                MOV64(b24, c2); \
                MOV64(b34, c3); \
                MOV64(b44, c4); \
        } while (0)
 #define IOTA(r)   XOR64_IOTA(a00, a00, r)
 #define P0    a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
              a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
 #define P1    a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
              a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
 #define P2    a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
              a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
 #define P3    a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
              a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
 #define P4    a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
              a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
 #define P5    a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
              a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
 #define P6    a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
              a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
 #define P7    a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
              a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
 #define P8    a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
              a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
 #define P9    a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
              a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
 #define P10   a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
              a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
 #define P11   a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
              a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
 #define P12   a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
              a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
 #define P13   a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
              a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
 #define P14   a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
              a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
 #define P15   a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
              a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
 #define P16   a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
              a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
 #define P17   a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
              a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
 #define P18   a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
              a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
 #define P19   a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
              a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
 #define P20   a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
              a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
 #define P21   a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
              a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
 #define P22   a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
              a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
 #define P23   a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
              a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
 #define P8_TO_P0   do { \
                DECL64(t); \
                MOV64(t, a01); \
                MOV64(a01, a11); \
                MOV64(a11, a43); \
                MOV64(a43, t); \
                MOV64(t, a02); \
                MOV64(a02, a22); \
                MOV64(a22, a31); \
                MOV64(a31, t); \
                MOV64(t, a03); \
                MOV64(a03, a33); \
                MOV64(a33, a24); \
                MOV64(a24, t); \
                MOV64(t, a04); \
                MOV64(a04, a44); \
                MOV64(a44, a12); \
                MOV64(a12, t); \
                MOV64(t, a10); \
                MOV64(a10, a32); \
                MOV64(a32, a13); \
                MOV64(a13, t); \
                MOV64(t, a14); \
                MOV64(a14, a21); \
                MOV64(a21, a20); \
                MOV64(a20, t); \
                MOV64(t, a23); \
                MOV64(a23, a42); \
                MOV64(a42, a40); \
                MOV64(a40, t); \
                MOV64(t, a30); \
                MOV64(a30, a41); \
                MOV64(a41, a34); \
                MOV64(a34, t); \
        } while (0)
 #define LPAR   (
 #define RPAR   )
 #define KF_ELT(r, s, k)   do { \
                THETA LPAR P ## r RPAR; \
                RHO LPAR P ## r RPAR; \
                KHI LPAR P ## s RPAR; \
                IOTA(k); \
        } while (0)
 #define DO(x)   x
 #define KECCAK_F_1600   DO(KECCAK_F_1600_)
 #define KECCAK_F_1600_   do { \
    int j; \
    for (j = 0; j < 24; j += 8) \
    { \
-       KF_ELT( 0,  1, (_mm256_set_epi64x( RC[j + 0], RC[j + 0], \
+       KF_ELT( 0,  1, _mm256_set1_epi64x( RC[j + 0] ) ); \
-                                       RC[j + 0], RC[j + 0])) ); \
+       KF_ELT( 1,  2, _mm256_set1_epi64x( RC[j + 1] ) ); \
-       KF_ELT( 1,  2, (_mm256_set_epi64x( RC[j + 1], RC[j + 1], \
+       KF_ELT( 2,  3, _mm256_set1_epi64x( RC[j + 2] ) ); \
-                                       RC[j + 1], RC[j + 1])) ); \
+       KF_ELT( 3,  4, _mm256_set1_epi64x( RC[j + 3] ) ); \
-       KF_ELT( 2,  3, (_mm256_set_epi64x( RC[j + 2], RC[j + 2], \
+       KF_ELT( 4,  5, _mm256_set1_epi64x( RC[j + 4] ) ); \
-                                       RC[j + 2], RC[j + 2])) ); \
+       KF_ELT( 5,  6, _mm256_set1_epi64x( RC[j + 5] ) ); \
-       KF_ELT( 3,  4, (_mm256_set_epi64x( RC[j + 3], RC[j + 3], \
+       KF_ELT( 6,  7, _mm256_set1_epi64x( RC[j + 6] ) ); \
-                                       RC[j + 3], RC[j + 3])) ); \
+       KF_ELT( 7,  8, _mm256_set1_epi64x( RC[j + 7] ) ); \
       KF_ELT( 4,  5, (_mm256_set_epi64x( RC[j + 4], RC[j + 4], \
                                       RC[j + 4], RC[j + 4])) ); \
       KF_ELT( 5,  6, (_mm256_set_epi64x( RC[j + 5], RC[j + 5], \
                                       RC[j + 5], RC[j + 5])) ); \
       KF_ELT( 6,  7, (_mm256_set_epi64x( RC[j + 6], RC[j + 6], \
                                       RC[j + 6], RC[j + 6])) ); \
       KF_ELT( 7,  8, (_mm256_set_epi64x( RC[j + 7], RC[j + 7], \
                                       RC[j + 7], RC[j + 7])) ); \
       P8_TO_P0; \
    } \
 } while (0)
@@ -453,7 +359,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
    else
    {
        j = lim - kc->ptr;
-        u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb );
+        u.tmp[0] = m256_const1_64( eb );
        memset_zero_256( u.tmp + 1, (j>>3) - 2 );
        u.tmp[ (j>>3) - 1] = m256_const1_64( 0x8000000000000000 );
    }
@@ -474,7 +380,7 @@ void keccak256_4way_init( void *kc )
 }
 void
-keccak256_4way(void *cc, const void *data, size_t len)
+keccak256_4way_update(void *cc, const void *data, size_t len)
 {
    keccak64_core(cc, data, len, 136);
 }
@@ -491,15 +397,24 @@ void keccak512_4way_init( void *kc )
 }
 void
-keccak512_4way(void *cc, const void *data, size_t len)
+keccak512_4way_update(void *cc, const void *data, size_t len)
 {
-        keccak64_core(cc, data, len, 72);
+   keccak64_core(cc, data, len, 72);
 }
 void
 keccak512_4way_close(void *cc, void *dst)
 {
-        keccak64_close(cc, dst, 64, 72);
+   keccak64_close(cc, dst, 64, 72);
 }
-#endif
+#undef INPUT_BUF
 #undef DECL64
 #undef XOR64
 #undef AND64
 #undef OR64
 #undef NOT64
 #undef ROL64
 #undef KECCAK_F_1600
 #endif  // AVX2
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -64,26 +64,49 @@ extern "C"{
 * <code>memcpy()</code>).
 */
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 typedef struct {
-        __m256i buf[144*8];    /* first field, for alignment */
+        __m512i buf[144*8];
        __m512i w[25];
        size_t ptr, lim;
 } keccak64_ctx_m512i __attribute__((aligned(128)));
 typedef keccak64_ctx_m512i keccak256_8way_context;
 typedef keccak64_ctx_m512i keccak512_8way_context;
 void keccak256_8way_init(void *cc);
 void keccak256_8way_update(void *cc, const void *data, size_t len);
 void keccak256_8way_close(void *cc, void *dst);
 void keccak512_8way_init(void *cc);
 void keccak512_8way_update(void *cc, const void *data, size_t len);
 void keccak512_8way_close(void *cc, void *dst);
 void keccak512_8way_addbits_and_close(
        void *cc, unsigned ub, unsigned n, void *dst);
 #endif   
 typedef struct {
        __m256i buf[144*8];  
        __m256i w[25];
        size_t ptr, lim;
-//        sph_u64 wide[25];
+} keccak64_ctx_m256i __attribute__((aligned(128)));
 } keccak64_ctx_m256i;
 typedef keccak64_ctx_m256i keccak256_4way_context;
 typedef keccak64_ctx_m256i keccak512_4way_context;
 void keccak256_4way_init(void *cc);
-void keccak256_4way(void *cc, const void *data, size_t len);
+void keccak256_4way_update(void *cc, const void *data, size_t len);
 void keccak256_4way_close(void *cc, void *dst);
-
+#define keccak256_4way keccak256_4way_update
 void keccak512_4way_init(void *cc);
-void keccak512_4way(void *cc, const void *data, size_t len);
+void keccak512_4way_update(void *cc, const void *data, size_t len);
 void keccak512_4way_close(void *cc, void *dst);
 void keccak512_4way_addbits_and_close(
        void *cc, unsigned ub, unsigned n, void *dst);
 #define keccak512_4way keccak512_4way_update
 #endif
--- a/algo/keccak/keccak-macros.c
+++ b/algo/keccak/keccak-macros.c
@@ -0,0 +1,324 @@
 #ifdef TH_ELT
 #undef TH_ELT
 #endif
 #define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
                DECL64(tt0); \
                DECL64(tt1); \
                DECL64(tt2); \
                DECL64(tt3); \
                XOR64(tt0, d0, d1); \
                XOR64(tt1, d2, d3); \
                XOR64(tt0, tt0, d4); \
                XOR64(tt0, tt0, tt1); \
                ROL64(tt0, tt0, 1); \
                XOR64(tt2, c0, c1); \
                XOR64(tt3, c2, c3); \
                XOR64(tt0, tt0, c4); \
                XOR64(tt2, tt2, tt3); \
                XOR64(t, tt0, tt2); \
        } while (0)
 #ifdef THETA
 #undef THETA
 #endif
 #define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
        b40, b41, b42, b43, b44) \
        do { \
                DECL64(t0); \
                DECL64(t1); \
                DECL64(t2); \
                DECL64(t3); \
                DECL64(t4); \
                TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
                TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
                TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
                TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
                TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
                XOR64(b00, b00, t0); \
                XOR64(b01, b01, t0); \
                XOR64(b02, b02, t0); \
                XOR64(b03, b03, t0); \
                XOR64(b04, b04, t0); \
                XOR64(b10, b10, t1); \
                XOR64(b11, b11, t1); \
                XOR64(b12, b12, t1); \
                XOR64(b13, b13, t1); \
                XOR64(b14, b14, t1); \
                XOR64(b20, b20, t2); \
                XOR64(b21, b21, t2); \
                XOR64(b22, b22, t2); \
                XOR64(b23, b23, t2); \
                XOR64(b24, b24, t2); \
                XOR64(b30, b30, t3); \
                XOR64(b31, b31, t3); \
                XOR64(b32, b32, t3); \
                XOR64(b33, b33, t3); \
                XOR64(b34, b34, t3); \
                XOR64(b40, b40, t4); \
                XOR64(b41, b41, t4); \
                XOR64(b42, b42, t4); \
                XOR64(b43, b43, t4); \
                XOR64(b44, b44, t4); \
        } while (0)
 #ifdef RHO
 #undef RHO
 #endif
 #define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
        b40, b41, b42, b43, b44) \
        do { \
                /* ROL64(b00, b00,  0); */ \
                ROL64(b01, b01, 36); \
                ROL64(b02, b02,  3); \
                ROL64(b03, b03, 41); \
                ROL64(b04, b04, 18); \
                ROL64(b10, b10,  1); \
                ROL64(b11, b11, 44); \
                ROL64(b12, b12, 10); \
                ROL64(b13, b13, 45); \
                ROL64(b14, b14,  2); \
                ROL64(b20, b20, 62); \
                ROL64(b21, b21,  6); \
                ROL64(b22, b22, 43); \
                ROL64(b23, b23, 15); \
                ROL64(b24, b24, 61); \
                ROL64(b30, b30, 28); \
                ROL64(b31, b31, 55); \
                ROL64(b32, b32, 25); \
                ROL64(b33, b33, 21); \
                ROL64(b34, b34, 56); \
                ROL64(b40, b40, 27); \
                ROL64(b41, b41, 20); \
                ROL64(b42, b42, 39); \
                ROL64(b43, b43,  8); \
                ROL64(b44, b44, 14); \
        } while (0)
 /*
 * The KHI macro integrates the "lane complement" optimization. On input,
 * some words are complemented:
 *    a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
 * On output, the following words are complemented:
 *    a04 a10 a20 a22 a23 a31
 *
 * The (implicit) permutation and the theta expansion will bring back
 * the input mask for the next round.
 */
 #ifdef KHI_XO
 #undef KHI_XO
 #endif
 #define KHI_XO(d, a, b, c)   do { \
                DECL64(kt); \
                OR64(kt, b, c); \
                XOR64(d, a, kt); \
        } while (0)
 #ifdef KHI_XA
 #undef KHI_XA
 #endif
 #define KHI_XA(d, a, b, c)   do { \
                DECL64(kt); \
                AND64(kt, b, c); \
                XOR64(d, a, kt); \
        } while (0)
 #ifdef KHI
 #undef KHI
 #endif
 #define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
        b40, b41, b42, b43, b44) \
        do { \
                DECL64(c0); \
                DECL64(c1); \
                DECL64(c2); \
                DECL64(c3); \
                DECL64(c4); \
                DECL64(bnn); \
                NOT64(bnn, b20); \
                KHI_XO(c0, b00, b10, b20); \
                KHI_XO(c1, b10, bnn, b30); \
                KHI_XA(c2, b20, b30, b40); \
                KHI_XO(c3, b30, b40, b00); \
                KHI_XA(c4, b40, b00, b10); \
                MOV64(b00, c0); \
                MOV64(b10, c1); \
                MOV64(b20, c2); \
                MOV64(b30, c3); \
                MOV64(b40, c4); \
                NOT64(bnn, b41); \
                KHI_XO(c0, b01, b11, b21); \
                KHI_XA(c1, b11, b21, b31); \
                KHI_XO(c2, b21, b31, bnn); \
                KHI_XO(c3, b31, b41, b01); \
                KHI_XA(c4, b41, b01, b11); \
                MOV64(b01, c0); \
                MOV64(b11, c1); \
                MOV64(b21, c2); \
                MOV64(b31, c3); \
                MOV64(b41, c4); \
                NOT64(bnn, b32); \
                KHI_XO(c0, b02, b12, b22); \
                KHI_XA(c1, b12, b22, b32); \
                KHI_XA(c2, b22, bnn, b42); \
                KHI_XO(c3, bnn, b42, b02); \
                KHI_XA(c4, b42, b02, b12); \
                MOV64(b02, c0); \
                MOV64(b12, c1); \
                MOV64(b22, c2); \
                MOV64(b32, c3); \
                MOV64(b42, c4); \
                NOT64(bnn, b33); \
                KHI_XA(c0, b03, b13, b23); \
                KHI_XO(c1, b13, b23, b33); \
                KHI_XO(c2, b23, bnn, b43); \
                KHI_XA(c3, bnn, b43, b03); \
                KHI_XO(c4, b43, b03, b13); \
                MOV64(b03, c0); \
                MOV64(b13, c1); \
                MOV64(b23, c2); \
                MOV64(b33, c3); \
                MOV64(b43, c4); \
                NOT64(bnn, b14); \
                KHI_XA(c0, b04, bnn, b24); \
                KHI_XO(c1, bnn, b24, b34); \
                KHI_XA(c2, b24, b34, b44); \
                KHI_XO(c3, b34, b44, b04); \
                KHI_XA(c4, b44, b04, b14); \
                MOV64(b04, c0); \
                MOV64(b14, c1); \
                MOV64(b24, c2); \
                MOV64(b34, c3); \
                MOV64(b44, c4); \
        } while (0)
 #ifdef IOTA
 #undef IOTA
 #endif
 #define IOTA(r)   XOR64_IOTA(a00, a00, r)
 #ifdef P0
 #undef P1
 #undef P2
 #undef P3
 #undef P4
 #undef P5
 #undef P6
 #undef P7
 #undef P8
 #undef P9
 #undef P10
 #undef p11
 #undef P12
 #undef P13
 #undef P14
 #undef P15
 #undef P16
 #undef P17
 #undef P18
 #undef P19
 #undef P20
 #undef P21
 #undef P22
 #undef P23
 #endif
 #define P0    a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
              a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
 #define P1    a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
              a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
 #define P2    a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
              a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
 #define P3    a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
              a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
 #define P4    a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
              a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
 #define P5    a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
              a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
 #define P6    a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
              a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
 #define P7    a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
              a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
 #define P8    a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
              a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
 #define P9    a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
              a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
 #define P10   a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
              a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
 #define P11   a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
              a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
 #define P12   a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
              a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
 #define P13   a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
              a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
 #define P14   a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
              a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
 #define P15   a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
              a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
 #define P16   a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
              a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
 #define P17   a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
              a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
 #define P18   a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
              a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
 #define P19   a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
              a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
 #define P20   a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
              a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
 #define P21   a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
              a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
 #define P22   a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
              a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
 #define P23   a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
              a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
 #ifdef P8_TO_P0
 #undef P8_TO_P0
 #endif
 #define P8_TO_P0   do { \
                DECL64(t); \
                MOV64(t, a01); \
                MOV64(a01, a11); \
                MOV64(a11, a43); \
                MOV64(a43, t); \
                MOV64(t, a02); \
                MOV64(a02, a22); \
                MOV64(a22, a31); \
                MOV64(a31, t); \
                MOV64(t, a03); \
                MOV64(a03, a33); \
                MOV64(a33, a24); \
                MOV64(a24, t); \
                MOV64(t, a04); \
                MOV64(a04, a44); \
                MOV64(a44, a12); \
                MOV64(a12, t); \
                MOV64(t, a10); \
                MOV64(a10, a32); \
                MOV64(a32, a13); \
                MOV64(a13, t); \
                MOV64(t, a14); \
                MOV64(a14, a21); \
                MOV64(a21, a20); \
                MOV64(a20, t); \
                MOV64(t, a23); \
                MOV64(a23, a42); \
                MOV64(a42, a40); \
                MOV64(a40, t); \
                MOV64(t, a30); \
                MOV64(a30, a41); \
                MOV64(a41, a34); \
                MOV64(a34, t); \
        } while (0)
 #define KF_ELT(r, s, k)   do { \
                THETA LPAR P ## r RPAR; \
                RHO LPAR P ## r RPAR; \
                KHI LPAR P ## s RPAR; \
                IOTA(k); \
        } while (0)
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -5,7 +5,6 @@
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 
 #if defined (LYRA2REV3_8WAY)
 typedef struct {
@@ -14,7 +13,7 @@ typedef struct {
   bmw256_8way_context       bmw;
 } lyra2v3_8way_ctx_holder;
-static lyra2v3_8way_ctx_holder l2v3_8way_ctx;
+static __thread lyra2v3_8way_ctx_holder l2v3_8way_ctx;
 bool init_lyra2rev3_8way_ctx()
 {
@@ -38,7 +37,7 @@ void lyra2rev3_8way_hash( void *state, const void *input )
   lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );
-   blake256_8way( &ctx.blake, input, 80 );
+   blake256_8way( &ctx.blake, input + (64*8), 16 );
   blake256_8way_close( &ctx.blake, vhash );
   dintrlv_8x32( hash0, hash1, hash2, hash3,
@@ -91,7 +90,7 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
 {
   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *hash7 = &hash[7<<3];
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
@@ -99,12 +98,15 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
-   const int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;
-   if ( opt_benchmark )
+   if ( opt_benchmark )  ( (uint32_t*)ptarget )[7] = 0x0000ff;
      ( (uint32_t*)ptarget )[7] = 0x0000ff;
   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   blake256_8way_init( &l2v3_8way_ctx.blake );
   blake256_8way( &l2v3_8way_ctx.blake, vdata, 64 );
   do
   {
      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
@@ -119,8 +121,8 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
         extr_lane_8x32( lane_hash, hash, lane, 256 );
         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
         {
-              pdata[19] = n + lane;
+             pdata[19] = n + lane;
-              submit_lane_solution( work, lane_hash, mythr, lane );
+             submit_lane_solution( work, lane_hash, mythr, lane );
         }
      }
      n += 8;
@@ -133,14 +135,14 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
 #if defined (LYRA2REV3_4WAY)  
 typedef struct {
   blake256_4way_context     blake;
   cubehashParam             cube;
   bmw256_4way_context       bmw;
 } lyra2v3_4way_ctx_holder;
-static lyra2v3_4way_ctx_holder l2v3_4way_ctx;
+//static lyra2v3_4way_ctx_holder l2v3_4way_ctx;
 static __thread lyra2v3_4way_ctx_holder l2v3_4way_ctx;
 bool init_lyra2rev3_4way_ctx()
 {
@@ -160,7 +162,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
   memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );
-   blake256_4way( &ctx.blake, input, 80 );
+//   blake256_4way( &ctx.blake, input, 80 );
   blake256_4way( &ctx.blake, input + (64*4), 16 );
   blake256_4way_close( &ctx.blake, vhash );
   dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
@@ -206,6 +209,10 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
      ( (uint32_t*)ptarget )[7] = 0x0000ff;
   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake256_4way_init( &l2v3_4way_ctx.blake );
   blake256_4way( &l2v3_4way_ctx.blake, vdata, 64 );
   do
   {
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
--- a/algo/sha/sha256_hash_11way.c
+++ b/algo/sha/sha256_hash_11way.c
@@ -1,538 +0,0 @@
 #if 0
 #include <stddef.h>
 #include <string.h>
 #include "sha2-hash-4way.h"
 #if defined(__AVX2__)
 // naming convention for variables and macros
 // VARx: AVX2 8 way 32 bit
 // VARy: MMX 2 way 32 bit
 // VARz: scalar integer 32 bit
 static const uint32_t H256[8] =
 {
        0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
        0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
 };
 static const uint32_t K256[64] = 
 {
        0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
        0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
        0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
        0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
        0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
        0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
        0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
        0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
        0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
        0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
        0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
        0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
        0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
        0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
        0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
        0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
 };
 #define CHx(X, Y, Z) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
 #define CHy(X, Y, Z) \
   _mm_xor_si64( _mm_and_si64( _mm_xor_si64( Y, Z ), X ), Z )
 #define CHz(X, Y, Z) ((( (Y) ^ (Z) ) & (X) ) ^ (Z) )
 #define MAJx(X, Y, Z) \
   _mm256_or_si256( _mm256_and_si256( X, Y ), \
                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
 #define MAJy(X, Y, Z) \
   _mm_or_si64( _mm_and_si64( X, Y ), \
                    _mm_and_si64( _mm_or_si64( X, Y ), Z ) )
 #define MAJz(X, Y, Z)  ( ( (X) & (Y) ) | ( ( (X) | (Y) ) & (Z) ) )
 #define BSG2_0x(x) \
   _mm256_xor_si256( _mm256_xor_si256( \
       mm256_ror_32(x,2), mm256_ror_32(x,13) ), _mm256_srli_epi32(x,22) )
 #define BSG2_0y(x) \
   _mm_xor_si64( _mm_xor_si64( \
       mm64_ror_32(x,2), mm64_ror_32(x,13) ), _mm_srli_pi32(x,22) )
 #define BSG2_0z(x)  ( u32_ror_32(x,2) ^ u32_ror_32(x,13)  ^ ((x)>>22) )
 #define BSG2_1x(x) \
   _mm256_xor_si256( _mm256_xor_si256( \
       mm256_ror_32(x,6), mm256_ror_32(x,11) ), _mm256_srli_epi32(x,25) )
 #define BSG2_1y(x) \
   _mm_xor_si64( _mm_xor_si64( \
       mm64_ror_32(x,6), mm64_ror_32(x,11) ), _mm_srli_pi32(x,25) )
 #define BSG2_1z(x)   ( u32_ror_32(x,6) ^ u32_ror_32(x,11) ^ ((x)>>25) )
 #define SSG2_0x(x) \
   _mm256_xor_si256( _mm256_xor_si256( \
       mm256_ror_32(x,7), mm256_ror_32(x,18) ), _mm256_srli_epi32(x,3) ) 
 #define SSG2_0y(x) \
   _mm_xor_si64( _mm_xor_si64( \
       mm64_ror_32(x,7), mm64_ror_32(x,18) ), _mm_srli_pi32(x,3) )
 #define SSG2_0z(x)  (( u32_ror_32(x,7) ^ u32_ror_32(x,18) ) ^ ((x)>>3) )
 #define SSG2_1x(x) \
   _mm256_xor_si256( _mm256_xor_si256( \
       mm256_ror_32(x,17), mm256_ror_32(x,19) ), _mm256_srli_epi32(x,10) )
 #define SSG2_1y(x) \
   _mm_xor_si64( _mm_xor_si64( \
       mm64_ror_32(x,17), mm64_ror_32(x,19) ), _mm_srli_pi32(x,10) )
 #define SSG2_1z(x)   ( u32_ror_32(x,17) ^ u32_ror_32(x,19)  ^ ((x)>>10) )
 #define SHA2x_MEXP( a, b, c, d ) \
     _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
                 SSG2_1x( Wx[a] ), Wx[b] ), SSG2_0x( Wx[c] ) ), Wx[d] )
 #define SHA2y_MEXP( a, b, c, d ) \
     _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
                 SSG2_1y( Wy[a] ), Wy[b] ), SSG2_0y( Wy[c] ) ), Wy[d] )
 #define SHA2z_MEXP( a, b, c, d ) \
               ( SSG2_1z( Wz[a] ) + Wz[b] + SSG2_0z( Wz[c] ) + Wz[d] )
 #define SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, \
 	                  Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, \
 		          Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, i, j) \
 do { \
  __m256i T1x, T2x; \
  __m64 T1y, T2y; \
  uint32_t T1z, T2z; \
  T1x = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
        _mm256_add_epi32( Hx, BSG2_1x(Ex) ), CHx(Ex, Fx, Gx) ), \
                          _mm256_set1_epi32( K256[( (j)+(i) )] ) ), Wx[i] ); \
  T1y = _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
        _mm_add_pi32( Hy, BSG2_1y(Ey) ), CHy(Ey, Fy, Gy) ), \
                          _mm_set1_pi32( K256[( (j)+(i) )] ) ), Wy[i] ); \
  T1z = Hz + BSG2_1z( Ez ) + CHz( Ez, Fz, Gz ) + K256[ ((j)+(i)) ] + Wz[i]; \
  T2x = _mm256_add_epi32( BSG2_0x(Ax), MAJx(Ax, Bx, Cx) ); \
  T2y = _mm_add_pi32( BSG2_0y(Ay), MAJy(Ay, By, Cy) ); \
  T2z = BSG2_0z( Az ) + MAJz( Az, Bz, Cz ); \
  Dx  = _mm256_add_epi32( Dx,  T1x ); \
  Dy  = _mm_add_pi32( Dy, T1y ); \
  Dz  = Dz + T1z; \
  Hx  = _mm256_add_epi32( T1x, T2x ); \
  Hy  = _mm_add_pi32( T1y, T2y ); \
  Hz  = T1z + T2z; \
 } while (0)
 void sha256_11way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 ry[8],
                         uint32_t *inz, uint32_t rz[8] )
 {
   __m256i Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx;
   __m256i Wx[16];
   __m64 Ay, By, Cy, Dy, Ey, Fy, Gy, Hy;
   __m64 Wy[16];
   uint32_t Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz;
   uint32_t Wz[16];
   Wx[ 0] = mm256_bswap_32( inx[ 0] );
   Wy[ 0] =  mm64_bswap_32( iny[ 0] );
   Wz[ 0] =       bswap_32( inz[ 0] );
   Wx[ 1] = mm256_bswap_32( inx[ 1] );
   Wy[ 1] =  mm64_bswap_32( iny[ 1] );
   Wz[ 1] =       bswap_32( inz[ 1] );
   Wx[ 2] = mm256_bswap_32( inx[ 2] );
   Wy[ 2] =  mm64_bswap_32( iny[ 2] );
   Wz[ 2] =       bswap_32( inz[ 2] );
   Wx[ 3] = mm256_bswap_32( inx[ 3] );
   Wy[ 3] =  mm64_bswap_32( iny[ 3] );
   Wz[ 3] =       bswap_32( inz[ 3] );
   Wx[ 4] = mm256_bswap_32( inx[ 4] );
   Wy[ 4] =  mm64_bswap_32( iny[ 4] );
   Wz[ 4] =       bswap_32( inz[ 4] );
   Wx[ 5] = mm256_bswap_32( inx[ 5] );
   Wy[ 5] =  mm64_bswap_32( iny[ 5] );
   Wz[ 5] =       bswap_32( inz[ 5] );
   Wx[ 6] = mm256_bswap_32( inx[ 6] );
   Wy[ 6] =  mm64_bswap_32( iny[ 6] );
   Wz[ 6] =       bswap_32( inz[ 6] );
   Wx[ 7] = mm256_bswap_32( inx[ 7] );
   Wy[ 7] =  mm64_bswap_32( iny[ 7] );
   Wz[ 7] =       bswap_32( inz[ 7] );
   Wx[ 8] = mm256_bswap_32( inx[ 8] );
   Wy[ 8] =  mm64_bswap_32( iny[ 8] );
   Wz[ 8] =       bswap_32( inz[ 8] );
   Wx[ 9] = mm256_bswap_32( inx[ 9] );
   Wy[ 9] =  mm64_bswap_32( iny[ 9] );
   Wz[ 9] =       bswap_32( inz[ 9] );
   Wx[10] = mm256_bswap_32( inx[10] );
   Wy[10] =  mm64_bswap_32( iny[10] );
   Wz[10] =       bswap_32( inz[10] );
   Wx[11] = mm256_bswap_32( inx[11] );
   Wy[11] =  mm64_bswap_32( iny[11] );
   Wz[11] =       bswap_32( inz[11] );
   Wx[12] = mm256_bswap_32( inx[12] );
   Wy[12] =  mm64_bswap_32( iny[12] );
   Wz[12] =       bswap_32( inz[12] );
   Wx[13] = mm256_bswap_32( inx[13] );
   Wy[13] =  mm64_bswap_32( iny[13] );
   Wz[13] =       bswap_32( inz[13] );
   Wx[14] = mm256_bswap_32( inx[14] );
   Wy[14] =  mm64_bswap_32( iny[14] );
   Wz[14] =       bswap_32( inz[14] );
   Wx[15] = mm256_bswap_32( inx[15] );
   Wy[15] =  mm64_bswap_32( iny[15] );
   Wz[15] =       bswap_32( inz[15] );
   Ax = rx[0];     Ay = ry[0];     Az = rz[0];
   Bx = rx[1];     By = ry[1];     Bz = rz[1];
   Cx = rx[2];     Cy = ry[2];     Cz = rz[2];
   Dx = rx[3];     Dy = ry[3];     Dz = rz[3];
   Ex = rx[4];     Ey = ry[4];     Ez = rz[4];
   Fx = rx[5];     Fy = ry[5];     Fz = rz[5];
   Gx = rx[6];     Gy = ry[6];     Gz = rz[6];
   Hx = rx[7];     Hy = ry[7];     Hz = rz[7];
   SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
                     Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
                     Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  0, 0 );
   SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
 		     Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
 		     Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  1, 0 );
   SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
 		     Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
 		     Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz,  2, 0 );
   SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
 		     Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
 		     Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez,  3, 0 );
   SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
 		     Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
 		     Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz,  4, 0 );
   SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
 		     Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
 		     Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz,  5, 0 );
   SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
 		     Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
 		     Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz,  6, 0 );
   SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
 		     By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
 		     Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az,  7, 0 );
   SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
 		     Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
 		     Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  8, 0 );
   SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
 		     Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
 		     Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  9, 0 );
   SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
 		     Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
 		     Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, 0 );
   SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
 		     Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
 		     Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, 0 );
   SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
 		     Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
 		     Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, 0 );
   SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
 		     Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
 		     Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, 0 );
   SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
 		     Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
 		     Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, 0 );
   SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
 		     By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
 		     Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, 0 );
   for ( int j = 16; j < 64; j += 16 )
   {
      Wx[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
      Wy[ 0] = SHA2y_MEXP( 14,  9,  1,  0 );
      Wz[ 0] = SHA2z_MEXP( 14,  9,  1,  0 );
      Wx[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
      Wy[ 1] = SHA2y_MEXP( 15, 10,  2,  1 );
      Wz[ 1] = SHA2z_MEXP( 15, 10,  2,  1 );
      Wx[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
      Wy[ 2] = SHA2y_MEXP(  0, 11,  3,  2 );
      Wz[ 2] = SHA2z_MEXP(  0, 11,  3,  2 );
      Wx[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
      Wy[ 3] = SHA2y_MEXP(  1, 12,  4,  3 );
      Wz[ 3] = SHA2z_MEXP(  1, 12,  4,  3 );
      Wx[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
      Wy[ 4] = SHA2y_MEXP(  2, 13,  5,  4 );
      Wz[ 4] = SHA2z_MEXP(  2, 13,  5,  4 );
      Wx[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
      Wy[ 5] = SHA2y_MEXP(  3, 14,  6,  5 );
      Wz[ 5] = SHA2z_MEXP(  3, 14,  6,  5 );
      Wx[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
      Wy[ 6] = SHA2y_MEXP(  4, 15,  7,  6 );
      Wz[ 6] = SHA2z_MEXP(  4, 15,  7,  6 );
      Wx[ 7] = SHA2x_MEXP(  5,  0,  8,  7);
      Wy[ 7] = SHA2y_MEXP(  5,  0,  8,  7);
      Wz[ 7] = SHA2z_MEXP(  5,  0,  8,  7);
      Wx[ 8] = SHA2x_MEXP(  6,  1,  9,  8);
      Wy[ 8] = SHA2y_MEXP(  6,  1,  9,  8);
      Wz[ 8] = SHA2z_MEXP(  6,  1,  9,  8);
      Wx[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
      Wy[ 9] = SHA2y_MEXP(  7,  2, 10,  9);
      Wz[ 9] = SHA2z_MEXP(  7,  2, 10,  9);
      Wx[10] = SHA2x_MEXP(  8,  3, 11, 10 );
      Wy[10] = SHA2y_MEXP(  8,  3, 11, 10);
      Wz[10] = SHA2z_MEXP(  8,  3, 11, 10);
      Wx[11] = SHA2x_MEXP(  9,  4, 12, 11);
      Wy[11] = SHA2y_MEXP(  9,  4, 12, 11);
      Wz[11] = SHA2z_MEXP(  9,  4, 12, 11 );
      Wx[12] = SHA2x_MEXP( 10,  5, 13, 12 );
      Wy[12] = SHA2y_MEXP( 10,  5, 13, 12 );
      Wz[12] = SHA2z_MEXP( 10,  5, 13, 12 );
      Wx[13] = SHA2x_MEXP( 11,  6, 14, 13 );
      Wy[13] = SHA2y_MEXP( 11,  6, 14, 13 );
      Wz[13] = SHA2z_MEXP( 11,  6, 14, 13 );
      Wx[14] = SHA2x_MEXP( 12,  7, 15, 14 );
      Wy[14] = SHA2y_MEXP( 12,  7, 15, 14 );
      Wz[14] = SHA2z_MEXP( 12,  7, 15, 14 );
      Wx[15] = SHA2x_MEXP( 13,  8,  0, 15 );
      Wy[15] = SHA2y_MEXP( 13,  8,  0, 15 );
      Wz[15] = SHA2z_MEXP( 13,  8,  0, 15 );
      SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
                        Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
 			Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,	 0, j );
      SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
 		        Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
 		       	Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  1, j );
      SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
 		        Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
 		       	Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz,  2, j );
      SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
 		        Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
 		       	Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez,  3, j );
      SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
 		        Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
 		       	Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz,  4, j );
      SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
 		        Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
 		       	Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz,  5, j );
      SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
 		        Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
 		       	Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz,  6, j );
      SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
 		        By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
 		       	Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az,  7, j );
      SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
                        Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
                        Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  8, j );
      SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx, 
                        Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, 
                        Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  9, j );
      SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx, 
                        Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, 
                        Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, j );
      SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex, 
                        Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, 
                        Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, j );
      SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx, 
                        Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, 
                        Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, j );
      SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx, 
                        Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, 
                        Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, j );
      SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx, 
                        Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, 
                        Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, j );
      SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax, 
                        By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, 
                        Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, j );
   }
   rx[0] = _mm256_add_epi32( rx[0], Ax );
   ry[0] =     _mm_add_pi32( ry[0], Ay );
   rz[0] =                   rz[0]+ Az;
   rx[1] = _mm256_add_epi32( rx[1], Bx );
   ry[1] =     _mm_add_pi32( ry[1], By );
   rz[1] =                   rz[1]+ Bz;
   rx[2] = _mm256_add_epi32( rx[2], Cx );
   ry[2] =     _mm_add_pi32( ry[2], Cy );
   rz[3] =                   rz[3]+ Dz;
   rx[4] = _mm256_add_epi32( rx[4], Ex );
   ry[4] =     _mm_add_pi32( ry[4], Ey );
   rz[4] =                   rz[4]+ Ez;
   rx[5] = _mm256_add_epi32( rx[5], Fx );
   ry[5] =     _mm_add_pi32( ry[5], Fy );
   rz[5] =                   rz[5]+ Fz;
   rx[6] = _mm256_add_epi32( rx[6], Gx );
   ry[6] =     _mm_add_pi32( ry[6], Gy );
   rz[6] =                   rz[6]+ Gz;
   rx[7] = _mm256_add_epi32( rx[7], Hx );
   ry[7] =     _mm_add_pi32( ry[7], Hy );
   rz[7] =                   rz[7]+ Hz;
 }
 void sha256_11way_init( sha256_11way_context *ctx )
 {
   ctx->count_high = ctx->count_low = 0;
   ctx->valx[0] = _mm256_set1_epi32( H256[0] );
   ctx->valy[0] =     _mm_set1_pi32( H256[0] );
   ctx->valx[1] = _mm256_set1_epi32( H256[0] );
   ctx->valy[1] =     _mm_set1_pi32( H256[0] );
   ctx->valx[2] = _mm256_set1_epi32( H256[0] );
   ctx->valy[2] =     _mm_set1_pi32( H256[0] );
   ctx->valx[3] = _mm256_set1_epi32( H256[0] );
   ctx->valy[3] =     _mm_set1_pi32( H256[0] );
   ctx->valx[4] = _mm256_set1_epi32( H256[0] );
   ctx->valy[4] =     _mm_set1_pi32( H256[0] );
   ctx->valx[5] = _mm256_set1_epi32( H256[0] );
   ctx->valy[5] =     _mm_set1_pi32( H256[0] );
   ctx->valx[6] = _mm256_set1_epi32( H256[0] );
   ctx->valy[6] =     _mm_set1_pi32( H256[0] );
   ctx->valx[7] = _mm256_set1_epi32( H256[0] );
   ctx->valy[7] =     _mm_set1_pi32( H256[0] );
   memcpy( ctx->valz, H256, 32 );
 }
 void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
 	                  const void *datay, const void *dataz, size_t len )
 {
   __m256i  *vdatax = (__m256i*) datax;
    __m64   *vdatay = (__m64*)   datay;
   uint32_t *idataz = (uint32_t*)dataz;
   size_t ptr;
   const int buf_size = 64;
   ptr = (unsigned)ctx->count_low & (buf_size - 1U);
   while ( len > 0 )
   {
      size_t clen;
      uint32_t clow, clow2;
      clen = buf_size - ptr;
      if ( clen > len )
         clen = len;
      memcpy_256( ctx->bufx + (ptr>>2), vdatax + (ptr>>2), clen>>2 );
      memcpy_m64( ctx->bufy + (ptr>>2), vdatay + (ptr>>2), clen>>2 );
      memcpy    ( ctx->bufz +  ptr,     idataz +  ptr,     clen    );
      ptr += clen;
      len -= clen;
      if ( ptr == buf_size )
      {
         sha256_11way_round( ctx->bufx, ctx->valx,
 			     ctx->bufy, ctx->valy,
 			     ctx->bufz, ctx->valz );
         ptr = 0;
      }
      clow = ctx->count_low;
      clow2 = clow + clen;
      ctx->count_low = clow2;
      if ( clow2 < clow )
         ctx->count_high++;
   }
 }
 void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dsty,
 	                                            void *dstz)
 {
    unsigned ptr, u;
    uint32_t low, high;
    const int buf_size = 64;
    const int pad = buf_size - 8;
    ptr = (unsigned)ctx->count_low & (buf_size - 1U);
    ctx->bufx[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
    ctx->bufy[ ptr>>2 ] = _mm_set1_pi32( 0x80 );
    ctx->bufz[ ptr>>2 ] = 0x80;
    ptr += 4;
    if ( ptr > pad )
    {
         memset_zero_256( ctx->bufx + (ptr>>2), (buf_size - ptr) >> 2 );
         memset_zero_m64( ctx->bufy + (ptr>>2), (buf_size - ptr) >> 2 );
         memset(      ctx->bufz + (ptr>>2), 0,  (buf_size - ptr) >> 2 );
         sha256_11way_round( ctx->bufx, ctx->valx,
 			     ctx->bufy, ctx->valy,
 			     ctx->bufz, ctx->valz );
         memset_zero_256( ctx->bufx, pad >> 2 );
         memset_zero_m64(  ctx->bufy, pad >> 2 );
         memset(      ctx->bufz, 0,  pad >> 2 );
    }
    else
    {
        memset_zero_256( ctx->bufx + (ptr>>2),    (pad - ptr) >> 2 );
        memset_zero_m64(  ctx->bufy + (ptr>>2),    (pad - ptr) >> 2 );
        memset(          ctx->bufz + (ptr>>2), 0, (pad - ptr) >> 2 );
    }
    low = ctx->count_low;
    high = (ctx->count_high << 3) | (low >> 29);
    low = low << 3;
    ctx->bufx[ pad >> 2 ] =
                 mm256_bswap_32( _mm256_set1_epi32( high ) );
    ctx->bufy[ pad >> 2 ] =
                 mm64_bswap_32( _mm_set1_pi32( high ) );
    ctx->bufz[ pad >> 2 ] =
                 bswap_32( high );
    ctx->bufx[ ( pad+4 ) >> 2 ] =
                 mm256_bswap_32( _mm256_set1_epi32( low ) );
    ctx->bufy[ ( pad+4 ) >> 2 ] =
                 mm64_bswap_32( _mm_set1_pi32( low ) );
    ctx->bufz[ ( pad+4 ) >> 2 ] =
                 bswap_32( low );
    sha256_11way_round( ctx->bufx, ctx->valx,
 		       ctx->bufy, ctx->valy,
 		       ctx->bufz, ctx->valz  );
    for ( u = 0; u < 8; u ++ )
    {
       casti_m256i( dstx, u ) = mm256_bswap_32( ctx->valx[u] );
       casti_m64  ( dsty, u ) =  mm64_bswap_32( ctx->valy[u] );
       ((uint32_t*)dstz)[u] = bswap_32( ctx->valz[u] );
   }
 }
 #endif
 #endif   // 0
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -5,137 +5,6 @@
 #include <stdio.h>
 #include "sha-hash-4way.h"
 #if defined(SHA256T_11WAY)
 static __thread sha256_11way_context sha256_ctx11 __attribute__ ((aligned (64)));
 void sha256t_11way_hash( void *outx, void *outy, void *outz, const void *inpx,
 	                 const void *inpy, const void*inpz )
 {
   uint32_t hashx[8*8] __attribute__ ((aligned (64)));
   uint32_t hashy[8*2] __attribute__ ((aligned (64)));
   uint32_t hashz[8]   __attribute__ ((aligned (64)));
   sha256_11way_context ctx;
   const void *inpx64 = inpx+(64<<3);
   const void *inpy64 = inpy+(64<<1);
   const void *inpz64 = inpz+ 64;
   memcpy( &ctx, &sha256_ctx11, sizeof ctx );
   sha256_11way_update( &ctx, inpx64, inpy64, inpz64,  16 );
   sha256_11way_close( &ctx, hashx, hashy, hashz );
   sha256_11way_init( &ctx );
   sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
   sha256_11way_close( &ctx, hashx, hashy, hashz );
   sha256_11way_init( &ctx );
   sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
   sha256_11way_close( &ctx, outx, outy, outz );
 }
 int scanhash_sha256t_11way( struct work *work, uint32_t max_nonce,
 	                    uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t datax[20*8]  __attribute__ ((aligned (64)));
   uint32_t datay[20*2]  __attribute__ ((aligned (32)));
   uint32_t dataz[20]    __attribute__ ((aligned (32)));
   uint32_t hashx[8*8]   __attribute__ ((aligned (32)));
   uint32_t hashy[8*2]   __attribute__ ((aligned (32)));
   uint32_t hashz[8]     __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *hash7;
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   __m256i  *noncex = (__m256i*) datax + 19;
   __m64    *noncey = (__m64*)   datay + 19;
   uint32_t *noncez = (uint32_t*)dataz + 19;
   int thr_id = mythr->id;  // thr_id arg is deprecated
   int i;
   const uint64_t htmax[] = {           0,
                                      0xF,
                                     0xFF,
                                    0xFFF,
                                   0xFFFF,
                               0x10000000 };
   const uint32_t masks[] = {  0xFFFFFFFF,
                               0xFFFFFFF0,
                               0xFFFFFF00,
                               0xFFFFF000,
                               0xFFFF0000,
                                        0 };
   // Use dataz (scalar) to stage bswapped data for the vectors.
   casti_m256i( dataz, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
   casti_m256i( dataz, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
   casti_m128i( dataz, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
   intrlv_8x32( datax, dataz, dataz, dataz, dataz,
                                 dataz, dataz, dataz, dataz, 640 );
   mm64_interleave_2x32( datay, dataz, dataz, 640 );
   sha256_11way_init( &sha256_ctx11 );
   sha256_11way_update( &sha256_ctx11, datax, datay, dataz, 64 );
   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
   {
      uint32_t mask = masks[m];
      do
      {
        *noncex = mm256_bswap_32(
         _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
        *noncey = mm64_bswap_32( _mm_set_pi32( n+9, n+8 ) );
        *noncez = bswap_32( n+10 );
        pdata[19] = n;
        sha256t_11way_hash( hashx, hashy, hashz, datax, datay, dataz );
        if ( opt_benchmark ) { n += 11; continue; }
        hash7 = &(hashx[7<<3]); 
        for ( i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
        { 
            // deinterleave hash for lane
            extr_lane_8x32( lane_hash, hashx, i, 256 );
            if ( fulltest( lane_hash, ptarget ) )
            {
 	            pdata[19] = n + i;
               submit_lane_solution( work, lane_hash, mythr, i );
            }
        }
        hash7 = &(hashy[7<<1]);
        for( i = 0; i < 2; i++ ) if ( !(hash7[ 0] & mask ) )
        {
            mm64_extr_lane_2x32( lane_hash, hashy, i, 256 );
           if ( fulltest( lane_hash, ptarget ) )
           {
               pdata[19] = n + 8 + i;
               submit_lane_solution( work, lane_hash, mythr, i+8 );
           }
 	     }
        if ( !(hashz[7] & mask ) && fulltest( hashz, ptarget ) )
        {
            pdata[19] = n+10;
            submit_lane_solution( work, hashz, mythr, 10 );
        }
        n += 11;
      } while ( (n < max_nonce-12) && !work_restart[thr_id].restart );
      break;
   }
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
 #endif
 #if defined(SHA256T_8WAY)
 static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
--- a/algo/simd/nist.c
+++ b/algo/simd/nist.c
@@ -83,13 +83,14 @@ HashReturn init_sd(hashState_sd *state, int hashbitlen) {
  char *init;
 #ifndef NO_PRECOMPUTED_IV
-  if (hashbitlen == 224)
+//  if (hashbitlen == 224)
-    r=InitIV(state, hashbitlen, IV_224);
+//    r=InitIV(state, hashbitlen, IV_224);
-  else if (hashbitlen == 256)
+//  else if (hashbitlen == 256)
-    r=InitIV(state, hashbitlen, IV_256);
+//    r=InitIV(state, hashbitlen, IV_256);
-  else if (hashbitlen == 384)
+//  else if (hashbitlen == 384)
-    r=InitIV(state, hashbitlen, IV_384);
+//    r=InitIV(state, hashbitlen, IV_384);
-  else if (hashbitlen == 512)
+//  else
  if (hashbitlen == 512)
    r=InitIV(state, hashbitlen, IV_512);
  else
 #endif
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -2,13 +2,136 @@
 #include <string.h>
 #include <stdint.h>
 #include "skein-hash-4way.h"
 // 8 way is faster than SHA on Icelake
 // SHA is faster than 4 way on Ryzen
 //
 #if defined(__SHA__)
  #include <openssl/sha.h>
 #else
  #include "algo/sha/sha-hash-4way.h"
 #endif
 #include "algo/sha/sha-hash-4way.h"
-#if defined (SKEIN_4WAY)
+#if defined (SKEIN_8WAY)
 void skeinhash_8way( void *state, const void *input )
 {
     uint64_t vhash64[16*8] __attribute__ ((aligned (128)));
     skein512_8way_context ctx_skein;
 //#if defined(__SHA__)
 //     uint32_t hash0[16] __attribute__ ((aligned (64)));
 //     uint32_t hash1[16] __attribute__ ((aligned (64)));
 //     uint32_t hash2[16] __attribute__ ((aligned (64)));
 //     uint32_t hash3[16] __attribute__ ((aligned (64)));
 //     uint32_t hash4[16] __attribute__ ((aligned (64)));
 //     uint32_t hash5[16] __attribute__ ((aligned (64)));
 //     uint32_t hash6[16] __attribute__ ((aligned (64)));
 //     uint32_t hash7[16] __attribute__ ((aligned (64)));
 //     SHA256_CTX           ctx_sha256;
 //#else
     uint32_t vhash32[32*8] __attribute__ ((aligned (128)));
     sha256_8way_context ctx_sha256;
 //#endif
     skein512_8way_init( &ctx_skein );
     skein512_8way_update( &ctx_skein, input, 80 );
     skein512_8way_close( &ctx_skein, vhash64 );
 /*
 #if defined(__SHA__)      
     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash64, 512 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
     SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
     SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
     SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
     SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash4, 64 );
     SHA256_Final( (unsigned char*)hash4, &ctx_sha256 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash5, 64 );
     SHA256_Final( (unsigned char*)hash5, &ctx_sha256 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash6, 64 );
     SHA256_Final( (unsigned char*)hash6, &ctx_sha256 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash7, 64 );
     SHA256_Final( (unsigned char*)hash7, &ctx_sha256 );
     intrlv_8x32( state, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, 256 );
 #else
 */
     rintrlv_8x64_8x32( vhash32, vhash64, 512 );
 //     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
 //                   vhash64, 512 );
 //     intrlv_8x32( vhash32, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
 //                   hash7, 512 );
     sha256_8way_init( &ctx_sha256 );
     sha256_8way( &ctx_sha256, vhash32, 64 );
     sha256_8way_close( &ctx_sha256, state );
 //#endif
 }
 int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t vdata[20*8] __attribute__ ((aligned (128)));
    uint32_t hash[16*8] __attribute__ ((aligned (64)));
    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
    uint32_t *hash7 = &(hash[7<<3]);
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
    int thr_id = mythr->id; 
   mm512_bswap32_intrlv80_8x64( vdata, pdata );
   do
   {
       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
       skeinhash_8way( hash, vdata );
       for ( int lane = 0; lane < 8; lane++ )
       if (  hash7[ lane ] <= Htarg )
       {
          extr_lane_8x32( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) )
          {
             pdata[19] = n + lane;
             submit_lane_solution( work, lane_hash, mythr, lane );
          }
       }
       n += 8;
    } while ( (n < max_nonce-8) && !work_restart[thr_id].restart );
    *hashes_done = n - first_nonce;
    return 0;
 }
 #elif defined (SKEIN_4WAY)
 void skeinhash_4way( void *state, const void *input )
 {
@@ -26,7 +149,7 @@ void skeinhash_4way( void *state, const void *input )
 #endif
     skein512_4way_init( &ctx_skein );
-     skein512_4way( &ctx_skein, input, 80 );
+     skein512_4way_update( &ctx_skein, input, 80 );
     skein512_4way_close( &ctx_skein, vhash64 );
 #if defined(__SHA__)      
@@ -71,7 +194,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-    int thr_id = mythr->id;  // thr_id arg is deprecated
+    int thr_id = mythr->id; 
   mm256_bswap32_intrlv80_4x64( vdata, pdata );
   do
@@ -92,9 +215,9 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
          }
       }
       n += 4;
-    } while ( (n < max_nonce) && !work_restart[thr_id].restart );
+    } while ( (n < max_nonce-4) && !work_restart[thr_id].restart );
-    *hashes_done = n - first_nonce + 1;
+    *hashes_done = n - first_nonce;
    return 0;
 }
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -4,8 +4,11 @@
 bool register_skein_algo( algo_gate_t* gate )
 {
-    gate->optimizations = AVX2_OPT | SHA_OPT;
+    gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
-#if defined (SKEIN_4WAY)
+#if defined (SKEIN_8WAY)
    gate->scanhash  = (void*)&scanhash_skein_8way;
    gate->hash      = (void*)&skeinhash_8way;
 #elif defined (SKEIN_4WAY)
    gate->scanhash  = (void*)&scanhash_skein_4way;
    gate->hash      = (void*)&skeinhash_4way;
 #else
@@ -15,3 +18,20 @@ bool register_skein_algo( algo_gate_t* gate )
    return true;
 };
 bool register_skein2_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX2_OPT | AVX512_OPT;
 #if defined (SKEIN_8WAY)
  gate->scanhash  = (void*)&scanhash_skein2_8way;
  gate->hash      = (void*)&skein2hash_8way;
 #elif defined (SKEIN_4WAY)
  gate->scanhash  = (void*)&scanhash_skein2_4way;
  gate->hash      = (void*)&skein2hash_4way;
 #else
  gate->scanhash  = (void*)&scanhash_skein2;
  gate->hash      = (void*)&skein2hash;
 #endif
  return true;
 };
--- a/algo/skein/skein-gate.h
+++ b/algo/skein/skein-gate.h
@@ -1,23 +1,44 @@
 #ifndef __SKEIN_GATE_H__
-#define __SKEIN_GATE_H__
+#define __SKEIN_GATE_H__ 1
 #include <stdint.h>
 #include "algo-gate-api.h"
-#if defined(__AVX2__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define SKEIN_4WAY
+  #define SKEIN_8WAY 1
 #elif defined(__AVX2__)
  #define SKEIN_4WAY 1
 #endif
-#if defined(SKEIN_4WAY)
+#if defined(SKEIN_8WAY)
 void skeinhash_8way( void *output, const void *input );
 int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 void skein2hash_8way( void *output, const void *input );
 int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
                          uint64_t* hashes_done, struct thr_info *mythr );
 #elif defined(SKEIN_4WAY)
 void skeinhash_4way( void *output, const void *input );
 int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-#endif
+
 void skein2hash_4way( void *output, const void *input );
 int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
                          uint64_t* hashes_done, struct thr_info *mythr );
 #else
 void skeinhash( void *output, const void *input );
 int scanhash_skein( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
 void skein2hash( void *output, const void *input );
 int scanhash_skein2( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 #endif
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -36,7 +36,6 @@
 #include <string.h>
 #include "skein-hash-4way.h"
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -45,6 +44,22 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif
 /*
 static const sph_u64 IV256[] = {
   SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
   SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
   SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
   SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
 };
 static const sph_u64 IV512[] = {
   SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
   SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
   SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
   SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
 };
 */
 /*
 * M9_ ## s ## _ ## i  evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
 */
@@ -270,8 +285,151 @@ extern "C"{
 #define SKBI(k, s, i)   XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
 #define SKBT(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
 #define READ_STATE_BIG(sc)   do { \
      h0 = (sc)->h0; \
      h1 = (sc)->h1; \
      h2 = (sc)->h2; \
      h3 = (sc)->h3; \
      h4 = (sc)->h4; \
      h5 = (sc)->h5; \
      h6 = (sc)->h6; \
      h7 = (sc)->h7; \
      bcount = sc->bcount; \
   } while (0)
 #define WRITE_STATE_BIG(sc)   do { \
      (sc)->h0 = h0; \
      (sc)->h1 = h1; \
      (sc)->h2 = h2; \
      (sc)->h3 = h3; \
      (sc)->h4 = h4; \
      (sc)->h5 = h5; \
      (sc)->h6 = h6; \
      (sc)->h7 = h7; \
      sc->bcount = bcount; \
   } while (0)
 // AVX2 all scalar vars are now vectors representing 4 nonces in parallel
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 #define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
 do { \
  k8 = _mm512_xor_si512( _mm512_xor_si512( \
                            _mm512_xor_si512( _mm512_xor_si512( k0, k1 ), \
                                              _mm512_xor_si512( k2, k3 ) ), \
                            _mm512_xor_si512( _mm512_xor_si512( k4, k5 ), \
                                              _mm512_xor_si512( k6, k7 ) ) ), \
                         m512_const1_64( 0x1BD11BDAA9FC1A22) ); \
  t2 = t0 ^ t1; \
 } while (0)
 #define TFBIG_ADDKEY_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \
 do { \
  w0 = _mm512_add_epi64( w0, SKBI(k,s,0) ); \
  w1 = _mm512_add_epi64( w1, SKBI(k,s,1) ); \
  w2 = _mm512_add_epi64( w2, SKBI(k,s,2) ); \
  w3 = _mm512_add_epi64( w3, SKBI(k,s,3) ); \
  w4 = _mm512_add_epi64( w4, SKBI(k,s,4) ); \
  w5 = _mm512_add_epi64( w5, _mm512_add_epi64( SKBI(k,s,5), \
                                         m512_const1_64( SKBT(t,s,0) ) ) ); \
  w6 = _mm512_add_epi64( w6, _mm512_add_epi64( SKBI(k,s,6), \
                                         m512_const1_64( SKBT(t,s,1) ) ) ); \
  w7 = _mm512_add_epi64( w7, _mm512_add_epi64( SKBI(k,s,7), \
                                         m512_const1_64( s ) ) ); \
 } while (0)
 #define TFBIG_MIX_8WAY(x0, x1, rc) \
 do { \
     x0 = _mm512_add_epi64( x0, x1 ); \
     x1 = _mm512_xor_si512( mm512_rol_64( x1, rc ), x0 ); \
 } while (0)
 #define TFBIG_MIX8_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3)  do { \
      TFBIG_MIX_8WAY(w0, w1, rc0); \
      TFBIG_MIX_8WAY(w2, w3, rc1); \
      TFBIG_MIX_8WAY(w4, w5, rc2); \
      TFBIG_MIX_8WAY(w6, w7, rc3); \
   } while (0)
 #define TFBIG_8WAY_4e(s)   do { \
      TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
      TFBIG_MIX8_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
      TFBIG_MIX8_8WAY(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
      TFBIG_MIX8_8WAY(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
      TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
   } while (0)
 #define TFBIG_8WAY_4o(s)   do { \
      TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
      TFBIG_MIX8_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
      TFBIG_MIX8_8WAY(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
      TFBIG_MIX8_8WAY(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
      TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
   } while (0)
 #define UBI_BIG_8WAY(etype, extra) \
 do { \
  sph_u64 t0, t1, t2; \
  __m512i h8; \
  __m512i m0 =  buf[0]; \
  __m512i m1 =  buf[1]; \
  __m512i m2 =  buf[2]; \
  __m512i m3 =  buf[3]; \
  __m512i m4 =  buf[4]; \
  __m512i m5 =  buf[5]; \
  __m512i m6 =  buf[6]; \
  __m512i m7 =  buf[7]; \
 \
  __m512i p0 = m0; \
  __m512i p1 = m1; \
  __m512i p2 = m2; \
  __m512i p3 = m3; \
  __m512i p4 = m4; \
  __m512i p5 = m5; \
  __m512i p6 = m6; \
  __m512i p7 = m7; \
  t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
  t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
  TFBIG_KINIT_8WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \
  TFBIG_8WAY_4e(0); \
  TFBIG_8WAY_4o(1); \
  TFBIG_8WAY_4e(2); \
  TFBIG_8WAY_4o(3); \
  TFBIG_8WAY_4e(4); \
  TFBIG_8WAY_4o(5); \
  TFBIG_8WAY_4e(6); \
  TFBIG_8WAY_4o(7); \
  TFBIG_8WAY_4e(8); \
  TFBIG_8WAY_4o(9); \
  TFBIG_8WAY_4e(10); \
  TFBIG_8WAY_4o(11); \
  TFBIG_8WAY_4e(12); \
  TFBIG_8WAY_4o(13); \
  TFBIG_8WAY_4e(14); \
  TFBIG_8WAY_4o(15); \
  TFBIG_8WAY_4e(16); \
  TFBIG_8WAY_4o(17); \
  TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \
  h0 = _mm512_xor_si512( m0, p0 );\
  h1 = _mm512_xor_si512( m1, p1 );\
  h2 = _mm512_xor_si512( m2, p2 );\
  h3 = _mm512_xor_si512( m3, p3 );\
  h4 = _mm512_xor_si512( m4, p4 );\
  h5 = _mm512_xor_si512( m5, p5 );\
  h6 = _mm512_xor_si512( m6, p6 );\
  h7 = _mm512_xor_si512( m7, p7 );\
 } while (0)
 #define DECL_STATE_BIG_8WAY \
  __m512i h0, h1, h2, h3, h4, h5, h6, h7; \
  sph_u64 bcount;
 #endif // AVX512
 #define TFBIG_KINIT_4WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
 do { \
  k8 = _mm256_xor_si256( _mm256_xor_si256( \
@@ -298,39 +456,34 @@ do { \
                                         m256_const1_64( s ) ) ); \
 } while (0)
 #define TFBIG_MIX_4WAY(x0, x1, rc) \
 do { \
     x0 = _mm256_add_epi64( x0, x1 ); \
     x1 = _mm256_xor_si256( mm256_rol_64( x1, rc ), x0 ); \
 } while (0)
-// typeless
+#define TFBIG_MIX8_4WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3)  do { \
-#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3)  do { \
+      TFBIG_MIX_4WAY(w0, w1, rc0); \
-		TFBIG_MIX_4WAY(w0, w1, rc0); \
+      TFBIG_MIX_4WAY(w2, w3, rc1); \
-		TFBIG_MIX_4WAY(w2, w3, rc1); \
+      TFBIG_MIX_4WAY(w4, w5, rc2); \
-		TFBIG_MIX_4WAY(w4, w5, rc2); \
+      TFBIG_MIX_4WAY(w6, w7, rc3); \
-		TFBIG_MIX_4WAY(w6, w7, rc3); \
+   } while (0)
 	} while (0)
 #define TFBIG_4WAY_4e(s)   do { \
      TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
      TFBIG_MIX8_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
      TFBIG_MIX8_4WAY(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
      TFBIG_MIX8_4WAY(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
      TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
   } while (0)
-#define TFBIG_4e(s)   do { \
+#define TFBIG_4WAY_4o(s)   do { \
-		TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
+      TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
-		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
+      TFBIG_MIX8_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
-		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
+      TFBIG_MIX8_4WAY(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
-		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
+      TFBIG_MIX8_4WAY(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
-		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
+      TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
-	} while (0)
+   } while (0)
 #define TFBIG_4o(s)   do { \
 		TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
 		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
 		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
 		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
 		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
 	} while (0)
 // scale buf offset by 4
 #define UBI_BIG_4WAY(etype, extra) \
@@ -357,24 +510,24 @@ do { \
  t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
  t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
  TFBIG_KINIT_4WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \
-  TFBIG_4e(0); \
+  TFBIG_4WAY_4e(0); \
-  TFBIG_4o(1); \
+  TFBIG_4WAY_4o(1); \
-  TFBIG_4e(2); \
+  TFBIG_4WAY_4e(2); \
-  TFBIG_4o(3); \
+  TFBIG_4WAY_4o(3); \
-  TFBIG_4e(4); \
+  TFBIG_4WAY_4e(4); \
-  TFBIG_4o(5); \
+  TFBIG_4WAY_4o(5); \
-  TFBIG_4e(6); \
+  TFBIG_4WAY_4e(6); \
-  TFBIG_4o(7); \
+  TFBIG_4WAY_4o(7); \
-  TFBIG_4e(8); \
+  TFBIG_4WAY_4e(8); \
-  TFBIG_4o(9); \
+  TFBIG_4WAY_4o(9); \
-  TFBIG_4e(10); \
+  TFBIG_4WAY_4e(10); \
-  TFBIG_4o(11); \
+  TFBIG_4WAY_4o(11); \
-  TFBIG_4e(12); \
+  TFBIG_4WAY_4e(12); \
-  TFBIG_4o(13); \
+  TFBIG_4WAY_4o(13); \
-  TFBIG_4e(14); \
+  TFBIG_4WAY_4e(14); \
-  TFBIG_4o(15); \
+  TFBIG_4WAY_4o(15); \
-  TFBIG_4e(16); \
+  TFBIG_4WAY_4e(16); \
-  TFBIG_4o(17); \
+  TFBIG_4WAY_4o(17); \
  TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \
  h0 = _mm256_xor_si256( m0, p0 );\
  h1 = _mm256_xor_si256( m1, p1 );\
@@ -391,45 +544,142 @@ do { \
  __m256i h0, h1, h2, h3, h4, h5, h6, h7; \
  sph_u64 bcount;
-#define READ_STATE_BIG(sc)   do { \
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 		h0 = (sc)->h0; \
 		h1 = (sc)->h1; \
 		h2 = (sc)->h2; \
 		h3 = (sc)->h3; \
 		h4 = (sc)->h4; \
 		h5 = (sc)->h5; \
 		h6 = (sc)->h6; \
 		h7 = (sc)->h7; \
 		bcount = sc->bcount; \
 	} while (0)
-#define WRITE_STATE_BIG(sc)   do { \
+void skein256_8way_init( skein256_8way_context *sc )
-		(sc)->h0 = h0; \
+{
-		(sc)->h1 = h1; \
+        sc->h0 = m512_const1_64( 0xCCD044A12FDB3E13 );
-		(sc)->h2 = h2; \
+        sc->h1 = m512_const1_64( 0xE83590301A79A9EB );
-		(sc)->h3 = h3; \
+        sc->h2 = m512_const1_64( 0x55AEA0614F816E6F );
-		(sc)->h4 = h4; \
+        sc->h3 = m512_const1_64( 0x2A2767A4AE9B94DB );
-		(sc)->h5 = h5; \
+        sc->h4 = m512_const1_64( 0xEC06025E74DD7683 );
-		(sc)->h6 = h6; \
+        sc->h5 = m512_const1_64( 0xE7A436CDC4746251 );
-		(sc)->h7 = h7; \
+        sc->h6 = m512_const1_64( 0xC36FBAF9393AD185 );
-		sc->bcount = bcount; \
+        sc->h7 = m512_const1_64( 0x3EEDBA1833EDFC13 );
-	} while (0)
+        sc->bcount = 0;
        sc->ptr = 0;
 }
-/*
+void skein512_8way_init( skein512_8way_context *sc )
-static const sph_u64 IV256[] = {
+{
-   SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
+        sc->h0 = m512_const1_64( 0x4903ADFF749C51CE );
-   SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
+        sc->h1 = m512_const1_64( 0x0D95DE399746DF03 );
-   SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
+        sc->h2 = m512_const1_64( 0x8FD1934127C79BCE );
-   SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
+        sc->h3 = m512_const1_64( 0x9A255629FF352CB1 );
-};
+        sc->h4 = m512_const1_64( 0x5DB62599DF6CA7B0 );
        sc->h5 = m512_const1_64( 0xEABE394CA9D5C3F4 );
        sc->h6 = m512_const1_64( 0x991112C71A75B523 );
        sc->h7 = m512_const1_64( 0xAE18A40B660FCC33 );
        sc->bcount = 0;
        sc->ptr = 0;
 }
 static void
 skein_big_core_8way( skein512_8way_context *sc, const void *data,
                     size_t len )
 {
   __m512i *vdata = (__m512i*)data;
   __m512i *buf;
   size_t ptr;
   unsigned first;
   DECL_STATE_BIG_8WAY
   buf = sc->buf;
   ptr = sc->ptr;
   const int buf_size = 64;   // 64 * _m256i
   if ( len <= buf_size - ptr )
   {
       memcpy_512( buf + (ptr>>3), vdata, len>>3 );
       sc->ptr = ptr + len;
       return;
   }
   READ_STATE_BIG( sc );
   first = ( bcount == 0 ) << 7;
   do {
       size_t clen;
       if ( ptr == buf_size )
       {
            bcount ++;
            UBI_BIG_8WAY( 96 + first, 0 );
            first = 0;
            ptr = 0;
       }
       clen = buf_size - ptr;
       if ( clen > len )
            clen = len;
       memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
       ptr += clen;
       vdata += (clen>>3);
       len -= clen;
   } while ( len > 0 );
   WRITE_STATE_BIG( sc );
   sc->ptr = ptr;
 }
 static void
 skein_big_close_8way( skein512_8way_context *sc, unsigned ub, unsigned n,
                      void *dst, size_t out_len )
 {
   __m512i *buf;
   size_t ptr;
   unsigned et;
   DECL_STATE_BIG_8WAY
   buf = sc->buf;
   ptr = sc->ptr;
        const int buf_size = 64;
   READ_STATE_BIG(sc);
   memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
   et = 352 + ((bcount == 0) << 7);
   UBI_BIG_8WAY( et, ptr );
   memset_zero_512( buf, buf_size >> 3 );
   bcount = 0;
   UBI_BIG_8WAY( 510, 8 );
   buf[0] = h0;
   buf[1] = h1;
   buf[2] = h2;
   buf[3] = h3;
   buf[4] = h4;
   buf[5] = h5;
   buf[6] = h6;
   buf[7] = h7;
   memcpy_512( dst, buf, out_len >> 3 );
 }
 void
 skein256_8way_update(void *cc, const void *data, size_t len)
 {
   skein_big_core_8way(cc, data, len);
 }
 void
 skein256_8way_close(void *cc, void *dst)
 {
        skein_big_close_8way(cc, 0, 0, dst, 32);
 }
 void
 skein512_8way_update(void *cc, const void *data, size_t len)
 {
   skein_big_core_8way(cc, data, len);
 }
 void
 skein512_8way_close(void *cc, void *dst)
 {
        skein_big_close_8way(cc, 0, 0, dst, 64);
 }
 #endif // AVX512
 static const sph_u64 IV512[] = {
   SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
   SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
   SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
   SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
 };
 */
 void skein256_4way_init( skein256_4way_context *sc )
 {
@@ -517,66 +767,30 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
 	ptr = sc->ptr;
        const int buf_size = 64;
 	/*
 	 * At that point, if ptr == 0, then the message was empty;
 	 * otherwise, there is between 1 and 64 bytes (inclusive) which
 	 * are yet to be processed. Either way, we complete the buffer
 	 * to a full block with zeros (the Skein specification mandates
 	 * that an empty message is padded so that there is at least
 	 * one block to process).
 	 *
 	 * Once this block has been processed, we do it again, with
 	 * a block full of zeros, for the output (that block contains
 	 * the encoding of "0", over 8 bytes, then padded with zeros).
 	 */
 	READ_STATE_BIG(sc);
-        memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+   memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
 	et = 352 + ((bcount == 0) << 7);
-        UBI_BIG_4WAY( et, ptr );
+   UBI_BIG_4WAY( et, ptr );
-        memset_zero_256( buf, buf_size >> 3 );
+   memset_zero_256( buf, buf_size >> 3 );
-        bcount = 0;
+   bcount = 0;
-        UBI_BIG_4WAY( 510, 8 );
+   UBI_BIG_4WAY( 510, 8 );
-        buf[0] = h0;
+   buf[0] = h0;
-        buf[1] = h1;
+   buf[1] = h1;
-        buf[2] = h2;
+   buf[2] = h2;
-        buf[3] = h3;
+   buf[3] = h3;
-        buf[4] = h4;
+   buf[4] = h4;
-        buf[5] = h5;
+   buf[5] = h5;
-        buf[6] = h6;
+   buf[6] = h6;
-        buf[7] = h7;
+   buf[7] = h7;
-        memcpy_256( dst, buf, out_len >> 3 );
+   memcpy_256( dst, buf, out_len >> 3 );
 }
 /*
 static const sph_u64 IV256[] = {
 	SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
 	SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
 	SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
 	SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
 };
 static const sph_u64 IV512[] = {
 	SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
 	SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
 	SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
 	SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
 };
 */
 /*
 void
-skein256_4way_init(void *cc)
+skein256_4way_update(void *cc, const void *data, size_t len)
 {
 	skein_big_init_4way(cc, IV256);
 }
 */
 void
 skein256_4way(void *cc, const void *data, size_t len)
 {
 	skein_big_core_4way(cc, data, len);
 }
@@ -587,16 +801,8 @@ skein256_4way_close(void *cc, void *dst)
        skein_big_close_4way(cc, 0, 0, dst, 32);
 }
 /*
 void
-skein512_4way_init(void *cc)
+skein512_4way_update(void *cc, const void *data, size_t len)
 {
 	skein_big_init_4way(cc, IV512);
 }
 */
 void
 skein512_4way(void *cc, const void *data, size_t len)
 {
 	skein_big_core_4way(cc, data, len);
 }
--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -55,29 +55,50 @@ extern "C"{
 #define SPH_SIZE_skein256   256
 #define SPH_SIZE_skein512   512
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 typedef struct
 {
-   __m256i buf[8] __attribute__ ((aligned (64)));
+   __m512i buf[8];
   __m512i h0, h1, h2, h3, h4, h5, h6, h7;
   size_t ptr;
   sph_u64 bcount;
 } sph_skein_8way_big_context __attribute__ ((aligned (128)));
 typedef sph_skein_8way_big_context skein512_8way_context;
 typedef sph_skein_8way_big_context skein256_8way_context;
 void skein512_8way_init( skein512_8way_context *sc );
 void skein512_8way_update( void *cc, const void *data, size_t len );
 void skein512_8way_close( void *cc, void *dst );
 void skein256_8way_init( skein256_8way_context *sc );
 void skein256_8way_update( void *cc, const void *data, size_t len );
 void skein256_8way_close( void *cc, void *dst );
 #endif // AVX512
 typedef struct
 {
   __m256i buf[8];
   __m256i h0, h1, h2, h3, h4, h5, h6, h7;
   size_t ptr;
 	sph_u64 bcount;
-} sph_skein_4way_big_context;
+} sph_skein_4way_big_context __attribute__ ((aligned (128)));
 typedef sph_skein_4way_big_context skein512_4way_context;
 typedef sph_skein_4way_big_context skein256_4way_context;
 void skein512_4way_init( skein512_4way_context *sc );
-void skein512_4way( void *cc, const void *data, size_t len );
+void skein512_4way_update( void *cc, const void *data, size_t len );
 void skein512_4way_close( void *cc, void *dst );
-//void sph_skein512_addbits_and_close(
+#define skein512_4way skein512_4way_update
 //        void *cc, unsigned ub, unsigned n, void *dst);
 void skein256_4way_init( skein256_4way_context *sc );
-void skein256_4way( void *cc, const void *data, size_t len );
+void skein256_4way_update( void *cc, const void *data, size_t len );
 void skein256_4way_close( void *cc, void *dst );
-//void sph_skein256_addbits_and_close(
+#define skein256_4way skein256_4way_update
 //	void *cc, unsigned ub, unsigned n, void *dst);
 #ifdef __cplusplus
 }
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -1,9 +1,66 @@
-#include "skein2-gate.h"
+#include "skein-gate.h"
 #include <string.h>
 #include <stdint.h>
 #include "skein-hash-4way.h"
-#if defined(SKEIN2_4WAY)
+#if defined(SKEIN_8WAY)
 void skein2hash_8way( void *output, const void *input )
 {
   skein512_8way_context ctx;
   uint64_t hash[16*8] __attribute__ ((aligned (128)));
   skein512_8way_init( &ctx );
   skein512_8way_update( &ctx, input, 80 );
   skein512_8way_close( &ctx, hash );
   skein512_8way_init( &ctx );
   skein512_8way_update( &ctx, hash, 64 );
   skein512_8way_close( &ctx, output );
 }
 int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[16*8] __attribute__ ((aligned (128)));
    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
    uint32_t *hash7 = &(hash[49]);
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
    int thr_id = mythr->id; 
    mm512_bswap32_intrlv80_8x64( vdata, pdata );
    do
    {
       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
       skein2hash_8way( hash, vdata );
       for ( int lane = 0; lane < 8; lane++ )
       if ( hash7[ lane<<1 ] <= Htarg )
       {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
          {
             pdata[19] = n + lane;
             submit_lane_solution( work, lane_hash, mythr, lane );
          }
       }
       n += 8;
    } while ( (n < max_nonce-8) && !work_restart[thr_id].restart );
    *hashes_done = n - first_nonce + 1;
    return 0;
 }
 #elif defined(SKEIN_4WAY)
 void skein2hash_4way( void *output, const void *input )
 {
--- a/algo/skein/skein2-gate.c
+++ b/algo/skein/skein2-gate.c
@@ -1,17 +0,0 @@
 #include "skein2-gate.h"
 #include <stdint.h>
 #include "sph_skein.h"
 bool register_skein2_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX2_OPT;
 #if defined (SKEIN2_4WAY)
  gate->scanhash  = (void*)&scanhash_skein2_4way;
  gate->hash      = (void*)&skein2hash_4way;
 #else
  gate->scanhash  = (void*)&scanhash_skein2;
  gate->hash      = (void*)&skein2hash;
 #endif
  return true;
 };
--- a/algo/skein/skein2-gate.h
+++ b/algo/skein/skein2-gate.h
@@ -1,20 +0,0 @@
 #ifndef __SKEIN2GATE_H__
 #define __SKEIN2_GATE_H__
 #include "algo-gate-api.h"
 #include <stdint.h>
 #if defined(__AVX2__)
  #define SKEIN2_4WAY
 #endif
 #if defined(SKEIN2_4WAY)
 void skein2hash_4way( void *output, const void *input );
 int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
                          uint64_t* hashes_done, struct thr_info *mythr );
 #endif
 void skein2hash( void *output, const void *input );
 int scanhash_skein2( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr );
 #endif
--- a/algo/skein/skein2.c
+++ b/algo/skein/skein2.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "skein-gate.h"
 #include <string.h>
 #include <stdint.h>
--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -45,12 +45,12 @@ void init_tt8_4way_ctx()
 void timetravel_4way_hash(void *output, const void *input)
 {
-   uint64_t hash0[8] __attribute__ ((aligned (64)));
+   uint64_t hash0[10] __attribute__ ((aligned (64)));
-   uint64_t hash1[8] __attribute__ ((aligned (64)));
+   uint64_t hash1[10] __attribute__ ((aligned (64)));
-   uint64_t hash2[8] __attribute__ ((aligned (64)));
+   uint64_t hash2[10] __attribute__ ((aligned (64)));
-   uint64_t hash3[8] __attribute__ ((aligned (64)));
+   uint64_t hash3[10] __attribute__ ((aligned (64)));
-   uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
+   uint64_t vhashX[10*4] __attribute__ ((aligned (64)));
-   uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
+   uint64_t vhashY[10*4] __attribute__ ((aligned (64)));
   uint64_t *vhashA, *vhashB;
   tt8_4way_ctx_holder ctx __attribute__ ((aligned (64)));
   uint32_t dataLen = 64;
--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -51,12 +51,12 @@ void init_tt10_4way_ctx()
 void timetravel10_4way_hash(void *output, const void *input)
 {
-   uint64_t hash0[8] __attribute__ ((aligned (64)));
+   uint64_t hash0[10] __attribute__ ((aligned (64)));
-   uint64_t hash1[8] __attribute__ ((aligned (64)));
+   uint64_t hash1[10] __attribute__ ((aligned (64)));
-   uint64_t hash2[8] __attribute__ ((aligned (64)));
+   uint64_t hash2[10] __attribute__ ((aligned (64)));
-   uint64_t hash3[8] __attribute__ ((aligned (64)));
+   uint64_t hash3[10] __attribute__ ((aligned (64)));
-   uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
+   uint64_t vhashX[10*4] __attribute__ ((aligned (64)));
-   uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
+   uint64_t vhashY[10*4] __attribute__ ((aligned (64)));
   uint64_t *vhashA, *vhashB;
   tt10_4way_ctx_holder ctx __attribute__ ((aligned (64)));
   uint32_t dataLen = 64;
--- a/algo/x12/x12-4way.c
+++ b/algo/x12/x12-4way.c
@@ -108,7 +108,7 @@ void x12_4way_hash( void *state, const void *input )
     intrlv_2x128( vhash, hash2, hash3, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     intrlv_2x128( hash2, hash3, vhash, 512 );
+     dintrlv_2x128( hash2, hash3, vhash, 512 );
     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
--- a/algo/yespower/yespower-blake2b.c
+++ b/algo/yespower/yespower-blake2b.c
@@ -49,6 +49,7 @@
 * no slowdown from the prefixes is generally observed on AMD CPUs supporting
 * XOP, some slowdown is sometimes observed on Intel CPUs with AVX.
 */
 /*
 #ifdef __XOP__
 #warning "Note: XOP is enabled.  That's great."
 #elif defined(__AVX__)
@@ -60,6 +61,7 @@
 #else
 #warning "Note: building generic code for non-x86.  That's OK."
 #endif
 */
 /*
 * The SSE4 code version has fewer instructions than the generic SSE2 version,
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -16,7 +16,8 @@ mv cpuminer cpuminer-avx512
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl
+# GCC 9 doesn't include AES with core-avx2
 CFLAGS="-O3 -march=core-avx2 -maes -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-avx2.exe
@@ -25,7 +26,7 @@ mv cpuminer cpuminer-avx2
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=corei7-avx -Wall" ./configure --with-curl
+CFLAGS="-O3 -march=corei7-avx -maes -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-avx.exe
--- a/build-allarch.sh.bak
+++ b/build-allarch.sh.bak
@@ -0,0 +1,86 @@
 #!/bin/bash
 #
 # This script is not intended for users, it is only used for compile testing
 # during develpment. Howver the information contained my provide cimpilation
 # tips to users.
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 CFLAGS="-O3 -march=skylake-avx512 -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-avx512.exe
 strip -s cpuminer
 mv cpuminer cpuminer-avx512
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-avx2.exe
 strip -s cpuminer
 mv cpuminer cpuminer-avx2
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=corei7-avx -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-avx.exe
 strip -s cpuminer
 mv cpuminer cpuminer-aes-avx
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-sse42.exe
 strip -s cpuminer
 mv cpuminer cpuminer-aes-sse42
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=corei7 -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-sse42.exe
 strip -s cpuminer
 mv cpuminer cpuminer-sse42
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=core2 -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-ssse3.exe
 strip -s cpuminer
 mv cpuminer cpuminer-ssse3
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -msse2 -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-sse2.exe
 strip -s cpuminer
 mv cpuminer cpuminer-sse2
 make clean || echo done
 rm -f config.status
 CFLAGS="-O3 -march=znver1 -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-zen.exe
 strip -s cpuminer
 mv cpuminer cpuminer-zen
 make clean || echo done
 rm -f config.status
 CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
 strip -s cpuminer
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.11.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.0.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.9.11'
+PACKAGE_VERSION='3.10.0'
-PACKAGE_STRING='cpuminer-opt 3.9.11'
+PACKAGE_STRING='cpuminer-opt 3.10.0'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.9.11 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.10.0 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.9.11:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.10.0:";;
   esac
  cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.9.11
+cpuminer-opt configure 3.10.0
 generated by GNU Autoconf 2.69
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by cpuminer-opt $as_me 3.9.11, which was
+It was created by cpuminer-opt $as_me 3.10.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.9.11'
+ VERSION='3.10.0'
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.9.11, which was
+This file was extended by cpuminer-opt $as_me 3.10.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.9.11
+cpuminer-opt config.status 3.10.0
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.9.11])
+AC_INIT([cpuminer-opt], [3.10.0])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -53,6 +53,8 @@
 #if HAVE_SYS_PARAM_H
 #include <sys/param.h>
 #endif
 // GCC 9 warning sysctl.h is deprecated
 #include <sys/sysctl.h>
 #endif
 #endif
@@ -3339,12 +3341,14 @@ bool check_cpu_capability ()
     bool cpu_has_avx2   = has_avx2();
     bool cpu_has_sha    = has_sha();
     bool cpu_has_avx512 = has_avx512();
     bool cpu_has_vaes   = has_vaes();
     bool sw_has_aes    = false;
     bool sw_has_sse42  = false;
     bool sw_has_avx    = false;
     bool sw_has_avx2   = false;
     bool sw_has_avx512 = false;
     bool sw_has_sha    = false;
     bool sw_has_vaes   = false;
     set_t algo_features = algo_gate.optimizations;
     bool algo_has_sse2   = set_incl( SSE2_OPT,    algo_features );
     bool algo_has_aes    = set_incl( AES_OPT,     algo_features );
@@ -3352,12 +3356,14 @@ bool check_cpu_capability ()
     bool algo_has_avx2   = set_incl( AVX2_OPT,    algo_features );
     bool algo_has_avx512 = set_incl( AVX512_OPT,  algo_features );
     bool algo_has_sha    = set_incl( SHA_OPT,     algo_features );
     bool algo_has_vaes   = set_incl( VAES_OPT,    algo_features );
     bool use_aes;
     bool use_sse2;
     bool use_sse42;
     bool use_avx2;
     bool use_avx512;
     bool use_sha;
     bool use_vaes;
     bool use_none;
     #ifdef __AES__
@@ -3372,12 +3378,16 @@ bool check_cpu_capability ()
     #ifdef __AVX2__
         sw_has_avx2 = true;
     #endif
-     #if (defined(__AVX512F__) && defined(__AVX51DQF__) && defined(__AVX51BW__) && defined(__AVX512VL__))
+     #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
         sw_has_avx512 = true;
     #endif
     #ifdef __SHA__
         sw_has_sha = true;
     #endif
     #ifdef __VAES__
         sw_has_vaes = true;
     #endif
 //     #if !((__AES__) || (__SSE2__))
 //         printf("Neither __AES__ nor __SSE2__ defined.\n");
@@ -3404,6 +3414,7 @@ bool check_cpu_capability ()
     if ( cpu_has_avx2   )    printf( " AVX2"   );
     if ( cpu_has_avx512 )    printf( " AVX512" );
     if ( cpu_has_sha    )    printf( " SHA"    );
     if ( cpu_has_vaes   )    printf( " VAES"   );
     printf(".\nSW features: SSE2");
     if ( sw_has_aes    )     printf( " AES"    );
@@ -3412,18 +3423,20 @@ bool check_cpu_capability ()
     if ( sw_has_avx2   )     printf( " AVX2"   );
     if ( sw_has_avx512 )     printf( " AVX512" );
     if ( sw_has_sha    )     printf( " SHA"    );
     if ( sw_has_vaes   )     printf( " VAES"   );
     printf(".\nAlgo features:");
     if ( algo_features == EMPTY_SET ) printf( " None" );
     else
     {
-        if ( algo_has_sse2   ) printf( " SSE2"    );
+        if ( algo_has_sse2   ) printf( " SSE2"   );
-        if ( algo_has_aes    ) printf( " AES"     );
+        if ( algo_has_aes    ) printf( " AES"    );
-        if ( algo_has_sse42  ) printf( " SSE4.2"  );
+        if ( algo_has_sse42  ) printf( " SSE4.2" );
        if ( algo_has_avx2   ) printf( " AVX2"   );
        if ( algo_has_avx512 ) printf( " AVX512" );
        if ( algo_has_sha    ) printf( " SHA"    );
        if ( algo_has_vaes   ) printf( " VAES"   );
     }
     printf(".\n");
@@ -3461,8 +3474,9 @@ bool check_cpu_capability ()
     use_avx2   = cpu_has_avx2   && sw_has_avx2   && algo_has_avx2;
     use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
     use_sha    = cpu_has_sha    && sw_has_sha    && algo_has_sha;
     use_vaes   = cpu_has_vaes   && sw_has_vaes   && algo_has_vaes;
     use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 ||
-                   use_sha );
+                   use_sha || use_vaes );
     // Display best options
     printf( "Start mining with" );
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -575,12 +575,26 @@ static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
  __m128i s3 = casti_m128i( src,3 );
  __m128i s4 = casti_m128i( src,4 );
 #if defined(__SSSE3__)
  __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
  s2 = _mm_shuffle_epi8( s2, bswap_shuf );
  s3 = _mm_shuffle_epi8( s3, bswap_shuf );
  s4 = _mm_shuffle_epi8( s4, bswap_shuf );
 #else
  s0 = mm128_bswap_32( s0 );
  s1 = mm128_bswap_32( s1 );
  s2 = mm128_bswap_32( s2 );
  s3 = mm128_bswap_32( s3 );
  s4 = mm128_bswap_32( s4 );
 #endif
  casti_m128i( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 );
  casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 );
  casti_m128i( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa );
@@ -742,17 +756,18 @@ static inline void extr_lane_8x32( void *d, const void *s,
 static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
 {
-   __m128i s0 = casti_m128i( src,0 );
+  __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
-   __m128i s1 = casti_m128i( src,1 );
+  __m128i s0 = casti_m128i( src,0 );
-   __m128i s2 = casti_m128i( src,2 );
+  __m128i s1 = casti_m128i( src,1 );
-   __m128i s3 = casti_m128i( src,3 );
+  __m128i s2 = casti_m128i( src,2 );
-   __m128i s4 = casti_m128i( src,4 );
+  __m128i s3 = casti_m128i( src,3 );
  __m128i s4 = casti_m128i( src,4 );
-   s0 = mm128_bswap_32( s0 );
+  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
-   s1 = mm128_bswap_32( s1 );
+  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
-   s2 = mm128_bswap_32( s2 );
+  s2 = _mm_shuffle_epi8( s2, bswap_shuf );
-   s3 = mm128_bswap_32( s3 );
+  s3 = _mm_shuffle_epi8( s3, bswap_shuf );
-   s4 = mm128_bswap_32( s4 );
+  s4 = _mm_shuffle_epi8( s4, bswap_shuf );
   casti_m128i( d, 0 ) = 
   casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0 , 0x00 );
@@ -960,17 +975,18 @@ static inline void extr_lane_16x32( void *d, const void *s,
 static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
 {
-   __m128i s0 = casti_m128i( src,0 );
+  __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
-   __m128i s1 = casti_m128i( src,1 );
+  __m128i s0 = casti_m128i( src,0 );
-   __m128i s2 = casti_m128i( src,2 );
+  __m128i s1 = casti_m128i( src,1 );
-   __m128i s3 = casti_m128i( src,3 );
+  __m128i s2 = casti_m128i( src,2 );
-   __m128i s4 = casti_m128i( src,4 );
+  __m128i s3 = casti_m128i( src,3 );
  __m128i s4 = casti_m128i( src,4 );
-   s0 = mm128_bswap_32( s0 );
+  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
-   s1 = mm128_bswap_32( s1 );
+  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
-   s2 = mm128_bswap_32( s2 );
+  s2 = _mm_shuffle_epi8( s2, bswap_shuf );
-   s3 = mm128_bswap_32( s3 );
+  s3 = _mm_shuffle_epi8( s3, bswap_shuf );
-   s4 = mm128_bswap_32( s4 );
+  s4 = _mm_shuffle_epi8( s4, bswap_shuf );
   casti_m128i( d, 0 ) = 
   casti_m128i( d, 1 ) = 
@@ -1374,17 +1390,18 @@ static inline void extr_lane_4x64( void *d, const void *s,
 static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
 {
-  __m128i s0 = casti_m128i( src, 0 );
+  __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
-  __m128i s1 = casti_m128i( src, 1 );
+  __m128i s0 = casti_m128i( src,0 );
-  __m128i s2 = casti_m128i( src, 2 );
+  __m128i s1 = casti_m128i( src,1 );
-  __m128i s3 = casti_m128i( src, 3 );
+  __m128i s2 = casti_m128i( src,2 );
-  __m128i s4 = casti_m128i( src, 4 );
+  __m128i s3 = casti_m128i( src,3 );
  __m128i s4 = casti_m128i( src,4 );
-  s0 = mm128_bswap_32( s0 );
+  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
-  s1 = mm128_bswap_32( s1 );
+  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
-  s2 = mm128_bswap_32( s2 );
+  s2 = _mm_shuffle_epi8( s2, bswap_shuf );
-  s3 = mm128_bswap_32( s3 );
+  s3 = _mm_shuffle_epi8( s3, bswap_shuf );
-  s4 = mm128_bswap_32( s4 );
+  s4 = _mm_shuffle_epi8( s4, bswap_shuf );
  casti_m128i( d,  0 ) = 
  casti_m128i( d,  1 ) = _mm_shuffle_epi32( s0, 0x44 );
@@ -1556,7 +1573,7 @@ static inline void dintrlv_8x64( void *dst0, void *dst1, void *dst2,
   __m128i *d3 = (__m128i*)dst3;
   __m128i *d4 = (__m128i*)dst4;
   __m128i *d5 = (__m128i*)dst5;
-   __m128i *d6 = (__m128i*)dst5;
+   __m128i *d6 = (__m128i*)dst6;
   __m128i *d7 = (__m128i*)dst7;
   const __m128i* s = (const __m128i*)src;
@@ -1690,17 +1707,18 @@ static inline void extr_lane_8x64( void *d, const void *s,
 static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
 {
-  __m128i s0 = casti_m128i( src, 0 );
+  __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
-  __m128i s1 = casti_m128i( src, 1 );
+  __m128i s0 = casti_m128i( src,0 );
-  __m128i s2 = casti_m128i( src, 2 );
+  __m128i s1 = casti_m128i( src,1 );
-  __m128i s3 = casti_m128i( src, 3 );
+  __m128i s2 = casti_m128i( src,2 );
-  __m128i s4 = casti_m128i( src, 4 );
+  __m128i s3 = casti_m128i( src,3 );
  __m128i s4 = casti_m128i( src,4 );
-  s0 = mm128_bswap_32( s0 );
+  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
-  s1 = mm128_bswap_32( s1 );
+  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
-  s2 = mm128_bswap_32( s2 );
+  s2 = _mm_shuffle_epi8( s2, bswap_shuf );
-  s3 = mm128_bswap_32( s3 );
+  s3 = _mm_shuffle_epi8( s3, bswap_shuf );
-  s4 = mm128_bswap_32( s4 );
+  s4 = _mm_shuffle_epi8( s4, bswap_shuf );
  casti_m128i( d,  0 ) =
  casti_m128i( d,  1 ) =
@@ -1746,7 +1764,6 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
  casti_m128i( d, 37 ) =
  casti_m128i( d, 38 ) =
  casti_m128i( d, 39 ) = _mm_shuffle_epi32( s4, 0xee );
 }
 #endif  // AVX512
@@ -1967,6 +1984,68 @@ static inline void rintrlv_4x64_4x32( void *dst, const void *src,
 #undef RLEAVE_4x64_4x32
 #define RLEAVE_8x64_8x32( i ) do \
 { \
   uint32_t *d = (uint32_t*)dst + (i); \
   const uint32_t *s = (const uint32_t*)src + (i); \
   d[ 0] = s[ 0];  d[ 1] = s[ 2];  d[ 2] = s[ 4];  d[ 3] = s[ 6]; \
   d[ 4] = s[ 8];  d[ 5] = s[10];  d[ 6] = s[12];  d[ 7] = s[14]; \
   d[ 8] = s[ 1];  d[ 9] = s[ 3];  d[10] = s[ 5];  d[11] = s[ 7]; \
   d[12] = s[ 9];  d[13] = s[11];  d[14] = s[13];  d[16] = s[15]; \
 } while(0)
 // 8x64 -> 8x32
 static inline void rintrlv_8x64_8x32( void *dst, const void *src,
                                      const int  bit_len )
 {
   RLEAVE_8x64_8x32(   0 );   RLEAVE_8x64_8x32(  16 );
   RLEAVE_8x64_8x32(  32 );   RLEAVE_8x64_8x32(  48 );
   RLEAVE_8x64_8x32(  64 );   RLEAVE_8x64_8x32(  80 );
   RLEAVE_8x64_8x32(  96 );   RLEAVE_8x64_8x32( 112 );
   RLEAVE_8x64_8x32( 128 );   RLEAVE_8x64_8x32( 144 );
   RLEAVE_8x64_8x32( 160 );   RLEAVE_8x64_8x32( 176 );
   RLEAVE_8x64_8x32( 192 );   RLEAVE_8x64_8x32( 208 );
   RLEAVE_8x64_8x32( 224 );   RLEAVE_8x64_8x32( 240 );
   if ( bit_len <= 256 ) return;
   RLEAVE_8x64_8x32( 256 );   RLEAVE_8x64_8x32( 272 );
   RLEAVE_8x64_8x32( 288 );   RLEAVE_8x64_8x32( 304 );
   RLEAVE_8x64_8x32( 320 );   RLEAVE_8x64_8x32( 336 );
   RLEAVE_8x64_8x32( 352 );   RLEAVE_8x64_8x32( 368 );
   RLEAVE_8x64_8x32( 384 );   RLEAVE_8x64_8x32( 400 );
   RLEAVE_8x64_8x32( 416 );   RLEAVE_8x64_8x32( 432 );
   RLEAVE_8x64_8x32( 448 );   RLEAVE_8x64_8x32( 464 );
   RLEAVE_8x64_8x32( 480 );   RLEAVE_8x64_8x32( 496 );
   if ( bit_len <= 512 ) return;
   RLEAVE_8x64_8x32( 512 );   RLEAVE_8x64_8x32( 528 );
   RLEAVE_8x64_8x32( 544 );   RLEAVE_8x64_8x32( 560 );
   RLEAVE_8x64_8x32( 576 );   RLEAVE_8x64_8x32( 592 );
   RLEAVE_8x64_8x32( 608 );   RLEAVE_8x64_8x32( 624 );
   RLEAVE_8x64_8x32( 640 );   RLEAVE_8x64_8x32( 656 );
   RLEAVE_8x64_8x32( 672 );   RLEAVE_8x64_8x32( 688 );
   RLEAVE_8x64_8x32( 704 );   RLEAVE_8x64_8x32( 720 );
   RLEAVE_8x64_8x32( 736 );   RLEAVE_8x64_8x32( 752 );
   RLEAVE_8x64_8x32( 768 );   RLEAVE_8x64_8x32( 784 );
   RLEAVE_8x64_8x32( 800 );   RLEAVE_8x64_8x32( 816 );
   RLEAVE_8x64_8x32( 832 );   RLEAVE_8x64_8x32( 848 );
   RLEAVE_8x64_8x32( 864 );   RLEAVE_8x64_8x32( 880 );
   RLEAVE_8x64_8x32( 896 );   RLEAVE_8x64_8x32( 912 );
   RLEAVE_8x64_8x32( 928 );   RLEAVE_8x64_8x32( 944 );
   RLEAVE_8x64_8x32( 960 );   RLEAVE_8x64_8x32( 976 );
   RLEAVE_8x64_8x32( 992 );   RLEAVE_8x64_8x32(1008 );
 }
 #undef RLEAVE_8x64_8x32
 // 4x32 -> 4x64
@@ -2067,7 +2146,7 @@ static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
   d[13] = _mm_unpacklo_epi64( s1[ 6], s1[ 7] );
   d[14] = _mm_unpackhi_epi64( s0[ 6], s0[ 7] );
   d[15] = _mm_unpackhi_epi64( s1[ 6], s1[ 7] );
-   if ( bit_len <= 256 ) return;
+   if ( bit_len <= 512 ) return;
   d[16] = _mm_unpacklo_epi64( s0[ 8], s0[ 9] );
   d[17] = _mm_unpacklo_epi64( s1[ 8], s1[ 9] );
   d[18] = _mm_unpackhi_epi64( s0[ 8], s0[ 9] );
@@ -2189,15 +2268,15 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
 #if defined(__SSE4_1__)
 // No SSE2 implementation.
-#define mm128_intrlv_blend_64( hi, lo )   _mm_blend_epi16( hi, lo, 0x0f )
+//#define mm128_intrlv_blend_64( hi, lo )   _mm_blend_epi16( hi, lo, 0x0f )
-#define mm128_intrlv_blend_32( hi, lo )   _mm_blend_epi16( hi, lo, 0x33 )
+//#define mm128_intrlv_blend_32( hi, lo )   _mm_blend_epi16( hi, lo, 0x33 )
 #endif   // SSE4_1
 #if defined(__AVX2__)
-#define mm256_intrlv_blend_128( hi, lo )  _mm256_blend_epi32( hi, lo, 0x0f )
+//#define mm256_intrlv_blend_128( hi, lo )  _mm256_blend_epi32( hi, lo, 0x0f )
-#define mm256_intrlv_blend_64( hi, lo )   _mm256_blend_epi32( hi, lo, 0x33 )
+//#define mm256_intrlv_blend_64( hi, lo )   _mm256_blend_epi32( hi, lo, 0x33 )
 #define mm256_intrlv_blend_32( hi, lo )   _mm256_blend_epi32( hi, lo, 0x55 )
 // Select lanes of 32 byte hash from 2 sources according to control mask.
@@ -2216,4 +2295,18 @@ do { \
 #endif  // AVX2
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 /*
 #define mm512_intrlv_blend_128( hi, lo ) \
   _mm512_mask_blend_epi32( 0x0f0f, hi, lo )
 #define mm512_intrlv_blend_64( hi, lo ) \
   _mm512_mask_blend_epi32( 0x3333, hi, lo )
 */
 #define mm512_intrlv_blend_32( hi, lo ) \
   _mm512_mask_blend_epi32( 0x5555, hi, lo )
 #endif // AVX512
 #endif // INTERLEAVE_H__
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -242,7 +242,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
-/*
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 #define mm128_ror_64    _mm_ror_epi64
@@ -251,14 +251,14 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_32    _mm_rol_epi32
 #else
-*/
+
 #define mm128_ror_64   mm128_ror_var_64
 #define mm128_rol_64   mm128_rol_var_64
 #define mm128_ror_32   mm128_ror_var_32
 #define mm128_rol_32   mm128_rol_var_32
-//#endif   // AVX512 else
+#endif   // AVX512 else
 #define mm128_ror_16( v, c ) \
   _mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) )
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -233,7 +233,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
                    _mm256_srli_epi32( v, 32-(c) ) )
-/*
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 // AVX512, control must be 8 bit immediate.
@@ -244,7 +244,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_rol_32    _mm256_rol_epi32
 #else
-*/
+
 // No AVX512, use fallback.
@@ -253,7 +253,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_ror_32    mm256_ror_var_32
 #define mm256_rol_32    mm256_rol_var_32
-// #endif     // AVX512 else
+#endif     // AVX512 else
 #define  mm256_ror_16( v, c ) \
   _mm256_or_si256( _mm256_srli_epi16( v, c ), \
@@ -311,7 +311,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 // AVX512 has finer granularity full vector permutes.
 // AVX512 has full vector alignr which might be faster, especially for 32 bit
-/*
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 #define mm256_swap_128( v )   _mm256_alignr_epi64( v, v, 2 )
@@ -323,7 +323,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_rol_3x32( v )   _mm256_alignr_epi32( v, v, 5 )
 #else   // AVX2
 */
 // Swap 128 bit elements in 256 bit vector.
 #define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )
@@ -354,7 +353,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
                     m256_const_64( 0x0000000400000003, 0x0000000200000001, \
                                    0x0000000000000007, 0x0000000600000005 )
-//#endif    // AVX512 else AVX2
+#endif    // AVX512 else AVX2
 // AVX512 can do 16 & 8 bit elements.
@@ -423,21 +422,25 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_ror1x32_128( v )  _mm256_shuffle_epi32( v, 0x39 )
 #define mm256_rol1x32_128( v )  _mm256_shuffle_epi32( v, 0x93 )
 // Rotate each 128 bit lane by one 16 bit element.
 #define mm256_ror1x16_128( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x01000f0e0d0c0b0a, \
+   _mm256_shuffle_epi8( v, \
-                                                    0x0908070605040302 ) )
+         m256_const_64( 0x11101f1e1d1c1b1a, 0x1918171615141312, \
-#define mm256_rol1x16_128( v ) \
+                        0x01000f0e0d0c0b0a, 0x0908070605040302 ) )
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x0d0c0b0a09080706, \
+
-                                                    0x0504030201000f0e ) )
+#define mm256_rol1x16_128( v ) \
   _mm256_shuffle_epi8( v, \
         m256_const_64( 0x1d1c1b1a19181716, 0x1514131211101f1e, \
                        0x0d0c0b0a09080706, 0x0504030201000f0e ) )
 // Rotate each 128 bit lane by one byte
 #define mm256_ror1x8_128( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x000f0e0d0c0b0a09, \
+   _mm256_shuffle_epi8( v, \
-                                                    0x0807060504030201 ) )
+         m256_const_64( 0x101f1e1d1c1b1a19, 0x1817161514131211, \
                        0x000f0e0d0c0b0a09, 0x0807060504030201 ) )
 #define mm256_rol1x8_128( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x0d0c0b0a09080f0e, \
+   _mm256_shuffle_epi8( v, \
-                                                    0x0504030201000706 ) )
+         m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \
                        0x0d0c0b0a09080f0e, 0x0504030201000706 ) )
 // Rotate each 128 bit lane by c bytes.
 #define mm256_bror_128( v, c ) \
@@ -451,50 +454,65 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_swap32_64( v )    _mm256_shuffle_epi32( v, 0xb1 )
 #define mm256_ror1x16_64( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x09080f0e0d0c0b0a, \
+   _mm256_shuffle_epi8( v, \
-                                                    0x0100070605040302 ) )
+        m256_const_64( 0x19181f1e1d1c1b1a, 0x1110171615141312, \
                       0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
 #define mm256_rol1x16_64( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x0d0c0b0a09080f0e, \
+   _mm256_shuffle_epi8( v, \
-                                                    0x0504030201000706 ) )
+        m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \
                       0x0d0c0b0a09080f0e, 0x0504030201000706 ) )
 #define mm256_ror1x8_64( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x080f0e0d0c0b0a09, \
+   _mm256_shuffle_epi8( v, \
-                                                    0x0007060504030201 ) )
+        m256_const_64( 0x181f1e1d1c1b1a19, 0x1017161514131211, \
                       0x080f0e0d0c0b0a09, 0x0007060504030201 ) )
 #define mm256_rol1x8_64( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x0e0d0c0b0a09080f, \
+   _mm256_shuffle_epi8( v, \
-                                                    0x0605040302010007 ) )
+        m256_const_64( 0x1e1d1c1b1a19181f, 0x1615141312111017, \
                       0x0e0d0c0b0a09080f, 0x0605040302010007 ) )
 #define mm256_ror3x8_64( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x0a09080f0e0d0c0b, \
+   _mm256_shuffle_epi8( v, \
-                                                    0x0201000706050403 ) )
+        m256_const_64( 0x1a19181f1e1d1c1b, 0x1211101716151413, \
                       0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
 #define mm256_rol3x8_64( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x0c0b0a09080f0e0d, \
+   _mm256_shuffle_epi8( v, \
-                                                    0x0403020100070605 ) )
+        m256_const_64( 0x1c1b1a19181f1e1d, 0x1413121110171615, \
                       0x0c0b0a09080f0e0d, 0x0403020100070605 ) )
 // Swap 16 bit elements in each 32 bit lane
 #define mm256_swap16_32( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x0b0a09080f0e0d0c, \
+   _mm256_shuffle_epi8( v, \
-                                                    0x0302010007060504 ) )
+         m256_const_64( 0x1b1a19181f1e1d1c, 0x1312111017161514, \
                        0x0b0a09080f0e0d0c, 0x0302010007060504 ) )
 //
 // Swap bytes in vector elements, endian bswap.
 #define mm256_bswap_64( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x08090a0b0c0d0e0f, \
+   _mm256_shuffle_epi8( v, \
-                                                    0x0001020304050607 ) )
+         m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
                        0x08090a0b0c0d0e0f, 0x0001020304050607 ) )
 #define mm256_bswap_32( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x0c0d0e0f08090a0b, \
+   _mm256_shuffle_epi8( v, \
-                                                    0x0405060700010203 ) )
+         m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
                        0x0c0d0e0f08090a0b, 0x0405060700010203 ) )
 #define mm256_bswap_16( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x0e0f0c0d0a0b0809, \
+   _mm256_shuffle_epi8( v, \
-                                                    0x0607040502030001 ) )
+         m256_const_64( 0x1e1f1c1d1a1b1819, 0x1617141512131011, \
                        0x0e0f0c0d0a0b0809, 0x0607040502030001, ) )
 // Source and destination are pointers, may point to same memory.
 // 8 byte qword * 8 qwords * 4 lanes = 256 bytes
 #define mm256_block_bswap_64( d, s ) do \
 { \
-  __m256i ctl = m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
+  __m256i ctl = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
                               0x08090a0b0c0d0e0f, 0x0001020304050607 ) ; \
  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
@@ -508,7 +526,8 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 // 4 byte dword * 8 dwords * 8 lanes = 256 bytes
 #define mm256_block_bswap_32( d, s ) do \
 { \
-  __m256i ctl = m256_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+  __m256i ctl = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -90,7 +90,7 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
 // Equivalent of set4, broadcast 256 bits in groups of four 64 bit constants
 // to all 256 bit lanes: {i3,i2,i1,i0,i3,i2,i1,i0,i3,i2,i1,i0,i3,i2,i1,i0}.
-static inline __m512i mm512_const4_64( const uint64_t i3, const uint64_t i2,
+static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
                                       const uint64_t i1, const uint64_t i0 )
 {
   __m256i lo = mm256_mov64_256( i0 );
@@ -105,7 +105,7 @@ static inline __m512i mm512_const4_64( const uint64_t i3, const uint64_t i2,
 // Broadcast 128 bits in pairs of 64 bit constants {i1. i0} to all
 // 128 bit lanes.
-#define mm512_const2_64( i1, i0 ) \
+#define m512_const2_64( i1, i0 ) \
   _mm512_permutex_epi64( _mm512_castsi128_si512( \
                          m128_const_64( i1, i0 ) ), 0x44 )
@@ -132,7 +132,7 @@ static inline __m512i mm512_const4_64( const uint64_t i3, const uint64_t i2,
 #define m512_one_16     _mm512_broadcastw_epi16( mm128_mov64_128( 1 ) )
 #define m512_one_8      _mm512_broadcastb_epi8 ( mm128_mov64_128( 1 ) )
-#define m512_neg1 mm512_const1_64( 0xffffffffffffffff )
+#define m512_neg1 m512_const1_64( 0xffffffffffffffff )
 /* 
 // EVEX vcmpeqq returns a bit mask instead of a vector
@@ -173,6 +173,19 @@ static inline __m512i mm512_neg1_fn()
 // returns p+o as pointer to vector
 #define casto_m512i(p,o) (((__m512i*)(p))+(o))
 //
 // Memory functions
 // n = number of 512 bit (64 byte) vectors
 static inline void memset_zero_512( __m512i *dst, const int n )
 {   for ( int i = 0; i < n; i++ ) dst[i] = m512_zero; }
 static inline void memset_512( __m512i *dst, const __m512i a, const int n )
 {   for ( int i = 0; i < n; i++ ) dst[i] = a; }
 static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 {   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
 // Sum 4 values, fewer dependencies than sequential addition.
@@ -189,7 +202,7 @@ static inline __m512i mm512_neg1_fn()
   _mm512_add_epi8( _mm512_add_epi8( a, b ), _mm512_add_epi8( c, d ) )
 #define mm512_xor4( a, b, c, d ) \
-   _mm512_xor_si512( _mm512_xor_si256( a, b ), _mm512_xor_si256( c, d ) )
+   _mm512_xor_si512( _mm512_xor_si512( a, b ), _mm512_xor_si512( c, d ) )
@@ -212,6 +225,11 @@ static inline __m512i mm512_neg1_fn()
 // _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
 //
 #define mm512_ror_64 _mm512_ror_epi64
 #define mm512_rol_64 _mm512_rol_epi64
 #define mm512_ror_32 _mm512_ror_epi32
 #define mm512_rol_32 _mm512_rol_epi32
 #define mm512_ror_var_64( v, c ) \
   _mm512_or_si512( _mm512_srli_epi64( v, c ), \
                    _mm512_slli_epi64( v, 64-(c) ) )
@@ -249,22 +267,34 @@ static inline __m512i mm512_neg1_fn()
 // Swap bytes in vector elements, vectorized endian conversion.
 #define mm512_bswap_64( v ) \
-   _mm512_shuffle_epi8( v, m512_const2_64( \
+   _mm512_shuffle_epi8( v, \
-                                 0x08090a0b0c0d0e0f, 0x0001020304050607 ) )
+               m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
                              0x28292a2b2c2d2e2f, 0x2021222324252627, \
                              0x18191a1b1c1d1e1f, 0x1011121314151617, \
                              0x08090a0b0c0d0e0f, 0x0001020304050607 ))
 #define mm512_bswap_32( v ) \
-   _mm512_shuffle_epi8( v, m512_const2_64( \
+   _mm512_shuffle_epi8( v, \
-                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ) )
+               m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
                              0x2c2d2e2f28292a2b, 0x2425262720212223, \
                              0x0c0d0e0f08090a0b, 0x0405060700010203, \
                              0x1c1d1e1f18191a1b, 0x1415161710111213 ) )
 #define mm512_bswap_16( v ) \
-   _mm512_shuffle_epi8( v, m512_const2_64( \
+   _mm512_shuffle_epi8( v, \
-                                 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) )
+               m512_const_64( 0x3e3f3c3d3a3b3839, 0x3637343532333031, \
                              0x2e2f2c2d2a2b2829, 0x2627242522232021, \
                              0x1e1f1c1d1a1b1819, 0x1617141512131011, \
                              0x0e0f0c0d0a0b0809, 0x0607040502030001 ) )
 // Source and destination are pointers, may point to same memory.
 // 8 lanes of 64 bytes each
 #define mm512_block_bswap_64( d, s ) do \
 { \
-  __m512i ctl = m512_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
+  __m512i ctl = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
                               0x28292a2b2c2d2e2f, 0x2021222324252627, \
                               0x18191a1b1c1d1e1f, 0x1011121314151617, \
                               0x08090a0b0c0d0e0f, 0x0001020304050607  ); \
  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
@@ -278,7 +308,10 @@ static inline __m512i mm512_neg1_fn()
 // 16 lanes of 32 bytes each
 #define mm512_block_bswap_32( d, s ) do \
 { \
-  __m512i ctl = m512_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+  __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
                               0x2c2d2e2f28292a2b, 0x2425262720212223, \
                               0x0c0d0e0f08090a0b, 0x0405060700010203, \
                               0x1c1d1e1f18191a1b, 0x1415161710111213 ); \
  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
@@ -381,6 +414,8 @@ static inline __m512i mm512_neg1_fn()
 #define mm512_ror1x64_256( v )   _mm512_permutex_epi64( v, 0x39 )
 #define mm512_rol1x64_256( v )   _mm512_permutex_epi64( v, 0x93 )
 /*  Need to fix 
 // Rotate 256 bit lanes by one 32 bit element
 #define mm512_ror1x32_256( v ) \
   _mm512_permutexvar_epi32( m512_const4_64( \
@@ -411,7 +446,7 @@ static inline __m512i mm512_neg1_fn()
    _mm512_shuffle_epi8( v, m512_const4_64( \
                     0x1e1d1c1b1a191817, 0x161514131211100f, \
                     0x0e0d0c0b0a090807, 0x060504030201001f ), v )
-
+*/
 //
 // Rotate elements within 128 bit lanes of 512 bit vector.
@@ -422,6 +457,7 @@ static inline __m512i mm512_neg1_fn()
 #define mm512_ror1x32_128( v )   _mm512_shuffle_epi32( v, 0x39 )
 #define mm512_rol1x32_128( v )   _mm512_shuffle_epi32( v, 0x93 )
 /*
 #define mm512_ror1x16_128( v ) \
    _mm512_permutexvar_epi16( m512_const2_64( \
                     0x0000000700060005, 0x0004000300020001 ), v ) 
@@ -437,6 +473,7 @@ static inline __m512i mm512_neg1_fn()
 #define mm512_rol1x8_128( v ) \
    _mm512_shuffle_epi8( v, m512_const2_64( \
                     0x0e0d0c0b0a090807, 0x060504030201000f ) )
 */
 // Rotate 128 bit lanes by c bytes.  
 #define mm512_bror_128( v, c ) \
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -18,14 +18,47 @@
 #ifndef WIN32
 // 1035g1: /sys/devices/platform/coretemp.0/hwmon/hwmon3/temp1_input
 // 1035g1: /sys/class/hwmon/hwmon1/temp1_input wrong temp
 // ryzen has no /sys/devices/platform/coretemp.0
 // ryzen: /sys/class/hwmon/hwmon0
 // 2400: /sys/class/hwmon/hwmon0/temp1_input incorrect temp
 // 2400 has no /sys/class/hwmon/hwmon2/temp1_input
 // 2400 /sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input ok
 // 6700 /sys/devices/platform/coretemp.0/hwmon/hwmon2/temp1_input
 // 6700 /sys/class/hwmon/hwmon2/temp1_input
 // /sys/devices/platform/coretemp.0/hwmon/hwmon0/temp2_input never exists
 // /sys/class/hwmon/hwmon0/temp2_input doesn't exist or shows wrong temp (sys16)
 // /sys/class/hwmon/hwmon0/device/temp1_input doesn't exist
 // the first 3 will find i5-2400, i7-6700k, r7-1700, i5-1035g1.
 // The others are left in for legacy, some should probably be removed.
 #define HWMON_PATH1 \
   "/sys/devices/platform/coretemp.0/hwmon/hwmon3/temp1_input"
 #define HWMON_PATH2 \
   "/sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input"
 #define HWMON_PATH3 \
   "/sys/class/hwmon/hwmon0/temp1_input"
 #define HWMON_PATH \
 "/sys/class/hwmon/hwmon2/temp1_input"
 /*
 #define HWMON_ALT \
 "/sys/class/hwmon/hwmon0/temp1_input"
 #define HWMON_ALT1 \
 "/sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input"
 */
 // This shows wrong temp on i5-1035g1
 #define HWMON_ALT2 \
 "/sys/class/hwmon/hwmon1/temp1_input"
 // None of these work on any of the cpus above.
 #define HWMON_ALT3 \
 "/sys/devices/platform/coretemp.0/hwmon/hwmon0/temp2_input"
 #define HWMON_ALT4 \
@@ -33,16 +66,28 @@
 #define HWMON_ALT5 \
 "/sys/class/hwmon/hwmon0/device/temp1_input"
 static inline float linux_cputemp(int core)
 {
 	float tc = 0.0;
-	FILE *fd = fopen(HWMON_PATH, "r");
+	FILE *fd;
 	uint32_t val = 0;
-	if (!fd)
+   fd = fopen(HWMON_PATH1, "r");
-		fd = fopen(HWMON_ALT, "r");
+
   if (!fd)
      fd = fopen(HWMON_PATH2, "r");
   if (!fd)
      fd = fopen(HWMON_PATH3, "r");
   if (!fd)
      fd = fopen(HWMON_PATH, "r");
 	if (!fd)
 //		fd = fopen(HWMON_ALT1, "r");
 //	if (!fd)
 		fd = fopen(HWMON_ALT2, "r");
 	if (!fd)
@@ -52,14 +97,14 @@ static inline float linux_cputemp(int core)
 		fd = fopen(HWMON_ALT4, "r");
 	if (!fd)
-                fd = fopen(HWMON_ALT5, "r");
+      fd = fopen(HWMON_ALT5, "r");
 	if (!fd)
 		return tc;
-	if (fscanf(fd, "%d", &val))
+	if ( fscanf( fd, "%d", &val ) )
 		tc = val / 1000.0;
-	fclose(fd);
+	fclose( fd );
 	return tc;
 }
@@ -296,7 +341,7 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 // EXTENDED_FEATURES ECX
 #define AVX512VBMI_Flag  (1<<1) 
 #define AVX512VBMI2_Flag (1<<6)
-#define AVX512VAES_Flag  (1<<9)
+#define VAES_Flag        (1<<9)
 // Use this to detect presence of feature
@@ -418,14 +463,14 @@ static inline bool has_avx512()
 #endif
 }
-static inline bool has_avx512vaes()
+static inline bool has_vaes()
 {
 #ifdef __arm__
    return false;
 #else
    int cpu_info[4] = { 0 };
    cpuid( EXTENDED_FEATURES, cpu_info );
-    return cpu_info[ ECX_Reg ] & AVX512VAES_Flag;
+    return cpu_info[ ECX_Reg ] & VAES_Flag;
 #endif
 }
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -41,27 +41,22 @@ make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-zen.exe
-#make clean || echo clean
+# mingw won't compile avx512 without -fno-asynchronous-unwind-tables
-#CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure $CONFIGURE_ARGS
+make clean || echo clean
-#make
+rm -f config.status
-#strip -s cpuminer.exe
+CFLAGS="-O3 -march=skylake-avx512 -Wall -fno-asynchronous-unwind-tables" ./configure $CONFIGURE_ARGS
-#mv cpuminer.exe release/cpuminer-avx-sha.exe
+make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512.exe
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=core-avx2 -Wall" ./configure $CONFIGURE_ARGS
+# GCC 9 doesn't include AES in core-avx2
 CFLAGS="-O3 -march=core-avx2 -maes -Wall" ./configure $CONFIGURE_ARGS
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2.exe
 #make clean || echo clean
 #rm -f config.status
 #CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS
 #make -j 
 #strip -s cpuminer.exe
 #mv cpuminer.exe release/cpuminer-aes-sha.exe
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $CONFIGURE_ARGS 
--- a/winbuild-cross.sh.bak
+++ b/winbuild-cross.sh.bak
@@ -0,0 +1,103 @@
 #!/bin/bash
 #
 # Script for building Windows binaries release package using mingw.
 # Requires a custom mingw environment, not intended for users.
 #
 # Compiles Windows EXE files for selected CPU architectures, copies them
 # as well as some DLLs that aren't available in most Windows environments
 # into a release folder ready to be zipped and uploaded.
 # define some local variables
 export LOCAL_LIB="$HOME/usr/lib"
 export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
 export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
 # make link to local gmp header file.
 ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h
 # edit configure to fix pthread lib name for Windows.
 #sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
 # make release directory and copy selected DLLs.
 mkdir release
 cp README.txt release/
 cp README.md release/
 cp RELEASE_NOTES release/
 cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
 cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
 cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libstdc++-6.dll release/
 cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libgcc_s_seh-1.dll release/
 cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/
 cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-zen.exe
 #make clean || echo clean
 #CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure $CONFIGURE_ARGS
 #make
 #strip -s cpuminer.exe
 #mv cpuminer.exe release/cpuminer-avx-sha.exe
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=core-avx2 -Wall" ./configure $CONFIGURE_ARGS
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2.exe
 #make clean || echo clean
 #rm -f config.status
 #CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS
 #make -j 
 #strip -s cpuminer.exe
 #mv cpuminer.exe release/cpuminer-aes-sha.exe
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $CONFIGURE_ARGS 
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx.exe
 # -march=westmere is supported in gcc5
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=westmere -Wall" ./configure $CONFIGURE_ARGS
 #CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-aes-sse42.exe
 #make clean || echo clean
 #rm -f config.status
 #CFLAGS="-O3 -march=corei7 -Wall" ./configure $CONFIGURE_ARGS
 #make 
 #strip -s cpuminer.exe
 #mv cpuminer.exe release/cpuminer-sse42.exe
 #make clean || echo clean
 #rm -f config.status
 #CFLAGS="-O3 -march=core2 -Wall" ./configure $CONFIGURE_ARGS
 #make 
 #strip -s cpuminer.exe
 #mv cpuminer.exe release/cpuminer-ssse3.exe
 #make clean || echo clean
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -msse2 -Wall" ./configure $CONFIGURE_ARGS
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-sse2.exe
 make clean || echo clean