v3.10.0

2026-02-22 16:33:08 +00:00 · 2019-12-03 12:26:11 -05:00
parent 91ec6f1771
commit 40039386a0
58 changed files with 3372 additions and 1920 deletions
--- a/14
+++ b/14
@@ -24,18 +24,10 @@ be installed manually. There may be others, read the error messages they
 will give a clue as to the missing package.

 The following command should install everything you need on Debian based
-distributions such as Ubuntu:
+distributions such as Ubuntu. Fedora and other distributions may have similar
+but different package names.

-sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake zlib1g-dev
-
-build-essential  (Development Tools package group on Fedora)
-automake
-libjansson-dev
-libgmp-dev
-libcurl4-openssl-dev
-libssl-dev
-lib-thread
-zlib1g-dev
+sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev

 SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
 openssl 1.1.0e or higher. Add one of the following, depending on the
--- a/5
+++ b/5
@@ -22,14 +22,13 @@ Step by step...

 Refer to Linux compile instructions and install required packages.

-Additionally, install mingw-64.
+Additionally, install mingw-w64.

 sudo apt-get install mingw-w64


 2. Create a local library directory for packages to be compiled in the next
-   step. Recommended location is $HOME/usr/lib/
-
+   step. Suggested location is $HOME/usr/lib/

 3. Download and build other packages for mingw that don't have a mingw64
   version available in the repositories.
--- a/Makefile.am
+++ b/Makefile.am
@@ -174,7 +174,6 @@ cpuminer_SOURCES = \
  algo/sha/sph_sha2big.c \
  algo/sha/sha256-hash-4way.c \
  algo/sha/sha512-hash-4way.c \
-  algo/sha/sha256_hash_11way.c \
  algo/sha/sha2.c \
  algo/sha/sha256t-gate.c \
  algo/sha/sha256t-4way.c \
@@ -198,7 +197,6 @@ cpuminer_SOURCES = \
  algo/skein/skein-gate.c \
  algo/skein/skein2.c \
  algo/skein/skein2-4way.c \
-  algo/skein/skein2-gate.c \
  algo/sm3/sm3.c \
  algo/sm3/sm3-hash-4way.c \
  algo/swifftx/swifftx.c \
--- a/README.txt
+++ b/README.txt
@@ -29,6 +29,7 @@ cpuminer-sse2.exe      "-msse2"                  Core2, Nehalem
 cpuminer-aes-sse42.exe "-march=westmere"         Westmere
 cpuminer-avx.exe       "-march=corei7-avx"       Sandy-Ivybridge
 cpuminer-avx2.exe      "-march=core-avx2"        Haswell, Sky-Kaby-Coffeelake
+cpuminer-avx512.exe    "-march=skylake-avx512"   Skylake-X, Cascadelake-X
 cpuminer-zen           "-march=znver1"           AMD Ryzen, Threadripper

 If you like this software feel free to donate:
--- a/20
+++ b/20
@@ -31,6 +31,26 @@ FreeBSD YMMV.
 Change Log
 ----------

+v3.10.0
+
+AVX-512 is now supported on selected algos, Windows binary is now available.
+AVX512 optimizations are available for argon2d, blake2s, keccak, keccakc,
+skein & skein2.
+
+Fixed CPU temperature for some CPU models (Linux only).
+
+Fixed a bug that caused some lanes not to submit shares.
+
+Fixed some previously undetected buffer overflows.
+
+Lyra2rev2 3% faster SSE2 and AVX2.
+
+Added "-fno-asynchronous-unwind-tables" to AVX512 build acript for Windows
+to fix known mingw issue.
+
+Changed AVX2 build script to explicitly add AES to address change in
+behaviour in GCC 9. 
+
 v3.9.11

 Added x22i & x25x algos.
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -59,7 +59,6 @@ extern "C"{
 typedef struct {
   unsigned char buf[64<<2];
   uint32_t H[8<<2];
-   uint32_t S[4<<2];
 //   __m128i buf[16] __attribute__ ((aligned (64)));
 //   __m128i H[8];
 //   __m128i S[4];    
@@ -93,7 +92,6 @@ void blake256r8_4way_close(void *cc, void *dst);
 typedef struct {
   __m256i buf[16] __attribute__ ((aligned (64)));
   __m256i H[8];
-   __m256i S[4];
   size_t ptr;
   sph_u32 T0, T1;
   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -304,16 +304,17 @@ static const sph_u32 CS[16] = {

 #endif

+// Blake-256 4 way

 #define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
 do { \
-   a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
-                                   _mm_set1_epi32( c1 ), m0 ), b ), a ); \
+   a = _mm_add_epi32( _mm_add_epi32( a, b ), \
+                      _mm_xor_si128( _mm_set1_epi32( c1 ), m0 ) ); \
   d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
   c = _mm_add_epi32( c, d ); \
   b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
-   a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
-                                   _mm_set1_epi32( c0 ), m1 ), b ), a ); \
+   a = _mm_add_epi32( _mm_add_epi32( a, b ), \
+                      _mm_xor_si128( _mm_set1_epi32( c0 ), m1 ) ); \
   d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
   c = _mm_add_epi32( c, d ); \
   b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
@@ -321,7 +322,8 @@ do { \

 #if SPH_COMPACT_BLAKE_32

-// Blake-256 4 way
+// Not used
+#if 0

 #define ROUND_S_4WAY(r)   do { \
 	GS_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
@@ -342,6 +344,8 @@ do { \
 		CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \
 } while (0)

+#endif
+
 #else

 #define ROUND_S_4WAY(r)   do { \
@@ -359,7 +363,6 @@ do { \

 #define DECL_STATE32_4WAY \
 	__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
-	__m128i S0, S1, S2, S3; \
        uint32_t T0, T1;

 #define READ_STATE32_4WAY(state)   do { \
@@ -371,10 +374,6 @@ do { \
 		H5 = casti_m128i( state->H, 5 ); \
 		H6 = casti_m128i( state->H, 6 ); \
 		H7 = casti_m128i( state->H, 7 ); \
-		S0 = casti_m128i( state->S, 0 ); \
-		S1 = casti_m128i( state->S, 1 ); \
-		S2 = casti_m128i( state->S, 2 ); \
-		S3 = casti_m128i( state->S, 3 ); \
 		T0 = (state)->T0; \
 		T1 = (state)->T1; \
 	} while (0)
@@ -388,17 +387,13 @@ do { \
 		casti_m128i( state->H, 5 ) = H5; \
 		casti_m128i( state->H, 6 ) = H6; \
 		casti_m128i( state->H, 7 ) = H7; \
-		casti_m128i( state->S, 0 ) = S0; \
-		casti_m128i( state->S, 1 ) = S1; \
-		casti_m128i( state->S, 2 ) = S2; \
-		casti_m128i( state->S, 3 ) = S3; \
 		(state)->T0 = T0; \
 		(state)->T1 = T1; \
 	} while (0)

 #if SPH_COMPACT_BLAKE_32
 // not used
-
+#if 0
 #define COMPRESS32_4WAY( rounds )   do { \
 	__m128i M[16]; \
 	__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -441,6 +436,7 @@ do { \
        H7 = _mm_xor_si128( _mm_xor_si128( \
                                   _mm_xor_si128( S3, V7 ), VF ), H7 ); \
 	} while (0)
+#endif

 #else

@@ -508,10 +504,10 @@ do { \
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-   V8 = _mm_xor_si128( S0, m128_const1_64( 0x243F6A88243F6A88 ) ); \
-   V9 = _mm_xor_si128( S1, m128_const1_64( 0x85A308D385A308D3 ) ); \
-   VA = _mm_xor_si128( S2, m128_const1_64( 0x13198A2E13198A2E ) ); \
-   VB = _mm_xor_si128( S3, m128_const1_64( 0x0370734403707344 ) ); \
+   V8 = m128_const1_64( 0x243F6A88243F6A88 ); \
+   V9 = m128_const1_64( 0x85A308D385A308D3 ); \
+   VA = m128_const1_64( 0x13198A2E13198A2E ); \
+   VB = m128_const1_64( 0x0370734403707344 ); \
   VC = _mm_xor_si128( _mm_set1_epi32( T0 ), \
                           m128_const1_64( 0xA4093822A4093822 ) ); \
   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), \
@@ -538,14 +534,14 @@ do { \
      ROUND_S_4WAY(2); \
      ROUND_S_4WAY(3); \
   } \
-   H0 = mm128_xor4( V8, V0, S0, H0 ); \
-   H1 = mm128_xor4( V9, V1, S1, H1 ); \
-   H2 = mm128_xor4( VA, V2, S2, H2 ); \
-   H3 = mm128_xor4( VB, V3, S3, H3 ); \
-   H4 = mm128_xor4( VC, V4, S0, H4 ); \
-   H5 = mm128_xor4( VD, V5, S1, H5 ); \
-   H6 = mm128_xor4( VE, V6, S2, H6 ); \
-   H7 = mm128_xor4( VF, V7, S3, H7 ); \
+   H0 = _mm_xor_si128( _mm_xor_si128( V8, V0 ), H0 ); \
+   H1 = _mm_xor_si128( _mm_xor_si128( V9, V1 ), H1 ); \
+   H2 = _mm_xor_si128( _mm_xor_si128( VA, V2 ), H2 ); \
+   H3 = _mm_xor_si128( _mm_xor_si128( VB, V3 ), H3 ); \
+   H4 = _mm_xor_si128( _mm_xor_si128( VC, V4 ), H4 ); \
+   H5 = _mm_xor_si128( _mm_xor_si128( VD, V5 ), H5 ); \
+   H6 = _mm_xor_si128( _mm_xor_si128( VE, V6 ), H6 ); \
+   H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \
 } while (0)

 #endif
@@ -556,13 +552,13 @@ do { \

 #define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
 do { \
-   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
-                 _mm256_set1_epi32( c1 ), m0 ), b ), a ); \
+   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
+                         _mm256_xor_si256( _mm256_set1_epi32( c1 ), m0 ) ); \
   d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
   c = _mm256_add_epi32( c, d ); \
   b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
-   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
-                 _mm256_set1_epi32( c0 ), m1 ), b ), a ); \
+   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
+                         _mm256_xor_si256( _mm256_set1_epi32( c0 ), m1 ) ); \
   d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
   c = _mm256_add_epi32( c, d ); \
   b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
@@ -581,7 +577,6 @@ do { \

 #define DECL_STATE32_8WAY \
   __m256i H0, H1, H2, H3, H4, H5, H6, H7; \
-   __m256i S0, S1, S2, S3; \
   sph_u32 T0, T1;

 #define READ_STATE32_8WAY(state) \
@@ -594,10 +589,6 @@ do { \
   H5 = (state)->H[5]; \
   H6 = (state)->H[6]; \
   H7 = (state)->H[7]; \
-   S0 = (state)->S[0]; \
-   S1 = (state)->S[1]; \
-   S2 = (state)->S[2]; \
-   S3 = (state)->S[3]; \
   T0 = (state)->T0; \
   T1 = (state)->T1; \
 } while (0)
@@ -612,10 +603,6 @@ do { \
   (state)->H[5] = H5; \
   (state)->H[6] = H6; \
   (state)->H[7] = H7; \
-   (state)->S[0] = S0; \
-   (state)->S[1] = S1; \
-   (state)->S[2] = S2; \
-   (state)->S[3] = S3; \
   (state)->T0 = T0; \
   (state)->T1 = T1; \
 } while (0)
@@ -635,10 +622,10 @@ do { \
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-   V8 = _mm256_xor_si256( S0, m256_const1_64( 0x243F6A88243F6A88 ) ); \
-   V9 = _mm256_xor_si256( S1, m256_const1_64( 0x85A308D385A308D3 ) ); \
-   VA = _mm256_xor_si256( S2, m256_const1_64( 0x13198A2E13198A2E ) ); \
-   VB = _mm256_xor_si256( S3, m256_const1_64( 0x0370734403707344 ) ); \
+   V8 = m256_const1_64( 0x243F6A88243F6A88 ); \
+   V9 = m256_const1_64( 0x85A308D385A308D3 ); \
+   VA = m256_const1_64( 0x13198A2E13198A2E ); \
+   VB = m256_const1_64( 0x0370734403707344 ); \
   VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
                              m256_const1_64( 0xA4093822A4093822 ) ); \
   VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
@@ -682,14 +669,14 @@ do { \
      ROUND_S_8WAY(2); \
      ROUND_S_8WAY(3); \
   } \
-   H0 = mm256_xor4( V8, V0, S0, H0 ); \
-   H1 = mm256_xor4( V9, V1, S1, H1 ); \
-   H2 = mm256_xor4( VA, V2, S2, H2 ); \
-   H3 = mm256_xor4( VB, V3, S3, H3 ); \
-   H4 = mm256_xor4( VC, V4, S0, H4 ); \
-   H5 = mm256_xor4( VD, V5, S1, H5 ); \
-   H6 = mm256_xor4( VE, V6, S2, H6 ); \
-   H7 = mm256_xor4( VF, V7, S3, H7 ); \
+   H0 = _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), H0 ); \
+   H1 = _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), H1 ); \
+   H2 = _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), H2 ); \
+   H3 = _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), H3 ); \
+   H4 = _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), H4 ); \
+   H5 = _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), H5 ); \
+   H6 = _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), H6 ); \
+   H7 = _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), H7 ); \
 } while (0)


@@ -703,7 +690,6 @@ static void
 blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
                   const uint32_t *salt, int rounds )
 {
-   __m128i zero = m128_zero;
   casti_m128i( ctx->H, 0 ) = m128_const1_64( 0x6A09E6676A09E667 );
   casti_m128i( ctx->H, 1 ) = m128_const1_64( 0xBB67AE85BB67AE85 );
   casti_m128i( ctx->H, 2 ) = m128_const1_64( 0x3C6EF3723C6EF372 );
@@ -712,11 +698,6 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
   casti_m128i( ctx->H, 5 ) = m128_const1_64( 0x9B05688C9B05688C );
   casti_m128i( ctx->H, 6 ) = m128_const1_64( 0x1F83D9AB1F83D9AB );
   casti_m128i( ctx->H, 7 ) = m128_const1_64( 0x5BE0CD195BE0CD19 );
-
-   casti_m128i( ctx->S, 0 ) = zero;
-   casti_m128i( ctx->S, 1 ) = zero;
-   casti_m128i( ctx->S, 2 ) = zero;
-   casti_m128i( ctx->S, 3 ) = zero;
   ctx->T0 = ctx->T1 = 0;
   ctx->ptr = 0;
   ctx->rounds = rounds;
@@ -824,7 +805,6 @@ static void
 blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
                   const sph_u32 *salt, int rounds )
 {
-   __m256i zero = m256_zero;
   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E6676A09E667 );
   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE85BB67AE85 );
   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF3723C6EF372 );
@@ -833,10 +813,6 @@ blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C9B05688C );
   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9AB1F83D9AB );
   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD195BE0CD19 );
-   casti_m256i( sc->S, 0 ) = zero;
-   casti_m256i( sc->S, 1 ) = zero;
-   casti_m256i( sc->S, 2 ) = zero;
-   casti_m256i( sc->S, 3 ) = zero;
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
   sc->rounds = rounds;
--- a/algo/blake/blake2b-4way.c
+++ b/algo/blake/blake2b-4way.c
@@ -4,13 +4,59 @@
 */

 #include "blake2b-gate.h"
-
-#if defined(BLAKE2B_4WAY)
-
 #include <string.h>
 #include <stdint.h>
 #include "blake2b-hash-4way.h"

+#if defined(BLAKE2B_8WAY)
+
+int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));;
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[25]);   // 3*8+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   int thr_id = mythr->id;
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+
+   uint32_t n = first_nonce;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   do {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+      blake2b_8way_init( &ctx );
+      blake2b_8way_update( &ctx, vdata, 80 );
+      blake2b_8way_final( &ctx, hash );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( hash7[ lane<<1 ] < Htarg )
+      {
+          extr_lane_8x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+      }
+      n += 8;
+   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#elif defined(BLAKE2B_4WAY)
+
 // Function not used, code inlined.
 void blake2b_4way_hash(void *output, const void *input)
 {
--- a/algo/blake/blake2b-gate.c
+++ b/algo/blake/blake2b-gate.c
@@ -1,15 +1,19 @@
 #include "blake2b-gate.h"

+
 bool register_blake2b_algo( algo_gate_t* gate )
 {
-#if defined(BLAKE2B_4WAY)
+#if defined(BLAKE2B_8WAY)
+  gate->scanhash  = (void*)&scanhash_blake2b_8way;
+//  gate->hash      = (void*)&blake2b_8way_hash;
+#elif defined(BLAKE2B_4WAY)
  gate->scanhash  = (void*)&scanhash_blake2b_4way;
  gate->hash      = (void*)&blake2b_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_blake2b;
  gate->hash      = (void*)&blake2b_hash;
 #endif
-  gate->optimizations =  AVX2_OPT;
+  gate->optimizations =  AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/blake/blake2b-gate.h
+++ b/algo/blake/blake2b-gate.h
@@ -4,13 +4,21 @@
 #include <stdint.h>
 #include "algo-gate-api.h"

-#if defined(__AVX2__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define BLAKE2B_8WAY
+#elif defined(__AVX2__)
  #define BLAKE2B_4WAY
 #endif

 bool register_blake2b_algo( algo_gate_t* gate );

-#if defined(BLAKE2B_4WAY)
+#if defined(BLAKE2B_8WAY)
+
+//void blake2b_8way_hash( void *state, const void *input );
+int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(BLAKE2B_4WAY)

 void blake2b_4way_hash( void *state, const void *input );
 int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
--- a/algo/blake/blake2b-hash-4way.c
+++ b/algo/blake/blake2b-hash-4way.c
@@ -33,6 +33,178 @@

 #include "blake2b-hash-4way.h"

+static const uint8_t sigma[12][16] =
+{
+      { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+      { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+      { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+      { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+      { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+      { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+      { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+      { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+      { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+      { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+      { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+      { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+};
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define B2B8W_G(a, b, c, d, x, y) \
+{ \
+   v[a] = _mm512_add_epi64( _mm512_add_epi64( v[a], v[b] ), x ); \
+   v[d] = mm512_ror_64( _mm512_xor_si512( v[d], v[a] ), 32 ); \
+   v[c] = _mm512_add_epi64( v[c], v[d] ); \
+   v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 24 ); \
+   v[a] = _mm512_add_epi64( _mm512_add_epi64( v[a], v[b] ), y ); \
+   v[d] = mm512_ror_64( _mm512_xor_si512( v[d], v[a] ), 16 ); \
+   v[c] = _mm512_add_epi64( v[c], v[d] ); \
+   v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
+}
+
+static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
+{  
+   __m512i v[16], m[16];
+
+   v[ 0] = ctx->h[0];
+   v[ 1] = ctx->h[1];
+   v[ 2] = ctx->h[2];
+   v[ 3] = ctx->h[3];
+   v[ 4] = ctx->h[4];
+   v[ 5] = ctx->h[5];
+   v[ 6] = ctx->h[6];
+   v[ 7] = ctx->h[7];
+   v[ 8] = m512_const1_64( 0x6A09E667F3BCC908 );
+   v[ 9] = m512_const1_64( 0xBB67AE8584CAA73B );
+   v[10] = m512_const1_64( 0x3C6EF372FE94F82B );
+   v[11] = m512_const1_64( 0xA54FF53A5F1D36F1 );
+   v[12] = m512_const1_64( 0x510E527FADE682D1 );
+   v[13] = m512_const1_64( 0x9B05688C2B3E6C1F );
+   v[14] = m512_const1_64( 0x1F83D9ABFB41BD6B );
+   v[15] = m512_const1_64( 0x5BE0CD19137E2179 );
+
+   v[12] = _mm512_xor_si512( v[12], _mm512_set1_epi64( ctx->t[0] ) );
+   v[13] = _mm512_xor_si512( v[13], _mm512_set1_epi64( ctx->t[1] ) );
+
+   if ( last )
+      v[14] = mm512_not( v[14] );
+
+   m[ 0] = ctx->b[ 0];
+   m[ 1] = ctx->b[ 1];
+   m[ 2] = ctx->b[ 2];
+   m[ 3] = ctx->b[ 3];
+   m[ 4] = ctx->b[ 4];
+   m[ 5] = ctx->b[ 5];
+   m[ 6] = ctx->b[ 6];
+   m[ 7] = ctx->b[ 7];
+   m[ 8] = ctx->b[ 8];
+   m[ 9] = ctx->b[ 9];
+   m[10] = ctx->b[10];
+   m[11] = ctx->b[11];
+   m[12] = ctx->b[12];
+   m[13] = ctx->b[13];
+   m[14] = ctx->b[14];
+   m[15] = ctx->b[15];
+
+   for ( int i = 0; i < 12; i++ )
+   {
+      B2B8W_G( 0, 4,  8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] );
+      B2B8W_G( 1, 5,  9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] );
+      B2B8W_G( 2, 6, 10, 14, m[ sigma[i][ 4] ], m[ sigma[i][ 5] ] );
+      B2B8W_G( 3, 7, 11, 15, m[ sigma[i][ 6] ], m[ sigma[i][ 7] ] );
+      B2B8W_G( 0, 5, 10, 15, m[ sigma[i][ 8] ], m[ sigma[i][ 9] ] );
+      B2B8W_G( 1, 6, 11, 12, m[ sigma[i][10] ], m[ sigma[i][11] ] );
+      B2B8W_G( 2, 7,  8, 13, m[ sigma[i][12] ], m[ sigma[i][13] ] );
+      B2B8W_G( 3, 4,  9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
+   }
+
+   ctx->h[0] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[0], v[0] ), v[ 8] );
+   ctx->h[1] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[1], v[1] ), v[ 9] );
+   ctx->h[2] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[2], v[2] ), v[10] );
+   ctx->h[3] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[3], v[3] ), v[11] );
+   ctx->h[4] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[4], v[4] ), v[12] );
+   ctx->h[5] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[5], v[5] ), v[13] );
+   ctx->h[6] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[6], v[6] ), v[14] );
+   ctx->h[7] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[7], v[7] ), v[15] );
+}
+
+int blake2b_8way_init( blake2b_8way_ctx *ctx )
+{
+   size_t i;
+
+   ctx->h[0] = m512_const1_64( 0x6A09E667F3BCC908 );
+   ctx->h[1] = m512_const1_64( 0xBB67AE8584CAA73B );
+   ctx->h[2] = m512_const1_64( 0x3C6EF372FE94F82B );
+   ctx->h[3] = m512_const1_64( 0xA54FF53A5F1D36F1 );
+   ctx->h[4] = m512_const1_64( 0x510E527FADE682D1 );
+   ctx->h[5] = m512_const1_64( 0x9B05688C2B3E6C1F );
+   ctx->h[6] = m512_const1_64( 0x1F83D9ABFB41BD6B );
+   ctx->h[7] = m512_const1_64( 0x5BE0CD19137E2179 );
+
+   ctx->h[0] = _mm512_xor_si512( ctx->h[0], m512_const1_64( 0x01010020 ) );
+
+   ctx->t[0] = 0;
+   ctx->t[1] = 0;
+   ctx->c = 0;
+   ctx->outlen = 32;
+
+   for ( i = 0; i < 16; i++ )
+     ctx->b[i] = m512_zero;
+
+   return 0;
+}
+
+
+void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
+                          size_t inlen )
+{
+   __m512i* in =(__m512i*)input;
+
+   size_t i, c;
+   c = ctx->c >> 3;
+
+   for ( i = 0; i < (inlen >> 3); i++ )
+   {
+      if ( ctx->c == 128 )
+      {
+         ctx->t[0] += ctx->c;
+         if ( ctx->t[0] < ctx->c )
+            ctx->t[1]++;
+         blake2b_8way_compress( ctx, 0 );
+         ctx->c = 0;
+      }
+      ctx->b[ c++ ] = in[i];
+      ctx->c += 8;
+   }
+}
+
+void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
+{
+   size_t c;
+   c = ctx->c >> 3;
+
+   ctx->t[0] += ctx->c;
+   if ( ctx->t[0] < ctx->c )
+      ctx->t[1]++;
+
+   while ( ctx->c < 128 )
+   {
+      ctx->b[c++] = m512_zero;
+      ctx->c += 8;
+   }
+
+   blake2b_8way_compress( ctx, 1 );           // final block flag = 1
+
+   casti_m512i( out, 0 ) = ctx->h[0];
+   casti_m512i( out, 1 ) = ctx->h[1];
+   casti_m512i( out, 2 ) = ctx->h[2];
+   casti_m512i( out, 3 ) = ctx->h[3];
+}
+
+#endif
+
 #if defined(__AVX2__)

 // G Mixing function.
@@ -61,21 +233,6 @@ static const uint64_t blake2b_iv[8] = {

 static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
 {
-	const uint8_t sigma[12][16] = {
-		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-		{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-		{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-		{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-		{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-		{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-		{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-		{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-		{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
-	};
-	int i;
 	__m256i v[16], m[16];

   v[ 0] = ctx->h[0];
@@ -118,7 +275,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
   m[14] = ctx->b[14];
   m[15] = ctx->b[15];
   
-	for ( i = 0; i < 12; i++ )
+	for ( int i = 0; i < 12; i++ )
   { 
 		B2B_G( 0, 4,  8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] );
 		B2B_G( 1, 5,  9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] );
--- a/algo/blake/blake2b-hash-4way.h
+++ b/algo/blake/blake2b-hash-4way.h
@@ -2,8 +2,6 @@
 #ifndef __BLAKE2B_HASH_4WAY_H__
 #define __BLAKE2B_HASH_4WAY_H__

-#if defined(__AVX2__)
-
 #include "simd-utils.h"
 #include <stddef.h>
 #include <stdint.h>
@@ -16,14 +14,34 @@
 #define ALIGN(x) __attribute__((aligned(x)))
 #endif

+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+ALIGN(128) typedef struct {
+   __m512i b[16]; // input buffer
+   __m512i h[8];  // chained state
+   uint64_t t[2];  // total number of bytes
+   size_t c;       // pointer for b[]
+   size_t outlen;  // digest size
+} blake2b_8way_ctx;
+
+int blake2b_8way_init( blake2b_8way_ctx *ctx );
+void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
+                          size_t inlen );
+void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
+
+#endif
+
+#if defined(__AVX2__)
+
 // state context
-ALIGN(64) typedef struct {
+ALIGN(128) typedef struct {
 	__m256i b[16]; // input buffer
 	__m256i h[8];  // chained state
 	uint64_t t[2];  // total number of bytes
 	size_t c;       // pointer for b[]
 	size_t outlen;  // digest size
-} blake2b_4way_ctx __attribute__((aligned(64)));
+} blake2b_4way_ctx;

 int blake2b_4way_init( blake2b_4way_ctx *ctx );
 void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
--- a/algo/blake/blake2s-4way.c
+++ b/algo/blake/blake2s-4way.c
@@ -3,22 +3,72 @@
 #include <string.h>
 #include <stdint.h>

-#if defined(BLAKE2S_8WAY)
+#if defined(BLAKE2S_16WAY)
+
+static __thread blake2s_16way_state blake2s_16w_ctx;
+
+void blake2s_16way_hash( void *output, const void *input )
+{
+   blake2s_16way_state ctx;
+   memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
+   blake2s_16way_update( &ctx, input + (64<<4), 16 );
+   blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
+}
+
+int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
+                            uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[20*16] __attribute__ ((aligned (128)));
+   uint32_t hash[8*16] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<4]);
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
+   uint32_t n = first_nonce;
+   int thr_id = mythr->id;  
+
+   mm512_bswap32_intrlv80_16x32( vdata, pdata );
+   blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
+   blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
+
+   do {
+      *noncev = mm512_bswap_32( _mm512_set_epi32(
+	                  n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
+	                  n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
+      pdata[19] = n;
+
+      blake2s_16way_hash( hash, vdata );
+
+      for ( int lane = 0; lane < 16; lane++ )
+      if ( unlikely( hash7[lane] <= Htarg ) )
+      {
+         extr_lane_16x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 16;
+   } while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#elif defined(BLAKE2S_8WAY)

 static __thread blake2s_8way_state blake2s_8w_ctx;

 void blake2s_8way_hash( void *output, const void *input )
 {
-   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
   blake2s_8way_state ctx;
   memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
-
   blake2s_8way_update( &ctx, input + (64<<3), 16 );
-   blake2s_8way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
-
-   dintrlv_8x32( output,     output+ 32, output+ 64, output+ 96,
-                 output+128, output+160, output+192, output+224,
-                 vhash, 256 );
+   blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
 }

 int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
@@ -26,13 +76,15 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
 {
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[7<<3]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
   uint32_t n = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id; 

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
@@ -45,16 +97,17 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,

      blake2s_8way_hash( hash, vdata );

-
-      for ( int i = 0; i < 8; i++ )
-      if (  (hash+(i<<3))[7] <= Htarg )
-      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( unlikely( hash7[lane] <= Htarg ) )
      {
-          pdata[19] = n+i;
-          submit_lane_solution( work, hash+(i<<3), mythr, i );
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+         }
      }
      n += 8;
-
   } while ( (n < max_nonce) && !work_restart[thr_id].restart );

   *hashes_done = n - first_nonce + 1;
@@ -67,15 +120,10 @@ static __thread blake2s_4way_state blake2s_4w_ctx;

 void blake2s_4way_hash( void *output, const void *input )
 {
-   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
   blake2s_4way_state ctx;
   memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
-
   blake2s_4way_update( &ctx, input + (64<<2), 16 );
-   blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
-
-   dintrlv_4x32( output, output+32, output+64, output+96,
-		            vhash, 256 );
+   blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
 }

 int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
@@ -83,13 +131,15 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
   uint32_t n = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id; 

   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
@@ -101,15 +151,16 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,

      blake2s_4way_hash( hash, vdata );

-      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg )
-      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+      for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
      {
-          pdata[19] = n+i;
-          submit_lane_solution( work, hash+(i<<3), mythr, i );
+         extr_lane_4x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+              }
      }
      n += 4;
-
   } while ( (n < max_nonce) && !work_restart[thr_id].restart );

   *hashes_done = n - first_nonce + 1;
--- a/algo/blake/blake2s-gate.c
+++ b/algo/blake/blake2s-gate.c
@@ -2,7 +2,11 @@

 bool register_blake2s_algo( algo_gate_t* gate )
 {
-#if defined(BLAKE2S_8WAY)
+#if defined(BLAKE2S_16WAY)
+  gate->scanhash  = (void*)&scanhash_blake2s_16way;
+  gate->hash      = (void*)&blake2s_16way_hash;
+#elif defined(BLAKE2S_8WAY)
+//#if defined(BLAKE2S_8WAY)
  gate->scanhash  = (void*)&scanhash_blake2s_8way;
  gate->hash      = (void*)&blake2s_8way_hash;
 #elif defined(BLAKE2S_4WAY)
@@ -12,7 +16,7 @@ bool register_blake2s_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_blake2s;
  gate->hash      = (void*)&blake2s_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/blake/blake2s-gate.h
+++ b/algo/blake/blake2s-gate.h
@@ -8,13 +8,26 @@
 #if defined(__SSE2__)
  #define BLAKE2S_4WAY
 #endif
+
 #if defined(__AVX2__)
  #define BLAKE2S_8WAY
 #endif

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define BLAKE2S_16WAY
+#endif
+
 bool register_blake2s_algo( algo_gate_t* gate );

-#if defined(BLAKE2S_8WAY)
+#if defined(BLAKE2S_16WAY)
+
+void blake2s_16way_hash( void *state, const void *input );
+int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined (BLAKE2S_8WAY)
+
+//#if defined(BLAKE2S_8WAY)

 void blake2s_8way_hash( void *state, const void *input );
 int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -165,13 +165,13 @@ do { \
 // 
 // Supported:
 //    64 + 16 bytes  (blake2s with midstate optimization)
-//    80 bytes without midstate (blake2s without midstate optimization)
+//    80 bytes       (blake2s without midstate optimization)
 //    Any multiple of 64 bytes in one shot (x25x)
 //
 // Unsupported:
-//    Stream of 64 byte blocks one at a time.   
-//
-// use for part blocks or when streaming more data
+//    Stream of full 64 byte blocks one at a time.   
+
+// use only when streaming more data or final block not full.
 int blake2s_4way_update( blake2s_4way_state *S, const void *in,
                         uint64_t inlen )
 {
@@ -466,6 +466,168 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )

 #endif // __AVX2__

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// Blake2s-256 16 way
+
+int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
+{
+   __m512i m[16];
+   __m512i v[16];
+
+   memcpy_512( m, block, 16 );
+   memcpy_512( v, S->h, 8 );
+
+   v[ 8] = m512_const1_64( 0x6A09E6676A09E667ULL );
+   v[ 9] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
+   v[10] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
+   v[11] = m512_const1_64( 0xA54FF53AA54FF53AULL );
+   v[12] = _mm512_xor_si512( _mm512_set1_epi32( S->t[0] ),
+                          m512_const1_64( 0x510E527F510E527FULL ) );
+
+   v[13] = _mm512_xor_si512( _mm512_set1_epi32( S->t[1] ),
+                          m512_const1_64( 0x9B05688C9B05688CULL ) );
+
+   v[14] = _mm512_xor_si512( _mm512_set1_epi32( S->f[0] ),
+                          m512_const1_64( 0x1F83D9AB1F83D9ABULL ) );
+
+   v[15] = _mm512_xor_si512( _mm512_set1_epi32( S->f[1] ),
+                          m512_const1_64( 0x5BE0CD195BE0CD19ULL ) );
+
+
+#define G16W( sigma0, sigma1, a, b, c, d) \
+do { \
+   uint8_t s0 = sigma0; \
+   uint8_t s1 = sigma1; \
+   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s0 ] ); \
+   d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
+   c = _mm512_add_epi32( c, d ); \
+   b = mm512_ror_32( _mm512_xor_si512( b, c ), 12 ); \
+   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s1 ] ); \
+   d = mm512_ror_32( _mm512_xor_si512( d, a ),  8 ); \
+   c = _mm512_add_epi32( c, d ); \
+   b = mm512_ror_32( _mm512_xor_si512( b, c ),  7 ); \
+} while(0)
+
+#define ROUND16W(r)  \
+do { \
+   uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
+   G16W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
+   G16W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
+   G16W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
+   G16W( sigma[ 6], sigma[ 7], v[ 3], v[ 7], v[11], v[15] ); \
+   G16W( sigma[ 8], sigma[ 9], v[ 0], v[ 5], v[10], v[15] ); \
+   G16W( sigma[10], sigma[11], v[ 1], v[ 6], v[11], v[12] ); \
+   G16W( sigma[12], sigma[13], v[ 2], v[ 7], v[ 8], v[13] ); \
+   G16W( sigma[14], sigma[15], v[ 3], v[ 4], v[ 9], v[14] ); \
+} while(0)
+
+   ROUND16W( 0 );
+   ROUND16W( 1 );
+   ROUND16W( 2 );
+   ROUND16W( 3 );
+   ROUND16W( 4 );
+   ROUND16W( 5 );
+   ROUND16W( 6 );
+   ROUND16W( 7 );
+   ROUND16W( 8 );
+   ROUND16W( 9 );
+
+   for( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm512_xor_si512( _mm512_xor_si512( S->h[i], v[i] ), v[i + 8] );
+
+#undef G16W
+#undef ROUND16W
+   return 0;
+}
+
+int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen )
+{
+   blake2s_nway_param P[1];
+
+   P->digest_length = outlen;
+   P->key_length    = 0;
+   P->fanout        = 1;
+   P->depth         = 1;
+   P->leaf_length   = 0;
+   *((uint64_t*)(P->node_offset)) = 0;
+   P->node_depth    = 0;
+   P->inner_length  = 0;
+   memset( P->salt,     0, sizeof( P->salt ) );
+   memset( P->personal, 0, sizeof( P->personal ) );
+
+   memset( S, 0, sizeof( blake2s_16way_state ) );
+   S->h[0] = m512_const1_64( 0x6A09E6676A09E667ULL );
+   S->h[1] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
+   S->h[2] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
+   S->h[3] = m512_const1_64( 0xA54FF53AA54FF53AULL );
+   S->h[4] = m512_const1_64( 0x510E527F510E527FULL );
+   S->h[5] = m512_const1_64( 0x9B05688C9B05688CULL );
+   S->h[6] = m512_const1_64( 0x1F83D9AB1F83D9ABULL );
+   S->h[7] = m512_const1_64( 0x5BE0CD195BE0CD19ULL );
+
+   uint32_t *p = ( uint32_t * )( P );
+
+   /* IV XOR ParamBlock */
+   for ( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm512_xor_si512( S->h[i], _mm512_set1_epi32( p[i] ) );
+   return 0;
+}
+
+int blake2s_16way_update( blake2s_16way_state *S, const void *in,
+                         uint64_t inlen )
+{
+  __m512i *input = (__m512i*)in;
+  __m512i *buf = (__m512i*)S->buf;
+  const int bsize = BLAKE2S_BLOCKBYTES;
+
+   while( inlen > 0 )
+   {
+      size_t left = S->buflen;
+      if( inlen >= bsize - left )
+      {
+         memcpy_512( buf + (left>>2), input, (bsize - left) >> 2 );
+         S->buflen += bsize - left;
+         S->t[0] += BLAKE2S_BLOCKBYTES;
+         S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+         blake2s_16way_compress( S, buf );
+         S->buflen = 0;
+         input += ( bsize >> 2 );
+         inlen -= bsize;
+      }
+      else
+      {
+          memcpy_512( buf + ( left>>2 ), input, inlen>>2 );
+          S->buflen += (size_t) inlen;
+          input += ( inlen>>2 );
+          inlen -= inlen;
+      }
+   }
+   return 0;
+}
+
+int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen )
+{
+   __m512i *buf = (__m512i*)S->buf;
+
+   S->t[0] += S->buflen;
+   S->t[1] += ( S->t[0] < S->buflen );
+   if ( S->last_node )
+      S->f[1] = ~0U;
+   S->f[0] = ~0U;
+
+   memset_zero_512( buf + ( S->buflen>>2 ),
+                    ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
+   blake2s_16way_compress( S, buf );
+
+   for ( int i = 0; i < 8; ++i )
+      casti_m512i( out, i ) = S->h[ i ];
+   return 0;
+}
+
+#endif   // AVX512
+
+
 #if 0
 int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
 {
--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -64,7 +64,7 @@ typedef struct __blake2s_nway_param
 ALIGN( 64 ) typedef struct __blake2s_4way_state
 {
   __m128i h[8];
-   uint8_t  buf[ 2 * BLAKE2S_BLOCKBYTES * 4 ];
+   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 4 ];
   uint32_t t[2];
   uint32_t f[2];
   size_t   buflen;
@@ -75,13 +75,16 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen );
 int blake2s_4way_update( blake2s_4way_state *S, const void *in,
                         uint64_t inlen );
 int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
+int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
+                              const void *input, uint64_t inlen );
+

 #if defined(__AVX2__)

 ALIGN( 64 ) typedef struct __blake2s_8way_state
 {
   __m256i h[8];
-   uint8_t  buf[ 2 * BLAKE2S_BLOCKBYTES * 8 ];
+   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 8 ];
   uint32_t t[2];
   uint32_t f[2];
   size_t   buflen;
@@ -92,9 +95,27 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen );
 int blake2s_8way_update( blake2s_8way_state *S, const void *in,
                         uint64_t inlen );
 int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
-int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
-                              const void *input, uint64_t inlen );
+//int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
+//                              const void *input, uint64_t inlen );

+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+ALIGN( 128 ) typedef struct __blake2s_16way_state
+{
+   __m512i h[8];
+   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 16 ];
+   uint32_t t[2];
+   uint32_t f[2];
+   size_t   buflen;
+   uint8_t  last_node;
+} blake2s_16way_state ;
+
+int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen );
+int blake2s_16way_update( blake2s_16way_state *S, const void *in,
+                         uint64_t inlen );
+int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );

 #endif

--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -78,7 +78,7 @@ void bmw256_4way_addbits_and_close(
 // BMW-256 8 way 32

 typedef struct {
-   __m256i buf[64];
+   __m256i buf[16];
   __m256i H[16];
   size_t ptr;
   uint32_t bit_count;  // assume bit_count fits in 32 bits
@@ -121,7 +121,7 @@ typedef struct {
   __m256i H[16];
   size_t ptr;
   sph_u64 bit_count;
-} bmw_4way_big_context;
+} bmw_4way_big_context __attribute__((aligned(128)));

 typedef bmw_4way_big_context bmw512_4way_context;

@@ -137,6 +137,22 @@ void bmw512_4way_addbits_and_close(

 #endif  // __AVX2__

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+typedef struct {
+   __m512i buf[16];
+   __m512i H[16];
+   size_t ptr;
+   uint64_t bit_count;
+} bmw512_8way_context __attribute__((aligned(128)));
+
+void bmw512_8way_init( bmw512_8way_context *ctx );
+void bmw512_8way_update( bmw512_8way_context *ctx, const void *data,
+                         size_t len );
+void bmw512_8way_close( bmw512_8way_context *ctx, void *dst );
+
+#endif // AVX512
+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -137,165 +137,151 @@ static const uint32_t IV256[] = {
                           ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \
      add_elt_s( M, H, (i)-16 ) )

+// Expressions are grouped using associativity to reduce CPU depenedencies,
+// resulting in some sign changes compared to the reference code.
+
 #define Ws0 \
   _mm_add_epi32( \
-       _mm_add_epi32( \
-          _mm_add_epi32( \
-             _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
-                            _mm_xor_si128( M[ 7], H[ 7] ) ), \
-             _mm_xor_si128( M[10], H[10] ) ), \
-          _mm_xor_si128( M[13], H[13] ) ), \
-       _mm_xor_si128( M[14], H[14] ) )
+      _mm_add_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
+                        _mm_xor_si128( M[ 7], H[ 7] ) ), \
+         _mm_xor_si128( M[10], H[10] ) ), \
+      _mm_add_epi32( _mm_xor_si128( M[13], H[13] ), \
+                     _mm_xor_si128( M[14], H[14] ) ) )

 #define Ws1 \
-   _mm_sub_epi32( \
+   _mm_add_epi32( \
       _mm_add_epi32( \
-          _mm_add_epi32( \
-             _mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
-                            _mm_xor_si128( M[ 8], H[ 8] ) ), \
-             _mm_xor_si128( M[11], H[11] ) ), \
-          _mm_xor_si128( M[14], H[14] ) ), \
-       _mm_xor_si128( M[15], H[15] ) )
+          _mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
+                         _mm_xor_si128( M[ 8], H[ 8] ) ), \
+          _mm_xor_si128( M[11], H[11] ) ), \
+       _mm_sub_epi32( _mm_xor_si128( M[14], H[14] ), \
+                      _mm_xor_si128( M[15], H[15] ) ) )

 #define Ws2 \
-   _mm_add_epi32( \
-       _mm_sub_epi32( \
-          _mm_add_epi32( \
-             _mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
-                            _mm_xor_si128( M[ 7], H[ 7] ) ), \
-             _mm_xor_si128( M[ 9], H[ 9] ) ), \
-          _mm_xor_si128( M[12], H[12] ) ), \
-       _mm_xor_si128( M[15], H[15] ) )
+   _mm_sub_epi32( \
+      _mm_add_epi32( \
+         _mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
+                        _mm_xor_si128( M[ 7], H[ 7] ) ), \
+         _mm_xor_si128( M[ 9], H[ 9] ) ), \
+      _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
+                     _mm_xor_si128( M[15], H[15] ) ) )

 #define Ws3 \
-   _mm_add_epi32( \
-       _mm_sub_epi32( \
-          _mm_add_epi32( \
-             _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
-                            _mm_xor_si128( M[ 1], H[ 1] ) ), \
-             _mm_xor_si128( M[ 8], H[ 8] ) ), \
-          _mm_xor_si128( M[10], H[10] ) ), \
-       _mm_xor_si128( M[13], H[13] ) )
+   _mm_sub_epi32( \
+      _mm_add_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
+                        _mm_xor_si128( M[ 1], H[ 1] ) ), \
+         _mm_xor_si128( M[ 8], H[ 8] ) ), \
+      _mm_sub_epi32( _mm_xor_si128( M[10], H[10] ), \
+                     _mm_xor_si128( M[13], H[13] ) ) )

 #define Ws4 \
   _mm_sub_epi32( \
-       _mm_sub_epi32( \
-          _mm_add_epi32( \
-             _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
-                            _mm_xor_si128( M[ 2], H[ 2] ) ), \
-             _mm_xor_si128( M[ 9], H[ 9] ) ), \
-          _mm_xor_si128( M[11], H[11] ) ), \
-       _mm_xor_si128( M[14], H[14] ) )
+      _mm_add_epi32( \
+         _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
+                        _mm_xor_si128( M[ 2], H[ 2] ) ), \
+         _mm_xor_si128( M[ 9], H[ 9] ) ), \
+      _mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
+                     _mm_xor_si128( M[14], H[14] ) ) )

 #define Ws5 \
-   _mm_add_epi32( \
-       _mm_sub_epi32( \
-          _mm_add_epi32( \
-             _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
-                            _mm_xor_si128( M[ 2], H[ 2] ) ), \
-             _mm_xor_si128( M[10], H[10] ) ), \
-          _mm_xor_si128( M[12], H[12] ) ), \
-       _mm_xor_si128( M[15], H[15] ) )
+   _mm_sub_epi32( \
+      _mm_add_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
+                        _mm_xor_si128( M[ 2], H[ 2] ) ), \
+         _mm_xor_si128( M[10], H[10] ) ), \
+      _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
+                     _mm_xor_si128( M[15], H[15] ) ) )

 #define Ws6 \
-   _mm_add_epi32( \
-       _mm_sub_epi32( \
-          _mm_sub_epi32( \
-             _mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
-                            _mm_xor_si128( M[ 0], H[ 0] ) ), \
-             _mm_xor_si128( M[ 3], H[ 3] ) ), \
-          _mm_xor_si128( M[11], H[11] ) ), \
-       _mm_xor_si128( M[13], H[13] ) )
+   _mm_sub_epi32( \
+      _mm_sub_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
+                        _mm_xor_si128( M[ 0], H[ 0] ) ), \
+         _mm_xor_si128( M[ 3], H[ 3] ) ), \
+      _mm_sub_epi32( _mm_xor_si128( M[11], H[11] ), \
+                     _mm_xor_si128( M[13], H[13] ) ) )

 #define Ws7 \
   _mm_sub_epi32( \
-       _mm_sub_epi32( \
-          _mm_sub_epi32( \
-             _mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
-                            _mm_xor_si128( M[ 4], H[ 4] ) ), \
-             _mm_xor_si128( M[ 5], H[ 5] ) ), \
-          _mm_xor_si128( M[12], H[12] ) ), \
-       _mm_xor_si128( M[14], H[14] ) )
+      _mm_sub_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
+                        _mm_xor_si128( M[ 4], H[ 4] ) ), \
+         _mm_xor_si128( M[ 5], H[ 5] ) ), \
+      _mm_add_epi32( _mm_xor_si128( M[12], H[12] ), \
+                     _mm_xor_si128( M[14], H[14] ) ) )

 #define Ws8 \
-   _mm_sub_epi32( \
-       _mm_add_epi32( \
-          _mm_sub_epi32( \
-             _mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
-                            _mm_xor_si128( M[ 5], H[ 5] ) ), \
-             _mm_xor_si128( M[ 6], H[ 6] ) ), \
-          _mm_xor_si128( M[13], H[13] ) ), \
-       _mm_xor_si128( M[15], H[15] ) )
-
-#define Ws9 \
   _mm_add_epi32( \
-       _mm_sub_epi32( \
-          _mm_add_epi32( \
-             _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
-                            _mm_xor_si128( M[ 3], H[ 3] ) ), \
-             _mm_xor_si128( M[ 6], H[ 6] ) ), \
-          _mm_xor_si128( M[ 7], H[ 7] ) ), \
-       _mm_xor_si128( M[14], H[14] ) )
+      _mm_sub_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
+                        _mm_xor_si128( M[ 5], H[ 5] ) ), \
+         _mm_xor_si128( M[ 6], H[ 6] ) ), \
+      _mm_sub_epi32( _mm_xor_si128( M[13], H[13] ), \
+                     _mm_xor_si128( M[15], H[15] ) ) )
+#define Ws9 \
+   _mm_sub_epi32( \
+      _mm_add_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
+                        _mm_xor_si128( M[ 3], H[ 3] ) ), \
+         _mm_xor_si128( M[ 6], H[ 6] ) ), \
+      _mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
+                     _mm_xor_si128( M[14], H[14] ) ) )

 #define Ws10 \
-   _mm_add_epi32( \
-       _mm_sub_epi32( \
-          _mm_sub_epi32( \
-             _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
-                            _mm_xor_si128( M[ 1], H[ 1] ) ), \
-             _mm_xor_si128( M[ 4], H[ 4] ) ), \
-          _mm_xor_si128( M[ 7], H[ 7] ) ), \
-       _mm_xor_si128( M[15], H[15] ) )
+   _mm_sub_epi32( \
+      _mm_sub_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
+                        _mm_xor_si128( M[ 1], H[ 1] ) ), \
+         _mm_xor_si128( M[ 4], H[ 4] ) ), \
+      _mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
+                     _mm_xor_si128( M[15], H[15] ) ) )

 #define Ws11 \
-   _mm_add_epi32( \
-       _mm_sub_epi32( \
-          _mm_sub_epi32( \
-             _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
-                            _mm_xor_si128( M[ 0], H[ 0] ) ), \
-             _mm_xor_si128( M[ 2], H[ 2] ) ), \
-          _mm_xor_si128( M[ 5], H[ 5] ) ), \
-       _mm_xor_si128( M[ 9], H[ 9] ) )
+   _mm_sub_epi32( \
+      _mm_sub_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
+                        _mm_xor_si128( M[ 0], H[ 0] ) ), \
+         _mm_xor_si128( M[ 2], H[ 2] ) ), \
+      _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
+                     _mm_xor_si128( M[ 9], H[ 9] ) ) )

 #define Ws12 \
-   _mm_add_epi32( \
-       _mm_sub_epi32( \
-          _mm_sub_epi32( \
-             _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
-                            _mm_xor_si128( M[ 3], H[ 3] ) ), \
-             _mm_xor_si128( M[ 6], H[ 6] ) ), \
-          _mm_xor_si128( M[ 9], H[ 9] ) ), \
-       _mm_xor_si128( M[10], H[10] ) )
+   _mm_sub_epi32( \
+      _mm_sub_epi32( \
+         _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
+                        _mm_xor_si128( M[ 3], H[ 3] ) ), \
+         _mm_xor_si128( M[ 6], H[ 6] ) ), \
+      _mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
+                     _mm_xor_si128( M[10], H[10] ) ) )

 #define Ws13 \
   _mm_add_epi32( \
-       _mm_add_epi32( \
-          _mm_add_epi32( \
-             _mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
-                            _mm_xor_si128( M[ 4], H[ 4] ) ), \
-             _mm_xor_si128( M[ 7], H[ 7] ) ), \
-          _mm_xor_si128( M[10], H[10] ) ), \
-       _mm_xor_si128( M[11], H[11] ) )
+      _mm_add_epi32( \
+         _mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
+                        _mm_xor_si128( M[ 4], H[ 4] ) ), \
+         _mm_xor_si128( M[ 7], H[ 7] ) ), \
+      _mm_add_epi32( _mm_xor_si128( M[10], H[10] ), \
+                     _mm_xor_si128( M[11], H[11] ) ) )

 #define Ws14 \
   _mm_sub_epi32( \
-       _mm_sub_epi32( \
-          _mm_add_epi32( \
-             _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
-                               _mm_xor_si128( M[ 5], H[ 5] ) ), \
-             _mm_xor_si128( M[ 8], H[ 8] ) ), \
-          _mm_xor_si128( M[11], H[11] ) ), \
-       _mm_xor_si128( M[12], H[12] ) )
+      _mm_add_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
+                        _mm_xor_si128( M[ 5], H[ 5] ) ), \
+         _mm_xor_si128( M[ 8], H[ 8] ) ), \
+      _mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
+                     _mm_xor_si128( M[12], H[12] ) ) )

 #define Ws15 \
-   _mm_add_epi32( \
-       _mm_sub_epi32( \
-          _mm_sub_epi32( \
-             _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
-                            _mm_xor_si128( M[ 4], H[ 4] ) ), \
-             _mm_xor_si128( M[ 6], H[ 6] ) ), \
-          _mm_xor_si128( M[ 9], H[ 9] ) ), \
-       _mm_xor_si128( M[13], H[13] ) )
+   _mm_sub_epi32( \
+      _mm_sub_epi32( \
+         _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
+                        _mm_xor_si128( M[ 4], H[4] ) ), \
+         _mm_xor_si128( M[ 6], H[ 6] ) ), \
+      _mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
+                     _mm_xor_si128( M[13], H[13] ) ) )


 void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
@@ -700,163 +686,148 @@ bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)

 #define W8s0 \
   _mm256_add_epi32( \
-       _mm256_add_epi32( \
-          _mm256_add_epi32( \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 5], H[ 5] ), \
-                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-             _mm256_xor_si256( M[10], H[10] ) ), \
-          _mm256_xor_si256( M[13], H[13] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
+      _mm256_add_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 5], H[ 5] ), \
+                           _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+         _mm256_xor_si256( M[10], H[10] ) ), \
+      _mm256_add_epi32( _mm256_xor_si256( M[13], H[13] ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )

 #define W8s1 \
-   _mm256_sub_epi32( \
+   _mm256_add_epi32( \
       _mm256_add_epi32( \
-          _mm256_add_epi32( \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 6], H[ 6] ), \
-                               _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-             _mm256_xor_si256( M[11], H[11] ) ), \
-          _mm256_xor_si256( M[14], H[14] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
+          _mm256_sub_epi32( _mm256_xor_si256( M[ 6], H[ 6] ), \
+                            _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_sub_epi32( _mm256_xor_si256( M[14], H[14] ), \
+                         _mm256_xor_si256( M[15], H[15] ) ) )

 #define W8s2 \
-   _mm256_add_epi32( \
-       _mm256_sub_epi32( \
-          _mm256_add_epi32( \
-             _mm256_add_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
+   _mm256_sub_epi32( \
+      _mm256_add_epi32( \
+         _mm256_add_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                           _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+         _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[12], H[12] ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )

 #define W8s3 \
-   _mm256_add_epi32( \
-       _mm256_sub_epi32( \
-          _mm256_add_epi32( \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
-             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-          _mm256_xor_si256( M[10], H[10] ) ), \
-       _mm256_xor_si256( M[13], H[13] ) )
+   _mm256_sub_epi32( \
+      _mm256_add_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                           _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+         _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[10], H[10] ), \
+                        _mm256_xor_si256( M[13], H[13] ) ) )

 #define W8s4 \
   _mm256_sub_epi32( \
-       _mm256_sub_epi32( \
-          _mm256_add_epi32( \
-             _mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
+      _mm256_add_epi32( \
+         _mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                           _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+         _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+      _mm256_add_epi32( _mm256_xor_si256( M[11], H[11] ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )

 #define W8s5 \
-   _mm256_add_epi32( \
-       _mm256_sub_epi32( \
-          _mm256_add_epi32( \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \
-                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-             _mm256_xor_si256( M[10], H[10] ) ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
+   _mm256_sub_epi32( \
+      _mm256_add_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                           _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+         _mm256_xor_si256( M[10], H[10] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[12], H[12] ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )

 #define W8s6 \
-   _mm256_add_epi32( \
-       _mm256_sub_epi32( \
-          _mm256_sub_epi32( \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 4], H[ 4] ), \
-                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
-             _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
-       _mm256_xor_si256( M[13], H[13] ) )
+   _mm256_sub_epi32( \
+      _mm256_sub_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 4], H[ 4] ), \
+                           _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+         _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[11], H[11] ), \
+                        _mm256_xor_si256( M[13], H[13] ) ) )

 #define W8s7 \
   _mm256_sub_epi32( \
-       _mm256_sub_epi32( \
-          _mm256_sub_epi32( \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-             _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
+      _mm256_sub_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                           _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+         _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+      _mm256_add_epi32( _mm256_xor_si256( M[12], H[12] ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )

 #define W8s8 \
-   _mm256_sub_epi32( \
-       _mm256_add_epi32( \
-          _mm256_sub_epi32( \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \
-                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[13], H[13] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
+   _mm256_add_epi32( \
+      _mm256_sub_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                           _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[13], H[13] ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )

 #define W8s9 \
-   _mm256_add_epi32( \
-       _mm256_sub_epi32( \
-          _mm256_add_epi32( \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
+   _mm256_sub_epi32( \
+      _mm256_add_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                           _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[ 7], H[ 7] ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )

 #define W8s10 \
-   _mm256_add_epi32( \
-       _mm256_sub_epi32( \
-          _mm256_sub_epi32( \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \
-                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
-             _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
+   _mm256_sub_epi32( \
+      _mm256_sub_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                           _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+         _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[ 7], H[ 7] ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )

 #define W8s11 \
-   _mm256_add_epi32( \
-       _mm256_sub_epi32( \
-          _mm256_sub_epi32( \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \
-                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
-             _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-          _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-       _mm256_xor_si256( M[ 9], H[ 9] ) )
+   _mm256_sub_epi32( \
+      _mm256_sub_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                           _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+         _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[ 5], H[ 5] ), \
+                        _mm256_xor_si256( M[ 9], H[ 9] ) ) )

 #define W8s12 \
-   _mm256_add_epi32( \
-       _mm256_sub_epi32( \
-          _mm256_sub_epi32( \
-             _mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-       _mm256_xor_si256( M[10], H[10] ) )
+   _mm256_sub_epi32( \
+      _mm256_sub_epi32( \
+         _mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                           _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[ 9], H[ 9] ), \
+                        _mm256_xor_si256( M[10], H[10] ) ) )

 #define W8s13 \
   _mm256_add_epi32( \
-       _mm256_add_epi32( \
-          _mm256_add_epi32( \
-             _mm256_add_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \
-                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-             _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-          _mm256_xor_si256( M[10], H[10] ) ), \
-       _mm256_xor_si256( M[11], H[11] ) )
+      _mm256_add_epi32( \
+         _mm256_add_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                           _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+         _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+      _mm256_add_epi32( _mm256_xor_si256( M[10], H[10] ), \
+                        _mm256_xor_si256( M[11], H[11] ) ) )

 #define W8s14 \
   _mm256_sub_epi32( \
-       _mm256_sub_epi32( \
-          _mm256_add_epi32( \
-             _mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \
-                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
-       _mm256_xor_si256( M[12], H[12] ) )
+      _mm256_add_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                           _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+         _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+      _mm256_add_epi32( _mm256_xor_si256( M[11], H[11] ), \
+                        _mm256_xor_si256( M[12], H[12] ) ) )

 #define W8s15 \
-   _mm256_add_epi32( \
-       _mm256_sub_epi32( \
-          _mm256_sub_epi32( \
-             _mm256_sub_epi32( _mm256_xor_si256( M[12], H[12] ), \
-                               _mm256_xor_si256( M[ 4], H[4] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-       _mm256_xor_si256( M[13], H[13] ) )
+   _mm256_sub_epi32( \
+      _mm256_sub_epi32( \
+         _mm256_sub_epi32( _mm256_xor_si256( M[12], H[12] ), \
+                           _mm256_xor_si256( M[ 4], H[4] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi32( _mm256_xor_si256( M[ 9], H[ 9] ), \
+                        _mm256_xor_si256( M[13], H[13] ) ) )
+

 void compress_small_8way( const __m256i *M, const __m256i H[16],
 	                  __m256i dH[16] )
--- a/algo/bmw/bmw512-4way.c
+++ b/algo/bmw/bmw512-4way.c
@@ -1,13 +1,66 @@
 #include "bmw512-gate.h"
-
-#ifdef BMW512_4WAY
-
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
 //#include "sph_keccak.h"
 #include "bmw-hash-4way.h"

+#if defined(BMW512_8WAY)
+
+void bmw512hash_8way(void *state, const void *input)
+{
+    bmw512_8way_context ctx;
+    bmw512_8way_init( &ctx );
+    bmw512_8way_update( &ctx, input, 80 );
+    bmw512_8way_close( &ctx, state );
+}
+
+int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t hash[16*8] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[49]);   // 3*16+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+//   const uint32_t Htarg = ptarget[7];
+   int thr_id = mythr->id;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do {
+       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0 ,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+      bmw512hash_8way( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
+      {
+          extr_lane_8x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) )
+          {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+      }
+      n += 4;
+
+   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+   
+
+#elif defined(BMW512_4WAY)
+
+//#ifdef BMW512_4WAY
+
 void bmw512hash_4way(void *state, const void *input)
 {
    bmw512_4way_context ctx;
--- a/algo/bmw/bmw512-gate.c
+++ b/algo/bmw/bmw512-gate.c
@@ -2,9 +2,12 @@

 bool register_bmw512_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
  opt_target_factor = 256.0;
-#if defined (BMW512_4WAY)
+#if defined (BMW512_8WAY)
+  gate->scanhash  = (void*)&scanhash_bmw512_8way;
+  gate->hash      = (void*)&bmw512hash_8way;
+#elif defined (BMW512_4WAY)
  gate->scanhash  = (void*)&scanhash_bmw512_4way;
  gate->hash      = (void*)&bmw512hash_4way;
 #else
--- a/algo/bmw/bmw512-gate.h
+++ b/algo/bmw/bmw512-gate.h
@@ -1,23 +1,33 @@
 #ifndef BMW512_GATE_H__
-#define BMW512_GATE_H__
+#define BMW512_GATE_H__ 1

 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define BMW512_8WAY 1
+#elif defined(__AVX2__)
  #define BMW512_4WAY 1
 #endif

-#if defined(BMW512_4WAY)
+#if defined(BMW512_8WAY)
+
+void bmw512hash_8way( void *state, const void *input );
+int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(BMW512_4WAY)

 void bmw512hash_4way( void *state, const void *input );
 int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );

-#endif
+#else

 void bmw512hash( void *state, const void *input );
 int scanhash_bmw512( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );

 #endif
+
+#endif
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -556,7 +556,7 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
   compress_big_2way( buf, h, h2 );
   memcpy_128( buf, h2, 16 );
   compress_big_2way( buf, final_b2, h1 );
-   memcpy( (__m128i*)dst, h1+16, 8 );
+   memcpy( (__m128i*)dst, h1+8, 8 );
 }

 #endif  // __SSE2__
@@ -636,165 +636,152 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
                     sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \
      add_elt_b( M, H, (i)-16 ) )

+
+
 #define Wb0 \
   _mm256_add_epi64( \
-       _mm256_add_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
-                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-             _mm256_xor_si256( M[10], H[10] ) ), \
-          _mm256_xor_si256( M[13], H[13] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
+      _mm256_add_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
+                           _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+         _mm256_xor_si256( M[10], H[10] ) ), \
+      _mm256_add_epi64( _mm256_xor_si256( M[13], H[13] ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )

 #define Wb1 \
-   _mm256_sub_epi64( \
+   _mm256_add_epi64( \
       _mm256_add_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
-                               _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-             _mm256_xor_si256( M[11], H[11] ) ), \
-          _mm256_xor_si256( M[14], H[14] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
+          _mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
+                            _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_sub_epi64( _mm256_xor_si256( M[14], H[14] ), \
+                         _mm256_xor_si256( M[15], H[15] ) ) )

 #define Wb2 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
+   _mm256_sub_epi64( \
+      _mm256_add_epi64( \
+         _mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                           _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+         _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )

 #define Wb3 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
-             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-          _mm256_xor_si256( M[10], H[10] ) ), \
-       _mm256_xor_si256( M[13], H[13] ) )
+   _mm256_sub_epi64( \
+      _mm256_add_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                           _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+         _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[10], H[10] ), \
+                        _mm256_xor_si256( M[13], H[13] ) ) )

 #define Wb4 \
   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
+      _mm256_add_epi64( \
+         _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                           _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+         _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+      _mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )

 #define Wb5 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
-                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-             _mm256_xor_si256( M[10], H[10] ) ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
+   _mm256_sub_epi64( \
+      _mm256_add_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                           _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+         _mm256_xor_si256( M[10], H[10] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )

 #define Wb6 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
-                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
-             _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
-       _mm256_xor_si256( M[13], H[13] ) )
+   _mm256_sub_epi64( \
+      _mm256_sub_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
+                           _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+         _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[11], H[11] ), \
+                        _mm256_xor_si256( M[13], H[13] ) ) )

 #define Wb7 \
   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-             _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
+      _mm256_sub_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                           _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+         _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+      _mm256_add_epi64( _mm256_xor_si256( M[12], H[12] ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )

 #define Wb8 \
-   _mm256_sub_epi64( \
-       _mm256_add_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
-                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[13], H[13] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
+   _mm256_add_epi64( \
+      _mm256_sub_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                           _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[13], H[13] ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )

 #define Wb9 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
+   _mm256_sub_epi64( \
+      _mm256_add_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                           _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )

 #define Wb10 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
-                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
-             _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
+   _mm256_sub_epi64( \
+      _mm256_sub_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                           _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+         _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )

 #define Wb11 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
-                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
-             _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-          _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-       _mm256_xor_si256( M[ 9], H[ 9] ) )
+   _mm256_sub_epi64( \
+      _mm256_sub_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                           _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+         _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
+                        _mm256_xor_si256( M[ 9], H[ 9] ) ) )

 #define Wb12 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-       _mm256_xor_si256( M[10], H[10] ) )
+   _mm256_sub_epi64( \
+      _mm256_sub_epi64( \
+         _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                           _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
+                        _mm256_xor_si256( M[10], H[10] ) ) )

 #define Wb13 \
   _mm256_add_epi64( \
-       _mm256_add_epi64( \
-          _mm256_add_epi64( \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
-                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-             _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-          _mm256_xor_si256( M[10], H[10] ) ), \
-       _mm256_xor_si256( M[11], H[11] ) )
+      _mm256_add_epi64( \
+         _mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                           _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+         _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+      _mm256_add_epi64( _mm256_xor_si256( M[10], H[10] ), \
+                        _mm256_xor_si256( M[11], H[11] ) ) )

 #define Wb14 \
   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
-                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
-       _mm256_xor_si256( M[12], H[12] ) )
+      _mm256_add_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                           _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+         _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+      _mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
+                        _mm256_xor_si256( M[12], H[12] ) ) )

 #define Wb15 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
-                               _mm256_xor_si256( M[ 4], H[4] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-       _mm256_xor_si256( M[13], H[13] ) )
+   _mm256_sub_epi64( \
+      _mm256_sub_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
+                           _mm256_xor_si256( M[ 4], H[4] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
+                        _mm256_xor_si256( M[13], H[13] ) ) )
+

 void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
 {
@@ -1079,6 +1066,477 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)

 #endif  // __AVX2__

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// BMW-512 8 WAY
+
+#define s8b0(x) \
+   mm512_xor4( _mm512_srli_epi64( (x), 1), _mm512_slli_epi64( (x), 3), \
+                mm512_rol_64(     (x), 4),  mm512_rol_64(     (x),37) )
+
+#define s8b1(x) \
+   mm512_xor4( _mm512_srli_epi64( (x), 1), _mm512_slli_epi64( (x), 2), \
+                mm512_rol_64(     (x),13),  mm512_rol_64(     (x),43) )
+
+#define s8b2(x) \
+   mm512_xor4( _mm512_srli_epi64( (x), 2), _mm512_slli_epi64( (x), 1), \
+                mm512_rol_64(     (x),19),  mm512_rol_64(     (x),53) )
+
+#define s8b3(x) \
+   mm512_xor4( _mm512_srli_epi64( (x), 2), _mm512_slli_epi64( (x), 2), \
+                mm512_rol_64(     (x),28),  mm512_rol_64(     (x),59) )
+
+#define s8b4(x) \
+  _mm512_xor_si512( (x), _mm512_srli_epi64( (x), 1 ) )
+
+#define s8b5(x) \
+  _mm512_xor_si512( (x), _mm512_srli_epi64( (x), 2 ) )
+
+#define r8b1(x)    mm512_rol_64( x,  5 )
+#define r8b2(x)    mm512_rol_64( x, 11 )
+#define r8b3(x)    mm512_rol_64( x, 27 )
+#define r8b4(x)    mm512_rol_64( x, 32 )
+#define r8b5(x)    mm512_rol_64( x, 37 )
+#define r8b6(x)    mm512_rol_64( x, 43 )
+#define r8b7(x)    mm512_rol_64( x, 53 )
+
+#define rol8w_off_64( M, j, off ) \
+   mm512_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
+                  ( ( (j) + (off) ) & 0xF ) + 1 )
+
+#define add_elt_b8( M, H, j ) \
+   _mm512_xor_si512( \
+      _mm512_add_epi64( \
+            _mm512_sub_epi64( _mm512_add_epi64( rol8w_off_64( M, j, 0 ), \
+                                                rol8w_off_64( M, j, 3 ) ), \
+                             rol8w_off_64( M, j, 10 ) ), \
+            _mm512_set1_epi64( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
+       H[ ( (j)+7 ) & 0xF ] )
+
+#define expand1b8( qt, M, H, i ) \
+   _mm512_add_epi64( mm512_add4_64( \
+      mm512_add4_64( s8b1( qt[ (i)-16 ] ), s8b2( qt[ (i)-15 ] ), \
+                     s8b3( qt[ (i)-14 ] ), s8b0( qt[ (i)-13 ] )), \
+      mm512_add4_64( s8b1( qt[ (i)-12 ] ), s8b2( qt[ (i)-11 ] ), \
+                     s8b3( qt[ (i)-10 ] ), s8b0( qt[ (i)- 9 ] )), \
+      mm512_add4_64( s8b1( qt[ (i)- 8 ] ), s8b2( qt[ (i)- 7 ] ), \
+                     s8b3( qt[ (i)- 6 ] ), s8b0( qt[ (i)- 5 ] )), \
+      mm512_add4_64( s8b1( qt[ (i)- 4 ] ), s8b2( qt[ (i)- 3 ] ), \
+                     s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) ), \
+      add_elt_b8( M, H, (i)-16 ) )
+
+#define expand2b8( qt, M, H, i) \
+   _mm512_add_epi64( mm512_add4_64( \
+      mm512_add4_64( qt[ (i)-16 ], r8b1( qt[ (i)-15 ] ), \
+                     qt[ (i)-14 ], r8b2( qt[ (i)-13 ] ) ), \
+      mm512_add4_64( qt[ (i)-12 ], r8b3( qt[ (i)-11 ] ), \
+                     qt[ (i)-10 ], r8b4( qt[ (i)- 9 ] ) ), \
+      mm512_add4_64( qt[ (i)- 8 ], r8b5( qt[ (i)- 7 ] ), \
+                     qt[ (i)- 6 ], r8b6( qt[ (i)- 5 ] ) ), \
+      mm512_add4_64( qt[ (i)- 4 ], r8b7( qt[ (i)- 3 ] ), \
+                     s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ), \
+      add_elt_b8( M, H, (i)-16 ) )
+
+
+
+#define W8b0 \
+   _mm512_add_epi64( \
+      _mm512_add_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
+                           _mm512_xor_si512( M[ 7], H[ 7] ) ), \
+         _mm512_xor_si512( M[10], H[10] ) ), \
+      _mm512_add_epi64( _mm512_xor_si512( M[13], H[13] ), \
+                        _mm512_xor_si512( M[14], H[14] ) ) )
+
+#define W8b1 \
+   _mm512_add_epi64( \
+       _mm512_add_epi64( \
+          _mm512_sub_epi64( _mm512_xor_si512( M[ 6], H[ 6] ), \
+                            _mm512_xor_si512( M[ 8], H[ 8] ) ), \
+          _mm512_xor_si512( M[11], H[11] ) ), \
+       _mm512_sub_epi64( _mm512_xor_si512( M[14], H[14] ), \
+                         _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W8b2 \
+   _mm512_sub_epi64( \
+      _mm512_add_epi64( \
+         _mm512_add_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
+                           _mm512_xor_si512( M[ 7], H[ 7] ) ), \
+         _mm512_xor_si512( M[ 9], H[ 9] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
+                        _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W8b3 \
+   _mm512_sub_epi64( \
+      _mm512_add_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
+                           _mm512_xor_si512( M[ 1], H[ 1] ) ), \
+         _mm512_xor_si512( M[ 8], H[ 8] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[10], H[10] ), \
+                        _mm512_xor_si512( M[13], H[13] ) ) )
+
+#define W8b4 \
+   _mm512_sub_epi64( \
+      _mm512_add_epi64( \
+         _mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
+                           _mm512_xor_si512( M[ 2], H[ 2] ) ), \
+         _mm512_xor_si512( M[ 9], H[ 9] ) ), \
+      _mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
+                        _mm512_xor_si512( M[14], H[14] ) ) )
+
+#define W8b5 \
+   _mm512_sub_epi64( \
+      _mm512_add_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
+                           _mm512_xor_si512( M[ 2], H[ 2] ) ), \
+         _mm512_xor_si512( M[10], H[10] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
+                        _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W8b6 \
+   _mm512_sub_epi64( \
+      _mm512_sub_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 4], H[ 4] ), \
+                           _mm512_xor_si512( M[ 0], H[ 0] ) ), \
+         _mm512_xor_si512( M[ 3], H[ 3] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[11], H[11] ), \
+                        _mm512_xor_si512( M[13], H[13] ) ) )
+
+#define W8b7 \
+   _mm512_sub_epi64( \
+      _mm512_sub_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
+                           _mm512_xor_si512( M[ 4], H[ 4] ) ), \
+         _mm512_xor_si512( M[ 5], H[ 5] ) ), \
+      _mm512_add_epi64( _mm512_xor_si512( M[12], H[12] ), \
+                        _mm512_xor_si512( M[14], H[14] ) ) )
+
+#define W8b8 \
+   _mm512_add_epi64( \
+      _mm512_sub_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
+                           _mm512_xor_si512( M[ 5], H[ 5] ) ), \
+         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[13], H[13] ), \
+                        _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W8b9 \
+   _mm512_sub_epi64( \
+      _mm512_add_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
+                           _mm512_xor_si512( M[ 3], H[ 3] ) ), \
+         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
+                        _mm512_xor_si512( M[14], H[14] ) ) )
+
+#define W8b10 \
+   _mm512_sub_epi64( \
+      _mm512_sub_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
+                           _mm512_xor_si512( M[ 1], H[ 1] ) ), \
+         _mm512_xor_si512( M[ 4], H[ 4] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
+                        _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W8b11 \
+   _mm512_sub_epi64( \
+      _mm512_sub_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
+                           _mm512_xor_si512( M[ 0], H[ 0] ) ), \
+         _mm512_xor_si512( M[ 2], H[ 2] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
+                        _mm512_xor_si512( M[ 9], H[ 9] ) ) )
+
+#define W8b12 \
+   _mm512_sub_epi64( \
+      _mm512_sub_epi64( \
+         _mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
+                           _mm512_xor_si512( M[ 3], H[ 3] ) ), \
+         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
+                        _mm512_xor_si512( M[10], H[10] ) ) )
+
+#define W8b13 \
+   _mm512_add_epi64( \
+      _mm512_add_epi64( \
+         _mm512_add_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
+                           _mm512_xor_si512( M[ 4], H[ 4] ) ), \
+         _mm512_xor_si512( M[ 7], H[ 7] ) ), \
+      _mm512_add_epi64( _mm512_xor_si512( M[10], H[10] ), \
+                        _mm512_xor_si512( M[11], H[11] ) ) )
+
+#define W8b14 \
+   _mm512_sub_epi64( \
+      _mm512_add_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
+                           _mm512_xor_si512( M[ 5], H[ 5] ) ), \
+         _mm512_xor_si512( M[ 8], H[ 8] ) ), \
+      _mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
+                        _mm512_xor_si512( M[12], H[12] ) ) )
+
+#define W8b15 \
+   _mm512_sub_epi64( \
+      _mm512_sub_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
+                           _mm512_xor_si512( M[ 4], H[4] ) ), \
+         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
+                        _mm512_xor_si512( M[13], H[13] ) ) )
+
+void compress_big_8way( const __m512i *M, const __m512i H[16],
+                        __m512i dH[16] )
+{
+   __m512i qt[32], xl, xh;
+
+   qt[ 0] = _mm512_add_epi64( s8b0( W8b0 ), H[ 1] );
+   qt[ 1] = _mm512_add_epi64( s8b1( W8b1 ), H[ 2] );
+   qt[ 2] = _mm512_add_epi64( s8b2( W8b2 ), H[ 3] );
+   qt[ 3] = _mm512_add_epi64( s8b3( W8b3 ), H[ 4] );
+   qt[ 4] = _mm512_add_epi64( s8b4( W8b4 ), H[ 5] );
+   qt[ 5] = _mm512_add_epi64( s8b0( W8b5 ), H[ 6] );
+   qt[ 6] = _mm512_add_epi64( s8b1( W8b6 ), H[ 7] );
+   qt[ 7] = _mm512_add_epi64( s8b2( W8b7 ), H[ 8] );
+   qt[ 8] = _mm512_add_epi64( s8b3( W8b8 ), H[ 9] );
+   qt[ 9] = _mm512_add_epi64( s8b4( W8b9 ), H[10] );
+   qt[10] = _mm512_add_epi64( s8b0( W8b10), H[11] );
+   qt[11] = _mm512_add_epi64( s8b1( W8b11), H[12] );
+   qt[12] = _mm512_add_epi64( s8b2( W8b12), H[13] );
+   qt[13] = _mm512_add_epi64( s8b3( W8b13), H[14] );
+   qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
+   qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
+   qt[16] = expand1b8( qt, M, H, 16 );
+   qt[17] = expand1b8( qt, M, H, 17 );
+   qt[18] = expand2b8( qt, M, H, 18 );
+   qt[19] = expand2b8( qt, M, H, 19 );
+   qt[20] = expand2b8( qt, M, H, 20 );
+   qt[21] = expand2b8( qt, M, H, 21 );
+   qt[22] = expand2b8( qt, M, H, 22 );
+   qt[23] = expand2b8( qt, M, H, 23 );
+   qt[24] = expand2b8( qt, M, H, 24 );
+   qt[25] = expand2b8( qt, M, H, 25 );
+   qt[26] = expand2b8( qt, M, H, 26 );
+   qt[27] = expand2b8( qt, M, H, 27 );
+   qt[28] = expand2b8( qt, M, H, 28 );
+   qt[29] = expand2b8( qt, M, H, 29 );
+   qt[30] = expand2b8( qt, M, H, 30 );
+   qt[31] = expand2b8( qt, M, H, 31 );
+
+   xl = _mm512_xor_si512(
+           mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
+           mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
+   xh = _mm512_xor_si512( xl, _mm512_xor_si512(
+           mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
+           mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
+
+#define DH1( m, sl, sr, a, b, c ) \
+   _mm512_add_epi64( \
+               _mm512_xor_si512( M[m], \
+                  _mm512_xor_si512( _mm512_slli_epi64( xh, sl ), \
+                                    _mm512_srli_epi64( qt[a], sr ) ) ), \
+               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
+
+#define DHL( m, rl, sl, h, a, b, c ) \
+   _mm512_add_epi64( _mm512_add_epi64( \
+       mm512_rol_64( dH[h], rl ), \
+          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
+                 _mm512_xor_si512( _mm512_slli_epi64( xl, sl ), \
+                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
+   
+#define DHR( m, rl, sr, h, a, b, c ) \
+   _mm512_add_epi64( _mm512_add_epi64( \
+       mm512_rol_64( dH[h], rl ), \
+          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
+                 _mm512_xor_si512( _mm512_srli_epi64( xl, sr ), \
+                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
+
+
+   dH[ 0] = DH1(  0,  5,  5, 16, 24, 0 );
+   dH[ 1] = DH1(  1,  7,  8, 17, 25, 1 );
+   dH[ 2] = DH1(  2,  5,  5, 18, 26, 2 );
+   dH[ 3] = DH1(  3,  1,  5, 19, 27, 3 );
+   dH[ 4] = DH1(  4,  3,  0, 20, 28, 4 );
+   dH[ 5] = DH1(  5,  6,  6, 21, 29, 5 );
+   dH[ 6] = DH1(  6,  4,  6, 22, 30, 6 );
+   dH[ 7] = DH1(  7, 11,  2, 23, 31, 7 );
+   dH[ 8] = DHL(  8,  9,  8,  4, 24, 23,  8 );
+   dH[ 9] = DHR(  9, 10,  6,  5, 25, 16,  9 );
+   dH[10] = DHL( 10, 11,  6,  6, 26, 17, 10 );
+   dH[11] = DHL( 11, 12,  4,  7, 27, 18, 11 );
+   dH[12] = DHR( 12, 13,  3,  0, 28, 19, 12 );
+   dH[13] = DHR( 13, 14,  4,  1, 29, 20, 13 );
+   dH[14] = DHR( 14, 15,  7,  2, 30, 21, 14 );
+   dH[15] = DHR( 15, 16,  2,  3, 31, 22, 15 );
+
+#undef DH1
+#undef DHL
+#undef DHR
+         
+}
+
+static const __m512i final_b8[16] =
+{
+   { 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0,
+     0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0,
+     0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0,
+     0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 },
+   { 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1,
+     0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1,
+     0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1,
+     0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 },
+   { 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2,
+     0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2,
+     0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2,
+     0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 },
+   { 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3,
+     0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3,
+     0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3,
+     0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 },
+   { 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4,
+     0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4,
+     0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4,
+     0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 },
+   { 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5,
+     0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5,
+     0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5,
+     0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 },
+   { 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6,
+     0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6,
+     0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6,
+     0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 },
+   { 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7,
+     0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7,
+     0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7,
+     0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7 },
+   { 0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8,
+     0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8,
+     0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8,
+     0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8 },
+   { 0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9,
+     0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9,
+     0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9,
+     0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9 },
+   { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
+     0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
+     0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
+     0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
+   { 0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab,
+     0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab,
+     0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab,
+     0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab },
+   { 0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac,
+     0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac,
+     0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac,
+     0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac },
+   { 0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad,
+     0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad,
+     0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad,
+     0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad },
+   { 0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae,
+     0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae,
+     0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae,
+     0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae },
+   { 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf,
+     0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf,
+     0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf,
+     0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf }
+};
+
+
+void bmw512_8way_init( bmw512_8way_context *ctx )
+//bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
+{
+   ctx->H[ 0] = m512_const1_64( 0x8081828384858687 );
+   ctx->H[ 1] = m512_const1_64( 0x88898A8B8C8D8E8F );
+   ctx->H[ 2] = m512_const1_64( 0x9091929394959697 );
+   ctx->H[ 3] = m512_const1_64( 0x98999A9B9C9D9E9F );
+   ctx->H[ 4] = m512_const1_64( 0xA0A1A2A3A4A5A6A7 );
+   ctx->H[ 5] = m512_const1_64( 0xA8A9AAABACADAEAF );
+   ctx->H[ 6] = m512_const1_64( 0xB0B1B2B3B4B5B6B7 );
+   ctx->H[ 7] = m512_const1_64( 0xB8B9BABBBCBDBEBF );
+   ctx->H[ 8] = m512_const1_64( 0xC0C1C2C3C4C5C6C7 );
+   ctx->H[ 9] = m512_const1_64( 0xC8C9CACBCCCDCECF );
+   ctx->H[10] = m512_const1_64( 0xD0D1D2D3D4D5D6D7 );
+   ctx->H[11] = m512_const1_64( 0xD8D9DADBDCDDDEDF );
+   ctx->H[12] = m512_const1_64( 0xE0E1E2E3E4E5E6E7 );
+   ctx->H[13] = m512_const1_64( 0xE8E9EAEBECEDEEEF );
+   ctx->H[14] = m512_const1_64( 0xF0F1F2F3F4F5F6F7 );
+   ctx->H[15] = m512_const1_64( 0xF8F9FAFBFCFDFEFF );
+   ctx->ptr = 0;
+   ctx->bit_count = 0;
+}
+
+void bmw512_8way_update( bmw512_8way_context *ctx, const void *data,
+                                size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   __m512i *buf;
+   __m512i htmp[16];
+   __m512i *h1, *h2;
+   size_t ptr;
+   const int buf_size = 128;  // bytes of one lane, compatible with len
+
+   ctx->bit_count += len << 3;
+   buf = ctx->buf;
+   ptr = ctx->ptr;
+   h1 = ctx->H;
+   h2 = htmp;
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_512( buf + (ptr>>3), vdata, clen >> 3 );
+      vdata = vdata + (clen>>3);
+      len -= clen;
+      ptr += clen;
+      if ( ptr == buf_size )
+      {
+         __m512i *ht;
+         compress_big_8way( buf, h1, h2 );
+         ht = h1;
+         h1 = h2;
+         h2 = ht;
+         ptr = 0;
+      }
+   }
+   ctx->ptr = ptr;
+   if ( h1 != ctx->H )
+        memcpy_512( ctx->H, h1, 16 );
+}
+
+void bmw512_8way_close( bmw512_8way_context *ctx, void *dst )
+{
+   __m512i *buf;
+   __m512i h1[16], h2[16], *h;
+   size_t ptr, u, v;
+   const int buf_size = 128;  // bytes of one lane, compatible with len
+
+   buf = ctx->buf;
+   ptr = ctx->ptr;
+   buf[ ptr>>3 ] = m512_const1_64( 0x80 );
+   ptr += 8;
+   h = ctx->H;
+
+   if (  ptr > (buf_size - 8) )
+   {
+      memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+      compress_big_8way( buf, h, h1 );
+      ptr = 0;
+      h = h1;
+   }
+   memset_zero_512( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
+   buf[ (buf_size - 8) >> 3 ] = _mm512_set1_epi64( ctx->bit_count );
+   compress_big_8way( buf, h, h2 );
+   for ( u = 0; u < 16; u ++ )
+      buf[ u ] = h2[ u ];
+   compress_big_8way( buf, final_b8, h1 );
+   for (u = 0, v = 8; u < 8; u ++, v ++)
+      casti_m512i( dst, u ) = h1[ v ];
+}
+
+#endif // AVX512
+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -1,18 +1,68 @@
 #include "keccak-gate.h"
-
-#ifdef KECCAK_4WAY
-
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
 #include "sph_keccak.h"
 #include "keccak-hash-4way.h"

+#if defined(KECCAK_8WAY)
+
+void keccakhash_8way(void *state, const void *input)
+{
+    keccak256_8way_context ctx;
+    keccak256_8way_init( &ctx );
+    keccak256_8way_update( &ctx, input, 80 );
+    keccak256_8way_close( &ctx, state );
+}
+
+int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash[16*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[49]);   // 3*16+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   const uint32_t Htarg = ptarget[7];
+   int thr_id = mythr->id;  
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do {
+       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+      keccakhash_8way( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( hash7[ lane<<1 ] < Htarg ) 
+      {
+          extr_lane_8x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+      }
+      n += 8;
+
+   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#elif defined(KECCAK_4WAY)
+
 void keccakhash_4way(void *state, const void *input)
 {
    keccak256_4way_context ctx;
    keccak256_4way_init( &ctx );
-    keccak256_4way( &ctx, input, 80 );
+    keccak256_4way_update( &ctx, input, 80 );
    keccak256_4way_close( &ctx, state );
 }

@@ -28,8 +78,8 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-//   const uint32_t Htarg = ptarget[7];
-    int thr_id = mythr->id;  // thr_id arg is deprecated
+   const uint32_t Htarg = ptarget[7];
+   int thr_id = mythr->id;

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
   do {
@@ -39,7 +89,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
      keccakhash_4way( hash, vdata );

      for ( int lane = 0; lane < 4; lane++ )
-      if ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 )
+      if ( hash7[ lane<<1 ] < Htarg )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -3,30 +3,36 @@

 bool register_keccak_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  opt_target_factor = 128.0;
-#if defined (KECCAK_4WAY)
+#if defined (KECCAK_8WAY)
+  gate->scanhash  = (void*)&scanhash_keccak_8way;
+  gate->hash      = (void*)&keccakhash_8way;
+#elif defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
 #else
-  gate->scanhash        = (void*)&scanhash_keccak;
-  gate->hash            = (void*)&keccakhash;
+  gate->scanhash  = (void*)&scanhash_keccak;
+  gate->hash      = (void*)&keccakhash;
 #endif
  return true;
 };

 bool register_keccakc_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
  gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
  opt_target_factor = 256.0;
-#if defined (KECCAK_4WAY)
+#if defined (KECCAK_8WAY)
+  gate->scanhash  = (void*)&scanhash_keccak_8way;
+  gate->hash      = (void*)&keccakhash_8way;
+#elif defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
 #else
-  gate->scanhash        = (void*)&scanhash_keccak;
-  gate->hash            = (void*)&keccakhash;
+  gate->scanhash  = (void*)&scanhash_keccak;
+  gate->hash      = (void*)&keccakhash;
 #endif
  return true;
 };
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -1,23 +1,33 @@
 #ifndef KECCAK_GATE_H__
-#define KECCAK_GATE_H__
+#define KECCAK_GATE_H__ 1

 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__)
-  #define KECCAK_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define KECCAK_8WAY 1
+#elif defined(__AVX2__)
+  #define KECCAK_4WAY 1
 #endif

-#if defined(KECCAK_4WAY)
+#if defined(KECCAK_8WAY)
+
+void keccakhash_8way( void *state, const void *input );
+int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(KECCAK_4WAY)

 void keccakhash_4way( void *state, const void *input );
 int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );

-#endif
+#else

 void keccakhash( void *state, const void *input );
 int scanhash_keccak( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );

 #endif
+
+#endif
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -1,23 +1,24 @@
 #include <stddef.h>
+#include <stdint.h>
 #include "keccak-hash-4way.h"

-#if defined(__AVX2__)
-
-static const sph_u64 RC[] = {
-        SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
-        SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
-        SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
-        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
-        SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
-        SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
-        SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
-        SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
-        SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
-        SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
-        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
-        SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
+static const uint64_t RC[] = {
+        0x0000000000000001, 0x0000000000008082,
+        0x800000000000808A, 0x8000000080008000,
+        0x000000000000808B, 0x0000000080000001,
+        0x8000000080008081, 0x8000000000008009,
+        0x000000000000008A, 0x0000000000000088,
+        0x0000000080008009, 0x000000008000000A,
+        0x000000008000808B, 0x800000000000008B,
+        0x8000000000008089, 0x8000000000008003,
+        0x8000000000008002, 0x8000000000000080,
+        0x000000000000800A, 0x800000008000000A,
+        0x8000000080008081, 0x8000000000008080,
+        0x0000000080000001, 0x8000000080008008
 };

+// generic macros
+
 #define a00   (kc->w[ 0])
 #define a10   (kc->w[ 1])
 #define a20   (kc->w[ 2])
@@ -48,6 +49,197 @@ static const sph_u64 RC[] = {
 #define READ_STATE(sc)
 #define WRITE_STATE(sc)

+#define MOV64(d, s)      (d = s)
+#define XOR64_IOTA       XOR64
+
+#define LPAR   (
+#define RPAR   )
+
+#define DO(x)   x
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define INPUT_BUF(size)   do { \
+    size_t j; \
+    for (j = 0; j < (size>>3); j++ ) \
+        kc->w[j ] = _mm512_xor_si512( kc->w[j], buf[j] ); \
+} while (0)
+
+// Targetted macros, keccak-macros.h is included for each target.
+
+#define DECL64(x)        __m512i x
+#define XOR64(d, a, b)   (d = _mm512_xor_si512(a,b))
+#define AND64(d, a, b)   (d = _mm512_and_si512(a,b))
+#define OR64(d, a, b)    (d = _mm512_or_si512(a,b))
+#define NOT64(d, s)      (d = _mm512_xor_si512(s,m512_neg1))
+#define ROL64(d, v, n)   (d = mm512_rol_64(v, n))
+
+#include "keccak-macros.c"
+
+#define KECCAK_F_1600   DO(KECCAK_F_1600_512)
+
+#define KECCAK_F_1600_512   do { \
+    int j; \
+    for (j = 0; j < 24; j += 8) \
+    { \
+       KF_ELT( 0,  1, _mm512_set1_epi64( RC[j + 0] ) ); \
+       KF_ELT( 1,  2, _mm512_set1_epi64( RC[j + 1] ) ); \
+       KF_ELT( 2,  3, _mm512_set1_epi64( RC[j + 2] ) ); \
+       KF_ELT( 3,  4, _mm512_set1_epi64( RC[j + 3] ) ); \
+       KF_ELT( 4,  5, _mm512_set1_epi64( RC[j + 4] ) ); \
+       KF_ELT( 5,  6, _mm512_set1_epi64( RC[j + 5] ) ); \
+       KF_ELT( 6,  7, _mm512_set1_epi64( RC[j + 6] ) ); \
+       KF_ELT( 7,  8, _mm512_set1_epi64( RC[j + 7] ) ); \
+       P8_TO_P0; \
+    } \
+} while (0)
+
+static void keccak64_8way_init( keccak64_ctx_m512i *kc, unsigned out_size )
+{
+   __m512i zero = m512_zero;
+   __m512i neg1 = m512_neg1;
+
+   // Initialization for the "lane complement".
+   kc->w[ 0] = zero;   kc->w[ 1] = neg1;
+   kc->w[ 2] = neg1;   kc->w[ 3] = zero;
+   kc->w[ 4] = zero;   kc->w[ 5] = zero;
+   kc->w[ 6] = zero;   kc->w[ 7] = zero;
+   kc->w[ 8] = neg1;   kc->w[ 9] = zero;
+   kc->w[10] = zero;   kc->w[11] = zero;
+   kc->w[12] = neg1;   kc->w[13] = zero;
+   kc->w[14] = zero;   kc->w[15] = zero;
+   kc->w[16] = zero;   kc->w[17] = neg1;
+   kc->w[18] = zero;   kc->w[19] = zero;
+   kc->w[20] = neg1;   kc->w[21] = zero;
+   kc->w[22] = zero;   kc->w[23] = zero;
+   kc->w[24] = zero;   kc->ptr = 0;
+   kc->lim = 200 - (out_size >> 2);
+}
+
+static void
+keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
+               size_t lim )
+{
+    __m512i *buf;
+    __m512i *vdata = (__m512i*)data;
+    size_t ptr;
+    DECL_STATE
+
+    buf = kc->buf;
+    ptr = kc->ptr;
+
+    if ( len < (lim - ptr) )
+    {
+        memcpy_512( buf + (ptr>>3), vdata, len>>3 );
+        kc->ptr = ptr + len;
+        return;
+    }
+    READ_STATE( kc );
+    while ( len > 0 )
+    {
+        size_t clen;
+
+        clen = (lim - ptr);
+        if ( clen > len )
+             clen = len;
+        memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
+        ptr += clen;
+        vdata = vdata + (clen>>3);
+        len -= clen;
+        if ( ptr == lim )
+        {
+            INPUT_BUF( lim );
+            KECCAK_F_1600;
+            ptr = 0;
+        }
+    }
+    WRITE_STATE( kc );
+    kc->ptr = ptr;
+}
+
+static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
+                                 size_t byte_len, size_t lim )
+{
+    unsigned eb;
+    union {
+       __m512i tmp[lim + 1];
+       sph_u64 dummy;   /* for alignment */
+    } u;
+    size_t j;
+    size_t m512_len = byte_len >> 3;
+
+    eb = 0x100  >> 8;
+    if ( kc->ptr == (lim - 8) )
+    {
+        const uint64_t t = eb | 0x8000000000000000;
+        u.tmp[0] = m512_const1_64( t );
+        j = 8;
+    }
+    else
+    {
+        j = lim - kc->ptr;
+        u.tmp[0] = m512_const1_64( eb );
+        memset_zero_512( u.tmp + 1, (j>>3) - 2 );
+        u.tmp[ (j>>3) - 1] = m512_const1_64( 0x8000000000000000 );
+    }
+    keccak64_8way_core( kc, u.tmp, j, lim );
+    /* Finalize the "lane complement" */
+    NOT64( kc->w[ 1], kc->w[ 1] );
+    NOT64( kc->w[ 2], kc->w[ 2] );
+    NOT64( kc->w[ 8], kc->w[ 8] );
+    NOT64( kc->w[12], kc->w[12] );
+    NOT64( kc->w[17], kc->w[17] );
+    NOT64( kc->w[20], kc->w[20] );
+    memcpy_512( dst, kc->w, m512_len );
+}
+
+void keccak256_8way_init( void *kc )
+{
+   keccak64_8way_init( kc, 256 );
+}
+
+void
+keccak256_8way_update(void *cc, const void *data, size_t len)
+{
+    keccak64_8way_core(cc, data, len, 136);
+}
+
+void
+keccak256_8way_close(void *cc, void *dst)
+{
+    keccak64_8way_close(cc, dst, 32, 136);
+}
+
+void keccak512_8way_init( void *kc )
+{
+   keccak64_8way_init( kc, 512 );
+}
+
+void
+keccak512_8way_update(void *cc, const void *data, size_t len)
+{
+        keccak64_8way_core(cc, data, len, 72);
+}
+
+void
+keccak512_8way_close(void *cc, void *dst)
+{
+        keccak64_8way_close(cc, dst, 64, 72);
+}
+
+#undef INPUT_BUF
+#undef DECL64
+#undef XOR64
+#undef AND64
+#undef OR64
+#undef NOT64
+#undef ROL64
+#undef KECCAK_F_1600
+
+#endif  // AVX512
+
+#if defined(__AVX2__)
+
 #define INPUT_BUF(size)   do { \
    size_t j; \
    for (j = 0; j < (size>>3); j++ ) \
@@ -55,314 +247,28 @@ static const sph_u64 RC[] = {
 } while (0)

 #define DECL64(x)        __m256i x
-#define MOV64(d, s)      (d = s)
 #define XOR64(d, a, b)   (d = _mm256_xor_si256(a,b))
 #define AND64(d, a, b)   (d = _mm256_and_si256(a,b))
 #define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
 #define NOT64(d, s)      (d = _mm256_xor_si256(s,m256_neg1))
 #define ROL64(d, v, n)   (d = mm256_rol_64(v, n))
-#define XOR64_IOTA       XOR64

-#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
-                DECL64(tt0); \
-                DECL64(tt1); \
-                DECL64(tt2); \
-                DECL64(tt3); \
-                XOR64(tt0, d0, d1); \
-                XOR64(tt1, d2, d3); \
-                XOR64(tt0, tt0, d4); \
-                XOR64(tt0, tt0, tt1); \
-                ROL64(tt0, tt0, 1); \
-                XOR64(tt2, c0, c1); \
-                XOR64(tt3, c2, c3); \
-                XOR64(tt0, tt0, c4); \
-                XOR64(tt2, tt2, tt3); \
-                XOR64(t, tt0, tt2); \
-        } while (0)
+#include "keccak-macros.c"

-#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
-        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
-        b40, b41, b42, b43, b44) \
-        do { \
-                DECL64(t0); \
-                DECL64(t1); \
-                DECL64(t2); \
-                DECL64(t3); \
-                DECL64(t4); \
-                TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
-                TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
-                TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
-                TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
-                TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
-                XOR64(b00, b00, t0); \
-                XOR64(b01, b01, t0); \
-                XOR64(b02, b02, t0); \
-                XOR64(b03, b03, t0); \
-                XOR64(b04, b04, t0); \
-                XOR64(b10, b10, t1); \
-                XOR64(b11, b11, t1); \
-                XOR64(b12, b12, t1); \
-                XOR64(b13, b13, t1); \
-                XOR64(b14, b14, t1); \
-                XOR64(b20, b20, t2); \
-                XOR64(b21, b21, t2); \
-                XOR64(b22, b22, t2); \
-                XOR64(b23, b23, t2); \
-                XOR64(b24, b24, t2); \
-                XOR64(b30, b30, t3); \
-                XOR64(b31, b31, t3); \
-                XOR64(b32, b32, t3); \
-                XOR64(b33, b33, t3); \
-                XOR64(b34, b34, t3); \
-                XOR64(b40, b40, t4); \
-                XOR64(b41, b41, t4); \
-                XOR64(b42, b42, t4); \
-                XOR64(b43, b43, t4); \
-                XOR64(b44, b44, t4); \
-        } while (0)
+#define KECCAK_F_1600   DO(KECCAK_F_1600_256)

-#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
-        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
-        b40, b41, b42, b43, b44) \
-        do { \
-                /* ROL64(b00, b00,  0); */ \
-                ROL64(b01, b01, 36); \
-                ROL64(b02, b02,  3); \
-                ROL64(b03, b03, 41); \
-                ROL64(b04, b04, 18); \
-                ROL64(b10, b10,  1); \
-                ROL64(b11, b11, 44); \
-                ROL64(b12, b12, 10); \
-                ROL64(b13, b13, 45); \
-                ROL64(b14, b14,  2); \
-                ROL64(b20, b20, 62); \
-                ROL64(b21, b21,  6); \
-                ROL64(b22, b22, 43); \
-                ROL64(b23, b23, 15); \
-                ROL64(b24, b24, 61); \
-                ROL64(b30, b30, 28); \
-                ROL64(b31, b31, 55); \
-                ROL64(b32, b32, 25); \
-                ROL64(b33, b33, 21); \
-                ROL64(b34, b34, 56); \
-                ROL64(b40, b40, 27); \
-                ROL64(b41, b41, 20); \
-                ROL64(b42, b42, 39); \
-                ROL64(b43, b43,  8); \
-                ROL64(b44, b44, 14); \
-        } while (0)
-
-/*
- * The KHI macro integrates the "lane complement" optimization. On input,
- * some words are complemented:
- *    a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
- * On output, the following words are complemented:
- *    a04 a10 a20 a22 a23 a31
- *
- * The (implicit) permutation and the theta expansion will bring back
- * the input mask for the next round.
- */
-
-#define KHI_XO(d, a, b, c)   do { \
-                DECL64(kt); \
-                OR64(kt, b, c); \
-                XOR64(d, a, kt); \
-        } while (0)
-
-#define KHI_XA(d, a, b, c)   do { \
-                DECL64(kt); \
-                AND64(kt, b, c); \
-                XOR64(d, a, kt); \
-        } while (0)
-
-#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
-        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
-        b40, b41, b42, b43, b44) \
-        do { \
-                DECL64(c0); \
-                DECL64(c1); \
-                DECL64(c2); \
-                DECL64(c3); \
-                DECL64(c4); \
-                DECL64(bnn); \
-                NOT64(bnn, b20); \
-                KHI_XO(c0, b00, b10, b20); \
-                KHI_XO(c1, b10, bnn, b30); \
-                KHI_XA(c2, b20, b30, b40); \
-                KHI_XO(c3, b30, b40, b00); \
-                KHI_XA(c4, b40, b00, b10); \
-                MOV64(b00, c0); \
-                MOV64(b10, c1); \
-                MOV64(b20, c2); \
-                MOV64(b30, c3); \
-                MOV64(b40, c4); \
-                NOT64(bnn, b41); \
-                KHI_XO(c0, b01, b11, b21); \
-                KHI_XA(c1, b11, b21, b31); \
-                KHI_XO(c2, b21, b31, bnn); \
-                KHI_XO(c3, b31, b41, b01); \
-                KHI_XA(c4, b41, b01, b11); \
-                MOV64(b01, c0); \
-                MOV64(b11, c1); \
-                MOV64(b21, c2); \
-                MOV64(b31, c3); \
-                MOV64(b41, c4); \
-                NOT64(bnn, b32); \
-                KHI_XO(c0, b02, b12, b22); \
-                KHI_XA(c1, b12, b22, b32); \
-                KHI_XA(c2, b22, bnn, b42); \
-                KHI_XO(c3, bnn, b42, b02); \
-                KHI_XA(c4, b42, b02, b12); \
-                MOV64(b02, c0); \
-                MOV64(b12, c1); \
-                MOV64(b22, c2); \
-                MOV64(b32, c3); \
-                MOV64(b42, c4); \
-                NOT64(bnn, b33); \
-                KHI_XA(c0, b03, b13, b23); \
-                KHI_XO(c1, b13, b23, b33); \
-                KHI_XO(c2, b23, bnn, b43); \
-                KHI_XA(c3, bnn, b43, b03); \
-                KHI_XO(c4, b43, b03, b13); \
-                MOV64(b03, c0); \
-                MOV64(b13, c1); \
-                MOV64(b23, c2); \
-                MOV64(b33, c3); \
-                MOV64(b43, c4); \
-                NOT64(bnn, b14); \
-                KHI_XA(c0, b04, bnn, b24); \
-                KHI_XO(c1, bnn, b24, b34); \
-                KHI_XA(c2, b24, b34, b44); \
-                KHI_XO(c3, b34, b44, b04); \
-                KHI_XA(c4, b44, b04, b14); \
-                MOV64(b04, c0); \
-                MOV64(b14, c1); \
-                MOV64(b24, c2); \
-                MOV64(b34, c3); \
-                MOV64(b44, c4); \
-        } while (0)
-
-#define IOTA(r)   XOR64_IOTA(a00, a00, r)
-
-#define P0    a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
-              a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
-#define P1    a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
-              a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
-#define P2    a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
-              a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
-#define P3    a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
-              a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
-#define P4    a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
-              a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
-#define P5    a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
-              a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
-#define P6    a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
-              a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
-#define P7    a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
-              a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
-#define P8    a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
-              a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
-#define P9    a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
-              a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
-#define P10   a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
-              a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
-#define P11   a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
-              a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
-#define P12   a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
-              a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
-#define P13   a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
-              a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
-#define P14   a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
-              a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
-#define P15   a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
-              a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
-#define P16   a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
-              a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
-#define P17   a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
-              a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
-#define P18   a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
-              a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
-#define P19   a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
-              a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
-#define P20   a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
-              a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
-#define P21   a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
-              a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
-#define P22   a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
-              a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
-#define P23   a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
-              a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
-
-#define P8_TO_P0   do { \
-                DECL64(t); \
-                MOV64(t, a01); \
-                MOV64(a01, a11); \
-                MOV64(a11, a43); \
-                MOV64(a43, t); \
-                MOV64(t, a02); \
-                MOV64(a02, a22); \
-                MOV64(a22, a31); \
-                MOV64(a31, t); \
-                MOV64(t, a03); \
-                MOV64(a03, a33); \
-                MOV64(a33, a24); \
-                MOV64(a24, t); \
-                MOV64(t, a04); \
-                MOV64(a04, a44); \
-                MOV64(a44, a12); \
-                MOV64(a12, t); \
-                MOV64(t, a10); \
-                MOV64(a10, a32); \
-                MOV64(a32, a13); \
-                MOV64(a13, t); \
-                MOV64(t, a14); \
-                MOV64(a14, a21); \
-                MOV64(a21, a20); \
-                MOV64(a20, t); \
-                MOV64(t, a23); \
-                MOV64(a23, a42); \
-                MOV64(a42, a40); \
-                MOV64(a40, t); \
-                MOV64(t, a30); \
-                MOV64(a30, a41); \
-                MOV64(a41, a34); \
-                MOV64(a34, t); \
-        } while (0)
-
-#define LPAR   (
-#define RPAR   )
-
-#define KF_ELT(r, s, k)   do { \
-                THETA LPAR P ## r RPAR; \
-                RHO LPAR P ## r RPAR; \
-                KHI LPAR P ## s RPAR; \
-                IOTA(k); \
-        } while (0)
-
-#define DO(x)   x
-
-#define KECCAK_F_1600   DO(KECCAK_F_1600_)
-
-#define KECCAK_F_1600_   do { \
+#define KECCAK_F_1600_256   do { \
    int j; \
    for (j = 0; j < 24; j += 8) \
    { \
-       KF_ELT( 0,  1, (_mm256_set_epi64x( RC[j + 0], RC[j + 0], \
-                                       RC[j + 0], RC[j + 0])) ); \
-       KF_ELT( 1,  2, (_mm256_set_epi64x( RC[j + 1], RC[j + 1], \
-                                       RC[j + 1], RC[j + 1])) ); \
-       KF_ELT( 2,  3, (_mm256_set_epi64x( RC[j + 2], RC[j + 2], \
-                                       RC[j + 2], RC[j + 2])) ); \
-       KF_ELT( 3,  4, (_mm256_set_epi64x( RC[j + 3], RC[j + 3], \
-                                       RC[j + 3], RC[j + 3])) ); \
-       KF_ELT( 4,  5, (_mm256_set_epi64x( RC[j + 4], RC[j + 4], \
-                                       RC[j + 4], RC[j + 4])) ); \
-       KF_ELT( 5,  6, (_mm256_set_epi64x( RC[j + 5], RC[j + 5], \
-                                       RC[j + 5], RC[j + 5])) ); \
-       KF_ELT( 6,  7, (_mm256_set_epi64x( RC[j + 6], RC[j + 6], \
-                                       RC[j + 6], RC[j + 6])) ); \
-       KF_ELT( 7,  8, (_mm256_set_epi64x( RC[j + 7], RC[j + 7], \
-                                       RC[j + 7], RC[j + 7])) ); \
+       KF_ELT( 0,  1, _mm256_set1_epi64x( RC[j + 0] ) ); \
+       KF_ELT( 1,  2, _mm256_set1_epi64x( RC[j + 1] ) ); \
+       KF_ELT( 2,  3, _mm256_set1_epi64x( RC[j + 2] ) ); \
+       KF_ELT( 3,  4, _mm256_set1_epi64x( RC[j + 3] ) ); \
+       KF_ELT( 4,  5, _mm256_set1_epi64x( RC[j + 4] ) ); \
+       KF_ELT( 5,  6, _mm256_set1_epi64x( RC[j + 5] ) ); \
+       KF_ELT( 6,  7, _mm256_set1_epi64x( RC[j + 6] ) ); \
+       KF_ELT( 7,  8, _mm256_set1_epi64x( RC[j + 7] ) ); \
       P8_TO_P0; \
    } \
 } while (0)
@@ -453,7 +359,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
    else
    {
        j = lim - kc->ptr;
-        u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb );
+        u.tmp[0] = m256_const1_64( eb );
        memset_zero_256( u.tmp + 1, (j>>3) - 2 );
        u.tmp[ (j>>3) - 1] = m256_const1_64( 0x8000000000000000 );
    }
@@ -474,7 +380,7 @@ void keccak256_4way_init( void *kc )
 }

 void
-keccak256_4way(void *cc, const void *data, size_t len)
+keccak256_4way_update(void *cc, const void *data, size_t len)
 {
    keccak64_core(cc, data, len, 136);
 }
@@ -491,15 +397,24 @@ void keccak512_4way_init( void *kc )
 }

 void
-keccak512_4way(void *cc, const void *data, size_t len)
+keccak512_4way_update(void *cc, const void *data, size_t len)
 {
-        keccak64_core(cc, data, len, 72);
+   keccak64_core(cc, data, len, 72);
 }

 void
 keccak512_4way_close(void *cc, void *dst)
 {
-        keccak64_close(cc, dst, 64, 72);
+   keccak64_close(cc, dst, 64, 72);
 }

-#endif
+#undef INPUT_BUF
+#undef DECL64
+#undef XOR64
+#undef AND64
+#undef OR64
+#undef NOT64
+#undef ROL64
+#undef KECCAK_F_1600
+
+#endif  // AVX2
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -64,26 +64,49 @@ extern "C"{
 * <code>memcpy()</code>).
 */

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 typedef struct {
-        __m256i buf[144*8];    /* first field, for alignment */
+        __m512i buf[144*8];
+        __m512i w[25];
+        size_t ptr, lim;
+} keccak64_ctx_m512i __attribute__((aligned(128)));
+
+typedef keccak64_ctx_m512i keccak256_8way_context;
+typedef keccak64_ctx_m512i keccak512_8way_context;
+
+void keccak256_8way_init(void *cc);
+void keccak256_8way_update(void *cc, const void *data, size_t len);
+void keccak256_8way_close(void *cc, void *dst);
+
+void keccak512_8way_init(void *cc);
+void keccak512_8way_update(void *cc, const void *data, size_t len);
+void keccak512_8way_close(void *cc, void *dst);
+void keccak512_8way_addbits_and_close(
+        void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif   
+
+typedef struct {
+        __m256i buf[144*8];  
        __m256i w[25];
        size_t ptr, lim;
-//        sph_u64 wide[25];
-} keccak64_ctx_m256i;
+} keccak64_ctx_m256i __attribute__((aligned(128)));

 typedef keccak64_ctx_m256i keccak256_4way_context;
 typedef keccak64_ctx_m256i keccak512_4way_context;

 void keccak256_4way_init(void *cc);
-void keccak256_4way(void *cc, const void *data, size_t len);
+void keccak256_4way_update(void *cc, const void *data, size_t len);
 void keccak256_4way_close(void *cc, void *dst);
-
+#define keccak256_4way keccak256_4way_update

 void keccak512_4way_init(void *cc);
-void keccak512_4way(void *cc, const void *data, size_t len);
+void keccak512_4way_update(void *cc, const void *data, size_t len);
 void keccak512_4way_close(void *cc, void *dst);
 void keccak512_4way_addbits_and_close(
        void *cc, unsigned ub, unsigned n, void *dst);
+#define keccak512_4way keccak512_4way_update

 #endif

--- a/algo/keccak/keccak-macros.c
+++ b/algo/keccak/keccak-macros.c
@@ -0,0 +1,324 @@
+#ifdef TH_ELT
+#undef TH_ELT
+#endif
+#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
+                DECL64(tt0); \
+                DECL64(tt1); \
+                DECL64(tt2); \
+                DECL64(tt3); \
+                XOR64(tt0, d0, d1); \
+                XOR64(tt1, d2, d3); \
+                XOR64(tt0, tt0, d4); \
+                XOR64(tt0, tt0, tt1); \
+                ROL64(tt0, tt0, 1); \
+                XOR64(tt2, c0, c1); \
+                XOR64(tt3, c2, c3); \
+                XOR64(tt0, tt0, c4); \
+                XOR64(tt2, tt2, tt3); \
+                XOR64(t, tt0, tt2); \
+        } while (0)
+
+#ifdef THETA
+#undef THETA
+#endif
+#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+        b40, b41, b42, b43, b44) \
+        do { \
+                DECL64(t0); \
+                DECL64(t1); \
+                DECL64(t2); \
+                DECL64(t3); \
+                DECL64(t4); \
+                TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
+                TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
+                TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
+                TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
+                TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
+                XOR64(b00, b00, t0); \
+                XOR64(b01, b01, t0); \
+                XOR64(b02, b02, t0); \
+                XOR64(b03, b03, t0); \
+                XOR64(b04, b04, t0); \
+                XOR64(b10, b10, t1); \
+                XOR64(b11, b11, t1); \
+                XOR64(b12, b12, t1); \
+                XOR64(b13, b13, t1); \
+                XOR64(b14, b14, t1); \
+                XOR64(b20, b20, t2); \
+                XOR64(b21, b21, t2); \
+                XOR64(b22, b22, t2); \
+                XOR64(b23, b23, t2); \
+                XOR64(b24, b24, t2); \
+                XOR64(b30, b30, t3); \
+                XOR64(b31, b31, t3); \
+                XOR64(b32, b32, t3); \
+                XOR64(b33, b33, t3); \
+                XOR64(b34, b34, t3); \
+                XOR64(b40, b40, t4); \
+                XOR64(b41, b41, t4); \
+                XOR64(b42, b42, t4); \
+                XOR64(b43, b43, t4); \
+                XOR64(b44, b44, t4); \
+        } while (0)
+
+#ifdef RHO
+#undef RHO
+#endif
+#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+        b40, b41, b42, b43, b44) \
+        do { \
+                /* ROL64(b00, b00,  0); */ \
+                ROL64(b01, b01, 36); \
+                ROL64(b02, b02,  3); \
+                ROL64(b03, b03, 41); \
+                ROL64(b04, b04, 18); \
+                ROL64(b10, b10,  1); \
+                ROL64(b11, b11, 44); \
+                ROL64(b12, b12, 10); \
+                ROL64(b13, b13, 45); \
+                ROL64(b14, b14,  2); \
+                ROL64(b20, b20, 62); \
+                ROL64(b21, b21,  6); \
+                ROL64(b22, b22, 43); \
+                ROL64(b23, b23, 15); \
+                ROL64(b24, b24, 61); \
+                ROL64(b30, b30, 28); \
+                ROL64(b31, b31, 55); \
+                ROL64(b32, b32, 25); \
+                ROL64(b33, b33, 21); \
+                ROL64(b34, b34, 56); \
+                ROL64(b40, b40, 27); \
+                ROL64(b41, b41, 20); \
+                ROL64(b42, b42, 39); \
+                ROL64(b43, b43,  8); \
+                ROL64(b44, b44, 14); \
+        } while (0)
+
+/*
+ * The KHI macro integrates the "lane complement" optimization. On input,
+ * some words are complemented:
+ *    a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
+ * On output, the following words are complemented:
+ *    a04 a10 a20 a22 a23 a31
+ *
+ * The (implicit) permutation and the theta expansion will bring back
+ * the input mask for the next round.
+ */
+
+#ifdef KHI_XO
+#undef KHI_XO
+#endif
+#define KHI_XO(d, a, b, c)   do { \
+                DECL64(kt); \
+                OR64(kt, b, c); \
+                XOR64(d, a, kt); \
+        } while (0)
+
+#ifdef KHI_XA
+#undef KHI_XA
+#endif
+#define KHI_XA(d, a, b, c)   do { \
+                DECL64(kt); \
+                AND64(kt, b, c); \
+                XOR64(d, a, kt); \
+        } while (0)
+
+#ifdef KHI
+#undef KHI
+#endif
+#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+        b40, b41, b42, b43, b44) \
+        do { \
+                DECL64(c0); \
+                DECL64(c1); \
+                DECL64(c2); \
+                DECL64(c3); \
+                DECL64(c4); \
+                DECL64(bnn); \
+                NOT64(bnn, b20); \
+                KHI_XO(c0, b00, b10, b20); \
+                KHI_XO(c1, b10, bnn, b30); \
+                KHI_XA(c2, b20, b30, b40); \
+                KHI_XO(c3, b30, b40, b00); \
+                KHI_XA(c4, b40, b00, b10); \
+                MOV64(b00, c0); \
+                MOV64(b10, c1); \
+                MOV64(b20, c2); \
+                MOV64(b30, c3); \
+                MOV64(b40, c4); \
+                NOT64(bnn, b41); \
+                KHI_XO(c0, b01, b11, b21); \
+                KHI_XA(c1, b11, b21, b31); \
+                KHI_XO(c2, b21, b31, bnn); \
+                KHI_XO(c3, b31, b41, b01); \
+                KHI_XA(c4, b41, b01, b11); \
+                MOV64(b01, c0); \
+                MOV64(b11, c1); \
+                MOV64(b21, c2); \
+                MOV64(b31, c3); \
+                MOV64(b41, c4); \
+                NOT64(bnn, b32); \
+                KHI_XO(c0, b02, b12, b22); \
+                KHI_XA(c1, b12, b22, b32); \
+                KHI_XA(c2, b22, bnn, b42); \
+                KHI_XO(c3, bnn, b42, b02); \
+                KHI_XA(c4, b42, b02, b12); \
+                MOV64(b02, c0); \
+                MOV64(b12, c1); \
+                MOV64(b22, c2); \
+                MOV64(b32, c3); \
+                MOV64(b42, c4); \
+                NOT64(bnn, b33); \
+                KHI_XA(c0, b03, b13, b23); \
+                KHI_XO(c1, b13, b23, b33); \
+                KHI_XO(c2, b23, bnn, b43); \
+                KHI_XA(c3, bnn, b43, b03); \
+                KHI_XO(c4, b43, b03, b13); \
+                MOV64(b03, c0); \
+                MOV64(b13, c1); \
+                MOV64(b23, c2); \
+                MOV64(b33, c3); \
+                MOV64(b43, c4); \
+                NOT64(bnn, b14); \
+                KHI_XA(c0, b04, bnn, b24); \
+                KHI_XO(c1, bnn, b24, b34); \
+                KHI_XA(c2, b24, b34, b44); \
+                KHI_XO(c3, b34, b44, b04); \
+                KHI_XA(c4, b44, b04, b14); \
+                MOV64(b04, c0); \
+                MOV64(b14, c1); \
+                MOV64(b24, c2); \
+                MOV64(b34, c3); \
+                MOV64(b44, c4); \
+        } while (0)
+
+#ifdef IOTA
+#undef IOTA
+#endif
+#define IOTA(r)   XOR64_IOTA(a00, a00, r)
+
+#ifdef P0
+#undef P1
+#undef P2
+#undef P3
+#undef P4
+#undef P5
+#undef P6
+#undef P7
+#undef P8
+#undef P9
+#undef P10
+#undef p11
+#undef P12
+#undef P13
+#undef P14
+#undef P15
+#undef P16
+#undef P17
+#undef P18
+#undef P19
+#undef P20
+#undef P21
+#undef P22
+#undef P23
+#endif
+
+#define P0    a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
+              a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
+#define P1    a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
+              a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
+#define P2    a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
+              a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
+#define P3    a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
+              a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
+#define P4    a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
+              a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
+#define P5    a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
+              a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
+#define P6    a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
+              a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
+#define P7    a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
+              a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
+#define P8    a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
+              a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
+#define P9    a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
+              a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
+#define P10   a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
+              a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
+#define P11   a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
+              a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
+#define P12   a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
+              a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
+#define P13   a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
+              a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
+#define P14   a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
+              a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
+#define P15   a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
+              a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
+#define P16   a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
+              a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
+#define P17   a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
+              a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
+#define P18   a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
+              a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
+#define P19   a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
+              a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
+#define P20   a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
+              a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
+#define P21   a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
+              a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
+#define P22   a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
+              a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
+#define P23   a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
+              a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
+
+#ifdef P8_TO_P0
+#undef P8_TO_P0
+#endif
+#define P8_TO_P0   do { \
+                DECL64(t); \
+                MOV64(t, a01); \
+                MOV64(a01, a11); \
+                MOV64(a11, a43); \
+                MOV64(a43, t); \
+                MOV64(t, a02); \
+                MOV64(a02, a22); \
+                MOV64(a22, a31); \
+                MOV64(a31, t); \
+                MOV64(t, a03); \
+                MOV64(a03, a33); \
+                MOV64(a33, a24); \
+                MOV64(a24, t); \
+                MOV64(t, a04); \
+                MOV64(a04, a44); \
+                MOV64(a44, a12); \
+                MOV64(a12, t); \
+                MOV64(t, a10); \
+                MOV64(a10, a32); \
+                MOV64(a32, a13); \
+                MOV64(a13, t); \
+                MOV64(t, a14); \
+                MOV64(a14, a21); \
+                MOV64(a21, a20); \
+                MOV64(a20, t); \
+                MOV64(t, a23); \
+                MOV64(a23, a42); \
+                MOV64(a42, a40); \
+                MOV64(a40, t); \
+                MOV64(t, a30); \
+                MOV64(a30, a41); \
+                MOV64(a41, a34); \
+                MOV64(a34, t); \
+        } while (0)
+
+#define KF_ELT(r, s, k)   do { \
+                THETA LPAR P ## r RPAR; \
+                RHO LPAR P ## r RPAR; \
+                KHI LPAR P ## s RPAR; \
+                IOTA(k); \
+        } while (0)
+
+
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -5,7 +5,6 @@
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 

-
 #if defined (LYRA2REV3_8WAY)

 typedef struct {
@@ -14,7 +13,7 @@ typedef struct {
   bmw256_8way_context       bmw;
 } lyra2v3_8way_ctx_holder;

-static lyra2v3_8way_ctx_holder l2v3_8way_ctx;
+static __thread lyra2v3_8way_ctx_holder l2v3_8way_ctx;

 bool init_lyra2rev3_8way_ctx()
 {
@@ -38,7 +37,7 @@ void lyra2rev3_8way_hash( void *state, const void *input )
   lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );

-   blake256_8way( &ctx.blake, input, 80 );
+   blake256_8way( &ctx.blake, input + (64*8), 16 );
   blake256_8way_close( &ctx.blake, vhash );

   dintrlv_8x32( hash0, hash1, hash2, hash3,
@@ -91,7 +90,7 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
 {
   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *hash7 = &hash[7<<3];
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
@@ -99,12 +98,15 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
-   const int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;

-   if ( opt_benchmark )
-      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+   if ( opt_benchmark )  ( (uint32_t*)ptarget )[7] = 0x0000ff;

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
+
+   blake256_8way_init( &l2v3_8way_ctx.blake );
+   blake256_8way( &l2v3_8way_ctx.blake, vdata, 64 );
+
   do
   {
      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
@@ -119,8 +121,8 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
         extr_lane_8x32( lane_hash, hash, lane, 256 );
         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
         {
-              pdata[19] = n + lane;
-              submit_lane_solution( work, lane_hash, mythr, lane );
+             pdata[19] = n + lane;
+             submit_lane_solution( work, lane_hash, mythr, lane );
         }
      }
      n += 8;
@@ -133,14 +135,14 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,

 #if defined (LYRA2REV3_4WAY)  

-
 typedef struct {
   blake256_4way_context     blake;
   cubehashParam             cube;
   bmw256_4way_context       bmw;
 } lyra2v3_4way_ctx_holder;

-static lyra2v3_4way_ctx_holder l2v3_4way_ctx;
+//static lyra2v3_4way_ctx_holder l2v3_4way_ctx;
+static __thread lyra2v3_4way_ctx_holder l2v3_4way_ctx;

 bool init_lyra2rev3_4way_ctx()
 {
@@ -160,7 +162,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
   memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );

-   blake256_4way( &ctx.blake, input, 80 );
+//   blake256_4way( &ctx.blake, input, 80 );
+   blake256_4way( &ctx.blake, input + (64*4), 16 );
   blake256_4way_close( &ctx.blake, vhash );
   dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

@@ -206,6 +209,10 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+
+   blake256_4way_init( &l2v3_4way_ctx.blake );
+   blake256_4way( &l2v3_4way_ctx.blake, vdata, 64 );
+
   do
   {
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
--- a/algo/sha/sha256_hash_11way.c
+++ b/algo/sha/sha256_hash_11way.c
@@ -1,538 +0,0 @@
-#if 0
-
-#include <stddef.h>
-#include <string.h>
-
-#include "sha2-hash-4way.h"
-
-#if defined(__AVX2__)
-
-// naming convention for variables and macros
-// VARx: AVX2 8 way 32 bit
-// VARy: MMX 2 way 32 bit
-// VARz: scalar integer 32 bit
-
-
-static const uint32_t H256[8] =
-{
-        0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-        0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-};
-
-static const uint32_t K256[64] = 
-{
-        0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
-        0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
-        0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
-        0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
-        0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
-        0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
-        0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
-        0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
-        0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
-        0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
-        0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
-        0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
-        0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
-        0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
-        0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
-        0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
-};
-
-#define CHx(X, Y, Z) \
-   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
-
-#define CHy(X, Y, Z) \
-   _mm_xor_si64( _mm_and_si64( _mm_xor_si64( Y, Z ), X ), Z )
-
-#define CHz(X, Y, Z) ((( (Y) ^ (Z) ) & (X) ) ^ (Z) )
-
-
-#define MAJx(X, Y, Z) \
-   _mm256_or_si256( _mm256_and_si256( X, Y ), \
-                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
-
-#define MAJy(X, Y, Z) \
-   _mm_or_si64( _mm_and_si64( X, Y ), \
-                    _mm_and_si64( _mm_or_si64( X, Y ), Z ) )
-
-#define MAJz(X, Y, Z)  ( ( (X) & (Y) ) | ( ( (X) | (Y) ) & (Z) ) )
-
-#define BSG2_0x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-       mm256_ror_32(x,2), mm256_ror_32(x,13) ), _mm256_srli_epi32(x,22) )
-
-#define BSG2_0y(x) \
-   _mm_xor_si64( _mm_xor_si64( \
-       mm64_ror_32(x,2), mm64_ror_32(x,13) ), _mm_srli_pi32(x,22) )
-
-#define BSG2_0z(x)  ( u32_ror_32(x,2) ^ u32_ror_32(x,13)  ^ ((x)>>22) )
-
-#define BSG2_1x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-       mm256_ror_32(x,6), mm256_ror_32(x,11) ), _mm256_srli_epi32(x,25) )
-
-#define BSG2_1y(x) \
-   _mm_xor_si64( _mm_xor_si64( \
-       mm64_ror_32(x,6), mm64_ror_32(x,11) ), _mm_srli_pi32(x,25) )
-
-#define BSG2_1z(x)   ( u32_ror_32(x,6) ^ u32_ror_32(x,11) ^ ((x)>>25) )
-
-#define SSG2_0x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-       mm256_ror_32(x,7), mm256_ror_32(x,18) ), _mm256_srli_epi32(x,3) ) 
-
-#define SSG2_0y(x) \
-   _mm_xor_si64( _mm_xor_si64( \
-       mm64_ror_32(x,7), mm64_ror_32(x,18) ), _mm_srli_pi32(x,3) )
-
-#define SSG2_0z(x)  (( u32_ror_32(x,7) ^ u32_ror_32(x,18) ) ^ ((x)>>3) )
-
-#define SSG2_1x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-       mm256_ror_32(x,17), mm256_ror_32(x,19) ), _mm256_srli_epi32(x,10) )
-
-#define SSG2_1y(x) \
-   _mm_xor_si64( _mm_xor_si64( \
-       mm64_ror_32(x,17), mm64_ror_32(x,19) ), _mm_srli_pi32(x,10) )
-
-#define SSG2_1z(x)   ( u32_ror_32(x,17) ^ u32_ror_32(x,19)  ^ ((x)>>10) )
-
-#define SHA2x_MEXP( a, b, c, d ) \
-     _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
-                 SSG2_1x( Wx[a] ), Wx[b] ), SSG2_0x( Wx[c] ) ), Wx[d] )
-
-#define SHA2y_MEXP( a, b, c, d ) \
-     _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
-                 SSG2_1y( Wy[a] ), Wy[b] ), SSG2_0y( Wy[c] ) ), Wy[d] )
-
-#define SHA2z_MEXP( a, b, c, d ) \
-               ( SSG2_1z( Wz[a] ) + Wz[b] + SSG2_0z( Wz[c] ) + Wz[d] )
-
-
-#define SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, \
-	                  Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, \
-		          Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, i, j) \
-do { \
-  __m256i T1x, T2x; \
-  __m64 T1y, T2y; \
-  uint32_t T1z, T2z; \
-  T1x = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
-        _mm256_add_epi32( Hx, BSG2_1x(Ex) ), CHx(Ex, Fx, Gx) ), \
-                          _mm256_set1_epi32( K256[( (j)+(i) )] ) ), Wx[i] ); \
-  T1y = _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
-        _mm_add_pi32( Hy, BSG2_1y(Ey) ), CHy(Ey, Fy, Gy) ), \
-                          _mm_set1_pi32( K256[( (j)+(i) )] ) ), Wy[i] ); \
-  T1z = Hz + BSG2_1z( Ez ) + CHz( Ez, Fz, Gz ) + K256[ ((j)+(i)) ] + Wz[i]; \
-  T2x = _mm256_add_epi32( BSG2_0x(Ax), MAJx(Ax, Bx, Cx) ); \
-  T2y = _mm_add_pi32( BSG2_0y(Ay), MAJy(Ay, By, Cy) ); \
-  T2z = BSG2_0z( Az ) + MAJz( Az, Bz, Cz ); \
-  Dx  = _mm256_add_epi32( Dx,  T1x ); \
-  Dy  = _mm_add_pi32( Dy, T1y ); \
-  Dz  = Dz + T1z; \
-  Hx  = _mm256_add_epi32( T1x, T2x ); \
-  Hy  = _mm_add_pi32( T1y, T2y ); \
-  Hz  = T1z + T2z; \
-} while (0)
-	
-void sha256_11way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 ry[8],
-                         uint32_t *inz, uint32_t rz[8] )
-{
-   __m256i Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx;
-   __m256i Wx[16];
-   __m64 Ay, By, Cy, Dy, Ey, Fy, Gy, Hy;
-   __m64 Wy[16];
-   uint32_t Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz;
-   uint32_t Wz[16];
-
-   Wx[ 0] = mm256_bswap_32( inx[ 0] );
-   Wy[ 0] =  mm64_bswap_32( iny[ 0] );
-   Wz[ 0] =       bswap_32( inz[ 0] );
-
-   Wx[ 1] = mm256_bswap_32( inx[ 1] );
-   Wy[ 1] =  mm64_bswap_32( iny[ 1] );
-   Wz[ 1] =       bswap_32( inz[ 1] );
-
-   Wx[ 2] = mm256_bswap_32( inx[ 2] );
-   Wy[ 2] =  mm64_bswap_32( iny[ 2] );
-   Wz[ 2] =       bswap_32( inz[ 2] );
-
-   Wx[ 3] = mm256_bswap_32( inx[ 3] );
-   Wy[ 3] =  mm64_bswap_32( iny[ 3] );
-   Wz[ 3] =       bswap_32( inz[ 3] );
-
-   Wx[ 4] = mm256_bswap_32( inx[ 4] );
-   Wy[ 4] =  mm64_bswap_32( iny[ 4] );
-   Wz[ 4] =       bswap_32( inz[ 4] );
-
-   Wx[ 5] = mm256_bswap_32( inx[ 5] );
-   Wy[ 5] =  mm64_bswap_32( iny[ 5] );
-   Wz[ 5] =       bswap_32( inz[ 5] );
-
-   Wx[ 6] = mm256_bswap_32( inx[ 6] );
-   Wy[ 6] =  mm64_bswap_32( iny[ 6] );
-   Wz[ 6] =       bswap_32( inz[ 6] );
-
-   Wx[ 7] = mm256_bswap_32( inx[ 7] );
-   Wy[ 7] =  mm64_bswap_32( iny[ 7] );
-   Wz[ 7] =       bswap_32( inz[ 7] );
-
-   Wx[ 8] = mm256_bswap_32( inx[ 8] );
-   Wy[ 8] =  mm64_bswap_32( iny[ 8] );
-   Wz[ 8] =       bswap_32( inz[ 8] );
-
-   Wx[ 9] = mm256_bswap_32( inx[ 9] );
-   Wy[ 9] =  mm64_bswap_32( iny[ 9] );
-   Wz[ 9] =       bswap_32( inz[ 9] );
-
-   Wx[10] = mm256_bswap_32( inx[10] );
-   Wy[10] =  mm64_bswap_32( iny[10] );
-   Wz[10] =       bswap_32( inz[10] );
-
-   Wx[11] = mm256_bswap_32( inx[11] );
-   Wy[11] =  mm64_bswap_32( iny[11] );
-   Wz[11] =       bswap_32( inz[11] );
-
-   Wx[12] = mm256_bswap_32( inx[12] );
-   Wy[12] =  mm64_bswap_32( iny[12] );
-   Wz[12] =       bswap_32( inz[12] );
-
-   Wx[13] = mm256_bswap_32( inx[13] );
-   Wy[13] =  mm64_bswap_32( iny[13] );
-   Wz[13] =       bswap_32( inz[13] );
-
-   Wx[14] = mm256_bswap_32( inx[14] );
-   Wy[14] =  mm64_bswap_32( iny[14] );
-   Wz[14] =       bswap_32( inz[14] );
-
-   Wx[15] = mm256_bswap_32( inx[15] );
-   Wy[15] =  mm64_bswap_32( iny[15] );
-   Wz[15] =       bswap_32( inz[15] );
-
-   Ax = rx[0];     Ay = ry[0];     Az = rz[0];
-   Bx = rx[1];     By = ry[1];     Bz = rz[1];
-   Cx = rx[2];     Cy = ry[2];     Cz = rz[2];
-   Dx = rx[3];     Dy = ry[3];     Dz = rz[3];
-   Ex = rx[4];     Ey = ry[4];     Ez = rz[4];
-   Fx = rx[5];     Fy = ry[5];     Fz = rz[5];
-   Gx = rx[6];     Gy = ry[6];     Gz = rz[6];
-   Hx = rx[7];     Hy = ry[7];     Hz = rz[7];
-
-   SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
-                     Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
-                     Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  0, 0 );
-   SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
-		     Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
-		     Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  1, 0 );
-   SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
-		     Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
-		     Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz,  2, 0 );
-   SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
-		     Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
-		     Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez,  3, 0 );
-   SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
-		     Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
-		     Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz,  4, 0 );
-   SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
-		     Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
-		     Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz,  5, 0 );
-   SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
-		     Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
-		     Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz,  6, 0 );
-   SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
-		     By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
-		     Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az,  7, 0 );
-   SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
-		     Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
-		     Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  8, 0 );
-   SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
-		     Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
-		     Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  9, 0 );
-   SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
-		     Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
-		     Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, 0 );
-   SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
-		     Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
-		     Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, 0 );
-   SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
-		     Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
-		     Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, 0 );
-   SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
-		     Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
-		     Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, 0 );
-   SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
-		     Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
-		     Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, 0 );
-   SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
-		     By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
-		     Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, 0 );
-
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      Wx[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
-      Wy[ 0] = SHA2y_MEXP( 14,  9,  1,  0 );
-      Wz[ 0] = SHA2z_MEXP( 14,  9,  1,  0 );
-
-      Wx[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
-      Wy[ 1] = SHA2y_MEXP( 15, 10,  2,  1 );
-      Wz[ 1] = SHA2z_MEXP( 15, 10,  2,  1 );
-
-      Wx[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
-      Wy[ 2] = SHA2y_MEXP(  0, 11,  3,  2 );
-      Wz[ 2] = SHA2z_MEXP(  0, 11,  3,  2 );
-
-      Wx[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
-      Wy[ 3] = SHA2y_MEXP(  1, 12,  4,  3 );
-      Wz[ 3] = SHA2z_MEXP(  1, 12,  4,  3 );
-
-      Wx[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
-      Wy[ 4] = SHA2y_MEXP(  2, 13,  5,  4 );
-      Wz[ 4] = SHA2z_MEXP(  2, 13,  5,  4 );
-
-      Wx[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
-      Wy[ 5] = SHA2y_MEXP(  3, 14,  6,  5 );
-      Wz[ 5] = SHA2z_MEXP(  3, 14,  6,  5 );
-
-      Wx[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
-      Wy[ 6] = SHA2y_MEXP(  4, 15,  7,  6 );
-      Wz[ 6] = SHA2z_MEXP(  4, 15,  7,  6 );
-
-      Wx[ 7] = SHA2x_MEXP(  5,  0,  8,  7);
-      Wy[ 7] = SHA2y_MEXP(  5,  0,  8,  7);
-      Wz[ 7] = SHA2z_MEXP(  5,  0,  8,  7);
-
-      Wx[ 8] = SHA2x_MEXP(  6,  1,  9,  8);
-      Wy[ 8] = SHA2y_MEXP(  6,  1,  9,  8);
-      Wz[ 8] = SHA2z_MEXP(  6,  1,  9,  8);
-
-      Wx[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
-      Wy[ 9] = SHA2y_MEXP(  7,  2, 10,  9);
-      Wz[ 9] = SHA2z_MEXP(  7,  2, 10,  9);
-
-      Wx[10] = SHA2x_MEXP(  8,  3, 11, 10 );
-      Wy[10] = SHA2y_MEXP(  8,  3, 11, 10);
-      Wz[10] = SHA2z_MEXP(  8,  3, 11, 10);
-
-      Wx[11] = SHA2x_MEXP(  9,  4, 12, 11);
-      Wy[11] = SHA2y_MEXP(  9,  4, 12, 11);
-      Wz[11] = SHA2z_MEXP(  9,  4, 12, 11 );
-
-      Wx[12] = SHA2x_MEXP( 10,  5, 13, 12 );
-      Wy[12] = SHA2y_MEXP( 10,  5, 13, 12 );
-      Wz[12] = SHA2z_MEXP( 10,  5, 13, 12 );
-
-      Wx[13] = SHA2x_MEXP( 11,  6, 14, 13 );
-      Wy[13] = SHA2y_MEXP( 11,  6, 14, 13 );
-      Wz[13] = SHA2z_MEXP( 11,  6, 14, 13 );
-
-      Wx[14] = SHA2x_MEXP( 12,  7, 15, 14 );
-      Wy[14] = SHA2y_MEXP( 12,  7, 15, 14 );
-      Wz[14] = SHA2z_MEXP( 12,  7, 15, 14 );
-
-      Wx[15] = SHA2x_MEXP( 13,  8,  0, 15 );
-      Wy[15] = SHA2y_MEXP( 13,  8,  0, 15 );
-      Wz[15] = SHA2z_MEXP( 13,  8,  0, 15 );
-
-
-      SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
-                        Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
-			Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,	 0, j );
-      SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
-		        Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
-		       	Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  1, j );
-      SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
-		        Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
-		       	Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz,  2, j );
-      SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
-		        Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
-		       	Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez,  3, j );
-      SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
-		        Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
-		       	Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz,  4, j );
-      SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
-		        Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
-		       	Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz,  5, j );
-      SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
-		        Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
-		       	Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz,  6, j );
-      SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
-		        By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
-		       	Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az,  7, j );
-      SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
-                        Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
-                        Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  8, j );
-      SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx, 
-                        Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, 
-                        Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  9, j );
-      SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx, 
-                        Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, 
-                        Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, j );
-      SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex, 
-                        Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, 
-                        Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, j );
-      SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx, 
-                        Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, 
-                        Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, j );
-      SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx, 
-                        Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, 
-                        Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, j );
-      SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx, 
-                        Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, 
-                        Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, j );
-      SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax, 
-                        By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, 
-                        Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, j );
-   }
-
-   rx[0] = _mm256_add_epi32( rx[0], Ax );
-   ry[0] =     _mm_add_pi32( ry[0], Ay );
-   rz[0] =                   rz[0]+ Az;
-   rx[1] = _mm256_add_epi32( rx[1], Bx );
-   ry[1] =     _mm_add_pi32( ry[1], By );
-   rz[1] =                   rz[1]+ Bz;
-   rx[2] = _mm256_add_epi32( rx[2], Cx );
-   ry[2] =     _mm_add_pi32( ry[2], Cy );
-   rz[3] =                   rz[3]+ Dz;
-   rx[4] = _mm256_add_epi32( rx[4], Ex );
-   ry[4] =     _mm_add_pi32( ry[4], Ey );
-   rz[4] =                   rz[4]+ Ez;
-   rx[5] = _mm256_add_epi32( rx[5], Fx );
-   ry[5] =     _mm_add_pi32( ry[5], Fy );
-   rz[5] =                   rz[5]+ Fz;
-   rx[6] = _mm256_add_epi32( rx[6], Gx );
-   ry[6] =     _mm_add_pi32( ry[6], Gy );
-   rz[6] =                   rz[6]+ Gz;
-   rx[7] = _mm256_add_epi32( rx[7], Hx );
-   ry[7] =     _mm_add_pi32( ry[7], Hy );
-   rz[7] =                   rz[7]+ Hz;
-
-}
-
-void sha256_11way_init( sha256_11way_context *ctx )
-{
-   ctx->count_high = ctx->count_low = 0;
-   ctx->valx[0] = _mm256_set1_epi32( H256[0] );
-   ctx->valy[0] =     _mm_set1_pi32( H256[0] );
-   ctx->valx[1] = _mm256_set1_epi32( H256[0] );
-   ctx->valy[1] =     _mm_set1_pi32( H256[0] );
-   ctx->valx[2] = _mm256_set1_epi32( H256[0] );
-   ctx->valy[2] =     _mm_set1_pi32( H256[0] );
-   ctx->valx[3] = _mm256_set1_epi32( H256[0] );
-   ctx->valy[3] =     _mm_set1_pi32( H256[0] );
-   ctx->valx[4] = _mm256_set1_epi32( H256[0] );
-   ctx->valy[4] =     _mm_set1_pi32( H256[0] );
-   ctx->valx[5] = _mm256_set1_epi32( H256[0] );
-   ctx->valy[5] =     _mm_set1_pi32( H256[0] );
-   ctx->valx[6] = _mm256_set1_epi32( H256[0] );
-   ctx->valy[6] =     _mm_set1_pi32( H256[0] );
-   ctx->valx[7] = _mm256_set1_epi32( H256[0] );
-   ctx->valy[7] =     _mm_set1_pi32( H256[0] );
-   memcpy( ctx->valz, H256, 32 );
-}
-
-
-void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
-	                  const void *datay, const void *dataz, size_t len )
-{
-   __m256i  *vdatax = (__m256i*) datax;
-    __m64   *vdatay = (__m64*)   datay;
-   uint32_t *idataz = (uint32_t*)dataz;
-   size_t ptr;
-   const int buf_size = 64;
-
-   ptr = (unsigned)ctx->count_low & (buf_size - 1U);
-   while ( len > 0 )
-   {
-      size_t clen;
-      uint32_t clow, clow2;
-
-      clen = buf_size - ptr;
-      if ( clen > len )
-         clen = len;
-      memcpy_256( ctx->bufx + (ptr>>2), vdatax + (ptr>>2), clen>>2 );
-      memcpy_m64( ctx->bufy + (ptr>>2), vdatay + (ptr>>2), clen>>2 );
-      memcpy    ( ctx->bufz +  ptr,     idataz +  ptr,     clen    );
-      ptr += clen;
-      len -= clen;
-      if ( ptr == buf_size )
-      {
-         sha256_11way_round( ctx->bufx, ctx->valx,
-			     ctx->bufy, ctx->valy,
-			     ctx->bufz, ctx->valz );
-         ptr = 0;
-      }
-      clow = ctx->count_low;
-      clow2 = clow + clen;
-      ctx->count_low = clow2;
-      if ( clow2 < clow )
-         ctx->count_high++;
-   }
-}
-
-
-void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dsty,
-	                                            void *dstz)
-{
-    unsigned ptr, u;
-    uint32_t low, high;
-    const int buf_size = 64;
-    const int pad = buf_size - 8;
-
-    ptr = (unsigned)ctx->count_low & (buf_size - 1U);
-    ctx->bufx[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
-    ctx->bufy[ ptr>>2 ] = _mm_set1_pi32( 0x80 );
-    ctx->bufz[ ptr>>2 ] = 0x80;
-    ptr += 4;
-
-    if ( ptr > pad )
-    {
-         memset_zero_256( ctx->bufx + (ptr>>2), (buf_size - ptr) >> 2 );
-         memset_zero_m64( ctx->bufy + (ptr>>2), (buf_size - ptr) >> 2 );
-         memset(      ctx->bufz + (ptr>>2), 0,  (buf_size - ptr) >> 2 );
-         sha256_11way_round( ctx->bufx, ctx->valx,
-			     ctx->bufy, ctx->valy,
-			     ctx->bufz, ctx->valz );
-         memset_zero_256( ctx->bufx, pad >> 2 );
-         memset_zero_m64(  ctx->bufy, pad >> 2 );
-         memset(      ctx->bufz, 0,  pad >> 2 );
-    }
-    else
-    {
-        memset_zero_256( ctx->bufx + (ptr>>2),    (pad - ptr) >> 2 );
-        memset_zero_m64(  ctx->bufy + (ptr>>2),    (pad - ptr) >> 2 );
-        memset(          ctx->bufz + (ptr>>2), 0, (pad - ptr) >> 2 );
-    }
-
-    low = ctx->count_low;
-    high = (ctx->count_high << 3) | (low >> 29);
-    low = low << 3;
-
-    ctx->bufx[ pad >> 2 ] =
-                 mm256_bswap_32( _mm256_set1_epi32( high ) );
-    ctx->bufy[ pad >> 2 ] =
-                 mm64_bswap_32( _mm_set1_pi32( high ) );
-    ctx->bufz[ pad >> 2 ] =
-                 bswap_32( high );
-
-
-    ctx->bufx[ ( pad+4 ) >> 2 ] =
-                 mm256_bswap_32( _mm256_set1_epi32( low ) );
-    ctx->bufy[ ( pad+4 ) >> 2 ] =
-                 mm64_bswap_32( _mm_set1_pi32( low ) );
-    ctx->bufz[ ( pad+4 ) >> 2 ] =
-                 bswap_32( low );
-
-    sha256_11way_round( ctx->bufx, ctx->valx,
-		       ctx->bufy, ctx->valy,
-		       ctx->bufz, ctx->valz  );
-
-    for ( u = 0; u < 8; u ++ )
-    {
-       casti_m256i( dstx, u ) = mm256_bswap_32( ctx->valx[u] );
-       casti_m64  ( dsty, u ) =  mm64_bswap_32( ctx->valy[u] );
-       ((uint32_t*)dstz)[u] = bswap_32( ctx->valz[u] );
-   }
-}
-
-#endif
-#endif   // 0
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -5,137 +5,6 @@
 #include <stdio.h>
 #include "sha-hash-4way.h"

-#if defined(SHA256T_11WAY)
-
-static __thread sha256_11way_context sha256_ctx11 __attribute__ ((aligned (64)));
-
-void sha256t_11way_hash( void *outx, void *outy, void *outz, const void *inpx,
-	                 const void *inpy, const void*inpz )
-{
-   uint32_t hashx[8*8] __attribute__ ((aligned (64)));
-   uint32_t hashy[8*2] __attribute__ ((aligned (64)));
-   uint32_t hashz[8]   __attribute__ ((aligned (64)));
-   sha256_11way_context ctx;
-   const void *inpx64 = inpx+(64<<3);
-   const void *inpy64 = inpy+(64<<1);
-   const void *inpz64 = inpz+ 64;
-
-   memcpy( &ctx, &sha256_ctx11, sizeof ctx );
-   sha256_11way_update( &ctx, inpx64, inpy64, inpz64,  16 );
-   sha256_11way_close( &ctx, hashx, hashy, hashz );
-
-   sha256_11way_init( &ctx );
-   sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
-   sha256_11way_close( &ctx, hashx, hashy, hashz );
-
-   sha256_11way_init( &ctx );
-   sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
-   sha256_11way_close( &ctx, outx, outy, outz );
-}
-
-int scanhash_sha256t_11way( struct work *work, uint32_t max_nonce,
-	                    uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t datax[20*8]  __attribute__ ((aligned (64)));
-   uint32_t datay[20*2]  __attribute__ ((aligned (32)));
-   uint32_t dataz[20]    __attribute__ ((aligned (32)));
-   uint32_t hashx[8*8]   __attribute__ ((aligned (32)));
-   uint32_t hashy[8*2]   __attribute__ ((aligned (32)));
-   uint32_t hashz[8]     __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash7;
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   uint32_t n = first_nonce;
-   __m256i  *noncex = (__m256i*) datax + 19;
-   __m64    *noncey = (__m64*)   datay + 19;
-   uint32_t *noncez = (uint32_t*)dataz + 19;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-   int i;
-   const uint64_t htmax[] = {           0,
-                                      0xF,
-                                     0xFF,
-                                    0xFFF,
-                                   0xFFFF,
-                               0x10000000 };
-   const uint32_t masks[] = {  0xFFFFFFFF,
-                               0xFFFFFFF0,
-                               0xFFFFFF00,
-                               0xFFFFF000,
-                               0xFFFF0000,
-                                        0 };
-
-   // Use dataz (scalar) to stage bswapped data for the vectors.
-   casti_m256i( dataz, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
-   casti_m256i( dataz, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
-   casti_m128i( dataz, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-
-   intrlv_8x32( datax, dataz, dataz, dataz, dataz,
-                                 dataz, dataz, dataz, dataz, 640 );
-   mm64_interleave_2x32( datay, dataz, dataz, 640 );
-
-   sha256_11way_init( &sha256_ctx11 );
-   sha256_11way_update( &sha256_ctx11, datax, datay, dataz, 64 );
-
-   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
-   {
-      uint32_t mask = masks[m];
-      do
-      {
-        *noncex = mm256_bswap_32(
-         _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
-        *noncey = mm64_bswap_32( _mm_set_pi32( n+9, n+8 ) );
-        *noncez = bswap_32( n+10 );
-
-        pdata[19] = n;
-
-        sha256t_11way_hash( hashx, hashy, hashz, datax, datay, dataz );
-
-        if ( opt_benchmark ) { n += 11; continue; }
-
-        hash7 = &(hashx[7<<3]); 
-        for ( i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
-        { 
-            // deinterleave hash for lane
-            extr_lane_8x32( lane_hash, hashx, i, 256 );
-            if ( fulltest( lane_hash, ptarget ) )
-            {
-	            pdata[19] = n + i;
-               submit_lane_solution( work, lane_hash, mythr, i );
-            }
-        }
-
-        hash7 = &(hashy[7<<1]);
-        for( i = 0; i < 2; i++ ) if ( !(hash7[ 0] & mask ) )
- 
-        {
-            mm64_extr_lane_2x32( lane_hash, hashy, i, 256 );
-           if ( fulltest( lane_hash, ptarget ) )
-           {
-               pdata[19] = n + 8 + i;
-               submit_lane_solution( work, lane_hash, mythr, i+8 );
-           }
-	     }
-
-        if ( !(hashz[7] & mask ) && fulltest( hashz, ptarget ) )
-        {
-            pdata[19] = n+10;
-            submit_lane_solution( work, hashz, mythr, 10 );
-        }
-        n += 11;
-
-      } while ( (n < max_nonce-12) && !work_restart[thr_id].restart );
-      break;
-   }
-    
-   *hashes_done = n - first_nonce + 1;
-   return 0;
-}
-
-#endif
-
 #if defined(SHA256T_8WAY)

 static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
--- a/algo/simd/nist.c
+++ b/algo/simd/nist.c
@@ -83,13 +83,14 @@ HashReturn init_sd(hashState_sd *state, int hashbitlen) {
  char *init;

 #ifndef NO_PRECOMPUTED_IV
-  if (hashbitlen == 224)
-    r=InitIV(state, hashbitlen, IV_224);
-  else if (hashbitlen == 256)
-    r=InitIV(state, hashbitlen, IV_256);
-  else if (hashbitlen == 384)
-    r=InitIV(state, hashbitlen, IV_384);
-  else if (hashbitlen == 512)
+//  if (hashbitlen == 224)
+//    r=InitIV(state, hashbitlen, IV_224);
+//  else if (hashbitlen == 256)
+//    r=InitIV(state, hashbitlen, IV_256);
+//  else if (hashbitlen == 384)
+//    r=InitIV(state, hashbitlen, IV_384);
+//  else
+  if (hashbitlen == 512)
    r=InitIV(state, hashbitlen, IV_512);
  else
 #endif
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -2,13 +2,136 @@
 #include <string.h>
 #include <stdint.h>
 #include "skein-hash-4way.h"
+
+// 8 way is faster than SHA on Icelake
+// SHA is faster than 4 way on Ryzen
+//
 #if defined(__SHA__)
  #include <openssl/sha.h>
-#else
-  #include "algo/sha/sha-hash-4way.h"
 #endif
+#include "algo/sha/sha-hash-4way.h"

-#if defined (SKEIN_4WAY)
+#if defined (SKEIN_8WAY)
+
+void skeinhash_8way( void *state, const void *input )
+{
+     uint64_t vhash64[16*8] __attribute__ ((aligned (128)));
+     skein512_8way_context ctx_skein;
+
+//#if defined(__SHA__)
+//     uint32_t hash0[16] __attribute__ ((aligned (64)));
+//     uint32_t hash1[16] __attribute__ ((aligned (64)));
+//     uint32_t hash2[16] __attribute__ ((aligned (64)));
+//     uint32_t hash3[16] __attribute__ ((aligned (64)));
+//     uint32_t hash4[16] __attribute__ ((aligned (64)));
+//     uint32_t hash5[16] __attribute__ ((aligned (64)));
+//     uint32_t hash6[16] __attribute__ ((aligned (64)));
+//     uint32_t hash7[16] __attribute__ ((aligned (64)));
+//     SHA256_CTX           ctx_sha256;
+//#else
+     uint32_t vhash32[32*8] __attribute__ ((aligned (128)));
+     sha256_8way_context ctx_sha256;
+//#endif
+
+     skein512_8way_init( &ctx_skein );
+     skein512_8way_update( &ctx_skein, input, 80 );
+     skein512_8way_close( &ctx_skein, vhash64 );
+/*
+#if defined(__SHA__)      
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash64, 512 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
+     SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
+     SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
+     SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
+     SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash4, 64 );
+     SHA256_Final( (unsigned char*)hash4, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash5, 64 );
+     SHA256_Final( (unsigned char*)hash5, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash6, 64 );
+     SHA256_Final( (unsigned char*)hash6, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash7, 64 );
+     SHA256_Final( (unsigned char*)hash7, &ctx_sha256 );
+     
+     intrlv_8x32( state, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, 256 );
+#else
+*/
+
+     rintrlv_8x64_8x32( vhash32, vhash64, 512 );
+//     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+//                   vhash64, 512 );
+//     intrlv_8x32( vhash32, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+//                   hash7, 512 );
+
+     sha256_8way_init( &ctx_sha256 );
+     sha256_8way( &ctx_sha256, vhash32, 64 );
+     sha256_8way_close( &ctx_sha256, state );
+//#endif
+}
+
+int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint32_t vdata[20*8] __attribute__ ((aligned (128)));
+    uint32_t hash[16*8] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+    uint32_t *hash7 = &(hash[7<<3]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint32_t Htarg = ptarget[7];
+    const uint32_t first_nonce = pdata[19];
+    uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+    int thr_id = mythr->id; 
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do
+   {
+       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+       skeinhash_8way( hash, vdata );
+
+       for ( int lane = 0; lane < 8; lane++ )
+       if (  hash7[ lane ] <= Htarg )
+       {
+          extr_lane_8x32( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) )
+          {
+             pdata[19] = n + lane;
+             submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+       }
+       n += 8;
+    } while ( (n < max_nonce-8) && !work_restart[thr_id].restart );
+
+    *hashes_done = n - first_nonce;
+    return 0;
+}
+
+#elif defined (SKEIN_4WAY)

 void skeinhash_4way( void *state, const void *input )
 {
@@ -26,7 +149,7 @@ void skeinhash_4way( void *state, const void *input )
 #endif

     skein512_4way_init( &ctx_skein );
-     skein512_4way( &ctx_skein, input, 80 );
+     skein512_4way_update( &ctx_skein, input, 80 );
     skein512_4way_close( &ctx_skein, vhash64 );

 #if defined(__SHA__)      
@@ -71,7 +194,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-    int thr_id = mythr->id;  // thr_id arg is deprecated
+    int thr_id = mythr->id; 

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
   do
@@ -92,9 +215,9 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
          }
       }
       n += 4;
-    } while ( (n < max_nonce) && !work_restart[thr_id].restart );
+    } while ( (n < max_nonce-4) && !work_restart[thr_id].restart );

-    *hashes_done = n - first_nonce + 1;
+    *hashes_done = n - first_nonce;
    return 0;
 }

--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -4,8 +4,11 @@

 bool register_skein_algo( algo_gate_t* gate )
 {
-    gate->optimizations = AVX2_OPT | SHA_OPT;
-#if defined (SKEIN_4WAY)
+    gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
+#if defined (SKEIN_8WAY)
+    gate->scanhash  = (void*)&scanhash_skein_8way;
+    gate->hash      = (void*)&skeinhash_8way;
+#elif defined (SKEIN_4WAY)
    gate->scanhash  = (void*)&scanhash_skein_4way;
    gate->hash      = (void*)&skeinhash_4way;
 #else
@@ -15,3 +18,20 @@ bool register_skein_algo( algo_gate_t* gate )
    return true;
 };

+bool register_skein2_algo( algo_gate_t* gate )
+{
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
+#if defined (SKEIN_8WAY)
+  gate->scanhash  = (void*)&scanhash_skein2_8way;
+  gate->hash      = (void*)&skein2hash_8way;
+#elif defined (SKEIN_4WAY)
+  gate->scanhash  = (void*)&scanhash_skein2_4way;
+  gate->hash      = (void*)&skein2hash_4way;
+#else
+  gate->scanhash  = (void*)&scanhash_skein2;
+  gate->hash      = (void*)&skein2hash;
+#endif
+  return true;
+};
+
+
--- a/algo/skein/skein-gate.h
+++ b/algo/skein/skein-gate.h
@@ -1,23 +1,44 @@
 #ifndef __SKEIN_GATE_H__
-#define __SKEIN_GATE_H__
+#define __SKEIN_GATE_H__ 1
 #include <stdint.h>
 #include "algo-gate-api.h"

-#if defined(__AVX2__)
-  #define SKEIN_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define SKEIN_8WAY 1
+#elif defined(__AVX2__)
+  #define SKEIN_4WAY 1
 #endif

-#if defined(SKEIN_4WAY)
+#if defined(SKEIN_8WAY)
+
+void skeinhash_8way( void *output, const void *input );
+int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+void skein2hash_8way( void *output, const void *input );
+int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t* hashes_done, struct thr_info *mythr );
+
+#elif defined(SKEIN_4WAY)

 void skeinhash_4way( void *output, const void *input );
-
 int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-#endif
+
+void skein2hash_4way( void *output, const void *input );
+int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
+                          uint64_t* hashes_done, struct thr_info *mythr );
+
+#else

 void skeinhash( void *output, const void *input );
-
 int scanhash_skein( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );

+void skein2hash( void *output, const void *input );
+int scanhash_skein2( struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
 #endif
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -36,7 +36,6 @@
 #include <string.h>
 #include "skein-hash-4way.h"

-
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -45,6 +44,22 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

+/*
+static const sph_u64 IV256[] = {
+   SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
+   SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
+   SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
+   SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
+};
+
+static const sph_u64 IV512[] = {
+   SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
+   SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
+   SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
+   SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
+};
+*/
+   
 /*
 * M9_ ## s ## _ ## i  evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
 */
@@ -270,8 +285,151 @@ extern "C"{
 #define SKBI(k, s, i)   XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
 #define SKBT(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))

+#define READ_STATE_BIG(sc)   do { \
+      h0 = (sc)->h0; \
+      h1 = (sc)->h1; \
+      h2 = (sc)->h2; \
+      h3 = (sc)->h3; \
+      h4 = (sc)->h4; \
+      h5 = (sc)->h5; \
+      h6 = (sc)->h6; \
+      h7 = (sc)->h7; \
+      bcount = sc->bcount; \
+   } while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+      (sc)->h0 = h0; \
+      (sc)->h1 = h1; \
+      (sc)->h2 = h2; \
+      (sc)->h3 = h3; \
+      (sc)->h4 = h4; \
+      (sc)->h5 = h5; \
+      (sc)->h6 = h6; \
+      (sc)->h7 = h7; \
+      sc->bcount = bcount; \
+   } while (0)
+   
 // AVX2 all scalar vars are now vectors representing 4 nonces in parallel

+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
+do { \
+  k8 = _mm512_xor_si512( _mm512_xor_si512( \
+                            _mm512_xor_si512( _mm512_xor_si512( k0, k1 ), \
+                                              _mm512_xor_si512( k2, k3 ) ), \
+                            _mm512_xor_si512( _mm512_xor_si512( k4, k5 ), \
+                                              _mm512_xor_si512( k6, k7 ) ) ), \
+                         m512_const1_64( 0x1BD11BDAA9FC1A22) ); \
+  t2 = t0 ^ t1; \
+} while (0)
+   
+#define TFBIG_ADDKEY_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \
+do { \
+  w0 = _mm512_add_epi64( w0, SKBI(k,s,0) ); \
+  w1 = _mm512_add_epi64( w1, SKBI(k,s,1) ); \
+  w2 = _mm512_add_epi64( w2, SKBI(k,s,2) ); \
+  w3 = _mm512_add_epi64( w3, SKBI(k,s,3) ); \
+  w4 = _mm512_add_epi64( w4, SKBI(k,s,4) ); \
+  w5 = _mm512_add_epi64( w5, _mm512_add_epi64( SKBI(k,s,5), \
+                                         m512_const1_64( SKBT(t,s,0) ) ) ); \
+  w6 = _mm512_add_epi64( w6, _mm512_add_epi64( SKBI(k,s,6), \
+                                         m512_const1_64( SKBT(t,s,1) ) ) ); \
+  w7 = _mm512_add_epi64( w7, _mm512_add_epi64( SKBI(k,s,7), \
+                                         m512_const1_64( s ) ) ); \
+} while (0)
+
+
+#define TFBIG_MIX_8WAY(x0, x1, rc) \
+do { \
+     x0 = _mm512_add_epi64( x0, x1 ); \
+     x1 = _mm512_xor_si512( mm512_rol_64( x1, rc ), x0 ); \
+} while (0)
+
+#define TFBIG_MIX8_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3)  do { \
+      TFBIG_MIX_8WAY(w0, w1, rc0); \
+      TFBIG_MIX_8WAY(w2, w3, rc1); \
+      TFBIG_MIX_8WAY(w4, w5, rc2); \
+      TFBIG_MIX_8WAY(w6, w7, rc3); \
+   } while (0)
+
+#define TFBIG_8WAY_4e(s)   do { \
+      TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
+      TFBIG_MIX8_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
+      TFBIG_MIX8_8WAY(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
+      TFBIG_MIX8_8WAY(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
+      TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
+   } while (0)
+
+#define TFBIG_8WAY_4o(s)   do { \
+      TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
+      TFBIG_MIX8_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
+      TFBIG_MIX8_8WAY(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
+      TFBIG_MIX8_8WAY(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
+      TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
+   } while (0)
+
+#define UBI_BIG_8WAY(etype, extra) \
+do { \
+  sph_u64 t0, t1, t2; \
+  __m512i h8; \
+  __m512i m0 =  buf[0]; \
+  __m512i m1 =  buf[1]; \
+  __m512i m2 =  buf[2]; \
+  __m512i m3 =  buf[3]; \
+  __m512i m4 =  buf[4]; \
+  __m512i m5 =  buf[5]; \
+  __m512i m6 =  buf[6]; \
+  __m512i m7 =  buf[7]; \
+\
+  __m512i p0 = m0; \
+  __m512i p1 = m1; \
+  __m512i p2 = m2; \
+  __m512i p3 = m3; \
+  __m512i p4 = m4; \
+  __m512i p5 = m5; \
+  __m512i p6 = m6; \
+  __m512i p7 = m7; \
+  t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
+  t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
+  TFBIG_KINIT_8WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \
+  TFBIG_8WAY_4e(0); \
+  TFBIG_8WAY_4o(1); \
+  TFBIG_8WAY_4e(2); \
+  TFBIG_8WAY_4o(3); \
+  TFBIG_8WAY_4e(4); \
+  TFBIG_8WAY_4o(5); \
+  TFBIG_8WAY_4e(6); \
+  TFBIG_8WAY_4o(7); \
+  TFBIG_8WAY_4e(8); \
+  TFBIG_8WAY_4o(9); \
+  TFBIG_8WAY_4e(10); \
+  TFBIG_8WAY_4o(11); \
+  TFBIG_8WAY_4e(12); \
+  TFBIG_8WAY_4o(13); \
+  TFBIG_8WAY_4e(14); \
+  TFBIG_8WAY_4o(15); \
+  TFBIG_8WAY_4e(16); \
+  TFBIG_8WAY_4o(17); \
+  TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \
+  h0 = _mm512_xor_si512( m0, p0 );\
+  h1 = _mm512_xor_si512( m1, p1 );\
+  h2 = _mm512_xor_si512( m2, p2 );\
+  h3 = _mm512_xor_si512( m3, p3 );\
+  h4 = _mm512_xor_si512( m4, p4 );\
+  h5 = _mm512_xor_si512( m5, p5 );\
+  h6 = _mm512_xor_si512( m6, p6 );\
+  h7 = _mm512_xor_si512( m7, p7 );\
+} while (0)
+
+#define DECL_STATE_BIG_8WAY \
+  __m512i h0, h1, h2, h3, h4, h5, h6, h7; \
+  sph_u64 bcount;
+
+
+#endif // AVX512
+
 #define TFBIG_KINIT_4WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
 do { \
  k8 = _mm256_xor_si256( _mm256_xor_si256( \
@@ -298,39 +456,34 @@ do { \
                                         m256_const1_64( s ) ) ); \
 } while (0)

-
 #define TFBIG_MIX_4WAY(x0, x1, rc) \
 do { \
     x0 = _mm256_add_epi64( x0, x1 ); \
     x1 = _mm256_xor_si256( mm256_rol_64( x1, rc ), x0 ); \
 } while (0)
- 

-// typeless
-#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3)  do { \
-		TFBIG_MIX_4WAY(w0, w1, rc0); \
-		TFBIG_MIX_4WAY(w2, w3, rc1); \
-		TFBIG_MIX_4WAY(w4, w5, rc2); \
-		TFBIG_MIX_4WAY(w6, w7, rc3); \
-	} while (0)
+#define TFBIG_MIX8_4WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3)  do { \
+      TFBIG_MIX_4WAY(w0, w1, rc0); \
+      TFBIG_MIX_4WAY(w2, w3, rc1); \
+      TFBIG_MIX_4WAY(w4, w5, rc2); \
+      TFBIG_MIX_4WAY(w6, w7, rc3); \
+   } while (0)

+#define TFBIG_4WAY_4e(s)   do { \
+      TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
+      TFBIG_MIX8_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
+      TFBIG_MIX8_4WAY(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
+      TFBIG_MIX8_4WAY(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
+      TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
+   } while (0)

-#define TFBIG_4e(s)   do { \
-		TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
-		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
-		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
-		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
-		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
-	} while (0)
-
-#define TFBIG_4o(s)   do { \
-		TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
-		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
-		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
-		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
-		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
-	} while (0)
-
+#define TFBIG_4WAY_4o(s)   do { \
+      TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
+      TFBIG_MIX8_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
+      TFBIG_MIX8_4WAY(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
+      TFBIG_MIX8_4WAY(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
+      TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
+   } while (0)

 // scale buf offset by 4
 #define UBI_BIG_4WAY(etype, extra) \
@@ -357,24 +510,24 @@ do { \
  t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
  t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
  TFBIG_KINIT_4WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \
-  TFBIG_4e(0); \
-  TFBIG_4o(1); \
-  TFBIG_4e(2); \
-  TFBIG_4o(3); \
-  TFBIG_4e(4); \
-  TFBIG_4o(5); \
-  TFBIG_4e(6); \
-  TFBIG_4o(7); \
-  TFBIG_4e(8); \
-  TFBIG_4o(9); \
-  TFBIG_4e(10); \
-  TFBIG_4o(11); \
-  TFBIG_4e(12); \
-  TFBIG_4o(13); \
-  TFBIG_4e(14); \
-  TFBIG_4o(15); \
-  TFBIG_4e(16); \
-  TFBIG_4o(17); \
+  TFBIG_4WAY_4e(0); \
+  TFBIG_4WAY_4o(1); \
+  TFBIG_4WAY_4e(2); \
+  TFBIG_4WAY_4o(3); \
+  TFBIG_4WAY_4e(4); \
+  TFBIG_4WAY_4o(5); \
+  TFBIG_4WAY_4e(6); \
+  TFBIG_4WAY_4o(7); \
+  TFBIG_4WAY_4e(8); \
+  TFBIG_4WAY_4o(9); \
+  TFBIG_4WAY_4e(10); \
+  TFBIG_4WAY_4o(11); \
+  TFBIG_4WAY_4e(12); \
+  TFBIG_4WAY_4o(13); \
+  TFBIG_4WAY_4e(14); \
+  TFBIG_4WAY_4o(15); \
+  TFBIG_4WAY_4e(16); \
+  TFBIG_4WAY_4o(17); \
  TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \
  h0 = _mm256_xor_si256( m0, p0 );\
  h1 = _mm256_xor_si256( m1, p1 );\
@@ -391,45 +544,142 @@ do { \
  __m256i h0, h1, h2, h3, h4, h5, h6, h7; \
  sph_u64 bcount;

-#define READ_STATE_BIG(sc)   do { \
-		h0 = (sc)->h0; \
-		h1 = (sc)->h1; \
-		h2 = (sc)->h2; \
-		h3 = (sc)->h3; \
-		h4 = (sc)->h4; \
-		h5 = (sc)->h5; \
-		h6 = (sc)->h6; \
-		h7 = (sc)->h7; \
-		bcount = sc->bcount; \
-	} while (0)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-#define WRITE_STATE_BIG(sc)   do { \
-		(sc)->h0 = h0; \
-		(sc)->h1 = h1; \
-		(sc)->h2 = h2; \
-		(sc)->h3 = h3; \
-		(sc)->h4 = h4; \
-		(sc)->h5 = h5; \
-		(sc)->h6 = h6; \
-		(sc)->h7 = h7; \
-		sc->bcount = bcount; \
-	} while (0)
+void skein256_8way_init( skein256_8way_context *sc )
+{
+        sc->h0 = m512_const1_64( 0xCCD044A12FDB3E13 );
+        sc->h1 = m512_const1_64( 0xE83590301A79A9EB );
+        sc->h2 = m512_const1_64( 0x55AEA0614F816E6F );
+        sc->h3 = m512_const1_64( 0x2A2767A4AE9B94DB );
+        sc->h4 = m512_const1_64( 0xEC06025E74DD7683 );
+        sc->h5 = m512_const1_64( 0xE7A436CDC4746251 );
+        sc->h6 = m512_const1_64( 0xC36FBAF9393AD185 );
+        sc->h7 = m512_const1_64( 0x3EEDBA1833EDFC13 );
+        sc->bcount = 0;
+        sc->ptr = 0;
+}

-/*
-static const sph_u64 IV256[] = {
-   SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
-   SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
-   SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
-   SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
-};
+void skein512_8way_init( skein512_8way_context *sc )
+{
+        sc->h0 = m512_const1_64( 0x4903ADFF749C51CE );
+        sc->h1 = m512_const1_64( 0x0D95DE399746DF03 );
+        sc->h2 = m512_const1_64( 0x8FD1934127C79BCE );
+        sc->h3 = m512_const1_64( 0x9A255629FF352CB1 );
+        sc->h4 = m512_const1_64( 0x5DB62599DF6CA7B0 );
+        sc->h5 = m512_const1_64( 0xEABE394CA9D5C3F4 );
+        sc->h6 = m512_const1_64( 0x991112C71A75B523 );
+        sc->h7 = m512_const1_64( 0xAE18A40B660FCC33 );
+        sc->bcount = 0;
+        sc->ptr = 0;
+}
+
+static void
+skein_big_core_8way( skein512_8way_context *sc, const void *data,
+                     size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   __m512i *buf;
+   size_t ptr;
+   unsigned first;
+   DECL_STATE_BIG_8WAY
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   const int buf_size = 64;   // 64 * _m256i
+
+   if ( len <= buf_size - ptr )
+   {
+       memcpy_512( buf + (ptr>>3), vdata, len>>3 );
+       sc->ptr = ptr + len;
+       return;
+   }
+
+   READ_STATE_BIG( sc );
+   first = ( bcount == 0 ) << 7;
+   do {
+       size_t clen;
+
+       if ( ptr == buf_size )
+       {
+            bcount ++;
+            UBI_BIG_8WAY( 96 + first, 0 );
+            first = 0;
+            ptr = 0;
+       }
+       clen = buf_size - ptr;
+       if ( clen > len )
+            clen = len;
+       memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
+       ptr += clen;
+       vdata += (clen>>3);
+       len -= clen;
+   } while ( len > 0 );
+   WRITE_STATE_BIG( sc );
+   sc->ptr = ptr;
+}
+
+static void
+skein_big_close_8way( skein512_8way_context *sc, unsigned ub, unsigned n,
+                      void *dst, size_t out_len )
+{
+   __m512i *buf;
+   size_t ptr;
+   unsigned et;
+   DECL_STATE_BIG_8WAY
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+        const int buf_size = 64;
+
+   READ_STATE_BIG(sc);
+
+   memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+   et = 352 + ((bcount == 0) << 7);
+   UBI_BIG_8WAY( et, ptr );
+
+   memset_zero_512( buf, buf_size >> 3 );
+   bcount = 0;
+   UBI_BIG_8WAY( 510, 8 );
+
+   buf[0] = h0;
+   buf[1] = h1;
+   buf[2] = h2;
+   buf[3] = h3;
+   buf[4] = h4;
+   buf[5] = h5;
+   buf[6] = h6;
+   buf[7] = h7;
+
+   memcpy_512( dst, buf, out_len >> 3 );
+}
+
+void
+skein256_8way_update(void *cc, const void *data, size_t len)
+{
+   skein_big_core_8way(cc, data, len);
+}
+
+void
+skein256_8way_close(void *cc, void *dst)
+{
+        skein_big_close_8way(cc, 0, 0, dst, 32);
+}
+
+void
+skein512_8way_update(void *cc, const void *data, size_t len)
+{
+   skein_big_core_8way(cc, data, len);
+}
+
+void
+skein512_8way_close(void *cc, void *dst)
+{
+        skein_big_close_8way(cc, 0, 0, dst, 64);
+}
+
+#endif // AVX512

-static const sph_u64 IV512[] = {
-   SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
-   SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
-   SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
-   SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
-};
-*/

 void skein256_4way_init( skein256_4way_context *sc )
 {
@@ -517,66 +767,30 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
 	ptr = sc->ptr;
        const int buf_size = 64;

-	/*
-	 * At that point, if ptr == 0, then the message was empty;
-	 * otherwise, there is between 1 and 64 bytes (inclusive) which
-	 * are yet to be processed. Either way, we complete the buffer
-	 * to a full block with zeros (the Skein specification mandates
-	 * that an empty message is padded so that there is at least
-	 * one block to process).
-	 *
-	 * Once this block has been processed, we do it again, with
-	 * a block full of zeros, for the output (that block contains
-	 * the encoding of "0", over 8 bytes, then padded with zeros).
-	 */
-
 	READ_STATE_BIG(sc);

-        memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+   memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
 	et = 352 + ((bcount == 0) << 7);
-        UBI_BIG_4WAY( et, ptr );
+   UBI_BIG_4WAY( et, ptr );

-        memset_zero_256( buf, buf_size >> 3 );
-        bcount = 0;
-        UBI_BIG_4WAY( 510, 8 );
+   memset_zero_256( buf, buf_size >> 3 );
+   bcount = 0;
+   UBI_BIG_4WAY( 510, 8 );

-        buf[0] = h0;
-        buf[1] = h1;
-        buf[2] = h2;
-        buf[3] = h3;
-        buf[4] = h4;
-        buf[5] = h5;
-        buf[6] = h6;
-        buf[7] = h7;
+   buf[0] = h0;
+   buf[1] = h1;
+   buf[2] = h2;
+   buf[3] = h3;
+   buf[4] = h4;
+   buf[5] = h5;
+   buf[6] = h6;
+   buf[7] = h7;

-        memcpy_256( dst, buf, out_len >> 3 );
+   memcpy_256( dst, buf, out_len >> 3 );
 }

-/*
-static const sph_u64 IV256[] = {
-	SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
-	SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
-	SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
-	SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
-};
-
-static const sph_u64 IV512[] = {
-	SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
-	SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
-	SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
-	SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
-};
-*/
-/*
 void
-skein256_4way_init(void *cc)
-{
-	skein_big_init_4way(cc, IV256);
-}
-*/
-
-void
-skein256_4way(void *cc, const void *data, size_t len)
+skein256_4way_update(void *cc, const void *data, size_t len)
 {
 	skein_big_core_4way(cc, data, len);
 }
@@ -587,16 +801,8 @@ skein256_4way_close(void *cc, void *dst)
        skein_big_close_4way(cc, 0, 0, dst, 32);
 }

-/*
 void
-skein512_4way_init(void *cc)
-{
-	skein_big_init_4way(cc, IV512);
-}
-*/
-
-void
-skein512_4way(void *cc, const void *data, size_t len)
+skein512_4way_update(void *cc, const void *data, size_t len)
 {
 	skein_big_core_4way(cc, data, len);
 }
--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -55,29 +55,50 @@ extern "C"{
 #define SPH_SIZE_skein256   256
 #define SPH_SIZE_skein512   512

+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 typedef struct
 {
-   __m256i buf[8] __attribute__ ((aligned (64)));
+   __m512i buf[8];
+   __m512i h0, h1, h2, h3, h4, h5, h6, h7;
+   size_t ptr;
+   sph_u64 bcount;
+} sph_skein_8way_big_context __attribute__ ((aligned (128)));
+
+typedef sph_skein_8way_big_context skein512_8way_context;
+typedef sph_skein_8way_big_context skein256_8way_context;
+
+void skein512_8way_init( skein512_8way_context *sc );
+void skein512_8way_update( void *cc, const void *data, size_t len );
+void skein512_8way_close( void *cc, void *dst );
+
+void skein256_8way_init( skein256_8way_context *sc );
+void skein256_8way_update( void *cc, const void *data, size_t len );
+void skein256_8way_close( void *cc, void *dst );
+
+#endif // AVX512
+   
+typedef struct
+{
+   __m256i buf[8];
   __m256i h0, h1, h2, h3, h4, h5, h6, h7;
   size_t ptr;
 	sph_u64 bcount;
-} sph_skein_4way_big_context;
+} sph_skein_4way_big_context __attribute__ ((aligned (128)));

 typedef sph_skein_4way_big_context skein512_4way_context;
 typedef sph_skein_4way_big_context skein256_4way_context;

 void skein512_4way_init( skein512_4way_context *sc );
-void skein512_4way( void *cc, const void *data, size_t len );
+void skein512_4way_update( void *cc, const void *data, size_t len );
 void skein512_4way_close( void *cc, void *dst );
-//void sph_skein512_addbits_and_close(
-//        void *cc, unsigned ub, unsigned n, void *dst);
+#define skein512_4way skein512_4way_update

 void skein256_4way_init( skein256_4way_context *sc );
-void skein256_4way( void *cc, const void *data, size_t len );
+void skein256_4way_update( void *cc, const void *data, size_t len );
 void skein256_4way_close( void *cc, void *dst );
-//void sph_skein256_addbits_and_close(
-//	void *cc, unsigned ub, unsigned n, void *dst);
-
+#define skein256_4way skein256_4way_update

 #ifdef __cplusplus
 }
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -1,9 +1,66 @@
-#include "skein2-gate.h"
+#include "skein-gate.h"
 #include <string.h>
 #include <stdint.h>
 #include "skein-hash-4way.h"

-#if defined(SKEIN2_4WAY)
+#if defined(SKEIN_8WAY)
+
+void skein2hash_8way( void *output, const void *input )
+{
+   skein512_8way_context ctx;
+   uint64_t hash[16*8] __attribute__ ((aligned (128)));
+
+   skein512_8way_init( &ctx );
+   skein512_8way_update( &ctx, input, 80 );
+   skein512_8way_close( &ctx, hash );
+
+   skein512_8way_init( &ctx );
+   skein512_8way_update( &ctx, hash, 64 );
+   skein512_8way_close( &ctx, output );
+}
+
+int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint32_t hash[16*8] __attribute__ ((aligned (128)));
+    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+    uint32_t *hash7 = &(hash[49]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint32_t Htarg = ptarget[7];
+    const uint32_t first_nonce = pdata[19];
+    uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+    int thr_id = mythr->id; 
+
+    mm512_bswap32_intrlv80_8x64( vdata, pdata );
+    do
+    {
+       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+       skein2hash_8way( hash, vdata );
+
+       for ( int lane = 0; lane < 8; lane++ )
+       if ( hash7[ lane<<1 ] <= Htarg )
+       {
+          extr_lane_8x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          {
+             pdata[19] = n + lane;
+             submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+       }
+       n += 8;
+    } while ( (n < max_nonce-8) && !work_restart[thr_id].restart );
+
+    *hashes_done = n - first_nonce + 1;
+    return 0;
+}
+
+#elif defined(SKEIN_4WAY)

 void skein2hash_4way( void *output, const void *input )
 {
--- a/algo/skein/skein2-gate.c
+++ b/algo/skein/skein2-gate.c
@@ -1,17 +0,0 @@
-#include "skein2-gate.h"
-#include <stdint.h>
-#include "sph_skein.h"
-
-bool register_skein2_algo( algo_gate_t* gate )
-{
-  gate->optimizations = AVX2_OPT;
-#if defined (SKEIN2_4WAY)
-  gate->scanhash  = (void*)&scanhash_skein2_4way;
-  gate->hash      = (void*)&skein2hash_4way;
-#else
-  gate->scanhash  = (void*)&scanhash_skein2;
-  gate->hash      = (void*)&skein2hash;
-#endif
-  return true;
-};
-
--- a/algo/skein/skein2-gate.h
+++ b/algo/skein/skein2-gate.h
@@ -1,20 +0,0 @@
-#ifndef __SKEIN2GATE_H__
-#define __SKEIN2_GATE_H__
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-#if defined(__AVX2__)
-  #define SKEIN2_4WAY
-#endif
-
-#if defined(SKEIN2_4WAY)
-void skein2hash_4way( void *output, const void *input );
-int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
-                          uint64_t* hashes_done, struct thr_info *mythr );
-#endif
-
-void skein2hash( void *output, const void *input );
-int scanhash_skein2( struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done, struct thr_info *mythr );
-#endif
-
--- a/algo/skein/skein2.c
+++ b/algo/skein/skein2.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "skein-gate.h"
 #include <string.h>
 #include <stdint.h>

--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -45,12 +45,12 @@ void init_tt8_4way_ctx()

 void timetravel_4way_hash(void *output, const void *input)
 {
-   uint64_t hash0[8] __attribute__ ((aligned (64)));
-   uint64_t hash1[8] __attribute__ ((aligned (64)));
-   uint64_t hash2[8] __attribute__ ((aligned (64)));
-   uint64_t hash3[8] __attribute__ ((aligned (64)));
-   uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
-   uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
+   uint64_t hash0[10] __attribute__ ((aligned (64)));
+   uint64_t hash1[10] __attribute__ ((aligned (64)));
+   uint64_t hash2[10] __attribute__ ((aligned (64)));
+   uint64_t hash3[10] __attribute__ ((aligned (64)));
+   uint64_t vhashX[10*4] __attribute__ ((aligned (64)));
+   uint64_t vhashY[10*4] __attribute__ ((aligned (64)));
   uint64_t *vhashA, *vhashB;
   tt8_4way_ctx_holder ctx __attribute__ ((aligned (64)));
   uint32_t dataLen = 64;
--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -51,12 +51,12 @@ void init_tt10_4way_ctx()

 void timetravel10_4way_hash(void *output, const void *input)
 {
-   uint64_t hash0[8] __attribute__ ((aligned (64)));
-   uint64_t hash1[8] __attribute__ ((aligned (64)));
-   uint64_t hash2[8] __attribute__ ((aligned (64)));
-   uint64_t hash3[8] __attribute__ ((aligned (64)));
-   uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
-   uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
+   uint64_t hash0[10] __attribute__ ((aligned (64)));
+   uint64_t hash1[10] __attribute__ ((aligned (64)));
+   uint64_t hash2[10] __attribute__ ((aligned (64)));
+   uint64_t hash3[10] __attribute__ ((aligned (64)));
+   uint64_t vhashX[10*4] __attribute__ ((aligned (64)));
+   uint64_t vhashY[10*4] __attribute__ ((aligned (64)));
   uint64_t *vhashA, *vhashB;
   tt10_4way_ctx_holder ctx __attribute__ ((aligned (64)));
   uint32_t dataLen = 64;
--- a/algo/x12/x12-4way.c
+++ b/algo/x12/x12-4way.c
@@ -108,7 +108,7 @@ void x12_4way_hash( void *state, const void *input )
     intrlv_2x128( vhash, hash2, hash3, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     intrlv_2x128( hash2, hash3, vhash, 512 );
+     dintrlv_2x128( hash2, hash3, vhash, 512 );

     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
--- a/algo/yespower/yespower-blake2b.c
+++ b/algo/yespower/yespower-blake2b.c
@@ -49,6 +49,7 @@
 * no slowdown from the prefixes is generally observed on AMD CPUs supporting
 * XOP, some slowdown is sometimes observed on Intel CPUs with AVX.
 */
+/*
 #ifdef __XOP__
 #warning "Note: XOP is enabled.  That's great."
 #elif defined(__AVX__)
@@ -60,6 +61,7 @@
 #else
 #warning "Note: building generic code for non-x86.  That's OK."
 #endif
+*/

 /*
 * The SSE4 code version has fewer instructions than the generic SSE2 version,
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -16,7 +16,8 @@ mv cpuminer cpuminer-avx512

 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl
+# GCC 9 doesn't include AES with core-avx2
+CFLAGS="-O3 -march=core-avx2 -maes -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-avx2.exe
@@ -25,7 +26,7 @@ mv cpuminer cpuminer-avx2

 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=corei7-avx -Wall" ./configure --with-curl
+CFLAGS="-O3 -march=corei7-avx -maes -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-avx.exe
--- a/build-allarch.sh.bak
+++ b/build-allarch.sh.bak
@@ -0,0 +1,86 @@
+#!/bin/bash
+#
+# This script is not intended for users, it is only used for compile testing
+# during develpment. Howver the information contained my provide cimpilation
+# tips to users.
+
+make distclean || echo clean
+rm -f config.status
+./autogen.sh || echo done
+CFLAGS="-O3 -march=skylake-avx512 -Wall" ./configure --with-curl
+make -j 16
+strip -s cpuminer.exe
+mv cpuminer.exe cpuminer-avx512.exe
+strip -s cpuminer
+mv cpuminer cpuminer-avx512
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl
+make -j 16
+strip -s cpuminer.exe
+mv cpuminer.exe cpuminer-avx2.exe
+strip -s cpuminer
+mv cpuminer cpuminer-avx2
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=corei7-avx -Wall" ./configure --with-curl
+make -j 16
+strip -s cpuminer.exe
+mv cpuminer.exe cpuminer-aes-avx.exe
+strip -s cpuminer
+mv cpuminer cpuminer-aes-avx
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl
+make -j 16
+strip -s cpuminer.exe
+mv cpuminer.exe cpuminer-aes-sse42.exe
+strip -s cpuminer
+mv cpuminer cpuminer-aes-sse42
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=corei7 -Wall" ./configure --with-curl
+make -j 16
+strip -s cpuminer.exe
+mv cpuminer.exe cpuminer-sse42.exe
+strip -s cpuminer
+mv cpuminer cpuminer-sse42
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=core2 -Wall" ./configure --with-curl
+make -j 16
+strip -s cpuminer.exe
+mv cpuminer.exe cpuminer-ssse3.exe
+strip -s cpuminer
+mv cpuminer cpuminer-ssse3
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -msse2 -Wall" ./configure --with-curl
+make -j 16
+strip -s cpuminer.exe
+mv cpuminer.exe cpuminer-sse2.exe
+strip -s cpuminer
+mv cpuminer cpuminer-sse2
+
+make clean || echo done
+rm -f config.status
+CFLAGS="-O3 -march=znver1 -Wall" ./configure --with-curl
+make -j 16
+strip -s cpuminer.exe
+mv cpuminer.exe cpuminer-zen.exe
+strip -s cpuminer
+mv cpuminer cpuminer-zen
+
+make clean || echo done
+rm -f config.status
+CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
+make -j 16
+strip -s cpuminer.exe
+strip -s cpuminer
+
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.11.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.0.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.9.11'
-PACKAGE_STRING='cpuminer-opt 3.9.11'
+PACKAGE_VERSION='3.10.0'
+PACKAGE_STRING='cpuminer-opt 3.10.0'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.9.11 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.10.0 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.9.11:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.10.0:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.9.11
+cpuminer-opt configure 3.10.0
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.9.11, which was
+It was created by cpuminer-opt $as_me 3.10.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.9.11'
+ VERSION='3.10.0'


 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.9.11, which was
+This file was extended by cpuminer-opt $as_me 3.10.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.9.11
+cpuminer-opt config.status 3.10.0
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.9.11])
+AC_INIT([cpuminer-opt], [3.10.0])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -53,6 +53,8 @@
 #if HAVE_SYS_PARAM_H
 #include <sys/param.h>
 #endif
+
+// GCC 9 warning sysctl.h is deprecated
 #include <sys/sysctl.h>
 #endif
 #endif
@@ -3339,12 +3341,14 @@ bool check_cpu_capability ()
     bool cpu_has_avx2   = has_avx2();
     bool cpu_has_sha    = has_sha();
     bool cpu_has_avx512 = has_avx512();
+     bool cpu_has_vaes   = has_vaes();
     bool sw_has_aes    = false;
     bool sw_has_sse42  = false;
     bool sw_has_avx    = false;
     bool sw_has_avx2   = false;
     bool sw_has_avx512 = false;
     bool sw_has_sha    = false;
+     bool sw_has_vaes   = false;
     set_t algo_features = algo_gate.optimizations;
     bool algo_has_sse2   = set_incl( SSE2_OPT,    algo_features );
     bool algo_has_aes    = set_incl( AES_OPT,     algo_features );
@@ -3352,12 +3356,14 @@ bool check_cpu_capability ()
     bool algo_has_avx2   = set_incl( AVX2_OPT,    algo_features );
     bool algo_has_avx512 = set_incl( AVX512_OPT,  algo_features );
     bool algo_has_sha    = set_incl( SHA_OPT,     algo_features );
+     bool algo_has_vaes   = set_incl( VAES_OPT,    algo_features );
     bool use_aes;
     bool use_sse2;
     bool use_sse42;
     bool use_avx2;
     bool use_avx512;
     bool use_sha;
+     bool use_vaes;
     bool use_none;

     #ifdef __AES__
@@ -3372,12 +3378,16 @@ bool check_cpu_capability ()
     #ifdef __AVX2__
         sw_has_avx2 = true;
     #endif
-     #if (defined(__AVX512F__) && defined(__AVX51DQF__) && defined(__AVX51BW__) && defined(__AVX512VL__))
+     #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
         sw_has_avx512 = true;
     #endif
     #ifdef __SHA__
         sw_has_sha = true;
     #endif
+     #ifdef __VAES__
+         sw_has_vaes = true;
+     #endif
+         

 //     #if !((__AES__) || (__SSE2__))
 //         printf("Neither __AES__ nor __SSE2__ defined.\n");
@@ -3404,6 +3414,7 @@ bool check_cpu_capability ()
     if ( cpu_has_avx2   )    printf( " AVX2"   );
     if ( cpu_has_avx512 )    printf( " AVX512" );
     if ( cpu_has_sha    )    printf( " SHA"    );
+     if ( cpu_has_vaes   )    printf( " VAES"   );

     printf(".\nSW features: SSE2");
     if ( sw_has_aes    )     printf( " AES"    );
@@ -3412,18 +3423,20 @@ bool check_cpu_capability ()
     if ( sw_has_avx2   )     printf( " AVX2"   );
     if ( sw_has_avx512 )     printf( " AVX512" );
     if ( sw_has_sha    )     printf( " SHA"    );
+     if ( sw_has_vaes   )     printf( " VAES"   );
    

     printf(".\nAlgo features:");
     if ( algo_features == EMPTY_SET ) printf( " None" );
     else
     {
-        if ( algo_has_sse2   ) printf( " SSE2"    );
-        if ( algo_has_aes    ) printf( " AES"     );
-        if ( algo_has_sse42  ) printf( " SSE4.2"  );
+        if ( algo_has_sse2   ) printf( " SSE2"   );
+        if ( algo_has_aes    ) printf( " AES"    );
+        if ( algo_has_sse42  ) printf( " SSE4.2" );
        if ( algo_has_avx2   ) printf( " AVX2"   );
        if ( algo_has_avx512 ) printf( " AVX512" );
        if ( algo_has_sha    ) printf( " SHA"    );
+        if ( algo_has_vaes   ) printf( " VAES"   );
     }
     printf(".\n");

@@ -3461,8 +3474,9 @@ bool check_cpu_capability ()
     use_avx2   = cpu_has_avx2   && sw_has_avx2   && algo_has_avx2;
     use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
     use_sha    = cpu_has_sha    && sw_has_sha    && algo_has_sha;
+     use_vaes   = cpu_has_vaes   && sw_has_vaes   && algo_has_vaes;
     use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 ||
-                   use_sha );
+                   use_sha || use_vaes );
      
     // Display best options
     printf( "Start mining with" );
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -575,12 +575,26 @@ static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
  __m128i s3 = casti_m128i( src,3 );
  __m128i s4 = casti_m128i( src,4 );

+#if defined(__SSSE3__)
+
+  __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
+
+  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
+  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
+  s2 = _mm_shuffle_epi8( s2, bswap_shuf );
+  s3 = _mm_shuffle_epi8( s3, bswap_shuf );
+  s4 = _mm_shuffle_epi8( s4, bswap_shuf );
+
+#else
+
  s0 = mm128_bswap_32( s0 );
  s1 = mm128_bswap_32( s1 );
  s2 = mm128_bswap_32( s2 );
  s3 = mm128_bswap_32( s3 );
  s4 = mm128_bswap_32( s4 );

+#endif
+
  casti_m128i( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 );
  casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 );
  casti_m128i( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa );
@@ -742,17 +756,18 @@ static inline void extr_lane_8x32( void *d, const void *s,

 static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
 {
-   __m128i s0 = casti_m128i( src,0 );
-   __m128i s1 = casti_m128i( src,1 );
-   __m128i s2 = casti_m128i( src,2 );
-   __m128i s3 = casti_m128i( src,3 );
-   __m128i s4 = casti_m128i( src,4 );
+  __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
+  __m128i s0 = casti_m128i( src,0 );
+  __m128i s1 = casti_m128i( src,1 );
+  __m128i s2 = casti_m128i( src,2 );
+  __m128i s3 = casti_m128i( src,3 );
+  __m128i s4 = casti_m128i( src,4 );

-   s0 = mm128_bswap_32( s0 );
-   s1 = mm128_bswap_32( s1 );
-   s2 = mm128_bswap_32( s2 );
-   s3 = mm128_bswap_32( s3 );
-   s4 = mm128_bswap_32( s4 );
+  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
+  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
+  s2 = _mm_shuffle_epi8( s2, bswap_shuf );
+  s3 = _mm_shuffle_epi8( s3, bswap_shuf );
+  s4 = _mm_shuffle_epi8( s4, bswap_shuf );

   casti_m128i( d, 0 ) = 
   casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0 , 0x00 );
@@ -960,17 +975,18 @@ static inline void extr_lane_16x32( void *d, const void *s,

 static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
 {
-   __m128i s0 = casti_m128i( src,0 );
-   __m128i s1 = casti_m128i( src,1 );
-   __m128i s2 = casti_m128i( src,2 );
-   __m128i s3 = casti_m128i( src,3 );
-   __m128i s4 = casti_m128i( src,4 );
+  __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
+  __m128i s0 = casti_m128i( src,0 );
+  __m128i s1 = casti_m128i( src,1 );
+  __m128i s2 = casti_m128i( src,2 );
+  __m128i s3 = casti_m128i( src,3 );
+  __m128i s4 = casti_m128i( src,4 );

-   s0 = mm128_bswap_32( s0 );
-   s1 = mm128_bswap_32( s1 );
-   s2 = mm128_bswap_32( s2 );
-   s3 = mm128_bswap_32( s3 );
-   s4 = mm128_bswap_32( s4 );
+  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
+  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
+  s2 = _mm_shuffle_epi8( s2, bswap_shuf );
+  s3 = _mm_shuffle_epi8( s3, bswap_shuf );
+  s4 = _mm_shuffle_epi8( s4, bswap_shuf );

   casti_m128i( d, 0 ) = 
   casti_m128i( d, 1 ) = 
@@ -1374,17 +1390,18 @@ static inline void extr_lane_4x64( void *d, const void *s,

 static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
 {
-  __m128i s0 = casti_m128i( src, 0 );
-  __m128i s1 = casti_m128i( src, 1 );
-  __m128i s2 = casti_m128i( src, 2 );
-  __m128i s3 = casti_m128i( src, 3 );
-  __m128i s4 = casti_m128i( src, 4 );
+  __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
+  __m128i s0 = casti_m128i( src,0 );
+  __m128i s1 = casti_m128i( src,1 );
+  __m128i s2 = casti_m128i( src,2 );
+  __m128i s3 = casti_m128i( src,3 );
+  __m128i s4 = casti_m128i( src,4 );

-  s0 = mm128_bswap_32( s0 );
-  s1 = mm128_bswap_32( s1 );
-  s2 = mm128_bswap_32( s2 );
-  s3 = mm128_bswap_32( s3 );
-  s4 = mm128_bswap_32( s4 );
+  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
+  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
+  s2 = _mm_shuffle_epi8( s2, bswap_shuf );
+  s3 = _mm_shuffle_epi8( s3, bswap_shuf );
+  s4 = _mm_shuffle_epi8( s4, bswap_shuf );

  casti_m128i( d,  0 ) = 
  casti_m128i( d,  1 ) = _mm_shuffle_epi32( s0, 0x44 );
@@ -1556,7 +1573,7 @@ static inline void dintrlv_8x64( void *dst0, void *dst1, void *dst2,
   __m128i *d3 = (__m128i*)dst3;
   __m128i *d4 = (__m128i*)dst4;
   __m128i *d5 = (__m128i*)dst5;
-   __m128i *d6 = (__m128i*)dst5;
+   __m128i *d6 = (__m128i*)dst6;
   __m128i *d7 = (__m128i*)dst7;
   const __m128i* s = (const __m128i*)src;

@@ -1690,17 +1707,18 @@ static inline void extr_lane_8x64( void *d, const void *s,

 static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
 {
-  __m128i s0 = casti_m128i( src, 0 );
-  __m128i s1 = casti_m128i( src, 1 );
-  __m128i s2 = casti_m128i( src, 2 );
-  __m128i s3 = casti_m128i( src, 3 );
-  __m128i s4 = casti_m128i( src, 4 );
+  __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
+  __m128i s0 = casti_m128i( src,0 );
+  __m128i s1 = casti_m128i( src,1 );
+  __m128i s2 = casti_m128i( src,2 );
+  __m128i s3 = casti_m128i( src,3 );
+  __m128i s4 = casti_m128i( src,4 );

-  s0 = mm128_bswap_32( s0 );
-  s1 = mm128_bswap_32( s1 );
-  s2 = mm128_bswap_32( s2 );
-  s3 = mm128_bswap_32( s3 );
-  s4 = mm128_bswap_32( s4 );
+  s0 = _mm_shuffle_epi8( s0, bswap_shuf );
+  s1 = _mm_shuffle_epi8( s1, bswap_shuf );
+  s2 = _mm_shuffle_epi8( s2, bswap_shuf );
+  s3 = _mm_shuffle_epi8( s3, bswap_shuf );
+  s4 = _mm_shuffle_epi8( s4, bswap_shuf );

  casti_m128i( d,  0 ) =
  casti_m128i( d,  1 ) =
@@ -1746,7 +1764,6 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
  casti_m128i( d, 37 ) =
  casti_m128i( d, 38 ) =
  casti_m128i( d, 39 ) = _mm_shuffle_epi32( s4, 0xee );
-  
 }

 #endif  // AVX512
@@ -1967,6 +1984,68 @@ static inline void rintrlv_4x64_4x32( void *dst, const void *src,

 #undef RLEAVE_4x64_4x32

+#define RLEAVE_8x64_8x32( i ) do \
+{ \
+   uint32_t *d = (uint32_t*)dst + (i); \
+   const uint32_t *s = (const uint32_t*)src + (i); \
+   d[ 0] = s[ 0];  d[ 1] = s[ 2];  d[ 2] = s[ 4];  d[ 3] = s[ 6]; \
+   d[ 4] = s[ 8];  d[ 5] = s[10];  d[ 6] = s[12];  d[ 7] = s[14]; \
+   d[ 8] = s[ 1];  d[ 9] = s[ 3];  d[10] = s[ 5];  d[11] = s[ 7]; \
+   d[12] = s[ 9];  d[13] = s[11];  d[14] = s[13];  d[16] = s[15]; \
+} while(0)
+
+
+// 8x64 -> 8x32
+
+static inline void rintrlv_8x64_8x32( void *dst, const void *src,
+                                      const int  bit_len )
+{
+   RLEAVE_8x64_8x32(   0 );   RLEAVE_8x64_8x32(  16 );
+   RLEAVE_8x64_8x32(  32 );   RLEAVE_8x64_8x32(  48 );
+   RLEAVE_8x64_8x32(  64 );   RLEAVE_8x64_8x32(  80 );
+   RLEAVE_8x64_8x32(  96 );   RLEAVE_8x64_8x32( 112 );
+
+   RLEAVE_8x64_8x32( 128 );   RLEAVE_8x64_8x32( 144 );
+   RLEAVE_8x64_8x32( 160 );   RLEAVE_8x64_8x32( 176 );
+   RLEAVE_8x64_8x32( 192 );   RLEAVE_8x64_8x32( 208 );
+   RLEAVE_8x64_8x32( 224 );   RLEAVE_8x64_8x32( 240 );
+   
+   if ( bit_len <= 256 ) return;
+
+   RLEAVE_8x64_8x32( 256 );   RLEAVE_8x64_8x32( 272 );
+   RLEAVE_8x64_8x32( 288 );   RLEAVE_8x64_8x32( 304 );
+   RLEAVE_8x64_8x32( 320 );   RLEAVE_8x64_8x32( 336 );
+   RLEAVE_8x64_8x32( 352 );   RLEAVE_8x64_8x32( 368 );
+
+   RLEAVE_8x64_8x32( 384 );   RLEAVE_8x64_8x32( 400 );
+   RLEAVE_8x64_8x32( 416 );   RLEAVE_8x64_8x32( 432 );
+   RLEAVE_8x64_8x32( 448 );   RLEAVE_8x64_8x32( 464 );
+   RLEAVE_8x64_8x32( 480 );   RLEAVE_8x64_8x32( 496 );
+
+   if ( bit_len <= 512 ) return;
+
+   RLEAVE_8x64_8x32( 512 );   RLEAVE_8x64_8x32( 528 );
+   RLEAVE_8x64_8x32( 544 );   RLEAVE_8x64_8x32( 560 );
+   RLEAVE_8x64_8x32( 576 );   RLEAVE_8x64_8x32( 592 );
+   RLEAVE_8x64_8x32( 608 );   RLEAVE_8x64_8x32( 624 );
+
+   RLEAVE_8x64_8x32( 640 );   RLEAVE_8x64_8x32( 656 );
+   RLEAVE_8x64_8x32( 672 );   RLEAVE_8x64_8x32( 688 );
+   RLEAVE_8x64_8x32( 704 );   RLEAVE_8x64_8x32( 720 );
+   RLEAVE_8x64_8x32( 736 );   RLEAVE_8x64_8x32( 752 );
+
+   RLEAVE_8x64_8x32( 768 );   RLEAVE_8x64_8x32( 784 );
+   RLEAVE_8x64_8x32( 800 );   RLEAVE_8x64_8x32( 816 );
+   RLEAVE_8x64_8x32( 832 );   RLEAVE_8x64_8x32( 848 );
+   RLEAVE_8x64_8x32( 864 );   RLEAVE_8x64_8x32( 880 );
+
+   RLEAVE_8x64_8x32( 896 );   RLEAVE_8x64_8x32( 912 );
+   RLEAVE_8x64_8x32( 928 );   RLEAVE_8x64_8x32( 944 );
+   RLEAVE_8x64_8x32( 960 );   RLEAVE_8x64_8x32( 976 );
+   RLEAVE_8x64_8x32( 992 );   RLEAVE_8x64_8x32(1008 );
+}
+
+#undef RLEAVE_8x64_8x32

 // 4x32 -> 4x64

@@ -2067,7 +2146,7 @@ static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
   d[13] = _mm_unpacklo_epi64( s1[ 6], s1[ 7] );
   d[14] = _mm_unpackhi_epi64( s0[ 6], s0[ 7] );
   d[15] = _mm_unpackhi_epi64( s1[ 6], s1[ 7] );
-   if ( bit_len <= 256 ) return;
+   if ( bit_len <= 512 ) return;
   d[16] = _mm_unpacklo_epi64( s0[ 8], s0[ 9] );
   d[17] = _mm_unpacklo_epi64( s1[ 8], s1[ 9] );
   d[18] = _mm_unpackhi_epi64( s0[ 8], s0[ 9] );
@@ -2189,15 +2268,15 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
 #if defined(__SSE4_1__)
 // No SSE2 implementation.

-#define mm128_intrlv_blend_64( hi, lo )   _mm_blend_epi16( hi, lo, 0x0f )
-#define mm128_intrlv_blend_32( hi, lo )   _mm_blend_epi16( hi, lo, 0x33 )
+//#define mm128_intrlv_blend_64( hi, lo )   _mm_blend_epi16( hi, lo, 0x0f )
+//#define mm128_intrlv_blend_32( hi, lo )   _mm_blend_epi16( hi, lo, 0x33 )

 #endif   // SSE4_1

 #if defined(__AVX2__)

-#define mm256_intrlv_blend_128( hi, lo )  _mm256_blend_epi32( hi, lo, 0x0f )
-#define mm256_intrlv_blend_64( hi, lo )   _mm256_blend_epi32( hi, lo, 0x33 )
+//#define mm256_intrlv_blend_128( hi, lo )  _mm256_blend_epi32( hi, lo, 0x0f )
+//#define mm256_intrlv_blend_64( hi, lo )   _mm256_blend_epi32( hi, lo, 0x33 )
 #define mm256_intrlv_blend_32( hi, lo )   _mm256_blend_epi32( hi, lo, 0x55 )

 // Select lanes of 32 byte hash from 2 sources according to control mask.
@@ -2216,4 +2295,18 @@ do { \

 #endif  // AVX2

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+/*
+#define mm512_intrlv_blend_128( hi, lo ) \
+   _mm512_mask_blend_epi32( 0x0f0f, hi, lo )
+
+#define mm512_intrlv_blend_64( hi, lo ) \
+   _mm512_mask_blend_epi32( 0x3333, hi, lo )
+*/
+
+#define mm512_intrlv_blend_32( hi, lo ) \
+   _mm512_mask_blend_epi32( 0x5555, hi, lo )
+
+#endif // AVX512
 #endif // INTERLEAVE_H__
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -242,7 +242,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )


-/*
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 #define mm128_ror_64    _mm_ror_epi64
@@ -251,14 +251,14 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_32    _mm_rol_epi32

 #else
-*/
+

 #define mm128_ror_64   mm128_ror_var_64
 #define mm128_rol_64   mm128_rol_var_64
 #define mm128_ror_32   mm128_ror_var_32
 #define mm128_rol_32   mm128_rol_var_32

-//#endif   // AVX512 else
+#endif   // AVX512 else

 #define mm128_ror_16( v, c ) \
   _mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) )
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -233,7 +233,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
                    _mm256_srli_epi32( v, 32-(c) ) )

-/*
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 // AVX512, control must be 8 bit immediate.
@@ -244,7 +244,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_rol_32    _mm256_rol_epi32

 #else
-*/
+

 // No AVX512, use fallback.

@@ -253,7 +253,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_ror_32    mm256_ror_var_32
 #define mm256_rol_32    mm256_rol_var_32

-// #endif     // AVX512 else
+#endif     // AVX512 else

 #define  mm256_ror_16( v, c ) \
   _mm256_or_si256( _mm256_srli_epi16( v, c ), \
@@ -311,7 +311,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 // AVX512 has finer granularity full vector permutes.
 // AVX512 has full vector alignr which might be faster, especially for 32 bit

-/*
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 #define mm256_swap_128( v )   _mm256_alignr_epi64( v, v, 2 )
@@ -323,7 +323,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_rol_3x32( v )   _mm256_alignr_epi32( v, v, 5 )

 #else   // AVX2
-*/

 // Swap 128 bit elements in 256 bit vector.
 #define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )
@@ -354,7 +353,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
                     m256_const_64( 0x0000000400000003, 0x0000000200000001, \
                                    0x0000000000000007, 0x0000000600000005 )

-//#endif    // AVX512 else AVX2
+#endif    // AVX512 else AVX2


 // AVX512 can do 16 & 8 bit elements.
@@ -423,21 +422,25 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_ror1x32_128( v )  _mm256_shuffle_epi32( v, 0x39 )
 #define mm256_rol1x32_128( v )  _mm256_shuffle_epi32( v, 0x93 )

-// Rotate each 128 bit lane by one 16 bit element.
 #define mm256_ror1x16_128( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x01000f0e0d0c0b0a, \
-                                                    0x0908070605040302 ) )
-#define mm256_rol1x16_128( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x0d0c0b0a09080706, \
-                                                    0x0504030201000f0e ) )
+   _mm256_shuffle_epi8( v, \
+         m256_const_64( 0x11101f1e1d1c1b1a, 0x1918171615141312, \
+                        0x01000f0e0d0c0b0a, 0x0908070605040302 ) )
+
+#define mm256_rol1x16_128( v ) \
+   _mm256_shuffle_epi8( v, \
+         m256_const_64( 0x1d1c1b1a19181716, 0x1514131211101f1e, \
+                        0x0d0c0b0a09080706, 0x0504030201000f0e ) )

-// Rotate each 128 bit lane by one byte
 #define mm256_ror1x8_128( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x000f0e0d0c0b0a09, \
-                                                    0x0807060504030201 ) )
+   _mm256_shuffle_epi8( v, \
+         m256_const_64( 0x101f1e1d1c1b1a19, 0x1817161514131211, \
+                        0x000f0e0d0c0b0a09, 0x0807060504030201 ) )
+
 #define mm256_rol1x8_128( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x0d0c0b0a09080f0e, \
-                                                    0x0504030201000706 ) )
+   _mm256_shuffle_epi8( v, \
+         m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \
+                        0x0d0c0b0a09080f0e, 0x0504030201000706 ) )

 // Rotate each 128 bit lane by c bytes.
 #define mm256_bror_128( v, c ) \
@@ -451,50 +454,65 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_swap32_64( v )    _mm256_shuffle_epi32( v, 0xb1 )

 #define mm256_ror1x16_64( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x09080f0e0d0c0b0a, \
-                                                    0x0100070605040302 ) )
+   _mm256_shuffle_epi8( v, \
+        m256_const_64( 0x19181f1e1d1c1b1a, 0x1110171615141312, \
+                       0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
+
 #define mm256_rol1x16_64( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x0d0c0b0a09080f0e, \
-                                                    0x0504030201000706 ) )
+   _mm256_shuffle_epi8( v, \
+        m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \
+                       0x0d0c0b0a09080f0e, 0x0504030201000706 ) )

 #define mm256_ror1x8_64( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x080f0e0d0c0b0a09, \
-                                                    0x0007060504030201 ) )
+   _mm256_shuffle_epi8( v, \
+        m256_const_64( 0x181f1e1d1c1b1a19, 0x1017161514131211, \
+                       0x080f0e0d0c0b0a09, 0x0007060504030201 ) )
+
 #define mm256_rol1x8_64( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x0e0d0c0b0a09080f, \
-                                                    0x0605040302010007 ) )
+   _mm256_shuffle_epi8( v, \
+        m256_const_64( 0x1e1d1c1b1a19181f, 0x1615141312111017, \
+                       0x0e0d0c0b0a09080f, 0x0605040302010007 ) )

 #define mm256_ror3x8_64( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x0a09080f0e0d0c0b, \
-                                                    0x0201000706050403 ) )
+   _mm256_shuffle_epi8( v, \
+        m256_const_64( 0x1a19181f1e1d1c1b, 0x1211101716151413, \
+                       0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
+
 #define mm256_rol3x8_64( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x0c0b0a09080f0e0d, \
-                                                    0x0403020100070605 ) )
+   _mm256_shuffle_epi8( v, \
+        m256_const_64( 0x1c1b1a19181f1e1d, 0x1413121110171615, \
+                       0x0c0b0a09080f0e0d, 0x0403020100070605 ) )
+

 // Swap 16 bit elements in each 32 bit lane
 #define mm256_swap16_32( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x0b0a09080f0e0d0c, \
-                                                    0x0302010007060504 ) )
+   _mm256_shuffle_epi8( v, \
+         m256_const_64( 0x1b1a19181f1e1d1c, 0x1312111017161514, \
+                        0x0b0a09080f0e0d0c, 0x0302010007060504 ) )

 //
 // Swap bytes in vector elements, endian bswap.
 #define mm256_bswap_64( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x08090a0b0c0d0e0f, \
-                                                    0x0001020304050607 ) )
+   _mm256_shuffle_epi8( v, \
+         m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
+                        0x08090a0b0c0d0e0f, 0x0001020304050607 ) )

 #define mm256_bswap_32( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x0c0d0e0f08090a0b, \
-                                                    0x0405060700010203 ) )
+   _mm256_shuffle_epi8( v, \
+         m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
+                        0x0c0d0e0f08090a0b, 0x0405060700010203 ) )

 #define mm256_bswap_16( v ) \
-            _mm256_shuffle_epi8( v, m256_const2_64( 0x0e0f0c0d0a0b0809, \
-                                                    0x0607040502030001 ) )
+   _mm256_shuffle_epi8( v, \
+         m256_const_64( 0x1e1f1c1d1a1b1819, 0x1617141512131011, \
+                        0x0e0f0c0d0a0b0809, 0x0607040502030001, ) )

 // Source and destination are pointers, may point to same memory.
 // 8 byte qword * 8 qwords * 4 lanes = 256 bytes
 #define mm256_block_bswap_64( d, s ) do \
 { \
-  __m256i ctl = m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
+  __m256i ctl = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
+                               0x08090a0b0c0d0e0f, 0x0001020304050607 ) ; \
  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
@@ -508,7 +526,8 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 // 4 byte dword * 8 dwords * 8 lanes = 256 bytes
 #define mm256_block_bswap_32( d, s ) do \
 { \
-  __m256i ctl = m256_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+  __m256i ctl = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
+                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -90,7 +90,7 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,

 // Equivalent of set4, broadcast 256 bits in groups of four 64 bit constants
 // to all 256 bit lanes: {i3,i2,i1,i0,i3,i2,i1,i0,i3,i2,i1,i0,i3,i2,i1,i0}.
-static inline __m512i mm512_const4_64( const uint64_t i3, const uint64_t i2,
+static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
                                       const uint64_t i1, const uint64_t i0 )
 {
   __m256i lo = mm256_mov64_256( i0 );
@@ -105,7 +105,7 @@ static inline __m512i mm512_const4_64( const uint64_t i3, const uint64_t i2,

 // Broadcast 128 bits in pairs of 64 bit constants {i1. i0} to all
 // 128 bit lanes.
-#define mm512_const2_64( i1, i0 ) \
+#define m512_const2_64( i1, i0 ) \
   _mm512_permutex_epi64( _mm512_castsi128_si512( \
                          m128_const_64( i1, i0 ) ), 0x44 )

@@ -132,7 +132,7 @@ static inline __m512i mm512_const4_64( const uint64_t i3, const uint64_t i2,
 #define m512_one_16     _mm512_broadcastw_epi16( mm128_mov64_128( 1 ) )
 #define m512_one_8      _mm512_broadcastb_epi8 ( mm128_mov64_128( 1 ) )

-#define m512_neg1 mm512_const1_64( 0xffffffffffffffff )
+#define m512_neg1 m512_const1_64( 0xffffffffffffffff )

 /* 
 // EVEX vcmpeqq returns a bit mask instead of a vector
@@ -173,6 +173,19 @@ static inline __m512i mm512_neg1_fn()
 // returns p+o as pointer to vector
 #define casto_m512i(p,o) (((__m512i*)(p))+(o))

+//
+// Memory functions
+// n = number of 512 bit (64 byte) vectors
+
+static inline void memset_zero_512( __m512i *dst, const int n )
+{   for ( int i = 0; i < n; i++ ) dst[i] = m512_zero; }
+
+static inline void memset_512( __m512i *dst, const __m512i a, const int n )
+{   for ( int i = 0; i < n; i++ ) dst[i] = a; }
+
+static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
+{   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
+

 // Sum 4 values, fewer dependencies than sequential addition.

@@ -189,7 +202,7 @@ static inline __m512i mm512_neg1_fn()
   _mm512_add_epi8( _mm512_add_epi8( a, b ), _mm512_add_epi8( c, d ) )

 #define mm512_xor4( a, b, c, d ) \
-   _mm512_xor_si512( _mm512_xor_si256( a, b ), _mm512_xor_si256( c, d ) )
+   _mm512_xor_si512( _mm512_xor_si512( a, b ), _mm512_xor_si512( c, d ) )



@@ -212,6 +225,11 @@ static inline __m512i mm512_neg1_fn()
 // _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
 //

+#define mm512_ror_64 _mm512_ror_epi64
+#define mm512_rol_64 _mm512_rol_epi64
+#define mm512_ror_32 _mm512_ror_epi32
+#define mm512_rol_32 _mm512_rol_epi32
+
 #define mm512_ror_var_64( v, c ) \
   _mm512_or_si512( _mm512_srli_epi64( v, c ), \
                    _mm512_slli_epi64( v, 64-(c) ) )
@@ -249,22 +267,34 @@ static inline __m512i mm512_neg1_fn()
 // Swap bytes in vector elements, vectorized endian conversion.

 #define mm512_bswap_64( v ) \
-   _mm512_shuffle_epi8( v, m512_const2_64( \
-                                 0x08090a0b0c0d0e0f, 0x0001020304050607 ) )
+   _mm512_shuffle_epi8( v, \
+               m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
+                              0x28292a2b2c2d2e2f, 0x2021222324252627, \
+                              0x18191a1b1c1d1e1f, 0x1011121314151617, \
+                              0x08090a0b0c0d0e0f, 0x0001020304050607 ))

 #define mm512_bswap_32( v ) \
-   _mm512_shuffle_epi8( v, m512_const2_64( \
-                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ) )
+   _mm512_shuffle_epi8( v, \
+               m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
+                              0x2c2d2e2f28292a2b, 0x2425262720212223, \
+                              0x0c0d0e0f08090a0b, 0x0405060700010203, \
+                              0x1c1d1e1f18191a1b, 0x1415161710111213 ) )

 #define mm512_bswap_16( v ) \
-   _mm512_shuffle_epi8( v, m512_const2_64( \
-                                 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) )
+   _mm512_shuffle_epi8( v, \
+               m512_const_64( 0x3e3f3c3d3a3b3839, 0x3637343532333031, \
+                              0x2e2f2c2d2a2b2829, 0x2627242522232021, \
+                              0x1e1f1c1d1a1b1819, 0x1617141512131011, \
+                              0x0e0f0c0d0a0b0809, 0x0607040502030001 ) )

 // Source and destination are pointers, may point to same memory.
 // 8 lanes of 64 bytes each
 #define mm512_block_bswap_64( d, s ) do \
 { \
-  __m512i ctl = m512_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
+  __m512i ctl = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
+                               0x28292a2b2c2d2e2f, 0x2021222324252627, \
+                               0x18191a1b1c1d1e1f, 0x1011121314151617, \
+                               0x08090a0b0c0d0e0f, 0x0001020304050607  ); \
  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
@@ -278,7 +308,10 @@ static inline __m512i mm512_neg1_fn()
 // 16 lanes of 32 bytes each
 #define mm512_block_bswap_32( d, s ) do \
 { \
-  __m512i ctl = m512_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+  __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
+                               0x2c2d2e2f28292a2b, 0x2425262720212223, \
+                               0x0c0d0e0f08090a0b, 0x0405060700010203, \
+                               0x1c1d1e1f18191a1b, 0x1415161710111213 ); \
  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
@@ -381,6 +414,8 @@ static inline __m512i mm512_neg1_fn()
 #define mm512_ror1x64_256( v )   _mm512_permutex_epi64( v, 0x39 )
 #define mm512_rol1x64_256( v )   _mm512_permutex_epi64( v, 0x93 )

+
+/*  Need to fix 
 // Rotate 256 bit lanes by one 32 bit element
 #define mm512_ror1x32_256( v ) \
   _mm512_permutexvar_epi32( m512_const4_64( \
@@ -411,7 +446,7 @@ static inline __m512i mm512_neg1_fn()
    _mm512_shuffle_epi8( v, m512_const4_64( \
                     0x1e1d1c1b1a191817, 0x161514131211100f, \
                     0x0e0d0c0b0a090807, 0x060504030201001f ), v )
-
+*/
 //
 // Rotate elements within 128 bit lanes of 512 bit vector.

@@ -422,6 +457,7 @@ static inline __m512i mm512_neg1_fn()
 #define mm512_ror1x32_128( v )   _mm512_shuffle_epi32( v, 0x39 )
 #define mm512_rol1x32_128( v )   _mm512_shuffle_epi32( v, 0x93 )

+/*
 #define mm512_ror1x16_128( v ) \
    _mm512_permutexvar_epi16( m512_const2_64( \
                     0x0000000700060005, 0x0004000300020001 ), v ) 
@@ -437,6 +473,7 @@ static inline __m512i mm512_neg1_fn()
 #define mm512_rol1x8_128( v ) \
    _mm512_shuffle_epi8( v, m512_const2_64( \
                     0x0e0d0c0b0a090807, 0x060504030201000f ) )
+*/

 // Rotate 128 bit lanes by c bytes.  
 #define mm512_bror_128( v, c ) \
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -18,14 +18,47 @@

 #ifndef WIN32

+// 1035g1: /sys/devices/platform/coretemp.0/hwmon/hwmon3/temp1_input
+// 1035g1: /sys/class/hwmon/hwmon1/temp1_input wrong temp
+// ryzen has no /sys/devices/platform/coretemp.0
+// ryzen: /sys/class/hwmon/hwmon0
+// 2400: /sys/class/hwmon/hwmon0/temp1_input incorrect temp
+// 2400 has no /sys/class/hwmon/hwmon2/temp1_input
+// 2400 /sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input ok
+// 6700 /sys/devices/platform/coretemp.0/hwmon/hwmon2/temp1_input
+// 6700 /sys/class/hwmon/hwmon2/temp1_input
+// /sys/devices/platform/coretemp.0/hwmon/hwmon0/temp2_input never exists
+// /sys/class/hwmon/hwmon0/temp2_input doesn't exist or shows wrong temp (sys16)
+// /sys/class/hwmon/hwmon0/device/temp1_input doesn't exist
+
+
+// the first 3 will find i5-2400, i7-6700k, r7-1700, i5-1035g1.
+// The others are left in for legacy, some should probably be removed.
+#define HWMON_PATH1 \
+   "/sys/devices/platform/coretemp.0/hwmon/hwmon3/temp1_input"
+
+#define HWMON_PATH2 \
+   "/sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input"
+
+#define HWMON_PATH3 \
+   "/sys/class/hwmon/hwmon0/temp1_input"
+
 #define HWMON_PATH \
 "/sys/class/hwmon/hwmon2/temp1_input"
+
+/*
 #define HWMON_ALT \
 "/sys/class/hwmon/hwmon0/temp1_input"
+
 #define HWMON_ALT1 \
 "/sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input"
+*/
+
+// This shows wrong temp on i5-1035g1
 #define HWMON_ALT2 \
 "/sys/class/hwmon/hwmon1/temp1_input"
+
+// None of these work on any of the cpus above.
 #define HWMON_ALT3 \
 "/sys/devices/platform/coretemp.0/hwmon/hwmon0/temp2_input"
 #define HWMON_ALT4 \
@@ -33,16 +66,28 @@
 #define HWMON_ALT5 \
 "/sys/class/hwmon/hwmon0/device/temp1_input"

+
 static inline float linux_cputemp(int core)
 {
 	float tc = 0.0;
-	FILE *fd = fopen(HWMON_PATH, "r");
+	FILE *fd;
 	uint32_t val = 0;

-	if (!fd)
-		fd = fopen(HWMON_ALT, "r");
+   fd = fopen(HWMON_PATH1, "r");
+
+   if (!fd)
+      fd = fopen(HWMON_PATH2, "r");
+
+   if (!fd)
+      fd = fopen(HWMON_PATH3, "r");
+
+   if (!fd)
+      fd = fopen(HWMON_PATH, "r");

 	if (!fd)
+//		fd = fopen(HWMON_ALT1, "r");
+
+//	if (!fd)
 		fd = fopen(HWMON_ALT2, "r");

 	if (!fd)
@@ -52,14 +97,14 @@ static inline float linux_cputemp(int core)
 		fd = fopen(HWMON_ALT4, "r");

 	if (!fd)
-                fd = fopen(HWMON_ALT5, "r");
+      fd = fopen(HWMON_ALT5, "r");

 	if (!fd)
 		return tc;

-	if (fscanf(fd, "%d", &val))
+	if ( fscanf( fd, "%d", &val ) )
 		tc = val / 1000.0;
-	fclose(fd);
+	fclose( fd );
 	return tc;
 }

@@ -296,7 +341,7 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 // EXTENDED_FEATURES ECX
 #define AVX512VBMI_Flag  (1<<1) 
 #define AVX512VBMI2_Flag (1<<6)
-#define AVX512VAES_Flag  (1<<9)
+#define VAES_Flag        (1<<9)


 // Use this to detect presence of feature
@@ -418,14 +463,14 @@ static inline bool has_avx512()
 #endif
 }

-static inline bool has_avx512vaes()
+static inline bool has_vaes()
 {
 #ifdef __arm__
    return false;
 #else
    int cpu_info[4] = { 0 };
    cpuid( EXTENDED_FEATURES, cpu_info );
-    return cpu_info[ ECX_Reg ] & AVX512VAES_Flag;
+    return cpu_info[ ECX_Reg ] & VAES_Flag;
 #endif
 }

--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -41,27 +41,22 @@ make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-zen.exe

-#make clean || echo clean
-#CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure $CONFIGURE_ARGS
-#make
-#strip -s cpuminer.exe
-#mv cpuminer.exe release/cpuminer-avx-sha.exe
+# mingw won't compile avx512 without -fno-asynchronous-unwind-tables
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=skylake-avx512 -Wall -fno-asynchronous-unwind-tables" ./configure $CONFIGURE_ARGS
+make -j 16
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-avx512.exe

 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=core-avx2 -Wall" ./configure $CONFIGURE_ARGS
+# GCC 9 doesn't include AES in core-avx2
+CFLAGS="-O3 -march=core-avx2 -maes -Wall" ./configure $CONFIGURE_ARGS
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2.exe

-#make clean || echo clean
-#rm -f config.status
-#CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS
-#make -j 
-#strip -s cpuminer.exe
-#mv cpuminer.exe release/cpuminer-aes-sha.exe
-
-
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $CONFIGURE_ARGS 
--- a/winbuild-cross.sh.bak
+++ b/winbuild-cross.sh.bak
@@ -0,0 +1,103 @@
+#!/bin/bash
+#
+# Script for building Windows binaries release package using mingw.
+# Requires a custom mingw environment, not intended for users.
+#
+# Compiles Windows EXE files for selected CPU architectures, copies them
+# as well as some DLLs that aren't available in most Windows environments
+# into a release folder ready to be zipped and uploaded.
+
+# define some local variables
+
+export LOCAL_LIB="$HOME/usr/lib"
+
+export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
+
+export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
+
+# make link to local gmp header file.
+ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h
+
+# edit configure to fix pthread lib name for Windows.
+#sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
+
+# make release directory and copy selected DLLs.
+mkdir release
+cp README.txt release/
+cp README.md release/
+cp RELEASE_NOTES release/
+cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
+cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
+cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libstdc++-6.dll release/
+cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libgcc_s_seh-1.dll release/
+cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/
+cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
+
+make distclean || echo clean
+rm -f config.status
+./autogen.sh || echo done
+CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS
+make -j 16
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-zen.exe
+
+#make clean || echo clean
+#CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure $CONFIGURE_ARGS
+#make
+#strip -s cpuminer.exe
+#mv cpuminer.exe release/cpuminer-avx-sha.exe
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=core-avx2 -Wall" ./configure $CONFIGURE_ARGS
+make -j 16
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-avx2.exe
+
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS
+#make -j 
+#strip -s cpuminer.exe
+#mv cpuminer.exe release/cpuminer-aes-sha.exe
+
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $CONFIGURE_ARGS 
+make -j 16
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-avx.exe
+
+# -march=westmere is supported in gcc5
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=westmere -Wall" ./configure $CONFIGURE_ARGS
+#CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS
+make -j 16
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-aes-sse42.exe
+
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-O3 -march=corei7 -Wall" ./configure $CONFIGURE_ARGS
+#make 
+#strip -s cpuminer.exe
+#mv cpuminer.exe release/cpuminer-sse42.exe
+
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-O3 -march=core2 -Wall" ./configure $CONFIGURE_ARGS
+#make 
+#strip -s cpuminer.exe
+#mv cpuminer.exe release/cpuminer-ssse3.exe
+#make clean || echo clean
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -msse2 -Wall" ./configure $CONFIGURE_ARGS
+make -j 16
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-sse2.exe
+make clean || echo clean
+