v3.10.2

v3.10.1
v3.10.0
2025-09-17 23:44:27 +00:00 · 2019-12-09 15:59:02 -05:00 · 2019-12-05 19:09:23 -05:00 · 2019-12-03 12:26:11 -05:00
106 changed files with 10098 additions and 2811 deletions
--- a/14
+++ b/14
@@ -24,18 +24,10 @@ be installed manually. There may be others, read the error messages they
 will give a clue as to the missing package.

 The following command should install everything you need on Debian based
-distributions such as Ubuntu:
+distributions such as Ubuntu. Fedora and other distributions may have similar
+but different package names.

-sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake zlib1g-dev
-
-build-essential  (Development Tools package group on Fedora)
-automake
-libjansson-dev
-libgmp-dev
-libcurl4-openssl-dev
-libssl-dev
-lib-thread
-zlib1g-dev
+sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev

 SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
 openssl 1.1.0e or higher. Add one of the following, depending on the
--- a/5
+++ b/5
@@ -22,14 +22,13 @@ Step by step...

 Refer to Linux compile instructions and install required packages.

-Additionally, install mingw-64.
+Additionally, install mingw-w64.

 sudo apt-get install mingw-w64


 2. Create a local library directory for packages to be compiled in the next
-   step. Recommended location is $HOME/usr/lib/
-
+   step. Suggested location is $HOME/usr/lib/

 3. Download and build other packages for mingw that don't have a mingw64
   version available in the repositories.
--- a/Makefile.am
+++ b/Makefile.am
@@ -174,7 +174,6 @@ cpuminer_SOURCES = \
  algo/sha/sph_sha2big.c \
  algo/sha/sha256-hash-4way.c \
  algo/sha/sha512-hash-4way.c \
-  algo/sha/sha256_hash_11way.c \
  algo/sha/sha2.c \
  algo/sha/sha256t-gate.c \
  algo/sha/sha256t-4way.c \
@@ -198,7 +197,6 @@ cpuminer_SOURCES = \
  algo/skein/skein-gate.c \
  algo/skein/skein2.c \
  algo/skein/skein2-4way.c \
-  algo/skein/skein2-gate.c \
  algo/sm3/sm3.c \
  algo/sm3/sm3-hash-4way.c \
  algo/swifftx/swifftx.c \
--- a/README.md
+++ b/README.md
@@ -144,6 +144,9 @@ Supported Algorithms
 Errata
 ------

+Old algorithms that are no longer used frequently will not have the latest
+optimizations.
+
 Cryptonight and variants are no longer supported, use another miner.

 Neoscrypt crashes on Windows, use legacy version.
--- a/README.txt
+++ b/README.txt
@@ -15,20 +15,29 @@ the features listed at cpuminer startup to ensure you are mining at
 optimum speed using the best available features.

 Architecture names and compile options used are only provided for Intel
-Core series. Even the newest Pentium and Celeron CPUs are often missing
-features.
+Core series. Budget CPUs like Pentium and Celeron are often missing the
+latest features.

 AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
 supported by cpuminer-opt due to an incompatible implementation of SSE2 on
 these CPUs. Some algos may crash the miner with an invalid instruction.
 Users are recommended to use an unoptimized miner such as cpuminer-multi.

+More information for Intel and AMD CPU architectures and their features
+can be found on Wikipedia.
+
+https://en.wikipedia.org/wiki/List_of_Intel_CPU_microarchitectures
+
+https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures
+
+
 Exe name                Compile flags            Arch name

 cpuminer-sse2.exe      "-msse2"                  Core2, Nehalem   
 cpuminer-aes-sse42.exe "-march=westmere"         Westmere
-cpuminer-avx.exe       "-march=corei7-avx"       Sandy-Ivybridge
-cpuminer-avx2.exe      "-march=core-avx2"        Haswell, Sky-Kaby-Coffeelake
+cpuminer-avx.exe       "-march=corei7-avx"       Sandybridge
+cpuminer-avx2.exe      "-march=core-avx2 -maes"  Haswell, Skylake, Coffeelake
+cpuminer-avx512.exe    "-march=skylake-avx512"   Skylake-X, Cascadelake-X
 cpuminer-zen           "-march=znver1"           AMD Ryzen, Threadripper

 If you like this software feel free to donate:
--- a/39
+++ b/39
@@ -25,12 +25,47 @@ Requirements
 Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
 supported.

-64 bit Linux or Windows operating system. Apple and Android are not supported.
-FreeBSD YMMV.
+64 bit Linux or Windows operating system. Apple, Android and Rpi are
+not supported. FreeBSD YMMV.

 Change Log
 ----------

+v3.10.2
+
+AVX512 added for bmw512, c11, phi1612 (phi), qubit, skunk, x11, x11gost (sib).
+Fixed c11 AVX2 invalid shares.
+
+v3.10.1
+
+AVX512 for blake2b, nist5, quark, tribus.
+
+More broken lane fixes, fixed buffer overflow in skein AVX512, fixed
+quark invalid shares AVX2.
+
+Only the highest ranking feature in a class is listed at startup, lower ranking
+features are available but no longer listed.
+
+v3.10.0
+
+AVX512 is now supported on selected algos, Windows binary is now available.
+AVX512 optimizations are available for argon2d, blake2s, keccak, keccakc,
+skein & skein2.
+
+Fixed CPU temperature for some CPU models (Linux only).
+
+Fixed a bug that caused some lanes not to submit shares.
+
+Fixed some previously undetected buffer overflows.
+
+Lyra2rev2 3% faster SSE2 and AVX2.
+
+Added "-fno-asynchronous-unwind-tables" to AVX512 build script for Windows
+to fix known mingw issue.
+
+Changed AVX2 build script to explicitly add AES to address change in
+behaviour in GCC 9. 
+
 v3.9.11

 Added x22i & x25x algos.
--- a/algo/argon2/argon2d/argon2d/opt.c
+++ b/algo/argon2/argon2d/argon2d/opt.c
@@ -21,7 +21,7 @@

 #include "argon2.h"
 #include "core.h"
-
+#include "simd-utils.h"
 #include "../blake2/blake2.h"
 #include "../blake2/blamka-round-opt.h"

@@ -37,24 +37,28 @@

 #if defined(__AVX512F__)

-static void fill_block(__m512i *state, const block *ref_block,
-                       block *next_block, int with_xor) {
+static void fill_block( __m512i *state, const block *ref_block,
+                       block *next_block, int with_xor )
+{
    __m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK];
    unsigned int i;

-    if (with_xor) {
-        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
-            state[i] = _mm512_xor_si512(
-                state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
-            block_XY[i] = _mm512_xor_si512(
-                state[i], _mm512_loadu_si512((const __m512i *)next_block->v + i));
-        }
-    } else {
-        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
-            block_XY[i] = state[i] = _mm512_xor_si512(
-                state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
+    if ( with_xor )
+    {
+        for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
+        {
+            state[i] = _mm512_xor_si512( state[i],
+                      _mm512_load_si512( (const __m512i*)ref_block->v + i ) );
+            block_XY[i] = _mm512_xor_si512( state[i],
+                      _mm512_load_si512( (const __m512i*)next_block->v + i ) );
        }
    }
+    else
+    {
+        for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
+            block_XY[i] = state[i] = _mm512_xor_si512( state[i],
+                      _mm512_load_si512( (const __m512i*)ref_block->v + i ) );
+    }

    BLAKE2_ROUND_1( state[ 0], state[ 1], state[ 2], state[ 3],
                    state[ 4], state[ 5], state[ 6], state[ 7] );
@@ -66,23 +70,10 @@ static void fill_block(__m512i *state, const block *ref_block,
    BLAKE2_ROUND_2( state[ 1], state[ 3], state[ 5], state[ 7],
                    state[ 9], state[11], state[13], state[15] );

-/*
-    for (i = 0; i < 2; ++i) {
-        BLAKE2_ROUND_1(
-            state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
-            state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
-    }
-
-    for (i = 0; i < 2; ++i) {
-        BLAKE2_ROUND_2(
-            state[2 * 0 + i], state[2 * 1 + i], state[2 * 2 + i], state[2 * 3 + i],
-            state[2 * 4 + i], state[2 * 5 + i], state[2 * 6 + i], state[2 * 7 + i]);
-    }
-*/
-
-    for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
-        state[i] = _mm512_xor_si512(state[i], block_XY[i]);
-        _mm512_storeu_si512((__m512i *)next_block->v + i, state[i]);
+    for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
+    {
+        state[i] = _mm512_xor_si512( state[i], block_XY[i] );
+        _mm512_store_si512( (__m512i*)next_block->v + i, state[i] );
    }
 }

@@ -125,18 +116,6 @@ static void fill_block(__m256i *state, const block *ref_block,
    BLAKE2_ROUND_2( state[ 3], state[ 7], state[11], state[15],
                    state[19], state[23], state[27], state[31] );

-/*
-    for (i = 0; i < 4; ++i) {
-        BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
-                       state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
-    }
-
-    for (i = 0; i < 4; ++i) {
-        BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i],
-                       state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
-    }
-*/
-
    for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
        state[i] = _mm256_xor_si256(state[i], block_XY[i]);
        _mm256_store_si256((__m256i *)next_block->v + i, state[i]);
@@ -153,14 +132,14 @@ static void fill_block(__m128i *state, const block *ref_block,
    if (with_xor) {
        for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
            state[i] = _mm_xor_si128(
-                state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
+                state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
            block_XY[i] = _mm_xor_si128(
-                state[i], _mm_loadu_si128((const __m128i *)next_block->v + i));
+                state[i], _mm_load_si128((const __m128i *)next_block->v + i));
        }
    } else {
        for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
            block_XY[i] = state[i] = _mm_xor_si128(
-                state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
+                state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
        }
    }

@@ -198,22 +177,9 @@ static void fill_block(__m128i *state, const block *ref_block,
    BLAKE2_ROUND( state[ 7], state[15], state[23], state[31],  
                  state[39], state[47], state[55], state[63] );

-/*
-    for (i = 0; i < 8; ++i) {
-        BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
-            state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
-            state[8 * i + 6], state[8 * i + 7]);
-    }
-
-    for (i = 0; i < 8; ++i) {
-        BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i],
-            state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
-            state[8 * 6 + i], state[8 * 7 + i]);
-    }
-*/
    for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
        state[i] = _mm_xor_si128(state[i], block_XY[i]);
-        _mm_storeu_si128((__m128i *)next_block->v + i, state[i]);
+        _mm_store_si128((__m128i *)next_block->v + i, state[i]);
    }
 }

--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -427,14 +427,14 @@ static __m512i muladd(__m512i x, __m512i y)
 #define SWAP_QUARTERS(A0, A1) \
    do { \
        SWAP_HALVES(A0, A1); \
-        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
-        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+        A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
+        A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
    } while((void)0, 0)

 #define UNSWAP_QUARTERS(A0, A1) \
    do { \
-        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
-        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+        A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
+        A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
        SWAP_HALVES(A0, A1); \
    } while((void)0, 0)

--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -59,7 +59,6 @@ extern "C"{
 typedef struct {
   unsigned char buf[64<<2];
   uint32_t H[8<<2];
-   uint32_t S[4<<2];
 //   __m128i buf[16] __attribute__ ((aligned (64)));
 //   __m128i H[8];
 //   __m128i S[4];    
@@ -93,7 +92,6 @@ void blake256r8_4way_close(void *cc, void *dst);
 typedef struct {
   __m256i buf[16] __attribute__ ((aligned (64)));
   __m256i H[8];
-   __m256i S[4];
   size_t ptr;
   sph_u32 T0, T1;
   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
@@ -120,20 +118,73 @@ void blake256r8_8way_close(void *cc, void *dst);
 // Blake-512 4 way

 typedef struct {
-   __m256i buf[16] __attribute__ ((aligned (64)));
+   __m256i buf[16];
   __m256i H[8];
   __m256i S[4];   
   size_t ptr;
   sph_u64 T0, T1;
-} blake_4way_big_context;
+} blake_4way_big_context __attribute__ ((aligned (128)));

 typedef blake_4way_big_context blake512_4way_context;

-void blake512_4way_init(void *cc);
-void blake512_4way(void *cc, const void *data, size_t len);
-void blake512_4way_close(void *cc, void *dst);
-void blake512_4way_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
+void blake512_4way_init( blake_4way_big_context *sc );
+void blake512_4way_update( void *cc, const void *data, size_t len );
+#define blake512_4way blake512_4way_update
+void blake512_4way_close( void *cc, void *dst );
+void blake512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                      void *dst );
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+//Blake-256 16 way
+
+typedef struct {
+   __m512i buf[16];
+   __m512i H[8];
+   size_t ptr;
+   uint32_t T0, T1;
+   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
+} blake_16way_small_context __attribute__ ((aligned (128)));
+
+// Default 14 rounds
+typedef blake_16way_small_context blake256_16way_context;
+void blake256_16way_init(void *cc);
+void blake256_16way_update(void *cc, const void *data, size_t len);
+void blake256_16way_close(void *cc, void *dst);
+
+// 14 rounds, blake, decred
+typedef blake_16way_small_context blake256r14_16way_context;
+void blake256r14_16way_init(void *cc);
+void blake256r14_16way_update(void *cc, const void *data, size_t len);
+void blake256r14_16way_close(void *cc, void *dst);
+
+// 8 rounds, blakecoin, vanilla
+typedef blake_16way_small_context blake256r8_16way_context;
+void blake256r8_16way_init(void *cc);
+void blake256r8_16way_update(void *cc, const void *data, size_t len);
+void blake256r8_16way_close(void *cc, void *dst);
+
+
+// Blake-512 8 way
+
+typedef struct {
+   __m512i buf[16];
+   __m512i H[8];
+   __m512i S[4];
+   size_t ptr;
+   sph_u64 T0, T1;
+} blake_8way_big_context __attribute__ ((aligned (128)));
+
+typedef blake_8way_big_context blake512_8way_context;
+
+void blake512_8way_init( blake_8way_big_context *sc );
+void blake512_8way_update( void *cc, const void *data, size_t len );
+void blake512_8way_close( void *cc, void *dst );
+void blake512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                      void *dst );
+
+#endif  // AVX512
+

 #endif  // AVX2

--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -304,16 +304,17 @@ static const sph_u32 CS[16] = {

 #endif

+// Blake-256 4 way

 #define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
 do { \
-   a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
-                                   _mm_set1_epi32( c1 ), m0 ), b ), a ); \
+   a = _mm_add_epi32( _mm_add_epi32( a, b ), \
+                      _mm_xor_si128( _mm_set1_epi32( c1 ), m0 ) ); \
   d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
   c = _mm_add_epi32( c, d ); \
   b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
-   a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
-                                   _mm_set1_epi32( c0 ), m1 ), b ), a ); \
+   a = _mm_add_epi32( _mm_add_epi32( a, b ), \
+                      _mm_xor_si128( _mm_set1_epi32( c0 ), m1 ) ); \
   d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
   c = _mm_add_epi32( c, d ); \
   b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
@@ -321,7 +322,8 @@ do { \

 #if SPH_COMPACT_BLAKE_32

-// Blake-256 4 way
+// Not used
+#if 0

 #define ROUND_S_4WAY(r)   do { \
 	GS_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
@@ -342,6 +344,8 @@ do { \
 		CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \
 } while (0)

+#endif
+
 #else

 #define ROUND_S_4WAY(r)   do { \
@@ -359,7 +363,6 @@ do { \

 #define DECL_STATE32_4WAY \
 	__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
-	__m128i S0, S1, S2, S3; \
        uint32_t T0, T1;

 #define READ_STATE32_4WAY(state)   do { \
@@ -371,10 +374,6 @@ do { \
 		H5 = casti_m128i( state->H, 5 ); \
 		H6 = casti_m128i( state->H, 6 ); \
 		H7 = casti_m128i( state->H, 7 ); \
-		S0 = casti_m128i( state->S, 0 ); \
-		S1 = casti_m128i( state->S, 1 ); \
-		S2 = casti_m128i( state->S, 2 ); \
-		S3 = casti_m128i( state->S, 3 ); \
 		T0 = (state)->T0; \
 		T1 = (state)->T1; \
 	} while (0)
@@ -388,17 +387,13 @@ do { \
 		casti_m128i( state->H, 5 ) = H5; \
 		casti_m128i( state->H, 6 ) = H6; \
 		casti_m128i( state->H, 7 ) = H7; \
-		casti_m128i( state->S, 0 ) = S0; \
-		casti_m128i( state->S, 1 ) = S1; \
-		casti_m128i( state->S, 2 ) = S2; \
-		casti_m128i( state->S, 3 ) = S3; \
 		(state)->T0 = T0; \
 		(state)->T1 = T1; \
 	} while (0)

 #if SPH_COMPACT_BLAKE_32
 // not used
-
+#if 0
 #define COMPRESS32_4WAY( rounds )   do { \
 	__m128i M[16]; \
 	__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -441,6 +436,7 @@ do { \
        H7 = _mm_xor_si128( _mm_xor_si128( \
                                   _mm_xor_si128( S3, V7 ), VF ), H7 ); \
 	} while (0)
+#endif

 #else

@@ -508,10 +504,10 @@ do { \
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-   V8 = _mm_xor_si128( S0, m128_const1_64( 0x243F6A88243F6A88 ) ); \
-   V9 = _mm_xor_si128( S1, m128_const1_64( 0x85A308D385A308D3 ) ); \
-   VA = _mm_xor_si128( S2, m128_const1_64( 0x13198A2E13198A2E ) ); \
-   VB = _mm_xor_si128( S3, m128_const1_64( 0x0370734403707344 ) ); \
+   V8 = m128_const1_64( 0x243F6A88243F6A88 ); \
+   V9 = m128_const1_64( 0x85A308D385A308D3 ); \
+   VA = m128_const1_64( 0x13198A2E13198A2E ); \
+   VB = m128_const1_64( 0x0370734403707344 ); \
   VC = _mm_xor_si128( _mm_set1_epi32( T0 ), \
                           m128_const1_64( 0xA4093822A4093822 ) ); \
   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), \
@@ -538,14 +534,14 @@ do { \
      ROUND_S_4WAY(2); \
      ROUND_S_4WAY(3); \
   } \
-   H0 = mm128_xor4( V8, V0, S0, H0 ); \
-   H1 = mm128_xor4( V9, V1, S1, H1 ); \
-   H2 = mm128_xor4( VA, V2, S2, H2 ); \
-   H3 = mm128_xor4( VB, V3, S3, H3 ); \
-   H4 = mm128_xor4( VC, V4, S0, H4 ); \
-   H5 = mm128_xor4( VD, V5, S1, H5 ); \
-   H6 = mm128_xor4( VE, V6, S2, H6 ); \
-   H7 = mm128_xor4( VF, V7, S3, H7 ); \
+   H0 = _mm_xor_si128( _mm_xor_si128( V8, V0 ), H0 ); \
+   H1 = _mm_xor_si128( _mm_xor_si128( V9, V1 ), H1 ); \
+   H2 = _mm_xor_si128( _mm_xor_si128( VA, V2 ), H2 ); \
+   H3 = _mm_xor_si128( _mm_xor_si128( VB, V3 ), H3 ); \
+   H4 = _mm_xor_si128( _mm_xor_si128( VC, V4 ), H4 ); \
+   H5 = _mm_xor_si128( _mm_xor_si128( VD, V5 ), H5 ); \
+   H6 = _mm_xor_si128( _mm_xor_si128( VE, V6 ), H6 ); \
+   H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \
 } while (0)

 #endif
@@ -556,13 +552,13 @@ do { \

 #define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
 do { \
-   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
-                 _mm256_set1_epi32( c1 ), m0 ), b ), a ); \
+   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
+                         _mm256_xor_si256( _mm256_set1_epi32( c1 ), m0 ) ); \
   d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
   c = _mm256_add_epi32( c, d ); \
   b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
-   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
-                 _mm256_set1_epi32( c0 ), m1 ), b ), a ); \
+   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
+                         _mm256_xor_si256( _mm256_set1_epi32( c0 ), m1 ) ); \
   d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
   c = _mm256_add_epi32( c, d ); \
   b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
@@ -581,7 +577,6 @@ do { \

 #define DECL_STATE32_8WAY \
   __m256i H0, H1, H2, H3, H4, H5, H6, H7; \
-   __m256i S0, S1, S2, S3; \
   sph_u32 T0, T1;

 #define READ_STATE32_8WAY(state) \
@@ -594,10 +589,6 @@ do { \
   H5 = (state)->H[5]; \
   H6 = (state)->H[6]; \
   H7 = (state)->H[7]; \
-   S0 = (state)->S[0]; \
-   S1 = (state)->S[1]; \
-   S2 = (state)->S[2]; \
-   S3 = (state)->S[3]; \
   T0 = (state)->T0; \
   T1 = (state)->T1; \
 } while (0)
@@ -612,10 +603,6 @@ do { \
   (state)->H[5] = H5; \
   (state)->H[6] = H6; \
   (state)->H[7] = H7; \
-   (state)->S[0] = S0; \
-   (state)->S[1] = S1; \
-   (state)->S[2] = S2; \
-   (state)->S[3] = S3; \
   (state)->T0 = T0; \
   (state)->T1 = T1; \
 } while (0)
@@ -635,10 +622,10 @@ do { \
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-   V8 = _mm256_xor_si256( S0, m256_const1_64( 0x243F6A88243F6A88 ) ); \
-   V9 = _mm256_xor_si256( S1, m256_const1_64( 0x85A308D385A308D3 ) ); \
-   VA = _mm256_xor_si256( S2, m256_const1_64( 0x13198A2E13198A2E ) ); \
-   VB = _mm256_xor_si256( S3, m256_const1_64( 0x0370734403707344 ) ); \
+   V8 = m256_const1_64( 0x243F6A88243F6A88 ); \
+   V9 = m256_const1_64( 0x85A308D385A308D3 ); \
+   VA = m256_const1_64( 0x13198A2E13198A2E ); \
+   VB = m256_const1_64( 0x0370734403707344 ); \
   VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
                              m256_const1_64( 0xA4093822A4093822 ) ); \
   VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
@@ -682,17 +669,155 @@ do { \
      ROUND_S_8WAY(2); \
      ROUND_S_8WAY(3); \
   } \
-   H0 = mm256_xor4( V8, V0, S0, H0 ); \
-   H1 = mm256_xor4( V9, V1, S1, H1 ); \
-   H2 = mm256_xor4( VA, V2, S2, H2 ); \
-   H3 = mm256_xor4( VB, V3, S3, H3 ); \
-   H4 = mm256_xor4( VC, V4, S0, H4 ); \
-   H5 = mm256_xor4( VD, V5, S1, H5 ); \
-   H6 = mm256_xor4( VE, V6, S2, H6 ); \
-   H7 = mm256_xor4( VF, V7, S3, H7 ); \
+   H0 = _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), H0 ); \
+   H1 = _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), H1 ); \
+   H2 = _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), H2 ); \
+   H3 = _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), H3 ); \
+   H4 = _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), H4 ); \
+   H5 = _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), H5 ); \
+   H6 = _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), H6 ); \
+   H7 = _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), H7 ); \
 } while (0)


+#endif
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// Blaske-256 16 way AVX512
+
+#define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
+do { \
+   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
+                         _mm512_xor_si512( _mm512_set1_epi32( c1 ), m0 ) ); \
+   d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
+   c = _mm512_add_epi32( c, d ); \
+   b = mm512_ror_32( _mm512_xor_si512( b, c ), 12 ); \
+   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
+                         _mm512_xor_si512( _mm512_set1_epi32( c0 ), m1 ) ); \
+   d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \
+   c = _mm512_add_epi32( c, d ); \
+   b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \
+} while (0)
+
+#define ROUND_S_16WAY(r)   do { \
+        GS_16WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
+        GS_16WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
+        GS_16WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
+        GS_16WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
+        GS_16WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
+        GS_16WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
+        GS_16WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
+        GS_16WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
+} while (0)
+
+#define DECL_STATE32_16WAY \
+   __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
+   sph_u32 T0, T1;
+
+#define READ_STATE32_16WAY(state) \
+do { \
+   H0 = (state)->H[0]; \
+   H1 = (state)->H[1]; \
+   H2 = (state)->H[2]; \
+   H3 = (state)->H[3]; \
+   H4 = (state)->H[4]; \
+   H5 = (state)->H[5]; \
+   H6 = (state)->H[6]; \
+   H7 = (state)->H[7]; \
+   T0 = (state)->T0; \
+   T1 = (state)->T1; \
+} while (0)
+
+#define WRITE_STATE32_16WAY(state) \
+do { \
+   (state)->H[0] = H0; \
+   (state)->H[1] = H1; \
+   (state)->H[2] = H2; \
+   (state)->H[3] = H3; \
+   (state)->H[4] = H4; \
+   (state)->H[5] = H5; \
+   (state)->H[6] = H6; \
+   (state)->H[7] = H7; \
+   (state)->T0 = T0; \
+   (state)->T1 = T1; \
+} while (0)
+
+#define COMPRESS32_16WAY( rounds ) \
+do { \
+   __m512i M0, M1, M2, M3, M4, M5, M6, M7; \
+   __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
+   __m512i V0, V1, V2, V3, V4, V5, V6, V7; \
+   __m512i V8, V9, VA, VB, VC, VD, VE, VF; \
+   __m512i shuf_bswap32; \
+   V0 = H0; \
+   V1 = H1; \
+   V2 = H2; \
+   V3 = H3; \
+   V4 = H4; \
+   V5 = H5; \
+   V6 = H6; \
+   V7 = H7; \
+   V8 = m512_const1_64( 0x243F6A88243F6A88 ); \
+   V9 = m512_const1_64( 0x85A308D385A308D3 ); \
+   VA = m512_const1_64( 0x13198A2E13198A2E ); \
+   VB = m512_const1_64( 0x0370734403707344 ); \
+   VC = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
+                              m512_const1_64( 0xA4093822A4093822 ) ); \
+   VD = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
+                              m512_const1_64( 0x299F31D0299F31D0 ) ); \
+   VE = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
+                              m512_const1_64( 0x082EFA98082EFA98 ) ); \
+   VF = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
+                              m512_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
+   shuf_bswap32 = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
+                                 0x2c2d2e2f28292a2b, 0x2425262720212223, \
+                                 0x1c1d1e1f18191a1b, 0x1415161710111213, \
+                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+   M0 = _mm512_shuffle_epi8( * buf    , shuf_bswap32 ); \
+   M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
+   M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
+   M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
+   M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
+   M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
+   M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
+   M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
+   M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
+   M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
+   MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
+   MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
+   MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
+   MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
+   ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
+   MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
+   ROUND_S_16WAY(0); \
+   ROUND_S_16WAY(1); \
+   ROUND_S_16WAY(2); \
+   ROUND_S_16WAY(3); \
+   ROUND_S_16WAY(4); \
+   ROUND_S_16WAY(5); \
+   ROUND_S_16WAY(6); \
+   ROUND_S_16WAY(7); \
+   if (rounds == 14) \
+   { \
+      ROUND_S_16WAY(8); \
+      ROUND_S_16WAY(9); \
+      ROUND_S_16WAY(0); \
+      ROUND_S_16WAY(1); \
+      ROUND_S_16WAY(2); \
+      ROUND_S_16WAY(3); \
+   } \
+   H0 = _mm512_xor_si512( _mm512_xor_si512( V8, V0 ), H0 ); \
+   H1 = _mm512_xor_si512( _mm512_xor_si512( V9, V1 ), H1 ); \
+   H2 = _mm512_xor_si512( _mm512_xor_si512( VA, V2 ), H2 ); \
+   H3 = _mm512_xor_si512( _mm512_xor_si512( VB, V3 ), H3 ); \
+   H4 = _mm512_xor_si512( _mm512_xor_si512( VC, V4 ), H4 ); \
+   H5 = _mm512_xor_si512( _mm512_xor_si512( VD, V5 ), H5 ); \
+   H6 = _mm512_xor_si512( _mm512_xor_si512( VE, V6 ), H6 ); \
+   H7 = _mm512_xor_si512( _mm512_xor_si512( VF, V7 ), H7 ); \
+} while (0)
+
 #endif

 // Blake-256 4 way
@@ -703,7 +828,6 @@ static void
 blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
                   const uint32_t *salt, int rounds )
 {
-   __m128i zero = m128_zero;
   casti_m128i( ctx->H, 0 ) = m128_const1_64( 0x6A09E6676A09E667 );
   casti_m128i( ctx->H, 1 ) = m128_const1_64( 0xBB67AE85BB67AE85 );
   casti_m128i( ctx->H, 2 ) = m128_const1_64( 0x3C6EF3723C6EF372 );
@@ -712,11 +836,6 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
   casti_m128i( ctx->H, 5 ) = m128_const1_64( 0x9B05688C9B05688C );
   casti_m128i( ctx->H, 6 ) = m128_const1_64( 0x1F83D9AB1F83D9AB );
   casti_m128i( ctx->H, 7 ) = m128_const1_64( 0x5BE0CD195BE0CD19 );
-
-   casti_m128i( ctx->S, 0 ) = zero;
-   casti_m128i( ctx->S, 1 ) = zero;
-   casti_m128i( ctx->S, 2 ) = zero;
-   casti_m128i( ctx->S, 3 ) = zero;
   ctx->T0 = ctx->T1 = 0;
   ctx->ptr = 0;
   ctx->rounds = rounds;
@@ -824,7 +943,6 @@ static void
 blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
                   const sph_u32 *salt, int rounds )
 {
-   __m256i zero = m256_zero;
   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E6676A09E667 );
   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE85BB67AE85 );
   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF3723C6EF372 );
@@ -833,10 +951,6 @@ blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C9B05688C );
   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9AB1F83D9AB );
   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD195BE0CD19 );
-   casti_m256i( sc->S, 0 ) = zero;
-   casti_m256i( sc->S, 1 ) = zero;
-   casti_m256i( sc->S, 2 ) = zero;
-   casti_m256i( sc->S, 3 ) = zero;
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
   sc->rounds = rounds;
@@ -940,6 +1054,179 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,

 #endif

+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+//Blake-256 16 way AVX512
+
+static void
+blake32_16way_init( blake_16way_small_context *sc, const sph_u32 *iv,
+                   const sph_u32 *salt, int rounds )
+{
+   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E6676A09E667 );
+   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE85BB67AE85 );
+   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF3723C6EF372 );
+   casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53AA54FF53A );
+   casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527F510E527F );
+   casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C9B05688C );
+   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9AB1F83D9AB );
+   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD195BE0CD19 );
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
+   sc->rounds = rounds;
+}
+
+static void
+blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   __m512i *buf;
+   size_t ptr;
+   const int buf_size = 64;   // number of elements, sizeof/4
+   DECL_STATE32_16WAY
+   buf = sc->buf;
+   ptr = sc->ptr;
+   if ( len < buf_size - ptr )
+   {
+        memcpy_512( buf + (ptr>>2), vdata, len>>2 );
+        ptr += len;
+        sc->ptr = ptr;
+        return;
+   }
+   READ_STATE32_16WAY(sc);
+   while ( len > 0 )
+   {
+      size_t clen;
+
+      clen = buf_size - ptr;
+      if (clen > len)
+           clen = len;
+      memcpy_512( buf + (ptr>>2), vdata, clen>>2 );
+      ptr += clen;
+      vdata += (clen>>2);
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+          if ( ( T0 = T0 + 512 ) < 512 )
+                T1 = T1 + 1;
+          COMPRESS32_16WAY( sc->rounds );
+          ptr = 0;
+      }
+   }
+   WRITE_STATE32_16WAY(sc);
+   sc->ptr = ptr;
+}
+
+static void
+blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
+                    void *dst, size_t out_size_w32 )
+{
+   __m512i buf[16];
+   size_t ptr;
+   unsigned bit_len;
+   sph_u32 th, tl;
+
+   ptr = sc->ptr;
+   bit_len = ((unsigned)ptr << 3);
+   buf[ptr>>2] = m512_const1_64( 0x0000008000000080ULL );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+
+   if ( ptr == 0 )
+   {
+        sc->T0 = 0xFFFFFE00UL;
+        sc->T1 = 0xFFFFFFFFUL;
+   }
+   else if ( sc->T0 == 0 )
+   {
+        sc->T0 = 0xFFFFFE00UL + bit_len;
+        sc->T1 = sc->T1 - 1;
+   }
+   else
+        sc->T0 -= 512 - bit_len;
+
+   if ( ptr <= 52 )
+   {
+       memset_zero_512( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
+       if ( out_size_w32 == 8 )
+           buf[52>>2] = _mm512_or_si512( buf[52>>2],
+                                m512_const1_64( 0x0100000001000000ULL ) );
+       buf[+56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
+       buf[+60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
+       blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
+   }
+   else
+   {
+        memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
+        blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
+        sc->T0 = 0xFFFFFE00UL;
+        sc->T1 = 0xFFFFFFFFUL;
+        memset_zero_512( buf, 56>>2 );
+       if ( out_size_w32 == 8 )
+           buf[52>>2] = m512_const1_64( 0x0100000001000000ULL );
+        buf[56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
+        buf[60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
+        blake32_16way( sc, buf, 64 );
+   }
+   mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
+}
+
+void
+blake256_16way_init(void *cc)
+{
+   blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
+}
+
+void
+blake256_16way_update(void *cc, const void *data, size_t len)
+{
+        blake32_16way(cc, data, len);
+}
+
+void
+blake256_16way_close_update(void *cc, void *dst)
+{
+        blake32_16way_close(cc, 0, 0, dst, 8);
+}
+
+void blake256r14_16way_init(void *cc)
+{
+   blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
+}
+
+void
+blake256r14_16way_update(void *cc, const void *data, size_t len)
+{
+   blake32_16way(cc, data, len);
+}
+
+void
+blake256r14_16way_close(void *cc, void *dst)
+{
+   blake32_16way_close(cc, 0, 0, dst, 8);
+}
+
+void blake256r8_16way_init(void *cc)
+{
+   blake32_16way_init( cc, IV256, salt_zero_8way_small, 8 );
+}
+
+void
+blake256r8_16way_update(void *cc, const void *data, size_t len)
+{
+   blake32_16way(cc, data, len);
+}
+
+void
+blake256r8_16way_close(void *cc, void *dst)
+{
+   blake32_16way_close(cc, 0, 0, dst, 8);
+}
+
+#endif // AVX512
+
+
+
 // Blake-256 4 way

 // default 14 rounds, backward copatibility
--- a/algo/blake/blake2b-4way.c
+++ b/algo/blake/blake2b-4way.c
@@ -4,13 +4,59 @@
 */

 #include "blake2b-gate.h"
-
-#if defined(BLAKE2B_4WAY)
-
 #include <string.h>
 #include <stdint.h>
 #include "blake2b-hash-4way.h"

+#if defined(BLAKE2B_8WAY)
+
+int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));;
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[49]);   // 3*16+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   int thr_id = mythr->id;
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+
+   uint32_t n = first_nonce;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   do {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+      blake2b_8way_init( &ctx );
+      blake2b_8way_update( &ctx, vdata, 80 );
+      blake2b_8way_final( &ctx, hash );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( hash7[ lane<<1 ] < Htarg )
+      {
+          extr_lane_8x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+      }
+      n += 8;
+   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#elif defined(BLAKE2B_4WAY)
+
 // Function not used, code inlined.
 void blake2b_4way_hash(void *output, const void *input)
 {
--- a/algo/blake/blake2b-gate.c
+++ b/algo/blake/blake2b-gate.c
@@ -1,15 +1,19 @@
 #include "blake2b-gate.h"

+
 bool register_blake2b_algo( algo_gate_t* gate )
 {
-#if defined(BLAKE2B_4WAY)
+#if defined(BLAKE2B_8WAY)
+  gate->scanhash  = (void*)&scanhash_blake2b_8way;
+//  gate->hash      = (void*)&blake2b_8way_hash;
+#elif defined(BLAKE2B_4WAY)
  gate->scanhash  = (void*)&scanhash_blake2b_4way;
  gate->hash      = (void*)&blake2b_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_blake2b;
  gate->hash      = (void*)&blake2b_hash;
 #endif
-  gate->optimizations =  AVX2_OPT;
+  gate->optimizations =  AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/blake/blake2b-gate.h
+++ b/algo/blake/blake2b-gate.h
@@ -4,13 +4,21 @@
 #include <stdint.h>
 #include "algo-gate-api.h"

-#if defined(__AVX2__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define BLAKE2B_8WAY
+#elif defined(__AVX2__)
  #define BLAKE2B_4WAY
 #endif

 bool register_blake2b_algo( algo_gate_t* gate );

-#if defined(BLAKE2B_4WAY)
+#if defined(BLAKE2B_8WAY)
+
+//void blake2b_8way_hash( void *state, const void *input );
+int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(BLAKE2B_4WAY)

 void blake2b_4way_hash( void *state, const void *input );
 int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
--- a/algo/blake/blake2b-hash-4way.c
+++ b/algo/blake/blake2b-hash-4way.c
@@ -33,6 +33,178 @@

 #include "blake2b-hash-4way.h"

+static const uint8_t sigma[12][16] =
+{
+      { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+      { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+      { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+      { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+      { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+      { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+      { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+      { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+      { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+      { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+      { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+      { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+};
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define B2B8W_G(a, b, c, d, x, y) \
+{ \
+   v[a] = _mm512_add_epi64( _mm512_add_epi64( v[a], v[b] ), x ); \
+   v[d] = mm512_ror_64( _mm512_xor_si512( v[d], v[a] ), 32 ); \
+   v[c] = _mm512_add_epi64( v[c], v[d] ); \
+   v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 24 ); \
+   v[a] = _mm512_add_epi64( _mm512_add_epi64( v[a], v[b] ), y ); \
+   v[d] = mm512_ror_64( _mm512_xor_si512( v[d], v[a] ), 16 ); \
+   v[c] = _mm512_add_epi64( v[c], v[d] ); \
+   v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
+}
+
+static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
+{  
+   __m512i v[16], m[16];
+
+   v[ 0] = ctx->h[0];
+   v[ 1] = ctx->h[1];
+   v[ 2] = ctx->h[2];
+   v[ 3] = ctx->h[3];
+   v[ 4] = ctx->h[4];
+   v[ 5] = ctx->h[5];
+   v[ 6] = ctx->h[6];
+   v[ 7] = ctx->h[7];
+   v[ 8] = m512_const1_64( 0x6A09E667F3BCC908 );
+   v[ 9] = m512_const1_64( 0xBB67AE8584CAA73B );
+   v[10] = m512_const1_64( 0x3C6EF372FE94F82B );
+   v[11] = m512_const1_64( 0xA54FF53A5F1D36F1 );
+   v[12] = m512_const1_64( 0x510E527FADE682D1 );
+   v[13] = m512_const1_64( 0x9B05688C2B3E6C1F );
+   v[14] = m512_const1_64( 0x1F83D9ABFB41BD6B );
+   v[15] = m512_const1_64( 0x5BE0CD19137E2179 );
+
+   v[12] = _mm512_xor_si512( v[12], _mm512_set1_epi64( ctx->t[0] ) );
+   v[13] = _mm512_xor_si512( v[13], _mm512_set1_epi64( ctx->t[1] ) );
+
+   if ( last )
+      v[14] = mm512_not( v[14] );
+
+   m[ 0] = ctx->b[ 0];
+   m[ 1] = ctx->b[ 1];
+   m[ 2] = ctx->b[ 2];
+   m[ 3] = ctx->b[ 3];
+   m[ 4] = ctx->b[ 4];
+   m[ 5] = ctx->b[ 5];
+   m[ 6] = ctx->b[ 6];
+   m[ 7] = ctx->b[ 7];
+   m[ 8] = ctx->b[ 8];
+   m[ 9] = ctx->b[ 9];
+   m[10] = ctx->b[10];
+   m[11] = ctx->b[11];
+   m[12] = ctx->b[12];
+   m[13] = ctx->b[13];
+   m[14] = ctx->b[14];
+   m[15] = ctx->b[15];
+
+   for ( int i = 0; i < 12; i++ )
+   {
+      B2B8W_G( 0, 4,  8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] );
+      B2B8W_G( 1, 5,  9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] );
+      B2B8W_G( 2, 6, 10, 14, m[ sigma[i][ 4] ], m[ sigma[i][ 5] ] );
+      B2B8W_G( 3, 7, 11, 15, m[ sigma[i][ 6] ], m[ sigma[i][ 7] ] );
+      B2B8W_G( 0, 5, 10, 15, m[ sigma[i][ 8] ], m[ sigma[i][ 9] ] );
+      B2B8W_G( 1, 6, 11, 12, m[ sigma[i][10] ], m[ sigma[i][11] ] );
+      B2B8W_G( 2, 7,  8, 13, m[ sigma[i][12] ], m[ sigma[i][13] ] );
+      B2B8W_G( 3, 4,  9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
+   }
+
+   ctx->h[0] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[0], v[0] ), v[ 8] );
+   ctx->h[1] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[1], v[1] ), v[ 9] );
+   ctx->h[2] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[2], v[2] ), v[10] );
+   ctx->h[3] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[3], v[3] ), v[11] );
+   ctx->h[4] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[4], v[4] ), v[12] );
+   ctx->h[5] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[5], v[5] ), v[13] );
+   ctx->h[6] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[6], v[6] ), v[14] );
+   ctx->h[7] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[7], v[7] ), v[15] );
+}
+
+int blake2b_8way_init( blake2b_8way_ctx *ctx )
+{
+   size_t i;
+
+   ctx->h[0] = m512_const1_64( 0x6A09E667F3BCC908 );
+   ctx->h[1] = m512_const1_64( 0xBB67AE8584CAA73B );
+   ctx->h[2] = m512_const1_64( 0x3C6EF372FE94F82B );
+   ctx->h[3] = m512_const1_64( 0xA54FF53A5F1D36F1 );
+   ctx->h[4] = m512_const1_64( 0x510E527FADE682D1 );
+   ctx->h[5] = m512_const1_64( 0x9B05688C2B3E6C1F );
+   ctx->h[6] = m512_const1_64( 0x1F83D9ABFB41BD6B );
+   ctx->h[7] = m512_const1_64( 0x5BE0CD19137E2179 );
+
+   ctx->h[0] = _mm512_xor_si512( ctx->h[0], m512_const1_64( 0x01010020 ) );
+
+   ctx->t[0] = 0;
+   ctx->t[1] = 0;
+   ctx->c = 0;
+   ctx->outlen = 32;
+
+   for ( i = 0; i < 16; i++ )
+     ctx->b[i] = m512_zero;
+
+   return 0;
+}
+
+
+void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
+                          size_t inlen )
+{
+   __m512i* in =(__m512i*)input;
+
+   size_t i, c;
+   c = ctx->c >> 3;
+
+   for ( i = 0; i < (inlen >> 3); i++ )
+   {
+      if ( ctx->c == 128 )
+      {
+         ctx->t[0] += ctx->c;
+         if ( ctx->t[0] < ctx->c )
+            ctx->t[1]++;
+         blake2b_8way_compress( ctx, 0 );
+         ctx->c = 0;
+      }
+      ctx->b[ c++ ] = in[i];
+      ctx->c += 8;
+   }
+}
+
+void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
+{
+   size_t c;
+   c = ctx->c >> 3;
+
+   ctx->t[0] += ctx->c;
+   if ( ctx->t[0] < ctx->c )
+      ctx->t[1]++;
+
+   while ( ctx->c < 128 )
+   {
+      ctx->b[c++] = m512_zero;
+      ctx->c += 8;
+   }
+
+   blake2b_8way_compress( ctx, 1 );           // final block flag = 1
+
+   casti_m512i( out, 0 ) = ctx->h[0];
+   casti_m512i( out, 1 ) = ctx->h[1];
+   casti_m512i( out, 2 ) = ctx->h[2];
+   casti_m512i( out, 3 ) = ctx->h[3];
+}
+
+#endif
+
 #if defined(__AVX2__)

 // G Mixing function.
@@ -61,21 +233,6 @@ static const uint64_t blake2b_iv[8] = {

 static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
 {
-	const uint8_t sigma[12][16] = {
-		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-		{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-		{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-		{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-		{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-		{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-		{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-		{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-		{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
-	};
-	int i;
 	__m256i v[16], m[16];

   v[ 0] = ctx->h[0];
@@ -118,7 +275,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
   m[14] = ctx->b[14];
   m[15] = ctx->b[15];
   
-	for ( i = 0; i < 12; i++ )
+	for ( int i = 0; i < 12; i++ )
   { 
 		B2B_G( 0, 4,  8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] );
 		B2B_G( 1, 5,  9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] );
--- a/algo/blake/blake2b-hash-4way.h
+++ b/algo/blake/blake2b-hash-4way.h
@@ -2,8 +2,6 @@
 #ifndef __BLAKE2B_HASH_4WAY_H__
 #define __BLAKE2B_HASH_4WAY_H__

-#if defined(__AVX2__)
-
 #include "simd-utils.h"
 #include <stddef.h>
 #include <stdint.h>
@@ -16,14 +14,34 @@
 #define ALIGN(x) __attribute__((aligned(x)))
 #endif

+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+ALIGN(128) typedef struct {
+   __m512i b[16]; // input buffer
+   __m512i h[8];  // chained state
+   uint64_t t[2];  // total number of bytes
+   size_t c;       // pointer for b[]
+   size_t outlen;  // digest size
+} blake2b_8way_ctx;
+
+int blake2b_8way_init( blake2b_8way_ctx *ctx );
+void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
+                          size_t inlen );
+void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
+
+#endif
+
+#if defined(__AVX2__)
+
 // state context
-ALIGN(64) typedef struct {
+ALIGN(128) typedef struct {
 	__m256i b[16]; // input buffer
 	__m256i h[8];  // chained state
 	uint64_t t[2];  // total number of bytes
 	size_t c;       // pointer for b[]
 	size_t outlen;  // digest size
-} blake2b_4way_ctx __attribute__((aligned(64)));
+} blake2b_4way_ctx;

 int blake2b_4way_init( blake2b_4way_ctx *ctx );
 void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
--- a/algo/blake/blake2s-4way.c
+++ b/algo/blake/blake2s-4way.c
@@ -3,22 +3,72 @@
 #include <string.h>
 #include <stdint.h>

-#if defined(BLAKE2S_8WAY)
+#if defined(BLAKE2S_16WAY)
+
+static __thread blake2s_16way_state blake2s_16w_ctx;
+
+void blake2s_16way_hash( void *output, const void *input )
+{
+   blake2s_16way_state ctx;
+   memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
+   blake2s_16way_update( &ctx, input + (64<<4), 16 );
+   blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
+}
+
+int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
+                            uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[20*16] __attribute__ ((aligned (128)));
+   uint32_t hash[8*16] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<4]);
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
+   uint32_t n = first_nonce;
+   int thr_id = mythr->id;  
+
+   mm512_bswap32_intrlv80_16x32( vdata, pdata );
+   blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
+   blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
+
+   do {
+      *noncev = mm512_bswap_32( _mm512_set_epi32(
+	                  n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
+	                  n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
+      pdata[19] = n;
+
+      blake2s_16way_hash( hash, vdata );
+
+      for ( int lane = 0; lane < 16; lane++ )
+      if ( unlikely( hash7[lane] <= Htarg ) )
+      {
+         extr_lane_16x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 16;
+   } while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#elif defined(BLAKE2S_8WAY)

 static __thread blake2s_8way_state blake2s_8w_ctx;

 void blake2s_8way_hash( void *output, const void *input )
 {
-   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
   blake2s_8way_state ctx;
   memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
-
   blake2s_8way_update( &ctx, input + (64<<3), 16 );
-   blake2s_8way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
-
-   dintrlv_8x32( output,     output+ 32, output+ 64, output+ 96,
-                 output+128, output+160, output+192, output+224,
-                 vhash, 256 );
+   blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
 }

 int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
@@ -26,13 +76,15 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
 {
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[7<<3]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
   uint32_t n = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id; 

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
@@ -45,16 +97,17 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,

      blake2s_8way_hash( hash, vdata );

-
-      for ( int i = 0; i < 8; i++ )
-      if (  (hash+(i<<3))[7] <= Htarg )
-      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( unlikely( hash7[lane] <= Htarg ) )
      {
-          pdata[19] = n+i;
-          submit_lane_solution( work, hash+(i<<3), mythr, i );
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+         }
      }
      n += 8;
-
   } while ( (n < max_nonce) && !work_restart[thr_id].restart );

   *hashes_done = n - first_nonce + 1;
@@ -67,15 +120,10 @@ static __thread blake2s_4way_state blake2s_4w_ctx;

 void blake2s_4way_hash( void *output, const void *input )
 {
-   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
   blake2s_4way_state ctx;
   memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
-
   blake2s_4way_update( &ctx, input + (64<<2), 16 );
-   blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
-
-   dintrlv_4x32( output, output+32, output+64, output+96,
-		            vhash, 256 );
+   blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
 }

 int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
@@ -83,13 +131,15 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
   uint32_t n = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id; 

   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
@@ -101,15 +151,16 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,

      blake2s_4way_hash( hash, vdata );

-      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg )
-      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+      for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
      {
-          pdata[19] = n+i;
-          submit_lane_solution( work, hash+(i<<3), mythr, i );
+         extr_lane_4x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+              }
      }
      n += 4;
-
   } while ( (n < max_nonce) && !work_restart[thr_id].restart );

   *hashes_done = n - first_nonce + 1;
--- a/algo/blake/blake2s-gate.c
+++ b/algo/blake/blake2s-gate.c
@@ -2,7 +2,11 @@

 bool register_blake2s_algo( algo_gate_t* gate )
 {
-#if defined(BLAKE2S_8WAY)
+#if defined(BLAKE2S_16WAY)
+  gate->scanhash  = (void*)&scanhash_blake2s_16way;
+  gate->hash      = (void*)&blake2s_16way_hash;
+#elif defined(BLAKE2S_8WAY)
+//#if defined(BLAKE2S_8WAY)
  gate->scanhash  = (void*)&scanhash_blake2s_8way;
  gate->hash      = (void*)&blake2s_8way_hash;
 #elif defined(BLAKE2S_4WAY)
@@ -12,7 +16,7 @@ bool register_blake2s_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_blake2s;
  gate->hash      = (void*)&blake2s_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/blake/blake2s-gate.h
+++ b/algo/blake/blake2s-gate.h
@@ -8,13 +8,26 @@
 #if defined(__SSE2__)
  #define BLAKE2S_4WAY
 #endif
+
 #if defined(__AVX2__)
  #define BLAKE2S_8WAY
 #endif

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define BLAKE2S_16WAY
+#endif
+
 bool register_blake2s_algo( algo_gate_t* gate );

-#if defined(BLAKE2S_8WAY)
+#if defined(BLAKE2S_16WAY)
+
+void blake2s_16way_hash( void *state, const void *input );
+int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined (BLAKE2S_8WAY)
+
+//#if defined(BLAKE2S_8WAY)

 void blake2s_8way_hash( void *state, const void *input );
 int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -165,13 +165,13 @@ do { \
 // 
 // Supported:
 //    64 + 16 bytes  (blake2s with midstate optimization)
-//    80 bytes without midstate (blake2s without midstate optimization)
+//    80 bytes       (blake2s without midstate optimization)
 //    Any multiple of 64 bytes in one shot (x25x)
 //
 // Unsupported:
-//    Stream of 64 byte blocks one at a time.   
-//
-// use for part blocks or when streaming more data
+//    Stream of full 64 byte blocks one at a time.   
+
+// use only when streaming more data or final block not full.
 int blake2s_4way_update( blake2s_4way_state *S, const void *in,
                         uint64_t inlen )
 {
@@ -466,6 +466,168 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )

 #endif // __AVX2__

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// Blake2s-256 16 way
+
+int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
+{
+   __m512i m[16];
+   __m512i v[16];
+
+   memcpy_512( m, block, 16 );
+   memcpy_512( v, S->h, 8 );
+
+   v[ 8] = m512_const1_64( 0x6A09E6676A09E667ULL );
+   v[ 9] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
+   v[10] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
+   v[11] = m512_const1_64( 0xA54FF53AA54FF53AULL );
+   v[12] = _mm512_xor_si512( _mm512_set1_epi32( S->t[0] ),
+                          m512_const1_64( 0x510E527F510E527FULL ) );
+
+   v[13] = _mm512_xor_si512( _mm512_set1_epi32( S->t[1] ),
+                          m512_const1_64( 0x9B05688C9B05688CULL ) );
+
+   v[14] = _mm512_xor_si512( _mm512_set1_epi32( S->f[0] ),
+                          m512_const1_64( 0x1F83D9AB1F83D9ABULL ) );
+
+   v[15] = _mm512_xor_si512( _mm512_set1_epi32( S->f[1] ),
+                          m512_const1_64( 0x5BE0CD195BE0CD19ULL ) );
+
+
+#define G16W( sigma0, sigma1, a, b, c, d) \
+do { \
+   uint8_t s0 = sigma0; \
+   uint8_t s1 = sigma1; \
+   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s0 ] ); \
+   d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
+   c = _mm512_add_epi32( c, d ); \
+   b = mm512_ror_32( _mm512_xor_si512( b, c ), 12 ); \
+   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s1 ] ); \
+   d = mm512_ror_32( _mm512_xor_si512( d, a ),  8 ); \
+   c = _mm512_add_epi32( c, d ); \
+   b = mm512_ror_32( _mm512_xor_si512( b, c ),  7 ); \
+} while(0)
+
+#define ROUND16W(r)  \
+do { \
+   uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
+   G16W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
+   G16W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
+   G16W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
+   G16W( sigma[ 6], sigma[ 7], v[ 3], v[ 7], v[11], v[15] ); \
+   G16W( sigma[ 8], sigma[ 9], v[ 0], v[ 5], v[10], v[15] ); \
+   G16W( sigma[10], sigma[11], v[ 1], v[ 6], v[11], v[12] ); \
+   G16W( sigma[12], sigma[13], v[ 2], v[ 7], v[ 8], v[13] ); \
+   G16W( sigma[14], sigma[15], v[ 3], v[ 4], v[ 9], v[14] ); \
+} while(0)
+
+   ROUND16W( 0 );
+   ROUND16W( 1 );
+   ROUND16W( 2 );
+   ROUND16W( 3 );
+   ROUND16W( 4 );
+   ROUND16W( 5 );
+   ROUND16W( 6 );
+   ROUND16W( 7 );
+   ROUND16W( 8 );
+   ROUND16W( 9 );
+
+   for( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm512_xor_si512( _mm512_xor_si512( S->h[i], v[i] ), v[i + 8] );
+
+#undef G16W
+#undef ROUND16W
+   return 0;
+}
+
+int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen )
+{
+   blake2s_nway_param P[1];
+
+   P->digest_length = outlen;
+   P->key_length    = 0;
+   P->fanout        = 1;
+   P->depth         = 1;
+   P->leaf_length   = 0;
+   *((uint64_t*)(P->node_offset)) = 0;
+   P->node_depth    = 0;
+   P->inner_length  = 0;
+   memset( P->salt,     0, sizeof( P->salt ) );
+   memset( P->personal, 0, sizeof( P->personal ) );
+
+   memset( S, 0, sizeof( blake2s_16way_state ) );
+   S->h[0] = m512_const1_64( 0x6A09E6676A09E667ULL );
+   S->h[1] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
+   S->h[2] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
+   S->h[3] = m512_const1_64( 0xA54FF53AA54FF53AULL );
+   S->h[4] = m512_const1_64( 0x510E527F510E527FULL );
+   S->h[5] = m512_const1_64( 0x9B05688C9B05688CULL );
+   S->h[6] = m512_const1_64( 0x1F83D9AB1F83D9ABULL );
+   S->h[7] = m512_const1_64( 0x5BE0CD195BE0CD19ULL );
+
+   uint32_t *p = ( uint32_t * )( P );
+
+   /* IV XOR ParamBlock */
+   for ( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm512_xor_si512( S->h[i], _mm512_set1_epi32( p[i] ) );
+   return 0;
+}
+
+int blake2s_16way_update( blake2s_16way_state *S, const void *in,
+                         uint64_t inlen )
+{
+  __m512i *input = (__m512i*)in;
+  __m512i *buf = (__m512i*)S->buf;
+  const int bsize = BLAKE2S_BLOCKBYTES;
+
+   while( inlen > 0 )
+   {
+      size_t left = S->buflen;
+      if( inlen >= bsize - left )
+      {
+         memcpy_512( buf + (left>>2), input, (bsize - left) >> 2 );
+         S->buflen += bsize - left;
+         S->t[0] += BLAKE2S_BLOCKBYTES;
+         S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+         blake2s_16way_compress( S, buf );
+         S->buflen = 0;
+         input += ( bsize >> 2 );
+         inlen -= bsize;
+      }
+      else
+      {
+          memcpy_512( buf + ( left>>2 ), input, inlen>>2 );
+          S->buflen += (size_t) inlen;
+          input += ( inlen>>2 );
+          inlen -= inlen;
+      }
+   }
+   return 0;
+}
+
+int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen )
+{
+   __m512i *buf = (__m512i*)S->buf;
+
+   S->t[0] += S->buflen;
+   S->t[1] += ( S->t[0] < S->buflen );
+   if ( S->last_node )
+      S->f[1] = ~0U;
+   S->f[0] = ~0U;
+
+   memset_zero_512( buf + ( S->buflen>>2 ),
+                    ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
+   blake2s_16way_compress( S, buf );
+
+   for ( int i = 0; i < 8; ++i )
+      casti_m512i( out, i ) = S->h[ i ];
+   return 0;
+}
+
+#endif   // AVX512
+
+
 #if 0
 int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
 {
--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -64,7 +64,7 @@ typedef struct __blake2s_nway_param
 ALIGN( 64 ) typedef struct __blake2s_4way_state
 {
   __m128i h[8];
-   uint8_t  buf[ 2 * BLAKE2S_BLOCKBYTES * 4 ];
+   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 4 ];
   uint32_t t[2];
   uint32_t f[2];
   size_t   buflen;
@@ -75,13 +75,16 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen );
 int blake2s_4way_update( blake2s_4way_state *S, const void *in,
                         uint64_t inlen );
 int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
+int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
+                              const void *input, uint64_t inlen );
+

 #if defined(__AVX2__)

 ALIGN( 64 ) typedef struct __blake2s_8way_state
 {
   __m256i h[8];
-   uint8_t  buf[ 2 * BLAKE2S_BLOCKBYTES * 8 ];
+   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 8 ];
   uint32_t t[2];
   uint32_t f[2];
   size_t   buflen;
@@ -92,9 +95,27 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen );
 int blake2s_8way_update( blake2s_8way_state *S, const void *in,
                         uint64_t inlen );
 int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
-int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
-                              const void *input, uint64_t inlen );
+//int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
+//                              const void *input, uint64_t inlen );

+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+ALIGN( 128 ) typedef struct __blake2s_16way_state
+{
+   __m512i h[8];
+   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 16 ];
+   uint32_t t[2];
+   uint32_t f[2];
+   size_t   buflen;
+   uint8_t  last_node;
+} blake2s_16way_state ;
+
+int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen );
+int blake2s_16way_update( blake2s_16way_state *S, const void *in,
+                         uint64_t inlen );
+int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );

 #endif

--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -42,21 +42,13 @@
 extern "C"{
 #endif

-#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE
-#define SPH_SMALL_FOOTPRINT_BLAKE   1
-#endif
-
-#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE)
-#define SPH_COMPACT_BLAKE_64   1
-#endif
-
 #ifdef _MSC_VER
 #pragma warning (disable: 4146)
 #endif

-
-// Blake-512
-
+// Blake-512 common
+   
+/*
 static const sph_u64 IV512[8] = {
 	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
 	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
@@ -64,10 +56,7 @@ static const sph_u64 IV512[8] = {
 	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
 };

-
-#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
-
-// Blake-256 4 & 8 way, Blake-512 4 way
+static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };

 static const unsigned sigma[16][16] = {
 	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
@@ -88,7 +77,17 @@ static const unsigned sigma[16][16] = {
 	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 }
 };

-#endif
+static const sph_u64 CB[16] = {
+   SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
+   SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
+   SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
+   SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
+   SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
+   SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
+   SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
+   SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
+
+*/

 #define Z00   0
 #define Z01   1
@@ -264,8 +263,6 @@ static const unsigned sigma[16][16] = {
 #define Mx_(n)      Mx__(n)
 #define Mx__(n)     M ## n

-// Blake-512 4 way
-
 #define CBx(r, i)   CBx_(Z ## r ## i)
 #define CBx_(n)     CBx__(n)
 #define CBx__(n)    CB ## n
@@ -287,21 +284,288 @@ static const unsigned sigma[16][16] = {
 #define CBE   SPH_C64(0x0801F2E2858EFC16)
 #define CBF   SPH_C64(0x636920D871574E69)

-#if SPH_COMPACT_BLAKE_64
-// not used
-static const sph_u64 CB[16] = {
-	SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
-	SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
-	SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
-	SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
-	SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
-	SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
-	SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
-	SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
-};
+#define READ_STATE64(state)   do { \
+      H0 = (state)->H[0]; \
+      H1 = (state)->H[1]; \
+      H2 = (state)->H[2]; \
+      H3 = (state)->H[3]; \
+      H4 = (state)->H[4]; \
+      H5 = (state)->H[5]; \
+      H6 = (state)->H[6]; \
+      H7 = (state)->H[7]; \
+      S0 = (state)->S[0]; \
+      S1 = (state)->S[1]; \
+      S2 = (state)->S[2]; \
+      S3 = (state)->S[3]; \
+      T0 = (state)->T0; \
+      T1 = (state)->T1; \
+   } while (0)

-#endif
+#define WRITE_STATE64(state)   do { \
+      (state)->H[0] = H0; \
+      (state)->H[1] = H1; \
+      (state)->H[2] = H2; \
+      (state)->H[3] = H3; \
+      (state)->H[4] = H4; \
+      (state)->H[5] = H5; \
+      (state)->H[6] = H6; \
+      (state)->H[7] = H7; \
+      (state)->S[0] = S0; \
+      (state)->S[1] = S1; \
+      (state)->S[2] = S2; \
+      (state)->S[3] = S3; \
+      (state)->T0 = T0; \
+      (state)->T1 = T1; \
+   } while (0)

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// Blake-512 8 way AVX512
+
+#define GB_8WAY(m0, m1, c0, c1, a, b, c, d)   do { \
+   a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
+                 _mm512_set1_epi64( c1 ), m0 ), b ), a ); \
+   d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
+   c = _mm512_add_epi64( c, d ); \
+   b = mm512_ror_64( _mm512_xor_si512( b, c ), 25 ); \
+   a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
+                 _mm512_set1_epi64( c0 ), m1 ), b ), a ); \
+   d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
+   c = _mm512_add_epi64( c, d ); \
+   b = mm512_ror_64( _mm512_xor_si512( b, c ), 11 ); \
+} while (0)
+
+#define ROUND_B_8WAY(r)   do { \
+   GB_8WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
+   GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
+   GB_8WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
+   GB_8WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
+   GB_8WAY(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
+   GB_8WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
+   GB_8WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
+   GB_8WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
+   } while (0)
+
+#define DECL_STATE64_8WAY \
+   __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
+        __m512i S0, S1, S2, S3; \
+   sph_u64 T0, T1;
+
+#define COMPRESS64_8WAY   do \
+{ \
+  __m512i M0, M1, M2, M3, M4, M5, M6, M7; \
+  __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
+  __m512i V0, V1, V2, V3, V4, V5, V6, V7; \
+  __m512i V8, V9, VA, VB, VC, VD, VE, VF; \
+  __m512i shuf_bswap64; \
+  V0 = H0; \
+  V1 = H1; \
+  V2 = H2; \
+  V3 = H3; \
+  V4 = H4; \
+  V5 = H5; \
+  V6 = H6; \
+  V7 = H7; \
+  V8 = _mm512_xor_si512( S0, m512_const1_64( CB0 ) );  \
+  V9 = _mm512_xor_si512( S1, m512_const1_64( CB1 ) );  \
+  VA = _mm512_xor_si512( S2, m512_const1_64( CB2 ) );  \
+  VB = _mm512_xor_si512( S3, m512_const1_64( CB3 ) );  \
+  VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
+                         m512_const1_64( CB4 ) );  \
+  VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
+                         m512_const1_64( CB5 ) );  \
+  VE = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
+                         m512_const1_64( CB6 ) );  \
+  VF = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
+                         m512_const1_64( CB7 ) );  \
+  shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
+                                0x28292a2b2c2d2e2f, 0x2021222324252627, \
+                                0x18191a1b1c1d1e1f, 0x1011121314151617, \
+                                0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
+  M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
+  M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
+  M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
+  M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
+  M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
+  M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
+  M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
+  M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
+  M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
+  M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
+  MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
+  MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
+  MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
+  MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
+  ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
+  MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
+  ROUND_B_8WAY(0); \
+  ROUND_B_8WAY(1); \
+  ROUND_B_8WAY(2); \
+  ROUND_B_8WAY(3); \
+  ROUND_B_8WAY(4); \
+  ROUND_B_8WAY(5); \
+  ROUND_B_8WAY(6); \
+  ROUND_B_8WAY(7); \
+  ROUND_B_8WAY(8); \
+  ROUND_B_8WAY(9); \
+  ROUND_B_8WAY(0); \
+  ROUND_B_8WAY(1); \
+  ROUND_B_8WAY(2); \
+  ROUND_B_8WAY(3); \
+  ROUND_B_8WAY(4); \
+  ROUND_B_8WAY(5); \
+  H0 = mm512_xor4( V8, V0, S0, H0 ); \
+  H1 = mm512_xor4( V9, V1, S1, H1 ); \
+  H2 = mm512_xor4( VA, V2, S2, H2 ); \
+  H3 = mm512_xor4( VB, V3, S3, H3 ); \
+  H4 = mm512_xor4( VC, V4, S0, H4 ); \
+  H5 = mm512_xor4( VD, V5, S1, H5 ); \
+  H6 = mm512_xor4( VE, V6, S2, H6 ); \
+  H7 = mm512_xor4( VF, V7, S3, H7 ); \
+} while (0)
+
+void blake512_8way_init( blake_8way_big_context *sc )
+{
+   __m512i zero = m512_zero;
+   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
+   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
+   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
+   casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
+   casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
+   casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
+   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
+   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
+
+   casti_m512i( sc->S, 0 ) = zero;
+   casti_m512i( sc->S, 1 ) = zero;
+   casti_m512i( sc->S, 2 ) = zero;
+   casti_m512i( sc->S, 3 ) = zero;
+
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
+}
+
+static void
+blake64_8way( blake_8way_big_context *sc, const void *data, size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   __m512i *buf;
+   size_t ptr;
+   DECL_STATE64_8WAY
+
+   const int buf_size = 128;  //  sizeof/8
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   if ( len < (buf_size - ptr) )
+   {
+   memcpy_512( buf + (ptr>>3), vdata, len>>3 );
+   ptr += len;
+   sc->ptr = ptr;
+   return;
+   }
+
+   READ_STATE64(sc);
+   while ( len > 0 )
+   {
+   size_t clen;
+
+   clen = buf_size - ptr;
+   if ( clen > len )
+      clen = len;
+   memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
+   ptr += clen;
+   vdata = vdata + (clen>>3);
+   len -= clen;
+   if ( ptr == buf_size )
+        {
+      if ( ( T0 = SPH_T64(T0 + 1024) ) < 1024 )
+         T1 = SPH_T64(T1 + 1);
+      COMPRESS64_8WAY;
+      ptr = 0;
+   }
+   }
+   WRITE_STATE64(sc);
+   sc->ptr = ptr;
+}
+
+static void
+blake64_8way_close( blake_8way_big_context *sc, void *dst )
+{
+   __m512i buf[16];
+   size_t ptr;
+   unsigned bit_len;
+//   uint64_t z, zz;
+   sph_u64 th, tl;
+
+   ptr = sc->ptr;
+   bit_len = ((unsigned)ptr << 3);
+//   z = 0x80 >> n;
+//   zz = ((ub & -z) | z) & 0xFF;
+//   buf[ptr>>3] = _mm512_set1_epi64( zz );
+   buf[ptr>>3] = m512_const1_64( 0x80 );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+   if (ptr == 0 )
+   {
+   sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
+   sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
+   }
+   else if ( sc->T0 == 0 )
+   {
+   sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
+   sc->T1 = SPH_T64(sc->T1 - 1);
+   }
+   else
+   {
+        sc->T0 -= 1024 - bit_len;
+   }
+   if ( ptr <= 104 )
+   {
+       memset_zero_512( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
+       buf[104>>3] = _mm512_or_si512( buf[104>>3],
+                                 m512_const1_64( 0x0100000000000000ULL ) );
+       buf[112>>3] = m512_const1_64( bswap_64( th ) );
+       buf[120>>3] = m512_const1_64( bswap_64( tl ) );
+
+       blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
+   }
+   else
+  {
+       memset_zero_512( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
+
+       blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
+       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
+       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
+       memset_zero_512( buf, 112>>3 );
+       buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
+       buf[112>>3] = m512_const1_64( bswap_64( th ) );
+       buf[120>>3] = m512_const1_64( bswap_64( tl ) );
+
+       blake64_8way( sc, buf, 128 );
+   }
+   mm512_block_bswap_64( (__m512i*)dst, sc->H );
+}
+
+void
+blake512_8way_update(void *cc, const void *data, size_t len)
+{
+   blake64_8way(cc, data, len);
+}
+
+void
+blake512_8way_close(void *cc, void *dst)
+{
+   blake512_8way_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+   blake64_8way_close(cc, dst);
+}
+
+#endif  // AVX512

 // Blake-512 4 way

@@ -318,29 +582,6 @@ static const sph_u64 CB[16] = {
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
 } while (0)

-#if SPH_COMPACT_BLAKE_64
-// not used
-#define ROUND_B_4WAY(r)   do { \
-	GB_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
-		CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \
-	GB_4WAY(M[sigma[r][0x2]], M[sigma[r][0x3]], \
-		CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \
-	GB_4WAY(M[sigma[r][0x4]], M[sigma[r][0x5]], \
-		CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \
-	GB_4WAY(M[sigma[r][0x6]], M[sigma[r][0x7]], \
-		CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \
-	GB_4WAY(M[sigma[r][0x8]], M[sigma[r][0x9]], \
-		CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \
-	GB_4WAY(M[sigma[r][0xA]], M[sigma[r][0xB]], \
-		CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \
-	GB_4WAY(M[sigma[r][0xC]], M[sigma[r][0xD]], \
-		CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \
-	GB_4WAY(M[sigma[r][0xE]], M[sigma[r][0xF]], \
-		CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \
-} while (0)
-
-#else
-//current_impl
 #define ROUND_B_4WAY(r)   do { \
 	GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
 	GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
@@ -352,120 +593,11 @@ static const sph_u64 CB[16] = {
 	GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
 	} while (0)

-#endif
-
-
-// Blake-512 4 way
-
 #define DECL_STATE64_4WAY \
 	__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
        __m256i S0, S1, S2, S3; \
 	sph_u64 T0, T1;

-#define READ_STATE64_4WAY(state)   do { \
-		H0 = (state)->H[0]; \
-		H1 = (state)->H[1]; \
-		H2 = (state)->H[2]; \
-		H3 = (state)->H[3]; \
-		H4 = (state)->H[4]; \
-		H5 = (state)->H[5]; \
-		H6 = (state)->H[6]; \
-		H7 = (state)->H[7]; \
-		S0 = (state)->S[0]; \
-		S1 = (state)->S[1]; \
-		S2 = (state)->S[2]; \
-		S3 = (state)->S[3]; \
-		T0 = (state)->T0; \
-		T1 = (state)->T1; \
-	} while (0)
-
-#define WRITE_STATE64_4WAY(state)   do { \
-		(state)->H[0] = H0; \
-		(state)->H[1] = H1; \
-		(state)->H[2] = H2; \
-		(state)->H[3] = H3; \
-		(state)->H[4] = H4; \
-		(state)->H[5] = H5; \
-		(state)->H[6] = H6; \
-		(state)->H[7] = H7; \
-		(state)->S[0] = S0; \
-		(state)->S[1] = S1; \
-		(state)->S[2] = S2; \
-		(state)->S[3] = S3; \
-		(state)->T0 = T0; \
-		(state)->T1 = T1; \
-	} while (0)
-
-#if SPH_COMPACT_BLAKE_64
-
-// not used
-#define COMPRESS64_4WAY   do { \
-	__m256i M[16]; \
-	__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
-	__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
-   const __m256i shuff_bswap64 = m256_const2_64( 0x08090a0b0c0d0e0f, \
-                                                 0x0001020304050607 ) \
-   unsigned r; \
-	V0 = H0; \
-	V1 = H1; \
-	V2 = H2; \
-	V3 = H3; \
-	V4 = H4; \
-	V5 = H5; \
-	V6 = H6; \
-	V7 = H7; \
-   V8 = _mm256_xor_si256( S0, _mm256_set1_epi64x( CB0 ) ); \
-   V9 = _mm256_xor_si256( S1, _mm256_set1_epi64x( CB1 ) ); \
-   VA = _mm256_xor_si256( S2, _mm256_set1_epi64x( CB2 ) ); \
-   VB = _mm256_xor_si256( S3, _mm256_set1_epi64x( CB3 ) ); \
-   VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
-                          _mm256_set1_epi64x( CB4 ) ); \
-   VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
-                          _mm256_set1_epi64x( CB5 ) ); \
-   VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
-                          _mm256_set1_epi64x( CB6 ) ); \
-   VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
-                          _mm256_set1_epi64x( CB7, CB7, CB7, CB7 ) ); \
-   M[0x0] = _mm256_shuffle_epi8( *(buf+ 0), shuff_bswap64 ); \
-	M[0x1] = _mm256_shuffle_epi8( *(buf+ 1), shuff_bswap64 ); \
-	M[0x2] = _mm256_shuffle_epi8( *(buf+ 2), shuff_bswap64 ); \
-	M[0x3] = _mm256_shuffle_epi8( *(buf+ 3), shuff_bswap64 ); \
-	M[0x4] = _mm256_shuffle_epi8( *(buf+ 4), shuff_bswap64 ); \
-	M[0x5] = _mm256_shuffle_epi8( *(buf+ 5), shuff_bswap64 ); \
-	M[0x6] = _mm256_shuffle_epi8( *(buf+ 6), shuff_bswap64 ); \
-	M[0x7] = _mm256_shuffle_epi8( *(buf+ 7), shuff_bswap64 ); \
-	M[0x8] = _mm256_shuffle_epi8( *(buf+ 8), shuff_bswap64 ); \
-	M[0x9] = _mm256_shuffle_epi8( *(buf+ 9), shuff_bswap64 ); \
-	M[0xA] = _mm256_shuffle_epi8( *(buf+10), shuff_bswap64 ); \
-	M[0xB] = _mm256_shuffle_epi8( *(buf+11), shuff_bswap64 ); \
-	M[0xC] = _mm256_shuffle_epi8( *(buf+12), shuff_bswap64 ); \
-	M[0xD] = _mm256_shuffle_epi8( *(buf+13), shuff_bswap64 ); \
-	M[0xE] = _mm256_shuffle_epi8( *(buf+14), shuff_bswap64 ); \
-	M[0xF] = _mm256_shuffle_epi8( *(buf+15), shuff_bswap64 ); \
-	for (r = 0; r < 16; r ++) \
-		ROUND_B_4WAY(r); \
-   H0 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
-   H1 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
-   H2 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S2, V2 ), VA ), H2 ); \
-   H3 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S3, V3 ), VB ), H3 ); \
-   H4 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S0, V4 ), VC ), H4 ); \
-   H5 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S1, V5 ), VD ), H5 ); \
-   H6 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S2, V6 ), VE ), H6 ); \
-   H7 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S3, V7 ), VF ), H7 ); \
-} while (0)
-
-#else
-
-//current impl
-
 #define COMPRESS64_4WAY   do \
 { \
  __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
@@ -493,7 +625,8 @@ static const sph_u64 CB[16] = {
                         m256_const1_64( CB6 ) );  \
  VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
                         m256_const1_64( CB7 ) );  \
-  shuf_bswap64 = m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
+  shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
+                                0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
  M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
  M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
  M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
@@ -536,13 +669,8 @@ static const sph_u64 CB[16] = {
  H7 = mm256_xor4( VF, V7, S3, H7 ); \
 } while (0)

-#endif

-static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
-
-static void
-blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
-              const sph_u64 *salt )
+void blake512_4way_init( blake_4way_big_context *sc )
 {
   __m256i zero = m256_zero;
   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
@@ -553,12 +681,10 @@ blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
-
   casti_m256i( sc->S, 0 ) = zero;
   casti_m256i( sc->S, 1 ) = zero;
   casti_m256i( sc->S, 2 ) = zero;
   casti_m256i( sc->S, 3 ) = zero;
-
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
 }
@@ -583,7 +709,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
 	return;
   }

-   READ_STATE64_4WAY(sc);
+   READ_STATE64(sc);
   while ( len > 0 )
   {
 	size_t clen;
@@ -603,25 +729,21 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
 		ptr = 0;
 	}
   }
-   WRITE_STATE64_4WAY(sc);
+   WRITE_STATE64(sc);
   sc->ptr = ptr;
 }

 static void
-blake64_4way_close( blake_4way_big_context *sc,
-	unsigned ub, unsigned n, void *dst, size_t out_size_w64)
+blake64_4way_close( blake_4way_big_context *sc, void *dst )
 {
   __m256i buf[16];
   size_t ptr;
   unsigned bit_len;
-   uint64_t z, zz;
   sph_u64 th, tl;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-   z = 0x80 >> n;
-   zz = ((ub & -z) | z) & 0xFF;
-   buf[ptr>>3] = _mm256_set1_epi64x( zz );
+   buf[ptr>>3] = m256_const1_64( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;
   if (ptr == 0 )
@@ -638,43 +760,44 @@ blake64_4way_close( blake_4way_big_context *sc,
   {
        sc->T0 -= 1024 - bit_len;
   }
+
   if ( ptr <= 104 )
   {
       memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
-       if ( out_size_w64 == 8 )
-          buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)],
+       buf[104>>3] = _mm256_or_si256( buf[104>>3],
                                 m256_const1_64( 0x0100000000000000ULL ) );
-       *(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) );
-       *(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) );
+       buf[112>>3] = m256_const1_64( bswap_64( th ) );
+       buf[120>>3] = m256_const1_64( bswap_64( tl ) );

       blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
   }
   else
-  {
+   {
       memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );

       blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
       memset_zero_256( buf, 112>>3 ); 
-       if ( out_size_w64 == 8 )
-           buf[104>>3] = m256_const1_64( 0x0100000000000000ULL );
-       *(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) );
-       *(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) );
+       buf[104>>3] = m256_const1_64( 0x0100000000000000ULL );
+       buf[112>>3] = m256_const1_64( bswap_64( th ) );
+       buf[120>>3] = m256_const1_64( bswap_64( tl ) );

       blake64_4way( sc, buf, 128 );
   }
   mm256_block_bswap_64( (__m256i*)dst, sc->H );
 }

+/*
 void
 blake512_4way_init(void *cc)
 {
 	blake64_4way_init(cc, IV512, salt_zero_big);
 }
+*/

 void
-blake512_4way(void *cc, const void *data, size_t len)
+blake512_4way_update(void *cc, const void *data, size_t len)
 {
 	blake64_4way(cc, data, len);
 }
@@ -682,15 +805,18 @@ blake512_4way(void *cc, const void *data, size_t len)
 void
 blake512_4way_close(void *cc, void *dst)
 {
-	blake512_4way_addbits_and_close(cc, 0, 0, dst);
+   blake64_4way_close( cc, dst );
+
+//   blake512_4way_addbits_and_close(cc, dst);
 }

+/*
 void
 blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
 	blake64_4way_close(cc, ub, n, dst, 8);
 }
-
+*/
 #ifdef __cplusplus
 }
 #endif
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -64,7 +64,8 @@ typedef bmw_4way_small_context bmw256_4way_context;

 void bmw256_4way_init( bmw256_4way_context *ctx );

-void bmw256_4way(void *cc, const void *data, size_t len);
+void bmw256_4way_update(void *cc, const void *data, size_t len);
+#define bmw256_4way bmw256_4way_update

 void bmw256_4way_close(void *cc, void *dst);

@@ -78,7 +79,7 @@ void bmw256_4way_addbits_and_close(
 // BMW-256 8 way 32

 typedef struct {
-   __m256i buf[64];
+   __m256i buf[16];
   __m256i H[16];
   size_t ptr;
   uint32_t bit_count;  // assume bit_count fits in 32 bits
@@ -87,11 +88,33 @@ typedef struct {
 typedef bmw_8way_small_context bmw256_8way_context;

 void bmw256_8way_init( bmw256_8way_context *ctx );
-void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len );
+void bmw256_8way_update( bmw256_8way_context *ctx, const void *data,
+                         size_t len );
+#define bmw256_8way bmw256_8way_update
 void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );

 #endif

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// BMW-256 16 way 32
+
+typedef struct {
+   __m512i buf[16];
+   __m512i H[16];
+   size_t ptr;
+   uint32_t bit_count;  // assume bit_count fits in 32 bits
+} bmw_16way_small_context __attribute__ ((aligned (128)));
+
+typedef bmw_16way_small_context bmw256_16way_context;
+
+void bmw256_16way_init( bmw256_16way_context *ctx );
+void bmw256_16way_update( bmw256_16way_context *ctx, const void *data,
+                          size_t len );
+void bmw256_16way_close( bmw256_16way_context *ctx, void *dst );
+
+#endif
+

 #if defined(__SSE2__)

@@ -107,7 +130,8 @@ typedef struct {
 typedef bmw_2way_big_context bmw512_2way_context;

 void bmw512_2way_init( bmw512_2way_context *ctx );
-void bmw512_2way( bmw512_2way_context *ctx, const void *data, size_t len );
+void bmw512_2way_update( bmw512_2way_context *ctx, const void *data,
+                         size_t len );
 void bmw512_2way_close( bmw512_2way_context *ctx, void *dst );

 #endif // __SSE2__
@@ -121,14 +145,15 @@ typedef struct {
   __m256i H[16];
   size_t ptr;
   sph_u64 bit_count;
-} bmw_4way_big_context;
+} bmw_4way_big_context __attribute__((aligned(128)));

 typedef bmw_4way_big_context bmw512_4way_context;


 void bmw512_4way_init(void *cc);

-void bmw512_4way(void *cc, const void *data, size_t len);
+void bmw512_4way_update(void *cc, const void *data, size_t len);
+#define bmw512_4way bmw512_4way_update

 void bmw512_4way_close(void *cc, void *dst);

@@ -137,6 +162,22 @@ void bmw512_4way_addbits_and_close(

 #endif  // __AVX2__

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+typedef struct {
+   __m512i buf[16];
+   __m512i H[16];
+   size_t ptr;
+   uint64_t bit_count;
+} bmw512_8way_context __attribute__((aligned(128)));
+
+void bmw512_8way_init( bmw512_8way_context *ctx );
+void bmw512_8way_update( bmw512_8way_context *ctx, const void *data,
+                         size_t len );
+void bmw512_8way_close( bmw512_8way_context *ctx, void *dst );
+
+#endif // AVX512
+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
--- a/algo/bmw/bmw512-4way.c
+++ b/algo/bmw/bmw512-4way.c
@@ -1,13 +1,67 @@
 #include "bmw512-gate.h"
-
-#ifdef BMW512_4WAY
-
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
 //#include "sph_keccak.h"
 #include "bmw-hash-4way.h"

+#if defined(BMW512_8WAY)
+
+void bmw512hash_8way(void *state, const void *input)
+{
+    bmw512_8way_context ctx;
+    bmw512_8way_init( &ctx );
+    bmw512_8way_update( &ctx, input, 80 );
+    bmw512_8way_close( &ctx, state );
+}
+
+int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash[16*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[49]);   // 3*16+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   const uint32_t Htarg = ptarget[7];
+   int thr_id = mythr->id;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do {
+       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0 ,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+      bmw512hash_8way( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
+//      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
+      {
+          extr_lane_8x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) )
+          {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+      }
+      n += 8;
+
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+   
+#elif defined(BMW512_4WAY)
+
+//#ifdef BMW512_4WAY
+
 void bmw512hash_4way(void *state, const void *input)
 {
    bmw512_4way_context ctx;
@@ -19,16 +73,17 @@ void bmw512hash_4way(void *state, const void *input)
 int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-   uint32_t hash[16*4] __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (128)));
+   uint32_t hash[16*4] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[25]);   // 3*8+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce -  4;
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-//   const uint32_t Htarg = ptarget[7];
+   const uint32_t Htarg = ptarget[7];
    int thr_id = mythr->id;  // thr_id arg is deprecated

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
@@ -39,7 +94,8 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
      bmw512hash_4way( hash, vdata );

      for ( int lane = 0; lane < 4; lane++ )
-      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
+      if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
+//      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) )
@@ -50,9 +106,9 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
      }
      n += 4;

-   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );

-   *hashes_done = n - first_nonce + 1;
+   *hashes_done = n - first_nonce;
   return 0;
 }

--- a/algo/bmw/bmw512-gate.c
+++ b/algo/bmw/bmw512-gate.c
@@ -2,9 +2,12 @@

 bool register_bmw512_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
  opt_target_factor = 256.0;
-#if defined (BMW512_4WAY)
+#if defined (BMW512_8WAY)
+  gate->scanhash  = (void*)&scanhash_bmw512_8way;
+  gate->hash      = (void*)&bmw512hash_8way;
+#elif defined (BMW512_4WAY)
  gate->scanhash  = (void*)&scanhash_bmw512_4way;
  gate->hash      = (void*)&bmw512hash_4way;
 #else
--- a/algo/bmw/bmw512-gate.h
+++ b/algo/bmw/bmw512-gate.h
@@ -1,23 +1,33 @@
 #ifndef BMW512_GATE_H__
-#define BMW512_GATE_H__
+#define BMW512_GATE_H__ 1

 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define BMW512_8WAY 1
+#elif defined(__AVX2__)
  #define BMW512_4WAY 1
 #endif

-#if defined(BMW512_4WAY)
+#if defined(BMW512_8WAY)
+
+void bmw512hash_8way( void *state, const void *input );
+int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(BMW512_4WAY)

 void bmw512hash_4way( void *state, const void *input );
 int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );

-#endif
+#else

 void bmw512hash( void *state, const void *input );
 int scanhash_bmw512( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );

 #endif
+
+#endif
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -58,8 +58,7 @@ static const sph_u64 IV512[] = {

 #if defined(__SSE2__)

-// BMW-512 2 way 64
-
+// BMW-512 2 way 64 

 #define s2b0(x) \
   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 1), \
@@ -556,18 +555,15 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
   compress_big_2way( buf, h, h2 );
   memcpy_128( buf, h2, 16 );
   compress_big_2way( buf, final_b2, h1 );
-   memcpy( (__m128i*)dst, h1+16, 8 );
+   memcpy( (__m128i*)dst, h1+8, 8 );
 }

 #endif  // __SSE2__

-
-
 #if defined(__AVX2__)

 // BMW-512 4 way 64

-
 #define sb0(x) \
   mm256_xor4( _mm256_srli_epi64( (x), 1), _mm256_slli_epi64( (x), 3), \
                mm256_rol_64(     (x), 4),  mm256_rol_64(     (x),37) )
@@ -636,165 +632,152 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
                     sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \
      add_elt_b( M, H, (i)-16 ) )

+
+
 #define Wb0 \
   _mm256_add_epi64( \
-       _mm256_add_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
-                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-             _mm256_xor_si256( M[10], H[10] ) ), \
-          _mm256_xor_si256( M[13], H[13] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
+      _mm256_add_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
+                           _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+         _mm256_xor_si256( M[10], H[10] ) ), \
+      _mm256_add_epi64( _mm256_xor_si256( M[13], H[13] ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )

 #define Wb1 \
-   _mm256_sub_epi64( \
+   _mm256_add_epi64( \
       _mm256_add_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
-                               _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-             _mm256_xor_si256( M[11], H[11] ) ), \
-          _mm256_xor_si256( M[14], H[14] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
+          _mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
+                            _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_sub_epi64( _mm256_xor_si256( M[14], H[14] ), \
+                         _mm256_xor_si256( M[15], H[15] ) ) )

 #define Wb2 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
+   _mm256_sub_epi64( \
+      _mm256_add_epi64( \
+         _mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                           _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+         _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )

 #define Wb3 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
-             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-          _mm256_xor_si256( M[10], H[10] ) ), \
-       _mm256_xor_si256( M[13], H[13] ) )
+   _mm256_sub_epi64( \
+      _mm256_add_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                           _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+         _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[10], H[10] ), \
+                        _mm256_xor_si256( M[13], H[13] ) ) )

 #define Wb4 \
   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
+      _mm256_add_epi64( \
+         _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                           _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+         _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+      _mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )

 #define Wb5 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
-                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-             _mm256_xor_si256( M[10], H[10] ) ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
+   _mm256_sub_epi64( \
+      _mm256_add_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                           _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+         _mm256_xor_si256( M[10], H[10] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )

 #define Wb6 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
-                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
-             _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
-       _mm256_xor_si256( M[13], H[13] ) )
+   _mm256_sub_epi64( \
+      _mm256_sub_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
+                           _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+         _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[11], H[11] ), \
+                        _mm256_xor_si256( M[13], H[13] ) ) )

 #define Wb7 \
   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-             _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
+      _mm256_sub_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                           _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+         _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+      _mm256_add_epi64( _mm256_xor_si256( M[12], H[12] ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )

 #define Wb8 \
-   _mm256_sub_epi64( \
-       _mm256_add_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
-                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[13], H[13] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
+   _mm256_add_epi64( \
+      _mm256_sub_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                           _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[13], H[13] ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )

 #define Wb9 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
+   _mm256_sub_epi64( \
+      _mm256_add_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                           _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
+                        _mm256_xor_si256( M[14], H[14] ) ) )

 #define Wb10 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
-                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
-             _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
+   _mm256_sub_epi64( \
+      _mm256_sub_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                           _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+         _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
+                        _mm256_xor_si256( M[15], H[15] ) ) )

 #define Wb11 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
-                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
-             _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-          _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-       _mm256_xor_si256( M[ 9], H[ 9] ) )
+   _mm256_sub_epi64( \
+      _mm256_sub_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                           _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+         _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
+                        _mm256_xor_si256( M[ 9], H[ 9] ) ) )

 #define Wb12 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-       _mm256_xor_si256( M[10], H[10] ) )
+   _mm256_sub_epi64( \
+      _mm256_sub_epi64( \
+         _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                           _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
+                        _mm256_xor_si256( M[10], H[10] ) ) )

 #define Wb13 \
   _mm256_add_epi64( \
-       _mm256_add_epi64( \
-          _mm256_add_epi64( \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
-                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-             _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-          _mm256_xor_si256( M[10], H[10] ) ), \
-       _mm256_xor_si256( M[11], H[11] ) )
+      _mm256_add_epi64( \
+         _mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                           _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+         _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+      _mm256_add_epi64( _mm256_xor_si256( M[10], H[10] ), \
+                        _mm256_xor_si256( M[11], H[11] ) ) )

 #define Wb14 \
   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
-                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
-       _mm256_xor_si256( M[12], H[12] ) )
+      _mm256_add_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                           _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+         _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+      _mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
+                        _mm256_xor_si256( M[12], H[12] ) ) )

 #define Wb15 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
-                               _mm256_xor_si256( M[ 4], H[4] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-       _mm256_xor_si256( M[13], H[13] ) )
+   _mm256_sub_epi64( \
+      _mm256_sub_epi64( \
+         _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
+                           _mm256_xor_si256( M[ 4], H[4] ) ), \
+         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+      _mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
+                        _mm256_xor_si256( M[13], H[13] ) ) )
+

 void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
 {
@@ -840,87 +823,57 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
           mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
           mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );

-   dH[ 0] = _mm256_add_epi64(
-               _mm256_xor_si256( M[0],
-                  _mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
-                                    _mm256_srli_epi64( qt[16], 5 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ) );
-   dH[ 1] = _mm256_add_epi64(
-               _mm256_xor_si256( M[1],
-                  _mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
-                                    _mm256_slli_epi64( qt[17], 8 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ) );
-   dH[ 2] = _mm256_add_epi64(
-               _mm256_xor_si256( M[2],
-                  _mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
-                                    _mm256_slli_epi64( qt[18], 5 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ) );
-   dH[ 3] = _mm256_add_epi64(
-               _mm256_xor_si256( M[3],
-                  _mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
-                                    _mm256_slli_epi64( qt[19], 5 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ) );
-   dH[ 4] = _mm256_add_epi64(
-               _mm256_xor_si256( M[4],
-                  _mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
-                                    _mm256_slli_epi64( qt[20], 0 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ) );
-   dH[ 5] = _mm256_add_epi64(
-               _mm256_xor_si256( M[5],
-                  _mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
-                                    _mm256_srli_epi64( qt[21], 6 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ) );
-   dH[ 6] = _mm256_add_epi64(
-               _mm256_xor_si256( M[6],
-                  _mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
-                                    _mm256_slli_epi64( qt[22], 6 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ) );
-   dH[ 7] = _mm256_add_epi64(
-               _mm256_xor_si256( M[7],
-                  _mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
-                                    _mm256_slli_epi64( qt[23], 2 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ) );
-   dH[ 8] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[4], 9 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
-                 _mm256_xor_si256( _mm256_slli_epi64( xl, 8 ),
-                                   _mm256_xor_si256( qt[23], qt[ 8] ) ) );
-   dH[ 9] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[5], 10 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 6 ),
-                                   _mm256_xor_si256( qt[16], qt[ 9] ) ) );
-   dH[10] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[6], 11 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
-                 _mm256_xor_si256( _mm256_slli_epi64( xl, 6 ),
-                                   _mm256_xor_si256( qt[17], qt[10] ) ) );
-   dH[11] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[7], 12 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
-                 _mm256_xor_si256( _mm256_slli_epi64( xl, 4 ),
-                                   _mm256_xor_si256( qt[18], qt[11] ) ) );
-   dH[12] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[0], 13 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 3 ),
-                                   _mm256_xor_si256( qt[19], qt[12] ) ) );
-   dH[13] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[1], 14 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 4 ),
-                                   _mm256_xor_si256( qt[20], qt[13] ) ) );
-   dH[14] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[2], 15 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 7 ),
-                                   _mm256_xor_si256( qt[21], qt[14] ) ) );
-   dH[15] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[3], 16 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 2 ),
-                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
-} 
+
+#define DH1L( m, sl, sr, a, b, c ) \
+   _mm256_add_epi64( \
+               _mm256_xor_si256( M[m], \
+                  _mm256_xor_si256( _mm256_slli_epi64( xh, sl ), \
+                                    _mm256_srli_epi64( qt[a], sr ) ) ), \
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+
+#define DH1R( m, sl, sr, a, b, c ) \
+   _mm256_add_epi64( \
+               _mm256_xor_si256( M[m], \
+                  _mm256_xor_si256( _mm256_srli_epi64( xh, sl ), \
+                                    _mm256_slli_epi64( qt[a], sr ) ) ), \
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+
+#define DH2L( m, rl, sl, h, a, b, c ) \
+   _mm256_add_epi64( _mm256_add_epi64( \
+       mm256_rol_64( dH[h], rl ), \
+          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
+                 _mm256_xor_si256( _mm256_slli_epi64( xl, sl ), \
+                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+
+#define DH2R( m, rl, sr, h, a, b, c ) \
+   _mm256_add_epi64( _mm256_add_epi64( \
+       mm256_rol_64( dH[h], rl ), \
+          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, sr ), \
+                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+
+   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
+   dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
+   dH[ 2] = DH1R(  2,  5,  5, 18, 26, 2 );
+   dH[ 3] = DH1R(  3,  1,  5, 19, 27, 3 );
+   dH[ 4] = DH1R(  4,  3,  0, 20, 28, 4 );
+   dH[ 5] = DH1L(  5,  6,  6, 21, 29, 5 );
+   dH[ 6] = DH1R(  6,  4,  6, 22, 30, 6 );
+   dH[ 7] = DH1R(  7, 11,  2, 23, 31, 7 );
+   dH[ 8] = DH2L(  8,  9,  8,  4, 24, 23,  8 );
+   dH[ 9] = DH2R(  9, 10,  6,  5, 25, 16,  9 );
+   dH[10] = DH2L( 10, 11,  6,  6, 26, 17, 10 );
+   dH[11] = DH2L( 11, 12,  4,  7, 27, 18, 11 );
+   dH[12] = DH2R( 12, 13,  3,  0, 28, 19, 12 );
+   dH[13] = DH2R( 13, 14,  4,  1, 29, 20, 13 );
+   dH[14] = DH2R( 14, 15,  7,  2, 30, 21, 14 );
+   dH[15] = DH2R( 15, 16,  2,  3, 31, 22, 15 );
+
+#undef DH1L
+#undef DH1R
+#undef DH2L
+#undef DH2R
+}

 static const __m256i final_b[16] =
 {
@@ -1060,7 +1013,7 @@ bmw512_4way_init(void *cc)
 }

 void
-bmw512_4way(void *cc, const void *data, size_t len)
+bmw512_4way_update(void *cc, const void *data, size_t len)
 {
 	bmw64_4way(cc, data, len);
 }
@@ -1079,6 +1032,483 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)

 #endif  // __AVX2__

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// BMW-512 8 WAY
+
+#define s8b0(x) \
+   mm512_xor4( _mm512_srli_epi64( (x), 1), _mm512_slli_epi64( (x), 3), \
+                mm512_rol_64(     (x), 4),  mm512_rol_64(     (x),37) )
+
+#define s8b1(x) \
+   mm512_xor4( _mm512_srli_epi64( (x), 1), _mm512_slli_epi64( (x), 2), \
+                mm512_rol_64(     (x),13),  mm512_rol_64(     (x),43) )
+
+#define s8b2(x) \
+   mm512_xor4( _mm512_srli_epi64( (x), 2), _mm512_slli_epi64( (x), 1), \
+                mm512_rol_64(     (x),19),  mm512_rol_64(     (x),53) )
+
+#define s8b3(x) \
+   mm512_xor4( _mm512_srli_epi64( (x), 2), _mm512_slli_epi64( (x), 2), \
+                mm512_rol_64(     (x),28),  mm512_rol_64(     (x),59) )
+
+#define s8b4(x) \
+  _mm512_xor_si512( (x), _mm512_srli_epi64( (x), 1 ) )
+
+#define s8b5(x) \
+  _mm512_xor_si512( (x), _mm512_srli_epi64( (x), 2 ) )
+
+#define r8b1(x)    mm512_rol_64( x,  5 )
+#define r8b2(x)    mm512_rol_64( x, 11 )
+#define r8b3(x)    mm512_rol_64( x, 27 )
+#define r8b4(x)    mm512_rol_64( x, 32 )
+#define r8b5(x)    mm512_rol_64( x, 37 )
+#define r8b6(x)    mm512_rol_64( x, 43 )
+#define r8b7(x)    mm512_rol_64( x, 53 )
+
+#define rol8w_off_64( M, j, off ) \
+   mm512_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
+                  ( ( (j) + (off) ) & 0xF ) + 1 )
+
+#define add_elt_b8( M, H, j ) \
+   _mm512_xor_si512( \
+      _mm512_add_epi64( \
+            _mm512_sub_epi64( _mm512_add_epi64( rol8w_off_64( M, j, 0 ), \
+                                                rol8w_off_64( M, j, 3 ) ), \
+                             rol8w_off_64( M, j, 10 ) ), \
+            _mm512_set1_epi64( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
+       H[ ( (j)+7 ) & 0xF ] )
+
+#define expand1b8( qt, M, H, i ) \
+   _mm512_add_epi64( mm512_add4_64( \
+      mm512_add4_64( s8b1( qt[ (i)-16 ] ), s8b2( qt[ (i)-15 ] ), \
+                     s8b3( qt[ (i)-14 ] ), s8b0( qt[ (i)-13 ] )), \
+      mm512_add4_64( s8b1( qt[ (i)-12 ] ), s8b2( qt[ (i)-11 ] ), \
+                     s8b3( qt[ (i)-10 ] ), s8b0( qt[ (i)- 9 ] )), \
+      mm512_add4_64( s8b1( qt[ (i)- 8 ] ), s8b2( qt[ (i)- 7 ] ), \
+                     s8b3( qt[ (i)- 6 ] ), s8b0( qt[ (i)- 5 ] )), \
+      mm512_add4_64( s8b1( qt[ (i)- 4 ] ), s8b2( qt[ (i)- 3 ] ), \
+                     s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) ), \
+      add_elt_b8( M, H, (i)-16 ) )
+
+#define expand2b8( qt, M, H, i) \
+   _mm512_add_epi64( mm512_add4_64( \
+      mm512_add4_64( qt[ (i)-16 ], r8b1( qt[ (i)-15 ] ), \
+                     qt[ (i)-14 ], r8b2( qt[ (i)-13 ] ) ), \
+      mm512_add4_64( qt[ (i)-12 ], r8b3( qt[ (i)-11 ] ), \
+                     qt[ (i)-10 ], r8b4( qt[ (i)- 9 ] ) ), \
+      mm512_add4_64( qt[ (i)- 8 ], r8b5( qt[ (i)- 7 ] ), \
+                     qt[ (i)- 6 ], r8b6( qt[ (i)- 5 ] ) ), \
+      mm512_add4_64( qt[ (i)- 4 ], r8b7( qt[ (i)- 3 ] ), \
+                     s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ), \
+      add_elt_b8( M, H, (i)-16 ) )
+
+#define W8b0 \
+   _mm512_add_epi64( \
+      _mm512_add_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
+                           _mm512_xor_si512( M[ 7], H[ 7] ) ), \
+         _mm512_xor_si512( M[10], H[10] ) ), \
+      _mm512_add_epi64( _mm512_xor_si512( M[13], H[13] ), \
+                        _mm512_xor_si512( M[14], H[14] ) ) )
+
+#define W8b1 \
+   _mm512_add_epi64( \
+       _mm512_add_epi64( \
+          _mm512_sub_epi64( _mm512_xor_si512( M[ 6], H[ 6] ), \
+                            _mm512_xor_si512( M[ 8], H[ 8] ) ), \
+          _mm512_xor_si512( M[11], H[11] ) ), \
+       _mm512_sub_epi64( _mm512_xor_si512( M[14], H[14] ), \
+                         _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W8b2 \
+   _mm512_sub_epi64( \
+      _mm512_add_epi64( \
+         _mm512_add_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
+                           _mm512_xor_si512( M[ 7], H[ 7] ) ), \
+         _mm512_xor_si512( M[ 9], H[ 9] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
+                        _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W8b3 \
+   _mm512_sub_epi64( \
+      _mm512_add_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
+                           _mm512_xor_si512( M[ 1], H[ 1] ) ), \
+         _mm512_xor_si512( M[ 8], H[ 8] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[10], H[10] ), \
+                        _mm512_xor_si512( M[13], H[13] ) ) )
+
+#define W8b4 \
+   _mm512_sub_epi64( \
+      _mm512_add_epi64( \
+         _mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
+                           _mm512_xor_si512( M[ 2], H[ 2] ) ), \
+         _mm512_xor_si512( M[ 9], H[ 9] ) ), \
+      _mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
+                        _mm512_xor_si512( M[14], H[14] ) ) )
+
+#define W8b5 \
+   _mm512_sub_epi64( \
+      _mm512_add_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
+                           _mm512_xor_si512( M[ 2], H[ 2] ) ), \
+         _mm512_xor_si512( M[10], H[10] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
+                        _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W8b6 \
+   _mm512_sub_epi64( \
+      _mm512_sub_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 4], H[ 4] ), \
+                           _mm512_xor_si512( M[ 0], H[ 0] ) ), \
+         _mm512_xor_si512( M[ 3], H[ 3] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[11], H[11] ), \
+                        _mm512_xor_si512( M[13], H[13] ) ) )
+
+#define W8b7 \
+   _mm512_sub_epi64( \
+      _mm512_sub_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
+                           _mm512_xor_si512( M[ 4], H[ 4] ) ), \
+         _mm512_xor_si512( M[ 5], H[ 5] ) ), \
+      _mm512_add_epi64( _mm512_xor_si512( M[12], H[12] ), \
+                        _mm512_xor_si512( M[14], H[14] ) ) )
+
+#define W8b8 \
+   _mm512_add_epi64( \
+      _mm512_sub_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
+                           _mm512_xor_si512( M[ 5], H[ 5] ) ), \
+         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[13], H[13] ), \
+                        _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W8b9 \
+   _mm512_sub_epi64( \
+      _mm512_add_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
+                           _mm512_xor_si512( M[ 3], H[ 3] ) ), \
+         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
+                        _mm512_xor_si512( M[14], H[14] ) ) )
+
+#define W8b10 \
+   _mm512_sub_epi64( \
+      _mm512_sub_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
+                           _mm512_xor_si512( M[ 1], H[ 1] ) ), \
+         _mm512_xor_si512( M[ 4], H[ 4] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
+                        _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W8b11 \
+   _mm512_sub_epi64( \
+      _mm512_sub_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
+                           _mm512_xor_si512( M[ 0], H[ 0] ) ), \
+         _mm512_xor_si512( M[ 2], H[ 2] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
+                        _mm512_xor_si512( M[ 9], H[ 9] ) ) )
+
+#define W8b12 \
+   _mm512_sub_epi64( \
+      _mm512_sub_epi64( \
+         _mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
+                           _mm512_xor_si512( M[ 3], H[ 3] ) ), \
+         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
+                        _mm512_xor_si512( M[10], H[10] ) ) )
+
+#define W8b13 \
+   _mm512_add_epi64( \
+      _mm512_add_epi64( \
+         _mm512_add_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
+                           _mm512_xor_si512( M[ 4], H[ 4] ) ), \
+         _mm512_xor_si512( M[ 7], H[ 7] ) ), \
+      _mm512_add_epi64( _mm512_xor_si512( M[10], H[10] ), \
+                        _mm512_xor_si512( M[11], H[11] ) ) )
+
+#define W8b14 \
+   _mm512_sub_epi64( \
+      _mm512_add_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
+                           _mm512_xor_si512( M[ 5], H[ 5] ) ), \
+         _mm512_xor_si512( M[ 8], H[ 8] ) ), \
+      _mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
+                        _mm512_xor_si512( M[12], H[12] ) ) )
+
+#define W8b15 \
+   _mm512_sub_epi64( \
+      _mm512_sub_epi64( \
+         _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
+                           _mm512_xor_si512( M[ 4], H[4] ) ), \
+         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
+      _mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
+                        _mm512_xor_si512( M[13], H[13] ) ) )
+
+void compress_big_8way( const __m512i *M, const __m512i H[16],
+                        __m512i dH[16] )
+{
+   __m512i qt[32], xl, xh;
+
+   qt[ 0] = _mm512_add_epi64( s8b0( W8b0 ), H[ 1] );
+   qt[ 1] = _mm512_add_epi64( s8b1( W8b1 ), H[ 2] );
+   qt[ 2] = _mm512_add_epi64( s8b2( W8b2 ), H[ 3] );
+   qt[ 3] = _mm512_add_epi64( s8b3( W8b3 ), H[ 4] );
+   qt[ 4] = _mm512_add_epi64( s8b4( W8b4 ), H[ 5] );
+   qt[ 5] = _mm512_add_epi64( s8b0( W8b5 ), H[ 6] );
+   qt[ 6] = _mm512_add_epi64( s8b1( W8b6 ), H[ 7] );
+   qt[ 7] = _mm512_add_epi64( s8b2( W8b7 ), H[ 8] );
+   qt[ 8] = _mm512_add_epi64( s8b3( W8b8 ), H[ 9] );
+   qt[ 9] = _mm512_add_epi64( s8b4( W8b9 ), H[10] );
+   qt[10] = _mm512_add_epi64( s8b0( W8b10), H[11] );
+   qt[11] = _mm512_add_epi64( s8b1( W8b11), H[12] );
+   qt[12] = _mm512_add_epi64( s8b2( W8b12), H[13] );
+   qt[13] = _mm512_add_epi64( s8b3( W8b13), H[14] );
+   qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
+   qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
+   qt[16] = expand1b8( qt, M, H, 16 );
+   qt[17] = expand1b8( qt, M, H, 17 );
+   qt[18] = expand2b8( qt, M, H, 18 );
+   qt[19] = expand2b8( qt, M, H, 19 );
+   qt[20] = expand2b8( qt, M, H, 20 );
+   qt[21] = expand2b8( qt, M, H, 21 );
+   qt[22] = expand2b8( qt, M, H, 22 );
+   qt[23] = expand2b8( qt, M, H, 23 );
+   qt[24] = expand2b8( qt, M, H, 24 );
+   qt[25] = expand2b8( qt, M, H, 25 );
+   qt[26] = expand2b8( qt, M, H, 26 );
+   qt[27] = expand2b8( qt, M, H, 27 );
+   qt[28] = expand2b8( qt, M, H, 28 );
+   qt[29] = expand2b8( qt, M, H, 29 );
+   qt[30] = expand2b8( qt, M, H, 30 );
+   qt[31] = expand2b8( qt, M, H, 31 );
+
+   xl = _mm512_xor_si512(
+           mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
+           mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
+   xh = _mm512_xor_si512( xl, _mm512_xor_si512(
+           mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
+           mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
+
+#define DH1L( m, sl, sr, a, b, c ) \
+   _mm512_add_epi64( \
+               _mm512_xor_si512( M[m], \
+                  _mm512_xor_si512( _mm512_slli_epi64( xh, sl ), \
+                                    _mm512_srli_epi64( qt[a], sr ) ) ), \
+               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
+
+#define DH1R( m, sl, sr, a, b, c ) \
+   _mm512_add_epi64( \
+               _mm512_xor_si512( M[m], \
+                  _mm512_xor_si512( _mm512_srli_epi64( xh, sl ), \
+                                    _mm512_slli_epi64( qt[a], sr ) ) ), \
+               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
+
+#define DH2L( m, rl, sl, h, a, b, c ) \
+   _mm512_add_epi64( _mm512_add_epi64( \
+       mm512_rol_64( dH[h], rl ), \
+          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
+                 _mm512_xor_si512( _mm512_slli_epi64( xl, sl ), \
+                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
+   
+#define DH2R( m, rl, sr, h, a, b, c ) \
+   _mm512_add_epi64( _mm512_add_epi64( \
+       mm512_rol_64( dH[h], rl ), \
+          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
+                 _mm512_xor_si512( _mm512_srli_epi64( xl, sr ), \
+                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
+
+
+   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
+   dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
+   dH[ 2] = DH1R(  2,  5,  5, 18, 26, 2 );
+   dH[ 3] = DH1R(  3,  1,  5, 19, 27, 3 );
+   dH[ 4] = DH1R(  4,  3,  0, 20, 28, 4 );
+   dH[ 5] = DH1L(  5,  6,  6, 21, 29, 5 );
+   dH[ 6] = DH1R(  6,  4,  6, 22, 30, 6 );
+   dH[ 7] = DH1R(  7, 11,  2, 23, 31, 7 );
+   dH[ 8] = DH2L(  8,  9,  8,  4, 24, 23,  8 );
+   dH[ 9] = DH2R(  9, 10,  6,  5, 25, 16,  9 );
+   dH[10] = DH2L( 10, 11,  6,  6, 26, 17, 10 );
+   dH[11] = DH2L( 11, 12,  4,  7, 27, 18, 11 );
+   dH[12] = DH2R( 12, 13,  3,  0, 28, 19, 12 );
+   dH[13] = DH2R( 13, 14,  4,  1, 29, 20, 13 );
+   dH[14] = DH2R( 14, 15,  7,  2, 30, 21, 14 );
+   dH[15] = DH2R( 15, 16,  2,  3, 31, 22, 15 );
+
+#undef DH1L
+#undef DH1R
+#undef DH2L
+#undef DH2R
+         
+}
+
+static const __m512i final_b8[16] =
+{
+   { 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0,
+     0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0,
+     0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0,
+     0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 },
+   { 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1,
+     0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1,
+     0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1,
+     0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 },
+   { 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2,
+     0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2,
+     0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2,
+     0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 },
+   { 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3,
+     0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3,
+     0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3,
+     0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 },
+   { 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4,
+     0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4,
+     0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4,
+     0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 },
+   { 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5,
+     0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5,
+     0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5,
+     0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 },
+   { 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6,
+     0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6,
+     0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6,
+     0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 },
+   { 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7,
+     0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7,
+     0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7,
+     0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7 },
+   { 0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8,
+     0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8,
+     0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8,
+     0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8 },
+   { 0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9,
+     0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9,
+     0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9,
+     0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9 },
+   { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
+     0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
+     0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
+     0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
+   { 0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab,
+     0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab,
+     0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab,
+     0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab },
+   { 0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac,
+     0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac,
+     0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac,
+     0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac },
+   { 0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad,
+     0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad,
+     0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad,
+     0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad },
+   { 0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae,
+     0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae,
+     0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae,
+     0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae },
+   { 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf,
+     0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf,
+     0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf,
+     0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf }
+};
+
+
+void bmw512_8way_init( bmw512_8way_context *ctx )
+//bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
+{
+   ctx->H[ 0] = m512_const1_64( 0x8081828384858687 );
+   ctx->H[ 1] = m512_const1_64( 0x88898A8B8C8D8E8F );
+   ctx->H[ 2] = m512_const1_64( 0x9091929394959697 );
+   ctx->H[ 3] = m512_const1_64( 0x98999A9B9C9D9E9F );
+   ctx->H[ 4] = m512_const1_64( 0xA0A1A2A3A4A5A6A7 );
+   ctx->H[ 5] = m512_const1_64( 0xA8A9AAABACADAEAF );
+   ctx->H[ 6] = m512_const1_64( 0xB0B1B2B3B4B5B6B7 );
+   ctx->H[ 7] = m512_const1_64( 0xB8B9BABBBCBDBEBF );
+   ctx->H[ 8] = m512_const1_64( 0xC0C1C2C3C4C5C6C7 );
+   ctx->H[ 9] = m512_const1_64( 0xC8C9CACBCCCDCECF );
+   ctx->H[10] = m512_const1_64( 0xD0D1D2D3D4D5D6D7 );
+   ctx->H[11] = m512_const1_64( 0xD8D9DADBDCDDDEDF );
+   ctx->H[12] = m512_const1_64( 0xE0E1E2E3E4E5E6E7 );
+   ctx->H[13] = m512_const1_64( 0xE8E9EAEBECEDEEEF );
+   ctx->H[14] = m512_const1_64( 0xF0F1F2F3F4F5F6F7 );
+   ctx->H[15] = m512_const1_64( 0xF8F9FAFBFCFDFEFF );
+   ctx->ptr = 0;
+   ctx->bit_count = 0;
+}
+
+void bmw512_8way_update( bmw512_8way_context *ctx, const void *data,
+                                size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   __m512i *buf;
+   __m512i htmp[16];
+   __m512i *h1, *h2;
+   size_t ptr;
+   const int buf_size = 128;  // bytes of one lane, compatible with len
+
+   ctx->bit_count += len << 3;
+   buf = ctx->buf;
+   ptr = ctx->ptr;
+   h1 = ctx->H;
+   h2 = htmp;
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_512( buf + (ptr>>3), vdata, clen >> 3 );
+      vdata = vdata + (clen>>3);
+      len -= clen;
+      ptr += clen;
+      if ( ptr == buf_size )
+      {
+         __m512i *ht;
+         compress_big_8way( buf, h1, h2 );
+         ht = h1;
+         h1 = h2;
+         h2 = ht;
+         ptr = 0;
+      }
+   }
+   ctx->ptr = ptr;
+   if ( h1 != ctx->H )
+        memcpy_512( ctx->H, h1, 16 );
+}
+
+void bmw512_8way_close( bmw512_8way_context *ctx, void *dst )
+{
+   __m512i *buf;
+   __m512i h1[16], h2[16], *h;
+   size_t ptr, u, v;
+   const int buf_size = 128;  // bytes of one lane, compatible with len
+
+   buf = ctx->buf;
+   ptr = ctx->ptr;
+   buf[ ptr>>3 ] = m512_const1_64( 0x80 );
+   ptr += 8;
+   h = ctx->H;
+
+   if (  ptr > (buf_size - 8) )
+   {
+      memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+      compress_big_8way( buf, h, h1 );
+      ptr = 0;
+      h = h1;
+   }
+   memset_zero_512( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
+   buf[ (buf_size - 8) >> 3 ] = _mm512_set1_epi64( ctx->bit_count );
+   compress_big_8way( buf, h, h2 );
+   for ( u = 0; u < 16; u ++ )
+      buf[ u ] = h2[ u ];
+   compress_big_8way( buf, final_b8, h1 );
+   for (u = 0, v = 8; u < 8; u ++, v ++)
+      casti_m512i( dst, u ) = h1[ v ];
+}
+
+#endif // AVX512
+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -26,6 +26,186 @@ static const uint64_t IV512[] =
 0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
 };

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// 4 way 128 is handy to avoid reinterleaving in many algos.
+// If reinterleaving is necessary it may be more efficient to use
+// 2 way 256. The same transform code should work for both.
+
+static void transform_4way( cube_4way_context *sp )
+{
+    int r;
+    const int rounds = sp->rounds;
+
+    __m512i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1;
+
+    x0 = _mm512_load_si512( (__m512i*)sp->h     );
+    x1 = _mm512_load_si512( (__m512i*)sp->h + 1 );
+    x2 = _mm512_load_si512( (__m512i*)sp->h + 2 );
+    x3 = _mm512_load_si512( (__m512i*)sp->h + 3 );
+    x4 = _mm512_load_si512( (__m512i*)sp->h + 4 );
+    x5 = _mm512_load_si512( (__m512i*)sp->h + 5 );
+    x6 = _mm512_load_si512( (__m512i*)sp->h + 6 );
+    x7 = _mm512_load_si512( (__m512i*)sp->h + 7 );
+
+    for ( r = 0; r < rounds; ++r )
+    {
+        x4 = _mm512_add_epi32( x0, x4 );
+        x5 = _mm512_add_epi32( x1, x5 );
+        x6 = _mm512_add_epi32( x2, x6 );
+        x7 = _mm512_add_epi32( x3, x7 );
+        y0 = x0;
+        y1 = x1;
+        x0 = mm512_rol_32( x2, 7 );
+        x1 = mm512_rol_32( x3, 7 );
+        x2 = mm512_rol_32( y0, 7 );
+        x3 = mm512_rol_32( y1, 7 );
+        x0 = _mm512_xor_si512( x0, x4 );
+        x1 = _mm512_xor_si512( x1, x5 );
+        x2 = _mm512_xor_si512( x2, x6 );
+        x3 = _mm512_xor_si512( x3, x7 );
+        x4 = mm512_swap64_128( x4 );
+        x5 = mm512_swap64_128( x5 );
+        x6 = mm512_swap64_128( x6 );
+        x7 = mm512_swap64_128( x7 );
+        x4 = _mm512_add_epi32( x0, x4 );
+        x5 = _mm512_add_epi32( x1, x5 );
+        x6 = _mm512_add_epi32( x2, x6 );
+        x7 = _mm512_add_epi32( x3, x7 );
+        y0 = x0;
+        y1 = x2;
+        x0 = mm512_rol_32( x1, 11 );
+        x1 = mm512_rol_32( y0, 11 );
+        x2 = mm512_rol_32( x3, 11 );
+        x3 = mm512_rol_32( y1, 11 );
+        x0 = _mm512_xor_si512( x0, x4 );
+        x1 = _mm512_xor_si512( x1, x5 );
+        x2 = _mm512_xor_si512( x2, x6 );
+        x3 = _mm512_xor_si512( x3, x7 );
+        x4 = mm512_swap32_64( x4 );
+        x5 = mm512_swap32_64( x5 );
+        x6 = mm512_swap32_64( x6 );
+        x7 = mm512_swap32_64( x7 );
+    }
+
+    _mm512_store_si512( (__m512i*)sp->h,     x0 );
+    _mm512_store_si512( (__m512i*)sp->h + 1, x1 );
+    _mm512_store_si512( (__m512i*)sp->h + 2, x2 );
+    _mm512_store_si512( (__m512i*)sp->h + 3, x3 );
+    _mm512_store_si512( (__m512i*)sp->h + 4, x4 );
+    _mm512_store_si512( (__m512i*)sp->h + 5, x5 );
+    _mm512_store_si512( (__m512i*)sp->h + 6, x6 );
+    _mm512_store_si512( (__m512i*)sp->h + 7, x7 );
+}
+
+int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
+                    int blockbytes )
+{
+    __m512i *h = (__m512i*)sp->h;
+    __m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
+                                                : (__m128i*)IV256 );
+    sp->hashlen   = hashbitlen/128;
+    sp->blocksize = blockbytes/16;
+    sp->rounds    = rounds;
+    sp->pos       = 0;
+
+    h[ 0] = m512_const1_128( iv[0] );
+    h[ 1] = m512_const1_128( iv[1] );
+    h[ 2] = m512_const1_128( iv[2] );
+    h[ 3] = m512_const1_128( iv[3] );
+    h[ 4] = m512_const1_128( iv[4] );
+    h[ 5] = m512_const1_128( iv[5] );
+    h[ 6] = m512_const1_128( iv[6] );
+    h[ 7] = m512_const1_128( iv[7] );
+    h[ 0] = m512_const1_128( iv[0] );
+    h[ 1] = m512_const1_128( iv[1] );
+    h[ 2] = m512_const1_128( iv[2] );
+    h[ 3] = m512_const1_128( iv[3] );
+    h[ 4] = m512_const1_128( iv[4] );
+    h[ 5] = m512_const1_128( iv[5] );
+    h[ 6] = m512_const1_128( iv[6] );
+    h[ 7] = m512_const1_128( iv[7] );
+
+    return 0;
+}
+
+int cube_4way_update( cube_4way_context *sp, const void *data, size_t size )
+{
+    const int len = size >> 4;
+    const __m512i *in = (__m512i*)data;
+    int i;
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_4way( sp );
+           sp->pos = 0;
+        }
+    }
+    return 0;
+}
+
+int cube_4way_close( cube_4way_context *sp, void *output )
+{
+    __m512i *hash = (__m512i*)output;
+    int i;
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
+                                 m512_const2_64( 0, 0x0000000000000080 ) );
+    transform_4way( sp );
+
+    sp->h[7] = _mm512_xor_si512( sp->h[7],
+                                 m512_const2_64( 0x0000000100000000, 0 ) );
+
+    for ( i = 0; i < 10; ++i ) 
+       transform_4way( sp );
+
+    memcpy( hash, sp->h, sp->hashlen<<6 );
+    return 0;
+}
+
+int cube_4way_update_close( cube_4way_context *sp, void *output,
+                               const void *data, size_t size )
+{
+    const int len = size >> 4;
+    const __m512i *in = (__m512i*)data;
+    __m512i *hash = (__m512i*)output;
+    int i;
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_4way( sp );
+           sp->pos = 0;
+        }
+    }
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
+                                    m512_const2_64( 0, 0x0000000000000080 ) );
+    transform_4way( sp );
+
+    sp->h[7] = _mm512_xor_si512( sp->h[7],
+                                    m512_const2_64( 0x0000000100000000, 0 ) );
+
+    for ( i = 0; i < 10; ++i )
+       transform_4way( sp );
+
+    memcpy( hash, sp->h, sp->hashlen<<6);
+    return 0;
+}
+
+
+#endif // AVX512
+
+// 2 way 128 

 static void transform_2way( cube_2way_context *sp )
 {
@@ -91,7 +271,6 @@ static void transform_2way( cube_2way_context *sp )
    _mm256_store_si256( (__m256i*)sp->h + 5, x5 );
    _mm256_store_si256( (__m256i*)sp->h + 6, x6 );
    _mm256_store_si256( (__m256i*)sp->h + 7, x7 );
-
 }

 int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
@@ -132,9 +311,6 @@ int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
    const __m256i *in = (__m256i*)data;
    int i;

-    // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
-    // Current usage sata is either 64 or 80 bytes.
-
    for ( i = 0; i < len; i++ )
    {
        sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -1,11 +1,38 @@
 #ifndef CUBE_HASH_2WAY_H__
-#define CUBE_HASH_2WAY_H__
-
-#if defined(__AVX2__)
+#define CUBE_HASH_2WAY_H__ 1

 #include <stdint.h>
 #include "simd-utils.h"

+#if defined(__AVX2__)
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+struct _cube_4way_context
+{
+    __m512i h[8];
+    int hashlen;
+    int rounds;
+    int blocksize;
+    int pos; 
+} __attribute__ ((aligned (128)));
+
+typedef struct _cube_4way_context cube_4way_context;
+
+int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
+                       int blockbytes );
+// reinitialize context with same parameters, much faster.
+int cube_4way_reinit( cube_4way_context *sp );
+
+int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
+
+int cube_4way_close( cube_4way_context *sp, void *output );
+
+int cube_4way_update_close( cube_4way_context *sp, void *output,
+                            const void *data, size_t size );
+
+#endif
+
 // 2x128, 2 way parallel SSE2

 struct _cube_2way_context
@@ -15,7 +42,7 @@ struct _cube_2way_context
    int rounds;
    int blocksize;         // __m128i
    int pos;               // number of __m128i read into x from current block
-} __attribute__ ((aligned (64)));
+} __attribute__ ((aligned (128)));

 typedef struct _cube_2way_context cube_2way_context;

--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -92,6 +92,38 @@ extern "C"{

 #endif

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define Sb_8W(x0, x1, x2, x3, c) \
+do { \
+   __m512i cc = _mm512_set1_epi64( c ); \
+    x3 = mm512_not( x3 ); \
+    x0 = _mm512_xor_si512( x0, _mm512_andnot_si512( x2, cc ) ); \
+    tmp = _mm512_xor_si512( cc, _mm512_and_si512( x0, x1 ) ); \
+    x0 = _mm512_xor_si512( x0, _mm512_and_si512( x2, x3 ) ); \
+    x3 = _mm512_xor_si512( x3, _mm512_andnot_si512( x1, x2 ) ); \
+    x1 = _mm512_xor_si512( x1, _mm512_and_si512( x0, x2 ) ); \
+    x2 = _mm512_xor_si512( x2, _mm512_andnot_si512( x3, x0 ) ); \
+    x0 = _mm512_xor_si512( x0, _mm512_or_si512( x1, x3 ) ); \
+    x3 = _mm512_xor_si512( x3, _mm512_and_si512( x1, x2 ) ); \
+    x1 = _mm512_xor_si512( x1, _mm512_and_si512( tmp, x0 ) ); \
+    x2 = _mm512_xor_si512( x2, tmp ); \
+} while (0)
+
+#define Lb_8W(x0, x1, x2, x3, x4, x5, x6, x7) \
+do { \
+    x4 = _mm512_xor_si512( x4, x1 ); \
+    x5 = _mm512_xor_si512( x5, x2 ); \
+    x6 = _mm512_xor_si512( x6, _mm512_xor_si512( x3, x0 ) ); \
+    x7 = _mm512_xor_si512( x7, x0 ); \
+    x0 = _mm512_xor_si512( x0, x5 ); \
+    x1 = _mm512_xor_si512( x1, x6 ); \
+    x2 = _mm512_xor_si512( x2, _mm512_xor_si512( x7, x4 ) ); \
+    x3 = _mm512_xor_si512( x3, x4 ); \
+} while (0)
+
+#endif
+
 #define Sb(x0, x1, x2, x3, c) \
 do { \
   __m256i cc = _mm256_set1_epi64x( c ); \
@@ -226,6 +258,48 @@ static const sph_u64 C[] = {
 			x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
 	} while (0)

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define S_8W(x0, x1, x2, x3, cb, r)   do { \
+      Sb_8W(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \
+      Sb_8W(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \
+   } while (0)
+
+#define L_8W(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+      Lb_8W(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \
+         x4 ## h, x5 ## h, x6 ## h, x7 ## h); \
+      Lb_8W(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \
+         x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
+   } while (0)
+
+#define Wz_8W(x, c, n) \
+do { \
+   __m512i t = _mm512_slli_epi64( _mm512_and_si512(x ## h, (c)), (n) ); \
+   x ## h = _mm512_or_si512( _mm512_and_si512( \
+                                _mm512_srli_epi64(x ## h, (n)), (c)), t ); \
+   t = _mm512_slli_epi64( _mm512_and_si512(x ## l, (c)), (n) ); \
+   x ## l = _mm512_or_si512( _mm512_and_si512((x ## l >> (n)), (c)), t ); \
+} while (0)
+
+#define W80(x)   Wz_8W(x, m512_const1_64( 0x5555555555555555 ),  1 )
+#define W81(x)   Wz_8W(x, m512_const1_64( 0x3333333333333333 ),  2 )
+#define W82(x)   Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ),  4 )
+#define W83(x)   Wz_8W(x, m512_const1_64( 0x00FF00FF00FF00FF ),  8 ) 
+#define W84(x)   Wz_8W(x, m512_const1_64( 0x0000FFFF0000FFFF ), 16 )
+#define W85(x)   Wz_8W(x, m512_const1_64( 0x00000000FFFFFFFF ), 32 )
+#define W86(x) \
+do { \
+   __m512i t = x ## h; \
+   x ## h = x ## l; \
+   x ## l = t; \
+} while (0)
+
+#define DECL_STATE_8W \
+   __m512i h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \
+   __m512i h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \
+   __m512i tmp;
+
+#endif

 #define Wz(x, c, n) \
 do { \
@@ -236,16 +310,6 @@ do { \
   x ## l = _mm256_or_si256( _mm256_and_si256((x ## l >> (n)), (c)), t ); \
 } while (0)

-
-/*
-#define Wz(x, c, n)   do { \
-		sph_u64 t = (x ## h & (c)) << (n); \
-		x ## h = ((x ## h >> (n)) & (c)) | t; \
-		t = (x ## l & (c)) << (n); \
-		x ## l = ((x ## l >> (n)) & (c)) | t; \
-	} while (0)
-*/
-
 #define W0(x)   Wz(x, m256_const1_64( 0x5555555555555555 ),  1 )
 #define W1(x)   Wz(x, m256_const1_64( 0x3333333333333333 ),  2 )
 #define W2(x)   Wz(x, m256_const1_64( 0x0F0F0F0F0F0F0F0F ),  4 )
@@ -259,25 +323,12 @@ do { \
   x ## l = t; \
 } while (0)

-/*
-#define W0(x)   Wz(x, SPH_C64(0x5555555555555555),  1)
-#define W1(x)   Wz(x, SPH_C64(0x3333333333333333),  2)
-#define W2(x)   Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F),  4)
-#define W3(x)   Wz(x, SPH_C64(0x00FF00FF00FF00FF),  8)
-#define W4(x)   Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16)
-#define W5(x)   Wz(x, SPH_C64(0x00000000FFFFFFFF), 32)
-#define W6(x)   do { \
-		sph_u64 t = x ## h; \
-		x ## h = x ## l; \
-		x ## l = t; \
-	} while (0)
-*/
-
 #define DECL_STATE \
 	__m256i h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \
 	__m256i h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \
 	__m256i tmp;

+
 #define READ_STATE(state)   do { \
 		h0h = (state)->H[ 0]; \
 		h0l = (state)->H[ 1]; \
@@ -316,6 +367,38 @@ do { \
 		(state)->H[15] = h7l; \
 	} while (0)

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define INPUT_BUF1_8W \
+   __m512i m0h = buf[0]; \
+   __m512i m0l = buf[1]; \
+   __m512i m1h = buf[2]; \
+   __m512i m1l = buf[3]; \
+   __m512i m2h = buf[4]; \
+   __m512i m2l = buf[5]; \
+   __m512i m3h = buf[6]; \
+   __m512i m3l = buf[7]; \
+   h0h = _mm512_xor_si512( h0h, m0h ); \
+   h0l = _mm512_xor_si512( h0l, m0l ); \
+   h1h = _mm512_xor_si512( h1h, m1h ); \
+   h1l = _mm512_xor_si512( h1l, m1l ); \
+   h2h = _mm512_xor_si512( h2h, m2h ); \
+   h2l = _mm512_xor_si512( h2l, m2l ); \
+   h3h = _mm512_xor_si512( h3h, m3h ); \
+   h3l = _mm512_xor_si512( h3l, m3l ); \
+
+#define INPUT_BUF2_8W \
+   h4h = _mm512_xor_si512( h4h, m0h ); \
+   h4l = _mm512_xor_si512( h4l, m0l ); \
+   h5h = _mm512_xor_si512( h5h, m1h ); \
+   h5l = _mm512_xor_si512( h5l, m1l ); \
+   h6h = _mm512_xor_si512( h6h, m2h ); \
+   h6l = _mm512_xor_si512( h6l, m2l ); \
+   h7h = _mm512_xor_si512( h7h, m3h ); \
+   h7l = _mm512_xor_si512( h7l, m3l ); \
+
+#endif
+
 #define INPUT_BUF1 \
 	__m256i m0h = buf[0]; \
 	__m256i m0l = buf[1]; \
@@ -344,6 +427,7 @@ do { \
   h7h = _mm256_xor_si256( h7h, m3h ); \
   h7l = _mm256_xor_si256( h7l, m3l ); \

+
 static const sph_u64 IV256[] = {
 	C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1),
 	C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03),
@@ -370,6 +454,22 @@ static const sph_u64 IV512[] = {
 #else


+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define SL_8W(ro)   SLu_8W(r + ro, ro)
+
+#define SLu_8W(r, ro)   do { \
+      S_8W(h0, h2, h4, h6, Ceven_, r); \
+      S_8W(h1, h3, h5, h7, Codd_, r); \
+      L_8W(h0, h2, h4, h6, h1, h3, h5, h7); \
+      W8 ## ro(h1); \
+      W8 ## ro(h3); \
+      W8 ## ro(h5); \
+      W8 ## ro(h7); \
+   } while (0)
+
 #endif

 #define SL(ro)   SLu(r + ro, ro)
@@ -393,6 +493,23 @@ static const sph_u64 IV512[] = {
 * loop.
 */

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define E8_8W   do { \
+      unsigned r; \
+      for (r = 0; r < 42; r += 7) { \
+         SL_8W(0); \
+         SL_8W(1); \
+         SL_8W(2); \
+         SL_8W(3); \
+         SL_8W(4); \
+         SL_8W(5); \
+         SL_8W(6); \
+      } \
+   } while (0)
+
+#endif
+
 #define E8   do { \
 		unsigned r; \
 		for (r = 0; r < 42; r += 7) { \
@@ -419,51 +536,100 @@ static const sph_u64 IV512[] = {
 * On a "true 64-bit" architecture, we can unroll at will.
 */

-#define E8   do { \
-		SLu( 0, 0); \
-		SLu( 1, 1); \
-		SLu( 2, 2); \
-		SLu( 3, 3); \
-		SLu( 4, 4); \
-		SLu( 5, 5); \
-		SLu( 6, 6); \
-		SLu( 7, 0); \
-		SLu( 8, 1); \
-		SLu( 9, 2); \
-		SLu(10, 3); \
-		SLu(11, 4); \
-		SLu(12, 5); \
-		SLu(13, 6); \
-		SLu(14, 0); \
-		SLu(15, 1); \
-		SLu(16, 2); \
-		SLu(17, 3); \
-		SLu(18, 4); \
-		SLu(19, 5); \
-		SLu(20, 6); \
-		SLu(21, 0); \
-		SLu(22, 1); \
-		SLu(23, 2); \
-		SLu(24, 3); \
-		SLu(25, 4); \
-		SLu(26, 5); \
-		SLu(27, 6); \
-		SLu(28, 0); \
-		SLu(29, 1); \
-		SLu(30, 2); \
-		SLu(31, 3); \
-		SLu(32, 4); \
-		SLu(33, 5); \
-		SLu(34, 6); \
-		SLu(35, 0); \
-		SLu(36, 1); \
-		SLu(37, 2); \
-		SLu(38, 3); \
-		SLu(39, 4); \
-		SLu(40, 5); \
-		SLu(41, 6); \
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define E8_8W   do { \
+		SLu_8W( 0, 0); \
+		SLu_8W( 1, 1); \
+		SLu_8W( 2, 2); \
+		SLu_8W( 3, 3); \
+		SLu_8W( 4, 4); \
+		SLu_8W( 5, 5); \
+		SLu_8W( 6, 6); \
+		SLu_8W( 7, 0); \
+		SLu_8W( 8, 1); \
+		SLu_8W( 9, 2); \
+		SLu_8W(10, 3); \
+		SLu_8W(11, 4); \
+		SLu_8W(12, 5); \
+		SLu_8W(13, 6); \
+		SLu_8W(14, 0); \
+		SLu_8W(15, 1); \
+		SLu_8W(16, 2); \
+		SLu_8W(17, 3); \
+		SLu_8W(18, 4); \
+		SLu_8W(19, 5); \
+		SLu_8W(20, 6); \
+		SLu_8W(21, 0); \
+		SLu_8W(22, 1); \
+		SLu_8W(23, 2); \
+		SLu_8W(24, 3); \
+		SLu_8W(25, 4); \
+		SLu_8W(26, 5); \
+		SLu_8W(27, 6); \
+		SLu_8W(28, 0); \
+		SLu_8W(29, 1); \
+		SLu_8W(30, 2); \
+		SLu_8W(31, 3); \
+		SLu_8W(32, 4); \
+		SLu_8W(33, 5); \
+		SLu_8W(34, 6); \
+		SLu_8W(35, 0); \
+		SLu_8W(36, 1); \
+		SLu_8W(37, 2); \
+		SLu_8W(38, 3); \
+		SLu_8W(39, 4); \
+		SLu_8W(40, 5); \
+		SLu_8W(41, 6); \
 	} while (0)

+#endif  // AVX512
+
+#define E8   do { \
+      SLu( 0, 0); \
+      SLu( 1, 1); \
+      SLu( 2, 2); \
+      SLu( 3, 3); \
+      SLu( 4, 4); \
+      SLu( 5, 5); \
+      SLu( 6, 6); \
+      SLu( 7, 0); \
+      SLu( 8, 1); \
+      SLu( 9, 2); \
+      SLu(10, 3); \
+      SLu(11, 4); \
+      SLu(12, 5); \
+      SLu(13, 6); \
+      SLu(14, 0); \
+      SLu(15, 1); \
+      SLu(16, 2); \
+      SLu(17, 3); \
+      SLu(18, 4); \
+      SLu(19, 5); \
+      SLu(20, 6); \
+      SLu(21, 0); \
+      SLu(22, 1); \
+      SLu(23, 2); \
+      SLu(24, 3); \
+      SLu(25, 4); \
+      SLu(26, 5); \
+      SLu(27, 6); \
+      SLu(28, 0); \
+      SLu(29, 1); \
+      SLu(30, 2); \
+      SLu(31, 3); \
+      SLu(32, 4); \
+      SLu(33, 5); \
+      SLu(34, 6); \
+      SLu(35, 0); \
+      SLu(36, 1); \
+      SLu(37, 2); \
+      SLu(38, 3); \
+      SLu(39, 4); \
+      SLu(40, 5); \
+      SLu(41, 6); \
+   } while (0)
+
 #else


@@ -471,6 +637,158 @@ static const sph_u64 IV512[] = {

 #endif

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+void jh256_8way_init( jh_8way_context *sc )
+{
+    // bswapped IV256
+    sc->H[ 0] = m512_const1_64( 0xebd3202c41a398eb );
+    sc->H[ 1] = m512_const1_64( 0xc145b29c7bbecd92 );
+    sc->H[ 2] = m512_const1_64( 0xfac7d4609151931c );
+    sc->H[ 3] = m512_const1_64( 0x038a507ed6820026 );
+    sc->H[ 4] = m512_const1_64( 0x45b92677269e23a4 );
+    sc->H[ 5] = m512_const1_64( 0x77941ad4481afbe0 );
+    sc->H[ 6] = m512_const1_64( 0x7a176b0226abb5cd );
+    sc->H[ 7] = m512_const1_64( 0xa82fff0f4224f056 );
+    sc->H[ 8] = m512_const1_64( 0x754d2e7f8996a371 );
+    sc->H[ 9] = m512_const1_64( 0x62e27df70849141d );
+    sc->H[10] = m512_const1_64( 0x948f2476f7957627 );
+    sc->H[11] = m512_const1_64( 0x6c29804757b6d587 );
+    sc->H[12] = m512_const1_64( 0x6c0d8eac2d275e5c );
+    sc->H[13] = m512_const1_64( 0x0f7a0557c6508451 );
+    sc->H[14] = m512_const1_64( 0xea12247067d3e47b );
+    sc->H[15] = m512_const1_64( 0x69d71cd313abe389 );
+    sc->ptr = 0;
+    sc->block_count = 0;
+}
+
+void jh512_8way_init( jh_8way_context *sc )
+{
+    // bswapped IV512
+    sc->H[ 0] = m512_const1_64( 0x17aa003e964bd16f );
+    sc->H[ 1] = m512_const1_64( 0x43d5157a052e6a63 );
+    sc->H[ 2] = m512_const1_64( 0x0bef970c8d5e228a );
+    sc->H[ 3] = m512_const1_64( 0x61c3b3f2591234e9 );
+    sc->H[ 4] = m512_const1_64( 0x1e806f53c1a01d89 );
+    sc->H[ 5] = m512_const1_64( 0x806d2bea6b05a92a );
+    sc->H[ 6] = m512_const1_64( 0xa6ba7520dbcc8e58 );
+    sc->H[ 7] = m512_const1_64( 0xf73bf8ba763a0fa9 );
+    sc->H[ 8] = m512_const1_64( 0x694ae34105e66901 );
+    sc->H[ 9] = m512_const1_64( 0x5ae66f2e8e8ab546 );
+    sc->H[10] = m512_const1_64( 0x243c84c1d0a74710 );
+    sc->H[11] = m512_const1_64( 0x99c15a2db1716e3b );
+    sc->H[12] = m512_const1_64( 0x56f8b19decf657cf );
+    sc->H[13] = m512_const1_64( 0x56b116577c8806a7 );
+    sc->H[14] = m512_const1_64( 0xfb1785e6dffcc2e3 );
+    sc->H[15] = m512_const1_64( 0x4bdd8ccc78465a54 );
+    sc->ptr = 0;
+    sc->block_count = 0;
+}
+
+static void
+jh_8way_core( jh_8way_context *sc, const void *data, size_t len )
+{
+    __m512i *buf;
+    __m512i *vdata = (__m512i*)data;
+   const int buf_size = 64;   // 64 * _m512i
+   size_t ptr;
+   DECL_STATE_8W
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   if ( len < (buf_size - ptr) )
+   {
+       memcpy_512( buf + (ptr>>3), vdata, len>>3 );
+       ptr += len;
+       sc->ptr = ptr;
+       return;
+   }
+
+   READ_STATE(sc);
+   while ( len > 0 )
+   {
+       size_t clen;
+       clen = buf_size - ptr;
+       if ( clen > len )
+          clen = len;
+
+       memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
+       ptr += clen;
+       vdata += (clen>>3);
+       len -= clen;
+       if ( ptr == buf_size )
+       {
+          INPUT_BUF1_8W;
+          E8_8W;
+          INPUT_BUF2_8W;
+          sc->block_count ++;
+          ptr = 0;
+       }
+   }
+   WRITE_STATE(sc);
+   sc->ptr = ptr;
+}
+
+static void
+jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst,
+               size_t out_size_w32, const void *iv )
+{
+   __m512i buf[16*4];
+   __m512i *dst512 = (__m512i*)dst;
+   size_t numz, u;
+   sph_u64 l0, l1, l0e, l1e;
+
+   buf[0] = m512_const1_64( 0x80ULL );
+
+   if ( sc->ptr == 0 )
+       numz = 48;
+   else
+       numz = 112 - sc->ptr;
+
+   memset_zero_512( buf+1, (numz>>3) - 1 );
+
+   l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
+   l1 = SPH_T64(sc->block_count >> 55);
+   sph_enc64be( &l0e, l0 );
+   sph_enc64be( &l1e, l1 );
+   *(buf + (numz>>3)    ) = _mm512_set1_epi64( l1e );
+   *(buf + (numz>>3) + 1) = _mm512_set1_epi64( l0e );
+
+   jh_8way_core( sc, buf, numz + 16 );
+
+   for ( u=0; u < 8; u++ )
+       buf[u] = sc->H[u+8];
+
+    memcpy_512( dst512, buf, 8 );
+}
+
+void
+jh256_8way_update(void *cc, const void *data, size_t len)
+{
+   jh_8way_core(cc, data, len);
+}
+
+void
+jh256_8way_close(void *cc, void *dst)
+{
+   jh_8way_close(cc, 0, 0, dst, 8, IV256);
+}
+
+void
+jh512_8way_update(void *cc, const void *data, size_t len)
+{
+   jh_8way_core(cc, data, len);
+}
+
+void
+jh512_8way_close(void *cc, void *dst)
+{
+   jh_8way_close(cc, 0, 0, dst, 16, IV512);
+}
+
+#endif
+
 void jh256_4way_init( jh_4way_context *sc )
 {
    // bswapped IV256
@@ -595,16 +913,8 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
    memcpy_256( dst256, buf, 8 );
 }

-/*
 void
-jh256_4way_init(void *cc)
-{
-	jhs_4way_init(cc, IV256);
-}
-*/
-
-void
-jh256_4way(void *cc, const void *data, size_t len)
+jh256_4way_update(void *cc, const void *data, size_t len)
 {
 	jh_4way_core(cc, data, len);
 }
@@ -615,16 +925,8 @@ jh256_4way_close(void *cc, void *dst)
 	jh_4way_close(cc, 0, 0, dst, 8, IV256);
 }

-/*
 void
-jh512_4way_init(void *cc)
-{
-	jhb_4way_init(cc, IV512);
-}
-*/
-
-void
-jh512_4way(void *cc, const void *data, size_t len)
+jh512_4way_update(void *cc, const void *data, size_t len)
 {
 	jh_4way_core(cc, data, len);
 }
@@ -635,6 +937,7 @@ jh512_4way_close(void *cc, void *dst)
 	jh_4way_close(cc, 0, 0, dst, 16, IV512);
 }

+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -60,20 +60,41 @@ extern "C"{
 * can be cloned by copying the context (e.g. with a simple
 * <code>memcpy()</code>).
 */
+
+ 
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 typedef struct {
-    __m256i buf[8] __attribute__ ((aligned (64)));
+    __m512i buf[8];
+    __m512i H[16];
+    size_t ptr;
+    uint64_t block_count;
+} jh_8way_context __attribute__ ((aligned (128)));
+
+typedef jh_8way_context jh256_8way_context;
+
+typedef jh_8way_context jh512_8way_context;
+
+void jh256_8way_init( jh_8way_context *sc);
+
+void jh256_8way_update(void *cc, const void *data, size_t len);
+
+void jh256_8way_close(void *cc, void *dst);
+
+void jh512_8way_init( jh_8way_context *sc );
+
+void jh512_8way_update(void *cc, const void *data, size_t len);
+
+void jh512_8way_close(void *cc, void *dst);
+
+#endif
+
+typedef struct {
+    __m256i buf[8];
    __m256i H[16];
    size_t ptr;
    uint64_t block_count;
-/*
-	unsigned char buf[64]; 
-	size_t ptr;
-	union {
-		sph_u64 wide[16];
-	} H;
-	sph_u64 block_count;
-*/
-} jh_4way_context;
+} jh_4way_context __attribute__ ((aligned (128)));

 typedef jh_4way_context jh256_4way_context;

@@ -81,13 +102,15 @@ typedef jh_4way_context jh512_4way_context;

 void jh256_4way_init( jh_4way_context *sc);

-void jh256_4way(void *cc, const void *data, size_t len);
+void jh256_4way_update(void *cc, const void *data, size_t len);
+#define jh256_4way jh256_4way_update

 void jh256_4way_close(void *cc, void *dst);

 void jh512_4way_init( jh_4way_context *sc );

-void jh512_4way(void *cc, const void *data, size_t len);
+void jh512_4way_update(void *cc, const void *data, size_t len);
+#define jh512_4way jh512_4way_update

 void jh512_4way_close(void *cc, void *dst);

@@ -95,6 +118,6 @@ void jh512_4way_close(void *cc, void *dst);
 }
 #endif

-#endif
+#endif // AVX2

 #endif
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -1,18 +1,68 @@
 #include "keccak-gate.h"
-
-#ifdef KECCAK_4WAY
-
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
 #include "sph_keccak.h"
 #include "keccak-hash-4way.h"

+#if defined(KECCAK_8WAY)
+
+void keccakhash_8way(void *state, const void *input)
+{
+    keccak256_8way_context ctx;
+    keccak256_8way_init( &ctx );
+    keccak256_8way_update( &ctx, input, 80 );
+    keccak256_8way_close( &ctx, state );
+}
+
+int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash[16*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[49]);   // 3*16+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   const uint32_t Htarg = ptarget[7];
+   int thr_id = mythr->id;  
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do {
+       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+      keccakhash_8way( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( hash7[ lane<<1 ] < Htarg ) 
+      {
+          extr_lane_8x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+      }
+      n += 8;
+
+   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#elif defined(KECCAK_4WAY)
+
 void keccakhash_4way(void *state, const void *input)
 {
    keccak256_4way_context ctx;
    keccak256_4way_init( &ctx );
-    keccak256_4way( &ctx, input, 80 );
+    keccak256_4way_update( &ctx, input, 80 );
    keccak256_4way_close( &ctx, state );
 }

@@ -28,8 +78,8 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-//   const uint32_t Htarg = ptarget[7];
-    int thr_id = mythr->id;  // thr_id arg is deprecated
+   const uint32_t Htarg = ptarget[7];
+   int thr_id = mythr->id;

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
   do {
@@ -39,7 +89,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
      keccakhash_4way( hash, vdata );

      for ( int lane = 0; lane < 4; lane++ )
-      if ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 )
+      if ( hash7[ lane<<1 ] < Htarg )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -3,30 +3,36 @@

 bool register_keccak_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  opt_target_factor = 128.0;
-#if defined (KECCAK_4WAY)
+#if defined (KECCAK_8WAY)
+  gate->scanhash  = (void*)&scanhash_keccak_8way;
+  gate->hash      = (void*)&keccakhash_8way;
+#elif defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
 #else
-  gate->scanhash        = (void*)&scanhash_keccak;
-  gate->hash            = (void*)&keccakhash;
+  gate->scanhash  = (void*)&scanhash_keccak;
+  gate->hash      = (void*)&keccakhash;
 #endif
  return true;
 };

 bool register_keccakc_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
  gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
  opt_target_factor = 256.0;
-#if defined (KECCAK_4WAY)
+#if defined (KECCAK_8WAY)
+  gate->scanhash  = (void*)&scanhash_keccak_8way;
+  gate->hash      = (void*)&keccakhash_8way;
+#elif defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
 #else
-  gate->scanhash        = (void*)&scanhash_keccak;
-  gate->hash            = (void*)&keccakhash;
+  gate->scanhash  = (void*)&scanhash_keccak;
+  gate->hash      = (void*)&keccakhash;
 #endif
  return true;
 };
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -1,23 +1,33 @@
 #ifndef KECCAK_GATE_H__
-#define KECCAK_GATE_H__
+#define KECCAK_GATE_H__ 1

 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__)
-  #define KECCAK_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define KECCAK_8WAY 1
+#elif defined(__AVX2__)
+  #define KECCAK_4WAY 1
 #endif

-#if defined(KECCAK_4WAY)
+#if defined(KECCAK_8WAY)
+
+void keccakhash_8way( void *state, const void *input );
+int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(KECCAK_4WAY)

 void keccakhash_4way( void *state, const void *input );
 int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );

-#endif
+#else

 void keccakhash( void *state, const void *input );
 int scanhash_keccak( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );

 #endif
+
+#endif
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -1,23 +1,24 @@
 #include <stddef.h>
+#include <stdint.h>
 #include "keccak-hash-4way.h"

-#if defined(__AVX2__)
-
-static const sph_u64 RC[] = {
-        SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
-        SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
-        SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
-        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
-        SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
-        SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
-        SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
-        SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
-        SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
-        SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
-        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
-        SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
+static const uint64_t RC[] = {
+        0x0000000000000001, 0x0000000000008082,
+        0x800000000000808A, 0x8000000080008000,
+        0x000000000000808B, 0x0000000080000001,
+        0x8000000080008081, 0x8000000000008009,
+        0x000000000000008A, 0x0000000000000088,
+        0x0000000080008009, 0x000000008000000A,
+        0x000000008000808B, 0x800000000000008B,
+        0x8000000000008089, 0x8000000000008003,
+        0x8000000000008002, 0x8000000000000080,
+        0x000000000000800A, 0x800000008000000A,
+        0x8000000080008081, 0x8000000000008080,
+        0x0000000080000001, 0x8000000080008008
 };

+// generic macros
+
 #define a00   (kc->w[ 0])
 #define a10   (kc->w[ 1])
 #define a20   (kc->w[ 2])
@@ -48,6 +49,197 @@ static const sph_u64 RC[] = {
 #define READ_STATE(sc)
 #define WRITE_STATE(sc)

+#define MOV64(d, s)      (d = s)
+#define XOR64_IOTA       XOR64
+
+#define LPAR   (
+#define RPAR   )
+
+#define DO(x)   x
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define INPUT_BUF(size)   do { \
+    size_t j; \
+    for (j = 0; j < (size>>3); j++ ) \
+        kc->w[j ] = _mm512_xor_si512( kc->w[j], buf[j] ); \
+} while (0)
+
+// Targetted macros, keccak-macros.h is included for each target.
+
+#define DECL64(x)        __m512i x
+#define XOR64(d, a, b)   (d = _mm512_xor_si512(a,b))
+#define AND64(d, a, b)   (d = _mm512_and_si512(a,b))
+#define OR64(d, a, b)    (d = _mm512_or_si512(a,b))
+#define NOT64(d, s)      (d = _mm512_xor_si512(s,m512_neg1))
+#define ROL64(d, v, n)   (d = mm512_rol_64(v, n))
+
+#include "keccak-macros.c"
+
+#define KECCAK_F_1600   DO(KECCAK_F_1600_512)
+
+#define KECCAK_F_1600_512   do { \
+    int j; \
+    for (j = 0; j < 24; j += 8) \
+    { \
+       KF_ELT( 0,  1, _mm512_set1_epi64( RC[j + 0] ) ); \
+       KF_ELT( 1,  2, _mm512_set1_epi64( RC[j + 1] ) ); \
+       KF_ELT( 2,  3, _mm512_set1_epi64( RC[j + 2] ) ); \
+       KF_ELT( 3,  4, _mm512_set1_epi64( RC[j + 3] ) ); \
+       KF_ELT( 4,  5, _mm512_set1_epi64( RC[j + 4] ) ); \
+       KF_ELT( 5,  6, _mm512_set1_epi64( RC[j + 5] ) ); \
+       KF_ELT( 6,  7, _mm512_set1_epi64( RC[j + 6] ) ); \
+       KF_ELT( 7,  8, _mm512_set1_epi64( RC[j + 7] ) ); \
+       P8_TO_P0; \
+    } \
+} while (0)
+
+static void keccak64_8way_init( keccak64_ctx_m512i *kc, unsigned out_size )
+{
+   __m512i zero = m512_zero;
+   __m512i neg1 = m512_neg1;
+
+   // Initialization for the "lane complement".
+   kc->w[ 0] = zero;   kc->w[ 1] = neg1;
+   kc->w[ 2] = neg1;   kc->w[ 3] = zero;
+   kc->w[ 4] = zero;   kc->w[ 5] = zero;
+   kc->w[ 6] = zero;   kc->w[ 7] = zero;
+   kc->w[ 8] = neg1;   kc->w[ 9] = zero;
+   kc->w[10] = zero;   kc->w[11] = zero;
+   kc->w[12] = neg1;   kc->w[13] = zero;
+   kc->w[14] = zero;   kc->w[15] = zero;
+   kc->w[16] = zero;   kc->w[17] = neg1;
+   kc->w[18] = zero;   kc->w[19] = zero;
+   kc->w[20] = neg1;   kc->w[21] = zero;
+   kc->w[22] = zero;   kc->w[23] = zero;
+   kc->w[24] = zero;   kc->ptr = 0;
+   kc->lim = 200 - (out_size >> 2);
+}
+
+static void
+keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
+               size_t lim )
+{
+    __m512i *buf;
+    __m512i *vdata = (__m512i*)data;
+    size_t ptr;
+    DECL_STATE
+
+    buf = kc->buf;
+    ptr = kc->ptr;
+
+    if ( len < (lim - ptr) )
+    {
+        memcpy_512( buf + (ptr>>3), vdata, len>>3 );
+        kc->ptr = ptr + len;
+        return;
+    }
+    READ_STATE( kc );
+    while ( len > 0 )
+    {
+        size_t clen;
+
+        clen = (lim - ptr);
+        if ( clen > len )
+             clen = len;
+        memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
+        ptr += clen;
+        vdata = vdata + (clen>>3);
+        len -= clen;
+        if ( ptr == lim )
+        {
+            INPUT_BUF( lim );
+            KECCAK_F_1600;
+            ptr = 0;
+        }
+    }
+    WRITE_STATE( kc );
+    kc->ptr = ptr;
+}
+
+static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
+                                 size_t byte_len, size_t lim )
+{
+    unsigned eb;
+    union {
+       __m512i tmp[lim + 1];
+       sph_u64 dummy;   /* for alignment */
+    } u;
+    size_t j;
+    size_t m512_len = byte_len >> 3;
+
+    eb = 0x100  >> 8;
+    if ( kc->ptr == (lim - 8) )
+    {
+        const uint64_t t = eb | 0x8000000000000000;
+        u.tmp[0] = m512_const1_64( t );
+        j = 8;
+    }
+    else
+    {
+        j = lim - kc->ptr;
+        u.tmp[0] = m512_const1_64( eb );
+        memset_zero_512( u.tmp + 1, (j>>3) - 2 );
+        u.tmp[ (j>>3) - 1] = m512_const1_64( 0x8000000000000000 );
+    }
+    keccak64_8way_core( kc, u.tmp, j, lim );
+    /* Finalize the "lane complement" */
+    NOT64( kc->w[ 1], kc->w[ 1] );
+    NOT64( kc->w[ 2], kc->w[ 2] );
+    NOT64( kc->w[ 8], kc->w[ 8] );
+    NOT64( kc->w[12], kc->w[12] );
+    NOT64( kc->w[17], kc->w[17] );
+    NOT64( kc->w[20], kc->w[20] );
+    memcpy_512( dst, kc->w, m512_len );
+}
+
+void keccak256_8way_init( void *kc )
+{
+   keccak64_8way_init( kc, 256 );
+}
+
+void
+keccak256_8way_update(void *cc, const void *data, size_t len)
+{
+    keccak64_8way_core(cc, data, len, 136);
+}
+
+void
+keccak256_8way_close(void *cc, void *dst)
+{
+    keccak64_8way_close(cc, dst, 32, 136);
+}
+
+void keccak512_8way_init( void *kc )
+{
+   keccak64_8way_init( kc, 512 );
+}
+
+void
+keccak512_8way_update(void *cc, const void *data, size_t len)
+{
+        keccak64_8way_core(cc, data, len, 72);
+}
+
+void
+keccak512_8way_close(void *cc, void *dst)
+{
+        keccak64_8way_close(cc, dst, 64, 72);
+}
+
+#undef INPUT_BUF
+#undef DECL64
+#undef XOR64
+#undef AND64
+#undef OR64
+#undef NOT64
+#undef ROL64
+#undef KECCAK_F_1600
+
+#endif  // AVX512
+
+#if defined(__AVX2__)
+
 #define INPUT_BUF(size)   do { \
    size_t j; \
    for (j = 0; j < (size>>3); j++ ) \
@@ -55,314 +247,28 @@ static const sph_u64 RC[] = {
 } while (0)

 #define DECL64(x)        __m256i x
-#define MOV64(d, s)      (d = s)
 #define XOR64(d, a, b)   (d = _mm256_xor_si256(a,b))
 #define AND64(d, a, b)   (d = _mm256_and_si256(a,b))
 #define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
 #define NOT64(d, s)      (d = _mm256_xor_si256(s,m256_neg1))
 #define ROL64(d, v, n)   (d = mm256_rol_64(v, n))
-#define XOR64_IOTA       XOR64

-#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
-                DECL64(tt0); \
-                DECL64(tt1); \
-                DECL64(tt2); \
-                DECL64(tt3); \
-                XOR64(tt0, d0, d1); \
-                XOR64(tt1, d2, d3); \
-                XOR64(tt0, tt0, d4); \
-                XOR64(tt0, tt0, tt1); \
-                ROL64(tt0, tt0, 1); \
-                XOR64(tt2, c0, c1); \
-                XOR64(tt3, c2, c3); \
-                XOR64(tt0, tt0, c4); \
-                XOR64(tt2, tt2, tt3); \
-                XOR64(t, tt0, tt2); \
-        } while (0)
+#include "keccak-macros.c"

-#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
-        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
-        b40, b41, b42, b43, b44) \
-        do { \
-                DECL64(t0); \
-                DECL64(t1); \
-                DECL64(t2); \
-                DECL64(t3); \
-                DECL64(t4); \
-                TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
-                TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
-                TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
-                TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
-                TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
-                XOR64(b00, b00, t0); \
-                XOR64(b01, b01, t0); \
-                XOR64(b02, b02, t0); \
-                XOR64(b03, b03, t0); \
-                XOR64(b04, b04, t0); \
-                XOR64(b10, b10, t1); \
-                XOR64(b11, b11, t1); \
-                XOR64(b12, b12, t1); \
-                XOR64(b13, b13, t1); \
-                XOR64(b14, b14, t1); \
-                XOR64(b20, b20, t2); \
-                XOR64(b21, b21, t2); \
-                XOR64(b22, b22, t2); \
-                XOR64(b23, b23, t2); \
-                XOR64(b24, b24, t2); \
-                XOR64(b30, b30, t3); \
-                XOR64(b31, b31, t3); \
-                XOR64(b32, b32, t3); \
-                XOR64(b33, b33, t3); \
-                XOR64(b34, b34, t3); \
-                XOR64(b40, b40, t4); \
-                XOR64(b41, b41, t4); \
-                XOR64(b42, b42, t4); \
-                XOR64(b43, b43, t4); \
-                XOR64(b44, b44, t4); \
-        } while (0)
+#define KECCAK_F_1600   DO(KECCAK_F_1600_256)

-#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
-        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
-        b40, b41, b42, b43, b44) \
-        do { \
-                /* ROL64(b00, b00,  0); */ \
-                ROL64(b01, b01, 36); \
-                ROL64(b02, b02,  3); \
-                ROL64(b03, b03, 41); \
-                ROL64(b04, b04, 18); \
-                ROL64(b10, b10,  1); \
-                ROL64(b11, b11, 44); \
-                ROL64(b12, b12, 10); \
-                ROL64(b13, b13, 45); \
-                ROL64(b14, b14,  2); \
-                ROL64(b20, b20, 62); \
-                ROL64(b21, b21,  6); \
-                ROL64(b22, b22, 43); \
-                ROL64(b23, b23, 15); \
-                ROL64(b24, b24, 61); \
-                ROL64(b30, b30, 28); \
-                ROL64(b31, b31, 55); \
-                ROL64(b32, b32, 25); \
-                ROL64(b33, b33, 21); \
-                ROL64(b34, b34, 56); \
-                ROL64(b40, b40, 27); \
-                ROL64(b41, b41, 20); \
-                ROL64(b42, b42, 39); \
-                ROL64(b43, b43,  8); \
-                ROL64(b44, b44, 14); \
-        } while (0)
-
-/*
- * The KHI macro integrates the "lane complement" optimization. On input,
- * some words are complemented:
- *    a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
- * On output, the following words are complemented:
- *    a04 a10 a20 a22 a23 a31
- *
- * The (implicit) permutation and the theta expansion will bring back
- * the input mask for the next round.
- */
-
-#define KHI_XO(d, a, b, c)   do { \
-                DECL64(kt); \
-                OR64(kt, b, c); \
-                XOR64(d, a, kt); \
-        } while (0)
-
-#define KHI_XA(d, a, b, c)   do { \
-                DECL64(kt); \
-                AND64(kt, b, c); \
-                XOR64(d, a, kt); \
-        } while (0)
-
-#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
-        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
-        b40, b41, b42, b43, b44) \
-        do { \
-                DECL64(c0); \
-                DECL64(c1); \
-                DECL64(c2); \
-                DECL64(c3); \
-                DECL64(c4); \
-                DECL64(bnn); \
-                NOT64(bnn, b20); \
-                KHI_XO(c0, b00, b10, b20); \
-                KHI_XO(c1, b10, bnn, b30); \
-                KHI_XA(c2, b20, b30, b40); \
-                KHI_XO(c3, b30, b40, b00); \
-                KHI_XA(c4, b40, b00, b10); \
-                MOV64(b00, c0); \
-                MOV64(b10, c1); \
-                MOV64(b20, c2); \
-                MOV64(b30, c3); \
-                MOV64(b40, c4); \
-                NOT64(bnn, b41); \
-                KHI_XO(c0, b01, b11, b21); \
-                KHI_XA(c1, b11, b21, b31); \
-                KHI_XO(c2, b21, b31, bnn); \
-                KHI_XO(c3, b31, b41, b01); \
-                KHI_XA(c4, b41, b01, b11); \
-                MOV64(b01, c0); \
-                MOV64(b11, c1); \
-                MOV64(b21, c2); \
-                MOV64(b31, c3); \
-                MOV64(b41, c4); \
-                NOT64(bnn, b32); \
-                KHI_XO(c0, b02, b12, b22); \
-                KHI_XA(c1, b12, b22, b32); \
-                KHI_XA(c2, b22, bnn, b42); \
-                KHI_XO(c3, bnn, b42, b02); \
-                KHI_XA(c4, b42, b02, b12); \
-                MOV64(b02, c0); \
-                MOV64(b12, c1); \
-                MOV64(b22, c2); \
-                MOV64(b32, c3); \
-                MOV64(b42, c4); \
-                NOT64(bnn, b33); \
-                KHI_XA(c0, b03, b13, b23); \
-                KHI_XO(c1, b13, b23, b33); \
-                KHI_XO(c2, b23, bnn, b43); \
-                KHI_XA(c3, bnn, b43, b03); \
-                KHI_XO(c4, b43, b03, b13); \
-                MOV64(b03, c0); \
-                MOV64(b13, c1); \
-                MOV64(b23, c2); \
-                MOV64(b33, c3); \
-                MOV64(b43, c4); \
-                NOT64(bnn, b14); \
-                KHI_XA(c0, b04, bnn, b24); \
-                KHI_XO(c1, bnn, b24, b34); \
-                KHI_XA(c2, b24, b34, b44); \
-                KHI_XO(c3, b34, b44, b04); \
-                KHI_XA(c4, b44, b04, b14); \
-                MOV64(b04, c0); \
-                MOV64(b14, c1); \
-                MOV64(b24, c2); \
-                MOV64(b34, c3); \
-                MOV64(b44, c4); \
-        } while (0)
-
-#define IOTA(r)   XOR64_IOTA(a00, a00, r)
-
-#define P0    a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
-              a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
-#define P1    a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
-              a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
-#define P2    a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
-              a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
-#define P3    a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
-              a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
-#define P4    a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
-              a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
-#define P5    a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
-              a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
-#define P6    a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
-              a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
-#define P7    a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
-              a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
-#define P8    a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
-              a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
-#define P9    a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
-              a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
-#define P10   a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
-              a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
-#define P11   a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
-              a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
-#define P12   a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
-              a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
-#define P13   a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
-              a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
-#define P14   a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
-              a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
-#define P15   a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
-              a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
-#define P16   a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
-              a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
-#define P17   a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
-              a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
-#define P18   a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
-              a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
-#define P19   a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
-              a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
-#define P20   a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
-              a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
-#define P21   a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
-              a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
-#define P22   a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
-              a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
-#define P23   a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
-              a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
-
-#define P8_TO_P0   do { \
-                DECL64(t); \
-                MOV64(t, a01); \
-                MOV64(a01, a11); \
-                MOV64(a11, a43); \
-                MOV64(a43, t); \
-                MOV64(t, a02); \
-                MOV64(a02, a22); \
-                MOV64(a22, a31); \
-                MOV64(a31, t); \
-                MOV64(t, a03); \
-                MOV64(a03, a33); \
-                MOV64(a33, a24); \
-                MOV64(a24, t); \
-                MOV64(t, a04); \
-                MOV64(a04, a44); \
-                MOV64(a44, a12); \
-                MOV64(a12, t); \
-                MOV64(t, a10); \
-                MOV64(a10, a32); \
-                MOV64(a32, a13); \
-                MOV64(a13, t); \
-                MOV64(t, a14); \
-                MOV64(a14, a21); \
-                MOV64(a21, a20); \
-                MOV64(a20, t); \
-                MOV64(t, a23); \
-                MOV64(a23, a42); \
-                MOV64(a42, a40); \
-                MOV64(a40, t); \
-                MOV64(t, a30); \
-                MOV64(a30, a41); \
-                MOV64(a41, a34); \
-                MOV64(a34, t); \
-        } while (0)
-
-#define LPAR   (
-#define RPAR   )
-
-#define KF_ELT(r, s, k)   do { \
-                THETA LPAR P ## r RPAR; \
-                RHO LPAR P ## r RPAR; \
-                KHI LPAR P ## s RPAR; \
-                IOTA(k); \
-        } while (0)
-
-#define DO(x)   x
-
-#define KECCAK_F_1600   DO(KECCAK_F_1600_)
-
-#define KECCAK_F_1600_   do { \
+#define KECCAK_F_1600_256   do { \
    int j; \
    for (j = 0; j < 24; j += 8) \
    { \
-       KF_ELT( 0,  1, (_mm256_set_epi64x( RC[j + 0], RC[j + 0], \
-                                       RC[j + 0], RC[j + 0])) ); \
-       KF_ELT( 1,  2, (_mm256_set_epi64x( RC[j + 1], RC[j + 1], \
-                                       RC[j + 1], RC[j + 1])) ); \
-       KF_ELT( 2,  3, (_mm256_set_epi64x( RC[j + 2], RC[j + 2], \
-                                       RC[j + 2], RC[j + 2])) ); \
-       KF_ELT( 3,  4, (_mm256_set_epi64x( RC[j + 3], RC[j + 3], \
-                                       RC[j + 3], RC[j + 3])) ); \
-       KF_ELT( 4,  5, (_mm256_set_epi64x( RC[j + 4], RC[j + 4], \
-                                       RC[j + 4], RC[j + 4])) ); \
-       KF_ELT( 5,  6, (_mm256_set_epi64x( RC[j + 5], RC[j + 5], \
-                                       RC[j + 5], RC[j + 5])) ); \
-       KF_ELT( 6,  7, (_mm256_set_epi64x( RC[j + 6], RC[j + 6], \
-                                       RC[j + 6], RC[j + 6])) ); \
-       KF_ELT( 7,  8, (_mm256_set_epi64x( RC[j + 7], RC[j + 7], \
-                                       RC[j + 7], RC[j + 7])) ); \
+       KF_ELT( 0,  1, _mm256_set1_epi64x( RC[j + 0] ) ); \
+       KF_ELT( 1,  2, _mm256_set1_epi64x( RC[j + 1] ) ); \
+       KF_ELT( 2,  3, _mm256_set1_epi64x( RC[j + 2] ) ); \
+       KF_ELT( 3,  4, _mm256_set1_epi64x( RC[j + 3] ) ); \
+       KF_ELT( 4,  5, _mm256_set1_epi64x( RC[j + 4] ) ); \
+       KF_ELT( 5,  6, _mm256_set1_epi64x( RC[j + 5] ) ); \
+       KF_ELT( 6,  7, _mm256_set1_epi64x( RC[j + 6] ) ); \
+       KF_ELT( 7,  8, _mm256_set1_epi64x( RC[j + 7] ) ); \
       P8_TO_P0; \
    } \
 } while (0)
@@ -453,7 +359,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
    else
    {
        j = lim - kc->ptr;
-        u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb );
+        u.tmp[0] = m256_const1_64( eb );
        memset_zero_256( u.tmp + 1, (j>>3) - 2 );
        u.tmp[ (j>>3) - 1] = m256_const1_64( 0x8000000000000000 );
    }
@@ -474,7 +380,7 @@ void keccak256_4way_init( void *kc )
 }

 void
-keccak256_4way(void *cc, const void *data, size_t len)
+keccak256_4way_update(void *cc, const void *data, size_t len)
 {
    keccak64_core(cc, data, len, 136);
 }
@@ -491,15 +397,24 @@ void keccak512_4way_init( void *kc )
 }

 void
-keccak512_4way(void *cc, const void *data, size_t len)
+keccak512_4way_update(void *cc, const void *data, size_t len)
 {
-        keccak64_core(cc, data, len, 72);
+   keccak64_core(cc, data, len, 72);
 }

 void
 keccak512_4way_close(void *cc, void *dst)
 {
-        keccak64_close(cc, dst, 64, 72);
+   keccak64_close(cc, dst, 64, 72);
 }

-#endif
+#undef INPUT_BUF
+#undef DECL64
+#undef XOR64
+#undef AND64
+#undef OR64
+#undef NOT64
+#undef ROL64
+#undef KECCAK_F_1600
+
+#endif  // AVX2
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -64,26 +64,49 @@ extern "C"{
 * <code>memcpy()</code>).
 */

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 typedef struct {
-        __m256i buf[144*8];    /* first field, for alignment */
+        __m512i buf[144*8];
+        __m512i w[25];
+        size_t ptr, lim;
+} keccak64_ctx_m512i __attribute__((aligned(128)));
+
+typedef keccak64_ctx_m512i keccak256_8way_context;
+typedef keccak64_ctx_m512i keccak512_8way_context;
+
+void keccak256_8way_init(void *cc);
+void keccak256_8way_update(void *cc, const void *data, size_t len);
+void keccak256_8way_close(void *cc, void *dst);
+
+void keccak512_8way_init(void *cc);
+void keccak512_8way_update(void *cc, const void *data, size_t len);
+void keccak512_8way_close(void *cc, void *dst);
+void keccak512_8way_addbits_and_close(
+        void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif   
+
+typedef struct {
+        __m256i buf[144*8];  
        __m256i w[25];
        size_t ptr, lim;
-//        sph_u64 wide[25];
-} keccak64_ctx_m256i;
+} keccak64_ctx_m256i __attribute__((aligned(128)));

 typedef keccak64_ctx_m256i keccak256_4way_context;
 typedef keccak64_ctx_m256i keccak512_4way_context;

 void keccak256_4way_init(void *cc);
-void keccak256_4way(void *cc, const void *data, size_t len);
+void keccak256_4way_update(void *cc, const void *data, size_t len);
 void keccak256_4way_close(void *cc, void *dst);
-
+#define keccak256_4way keccak256_4way_update

 void keccak512_4way_init(void *cc);
-void keccak512_4way(void *cc, const void *data, size_t len);
+void keccak512_4way_update(void *cc, const void *data, size_t len);
 void keccak512_4way_close(void *cc, void *dst);
 void keccak512_4way_addbits_and_close(
        void *cc, unsigned ub, unsigned n, void *dst);
+#define keccak512_4way keccak512_4way_update

 #endif

--- a/algo/keccak/keccak-macros.c
+++ b/algo/keccak/keccak-macros.c
@@ -0,0 +1,324 @@
+#ifdef TH_ELT
+#undef TH_ELT
+#endif
+#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
+                DECL64(tt0); \
+                DECL64(tt1); \
+                DECL64(tt2); \
+                DECL64(tt3); \
+                XOR64(tt0, d0, d1); \
+                XOR64(tt1, d2, d3); \
+                XOR64(tt0, tt0, d4); \
+                XOR64(tt0, tt0, tt1); \
+                ROL64(tt0, tt0, 1); \
+                XOR64(tt2, c0, c1); \
+                XOR64(tt3, c2, c3); \
+                XOR64(tt0, tt0, c4); \
+                XOR64(tt2, tt2, tt3); \
+                XOR64(t, tt0, tt2); \
+        } while (0)
+
+#ifdef THETA
+#undef THETA
+#endif
+#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+        b40, b41, b42, b43, b44) \
+        do { \
+                DECL64(t0); \
+                DECL64(t1); \
+                DECL64(t2); \
+                DECL64(t3); \
+                DECL64(t4); \
+                TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
+                TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
+                TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
+                TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
+                TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
+                XOR64(b00, b00, t0); \
+                XOR64(b01, b01, t0); \
+                XOR64(b02, b02, t0); \
+                XOR64(b03, b03, t0); \
+                XOR64(b04, b04, t0); \
+                XOR64(b10, b10, t1); \
+                XOR64(b11, b11, t1); \
+                XOR64(b12, b12, t1); \
+                XOR64(b13, b13, t1); \
+                XOR64(b14, b14, t1); \
+                XOR64(b20, b20, t2); \
+                XOR64(b21, b21, t2); \
+                XOR64(b22, b22, t2); \
+                XOR64(b23, b23, t2); \
+                XOR64(b24, b24, t2); \
+                XOR64(b30, b30, t3); \
+                XOR64(b31, b31, t3); \
+                XOR64(b32, b32, t3); \
+                XOR64(b33, b33, t3); \
+                XOR64(b34, b34, t3); \
+                XOR64(b40, b40, t4); \
+                XOR64(b41, b41, t4); \
+                XOR64(b42, b42, t4); \
+                XOR64(b43, b43, t4); \
+                XOR64(b44, b44, t4); \
+        } while (0)
+
+#ifdef RHO
+#undef RHO
+#endif
+#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+        b40, b41, b42, b43, b44) \
+        do { \
+                /* ROL64(b00, b00,  0); */ \
+                ROL64(b01, b01, 36); \
+                ROL64(b02, b02,  3); \
+                ROL64(b03, b03, 41); \
+                ROL64(b04, b04, 18); \
+                ROL64(b10, b10,  1); \
+                ROL64(b11, b11, 44); \
+                ROL64(b12, b12, 10); \
+                ROL64(b13, b13, 45); \
+                ROL64(b14, b14,  2); \
+                ROL64(b20, b20, 62); \
+                ROL64(b21, b21,  6); \
+                ROL64(b22, b22, 43); \
+                ROL64(b23, b23, 15); \
+                ROL64(b24, b24, 61); \
+                ROL64(b30, b30, 28); \
+                ROL64(b31, b31, 55); \
+                ROL64(b32, b32, 25); \
+                ROL64(b33, b33, 21); \
+                ROL64(b34, b34, 56); \
+                ROL64(b40, b40, 27); \
+                ROL64(b41, b41, 20); \
+                ROL64(b42, b42, 39); \
+                ROL64(b43, b43,  8); \
+                ROL64(b44, b44, 14); \
+        } while (0)
+
+/*
+ * The KHI macro integrates the "lane complement" optimization. On input,
+ * some words are complemented:
+ *    a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
+ * On output, the following words are complemented:
+ *    a04 a10 a20 a22 a23 a31
+ *
+ * The (implicit) permutation and the theta expansion will bring back
+ * the input mask for the next round.
+ */
+
+#ifdef KHI_XO
+#undef KHI_XO
+#endif
+#define KHI_XO(d, a, b, c)   do { \
+                DECL64(kt); \
+                OR64(kt, b, c); \
+                XOR64(d, a, kt); \
+        } while (0)
+
+#ifdef KHI_XA
+#undef KHI_XA
+#endif
+#define KHI_XA(d, a, b, c)   do { \
+                DECL64(kt); \
+                AND64(kt, b, c); \
+                XOR64(d, a, kt); \
+        } while (0)
+
+#ifdef KHI
+#undef KHI
+#endif
+#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+        b40, b41, b42, b43, b44) \
+        do { \
+                DECL64(c0); \
+                DECL64(c1); \
+                DECL64(c2); \
+                DECL64(c3); \
+                DECL64(c4); \
+                DECL64(bnn); \
+                NOT64(bnn, b20); \
+                KHI_XO(c0, b00, b10, b20); \
+                KHI_XO(c1, b10, bnn, b30); \
+                KHI_XA(c2, b20, b30, b40); \
+                KHI_XO(c3, b30, b40, b00); \
+                KHI_XA(c4, b40, b00, b10); \
+                MOV64(b00, c0); \
+                MOV64(b10, c1); \
+                MOV64(b20, c2); \
+                MOV64(b30, c3); \
+                MOV64(b40, c4); \
+                NOT64(bnn, b41); \
+                KHI_XO(c0, b01, b11, b21); \
+                KHI_XA(c1, b11, b21, b31); \
+                KHI_XO(c2, b21, b31, bnn); \
+                KHI_XO(c3, b31, b41, b01); \
+                KHI_XA(c4, b41, b01, b11); \
+                MOV64(b01, c0); \
+                MOV64(b11, c1); \
+                MOV64(b21, c2); \
+                MOV64(b31, c3); \
+                MOV64(b41, c4); \
+                NOT64(bnn, b32); \
+                KHI_XO(c0, b02, b12, b22); \
+                KHI_XA(c1, b12, b22, b32); \
+                KHI_XA(c2, b22, bnn, b42); \
+                KHI_XO(c3, bnn, b42, b02); \
+                KHI_XA(c4, b42, b02, b12); \
+                MOV64(b02, c0); \
+                MOV64(b12, c1); \
+                MOV64(b22, c2); \
+                MOV64(b32, c3); \
+                MOV64(b42, c4); \
+                NOT64(bnn, b33); \
+                KHI_XA(c0, b03, b13, b23); \
+                KHI_XO(c1, b13, b23, b33); \
+                KHI_XO(c2, b23, bnn, b43); \
+                KHI_XA(c3, bnn, b43, b03); \
+                KHI_XO(c4, b43, b03, b13); \
+                MOV64(b03, c0); \
+                MOV64(b13, c1); \
+                MOV64(b23, c2); \
+                MOV64(b33, c3); \
+                MOV64(b43, c4); \
+                NOT64(bnn, b14); \
+                KHI_XA(c0, b04, bnn, b24); \
+                KHI_XO(c1, bnn, b24, b34); \
+                KHI_XA(c2, b24, b34, b44); \
+                KHI_XO(c3, b34, b44, b04); \
+                KHI_XA(c4, b44, b04, b14); \
+                MOV64(b04, c0); \
+                MOV64(b14, c1); \
+                MOV64(b24, c2); \
+                MOV64(b34, c3); \
+                MOV64(b44, c4); \
+        } while (0)
+
+#ifdef IOTA
+#undef IOTA
+#endif
+#define IOTA(r)   XOR64_IOTA(a00, a00, r)
+
+#ifdef P0
+#undef P1
+#undef P2
+#undef P3
+#undef P4
+#undef P5
+#undef P6
+#undef P7
+#undef P8
+#undef P9
+#undef P10
+#undef p11
+#undef P12
+#undef P13
+#undef P14
+#undef P15
+#undef P16
+#undef P17
+#undef P18
+#undef P19
+#undef P20
+#undef P21
+#undef P22
+#undef P23
+#endif
+
+#define P0    a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
+              a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
+#define P1    a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
+              a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
+#define P2    a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
+              a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
+#define P3    a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
+              a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
+#define P4    a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
+              a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
+#define P5    a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
+              a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
+#define P6    a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
+              a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
+#define P7    a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
+              a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
+#define P8    a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
+              a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
+#define P9    a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
+              a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
+#define P10   a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
+              a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
+#define P11   a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
+              a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
+#define P12   a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
+              a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
+#define P13   a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
+              a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
+#define P14   a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
+              a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
+#define P15   a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
+              a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
+#define P16   a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
+              a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
+#define P17   a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
+              a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
+#define P18   a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
+              a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
+#define P19   a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
+              a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
+#define P20   a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
+              a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
+#define P21   a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
+              a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
+#define P22   a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
+              a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
+#define P23   a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
+              a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
+
+#ifdef P8_TO_P0
+#undef P8_TO_P0
+#endif
+#define P8_TO_P0   do { \
+                DECL64(t); \
+                MOV64(t, a01); \
+                MOV64(a01, a11); \
+                MOV64(a11, a43); \
+                MOV64(a43, t); \
+                MOV64(t, a02); \
+                MOV64(a02, a22); \
+                MOV64(a22, a31); \
+                MOV64(a31, t); \
+                MOV64(t, a03); \
+                MOV64(a03, a33); \
+                MOV64(a33, a24); \
+                MOV64(a24, t); \
+                MOV64(t, a04); \
+                MOV64(a04, a44); \
+                MOV64(a44, a12); \
+                MOV64(a12, t); \
+                MOV64(t, a10); \
+                MOV64(a10, a32); \
+                MOV64(a32, a13); \
+                MOV64(a13, t); \
+                MOV64(t, a14); \
+                MOV64(a14, a21); \
+                MOV64(a21, a20); \
+                MOV64(a20, t); \
+                MOV64(t, a23); \
+                MOV64(a23, a42); \
+                MOV64(a42, a40); \
+                MOV64(a40, t); \
+                MOV64(t, a30); \
+                MOV64(a30, a41); \
+                MOV64(a41, a34); \
+                MOV64(a34, t); \
+        } while (0)
+
+#define KF_ELT(r, s, k)   do { \
+                THETA LPAR P ## r RPAR; \
+                RHO LPAR P ## r RPAR; \
+                KHI LPAR P ## s RPAR; \
+                IOTA(k); \
+        } while (0)
+
+
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -1,16 +1,578 @@
 #include <string.h>
 #include <immintrin.h>
 #include "luffa-hash-2way.h"
+#include <stdio.h>

 #if defined(__AVX2__)

 #include "simd-utils.h"

+/* initial values of chaining variables */
+static const uint32 IV[40] __attribute((aligned(64))) = {
+    0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
+    0xdef610bb,0xee058139,0x90152df4,0x6e292011,
+    0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
+    0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
+    0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
+    0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
+    0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
+    0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
+    0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
+    0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
+};
+
+/* Round Constants */
+static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
+    0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
+    0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
+    0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
+    0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
+    0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
+    0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
+    0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
+    0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
+    0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
+    0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
+    0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
+    0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
+    0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
+    0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
+    0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
+    0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
+    0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
+    0x00000000,0x00000000,0x00000000,0x5090d577,
+    0x00000000,0x00000000,0x00000000,0xac11d7fa,
+    0x00000000,0x00000000,0x00000000,0x2d1925ab,
+    0x00000000,0x00000000,0x00000000,0x1bcb66f2,
+    0x00000000,0x00000000,0x00000000,0xb46496ac,
+    0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
+    0x00000000,0x00000000,0x00000000,0xd1925ab0,
+    0x00000000,0x00000000,0x00000000,0x78602649,
+    0x00000000,0x00000000,0x00000000,0x29131ab6,
+    0x00000000,0x00000000,0x00000000,0x8edae952,
+    0x00000000,0x00000000,0x00000000,0x0fc053c3,
+    0x00000000,0x00000000,0x00000000,0x3b6ba548,
+    0x00000000,0x00000000,0x00000000,0x3f014f0c,
+    0x00000000,0x00000000,0x00000000,0xedae9520,
+    0x00000000,0x00000000,0x00000000,0xfc053c31
+};
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define cns4w(i)  m512_const1_128( ( (__m128i*)CNS_INIT)[i] )
+
+#define ADD_CONSTANT4W(a,b,c0,c1)\
+    a = _mm512_xor_si512(a,c0);\
+    b = _mm512_xor_si512(b,c1);
+
+#define MULT24W( a0, a1, mask ) \
+do { \
+  __m512i b = _mm512_xor_si512( a0, \
+                   _mm512_shuffle_epi32( _mm512_and_si512(a1,mask), 16 ) ); \
+  a0 = _mm512_or_si512( _mm512_bsrli_epi128(b,4), _mm512_bslli_epi128(a1,12) );\
+  a1 = _mm512_or_si512( _mm512_bsrli_epi128(a1,4), _mm512_bslli_epi128(b,12) );\
+} while(0)
+
+// confirm pointer arithmetic
+// ok but use array indexes
+#define STEP_PART4W(x,c0,c1,t)\
+    SUBCRUMB4W(*x,*(x+1),*(x+2),*(x+3),*t);\
+    SUBCRUMB4W(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
+    MIXWORD4W(*x,*(x+4),*t,*(t+1));\
+    MIXWORD4W(*(x+1),*(x+5),*t,*(t+1));\
+    MIXWORD4W(*(x+2),*(x+6),*t,*(t+1));\
+    MIXWORD4W(*(x+3),*(x+7),*t,*(t+1));\
+    ADD_CONSTANT4W(*x, *(x+4), c0, c1);
+
+#define SUBCRUMB4W(a0,a1,a2,a3,t)\
+    t  = _mm512_load_si512(&a0);\
+    a0 = _mm512_or_si512(a0,a1);\
+    a2 = _mm512_xor_si512(a2,a3);\
+    a1 = _mm512_andnot_si512(a1, m512_neg1 );\
+    a0 = _mm512_xor_si512(a0,a3);\
+    a3 = _mm512_and_si512(a3,t);\
+    a1 = _mm512_xor_si512(a1,a3);\
+    a3 = _mm512_xor_si512(a3,a2);\
+    a2 = _mm512_and_si512(a2,a0);\
+    a0 = _mm512_andnot_si512(a0, m512_neg1 );\
+    a2 = _mm512_xor_si512(a2,a1);\
+    a1 = _mm512_or_si512(a1,a3);\
+    t  = _mm512_xor_si512(t,a1);\
+    a3 = _mm512_xor_si512(a3,a2);\
+    a2 = _mm512_and_si512(a2,a1);\
+    a1 = _mm512_xor_si512(a1,a0);\
+    a0 = _mm512_load_si512(&t);
+
+#define MIXWORD4W(a,b,t1,t2)\
+    b  = _mm512_xor_si512(a,b);\
+    t1 = _mm512_slli_epi32(a,2);\
+    t2 = _mm512_srli_epi32(a,30);\
+     a = _mm512_or_si512(t1,t2);\
+    a  = _mm512_xor_si512(a,b);\
+    t1 = _mm512_slli_epi32(b,14);\
+    t2 = _mm512_srli_epi32(b,18);\
+    b  = _mm512_or_si512(t1,t2);\
+    b  = _mm512_xor_si512(a,b);\
+    t1 = _mm512_slli_epi32(a,10);\
+    t2 = _mm512_srli_epi32(a,22);\
+    a  = _mm512_or_si512(t1,t2);\
+    a  = _mm512_xor_si512(a,b);\
+    t1 = _mm512_slli_epi32(b,1);\
+    t2 = _mm512_srli_epi32(b,31);\
+    b  = _mm512_or_si512(t1,t2);
+
+#define STEP_PART24W(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
+    a1 = _mm512_shuffle_epi32(a1,147);\
+    t0 = _mm512_load_si512(&a1);\
+    a1 = _mm512_unpacklo_epi32(a1,a0);\
+    t0 = _mm512_unpackhi_epi32(t0,a0);\
+    t1 = _mm512_shuffle_epi32(t0,78);\
+    a0 = _mm512_shuffle_epi32(a1,78);\
+    SUBCRUMB4W(t1,t0,a0,a1,tmp0);\
+    t0 = _mm512_unpacklo_epi32(t0,t1);\
+    a1 = _mm512_unpacklo_epi32(a1,a0);\
+    a0 = _mm512_load_si512(&a1);\
+    a0 = _mm512_unpackhi_epi64(a0,t0);\
+    a1 = _mm512_unpacklo_epi64(a1,t0);\
+    a1 = _mm512_shuffle_epi32(a1,57);\
+    MIXWORD4W(a0,a1,tmp0,tmp1);\
+    ADD_CONSTANT4W(a0,a1,c0,c1);
+
+#define NMLTOM7684W(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
+    s2 = _mm512_load_si512(&r1);\
+    q2 = _mm512_load_si512(&p1);\
+    r2 = _mm512_shuffle_epi32(r2,216);\
+    p2 = _mm512_shuffle_epi32(p2,216);\
+    r1 = _mm512_unpacklo_epi32(r1,r0);\
+    p1 = _mm512_unpacklo_epi32(p1,p0);\
+    s2 = _mm512_unpackhi_epi32(s2,r0);\
+    q2 = _mm512_unpackhi_epi32(q2,p0);\
+    s0 = _mm512_load_si512(&r2);\
+    q0 = _mm512_load_si512(&p2);\
+    r2 = _mm512_unpacklo_epi64(r2,r1);\
+    p2 = _mm512_unpacklo_epi64(p2,p1);\
+    s1 = _mm512_load_si512(&s0);\
+    q1 = _mm512_load_si512(&q0);\
+    s0 = _mm512_unpackhi_epi64(s0,r1);\
+    q0 = _mm512_unpackhi_epi64(q0,p1);\
+    r2 = _mm512_shuffle_epi32(r2,225);\
+    p2 = _mm512_shuffle_epi32(p2,225);\
+    r0 = _mm512_load_si512(&s1);\
+    p0 = _mm512_load_si512(&q1);\
+    s0 = _mm512_shuffle_epi32(s0,225);\
+    q0 = _mm512_shuffle_epi32(q0,225);\
+    s1 = _mm512_unpacklo_epi64(s1,s2);\
+    q1 = _mm512_unpacklo_epi64(q1,q2);\
+    r0 = _mm512_unpackhi_epi64(r0,s2);\
+    p0 = _mm512_unpackhi_epi64(p0,q2);\
+    s2 = _mm512_load_si512(&r0);\
+    q2 = _mm512_load_si512(&p0);\
+    s3 = _mm512_load_si512(&r2);\
+    q3 = _mm512_load_si512(&p2);
+
+#define MIXTON7684W(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
+    s0 = _mm512_load_si512(&r0);\
+    q0 = _mm512_load_si512(&p0);\
+    s1 = _mm512_load_si512(&r2);\
+    q1 = _mm512_load_si512(&p2);\
+    r0 = _mm512_unpackhi_epi32(r0,r1);\
+    p0 = _mm512_unpackhi_epi32(p0,p1);\
+    r2 = _mm512_unpackhi_epi32(r2,r3);\
+    p2 = _mm512_unpackhi_epi32(p2,p3);\
+    s0 = _mm512_unpacklo_epi32(s0,r1);\
+    q0 = _mm512_unpacklo_epi32(q0,p1);\
+    s1 = _mm512_unpacklo_epi32(s1,r3);\
+    q1 = _mm512_unpacklo_epi32(q1,p3);\
+    r1 = _mm512_load_si512(&r0);\
+    p1 = _mm512_load_si512(&p0);\
+    r0 = _mm512_unpackhi_epi64(r0,r2);\
+    p0 = _mm512_unpackhi_epi64(p0,p2);\
+    s0 = _mm512_unpackhi_epi64(s0,s1);\
+    q0 = _mm512_unpackhi_epi64(q0,q1);\
+    r1 = _mm512_unpacklo_epi64(r1,r2);\
+    p1 = _mm512_unpacklo_epi64(p1,p2);\
+    s2 = _mm512_load_si512(&r0);\
+    q2 = _mm512_load_si512(&p0);\
+    s1 = _mm512_load_si512(&r1);\
+    q1 = _mm512_load_si512(&p1);
+
+#define NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
+    s1 = _mm512_load_si512(&r3);\
+    q1 = _mm512_load_si512(&p3);\
+    s3 = _mm512_load_si512(&r3);\
+    q3 = _mm512_load_si512(&p3);\
+    s1 = _mm512_unpackhi_epi32(s1,r2);\
+    q1 = _mm512_unpackhi_epi32(q1,p2);\
+    s3 = _mm512_unpacklo_epi32(s3,r2);\
+    q3 = _mm512_unpacklo_epi32(q3,p2);\
+    s0 = _mm512_load_si512(&s1);\
+    q0 = _mm512_load_si512(&q1);\
+    s2 = _mm512_load_si512(&s3);\
+    q2 = _mm512_load_si512(&q3);\
+    r3 = _mm512_load_si512(&r1);\
+    p3 = _mm512_load_si512(&p1);\
+    r1 = _mm512_unpacklo_epi32(r1,r0);\
+    p1 = _mm512_unpacklo_epi32(p1,p0);\
+    r3 = _mm512_unpackhi_epi32(r3,r0);\
+    p3 = _mm512_unpackhi_epi32(p3,p0);\
+    s0 = _mm512_unpackhi_epi64(s0,r3);\
+    q0 = _mm512_unpackhi_epi64(q0,p3);\
+    s1 = _mm512_unpacklo_epi64(s1,r3);\
+    q1 = _mm512_unpacklo_epi64(q1,p3);\
+    s2 = _mm512_unpackhi_epi64(s2,r1);\
+    q2 = _mm512_unpackhi_epi64(q2,p1);\
+    s3 = _mm512_unpacklo_epi64(s3,r1);\
+    q3 = _mm512_unpacklo_epi64(q3,p1);
+
+#define MIXTON10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
+    NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
+
+void rnd512_4way( luffa_4way_context *state, __m512i *msg )
+{
+    __m512i t0, t1;
+    __m512i *chainv = state->chainv;
+    __m512i msg0, msg1;
+    __m512i tmp[2];
+    __m512i x[8];
+    const __m512i MASK = m512_const2_64( 0, 0x00000000ffffffff );
+
+    t0 = chainv[0];
+    t1 = chainv[1];
+
+    t0 = _mm512_xor_si512( t0, chainv[2] );
+    t1 = _mm512_xor_si512( t1, chainv[3] );
+    t0 = _mm512_xor_si512( t0, chainv[4] );
+    t1 = _mm512_xor_si512( t1, chainv[5] );
+    t0 = _mm512_xor_si512( t0, chainv[6] );
+    t1 = _mm512_xor_si512( t1, chainv[7] );
+    t0 = _mm512_xor_si512( t0, chainv[8] );
+    t1 = _mm512_xor_si512( t1, chainv[9] );
+
+    MULT24W( t0, t1, MASK );
+
+    msg0 = _mm512_shuffle_epi32( msg[0], 27 );
+    msg1 = _mm512_shuffle_epi32( msg[1], 27 );
+
+    chainv[0] = _mm512_xor_si512( chainv[0], t0 );
+    chainv[1] = _mm512_xor_si512( chainv[1], t1 );
+    chainv[2] = _mm512_xor_si512( chainv[2], t0 );
+    chainv[3] = _mm512_xor_si512( chainv[3], t1 );
+    chainv[4] = _mm512_xor_si512( chainv[4], t0 );
+    chainv[5] = _mm512_xor_si512( chainv[5], t1 );
+    chainv[6] = _mm512_xor_si512( chainv[6], t0 );
+    chainv[7] = _mm512_xor_si512( chainv[7], t1 );
+    chainv[8] = _mm512_xor_si512( chainv[8], t0 );
+    chainv[9] = _mm512_xor_si512( chainv[9], t1 );
+
+    t0 = chainv[0];
+    t1 = chainv[1];
+
+    MULT24W( chainv[0], chainv[1], MASK );
+    chainv[0] = _mm512_xor_si512( chainv[0], chainv[2] );
+    chainv[1] = _mm512_xor_si512( chainv[1], chainv[3] );
+
+    MULT24W( chainv[2], chainv[3], MASK );
+    chainv[2] = _mm512_xor_si512(chainv[2], chainv[4]);
+    chainv[3] = _mm512_xor_si512(chainv[3], chainv[5]);
+
+    MULT24W( chainv[4], chainv[5], MASK );
+    chainv[4] = _mm512_xor_si512(chainv[4], chainv[6]);
+    chainv[5] = _mm512_xor_si512(chainv[5], chainv[7]);
+
+    MULT24W( chainv[6], chainv[7], MASK );
+    chainv[6] = _mm512_xor_si512(chainv[6], chainv[8]);
+    chainv[7] = _mm512_xor_si512(chainv[7], chainv[9]);
+
+    MULT24W( chainv[8], chainv[9], MASK );
+    chainv[8] = _mm512_xor_si512( chainv[8], t0 );
+    chainv[9] = _mm512_xor_si512( chainv[9], t1 );
+
+    t0 = chainv[8];
+    t1 = chainv[9];
+
+    MULT24W( chainv[8], chainv[9], MASK );
+    chainv[8] = _mm512_xor_si512( chainv[8], chainv[6] );
+    chainv[9] = _mm512_xor_si512( chainv[9], chainv[7] );
+
+    MULT24W( chainv[6], chainv[7], MASK );
+    chainv[6] = _mm512_xor_si512( chainv[6], chainv[4] );
+    chainv[7] = _mm512_xor_si512( chainv[7], chainv[5] );
+
+    MULT24W( chainv[4], chainv[5], MASK );
+    chainv[4] = _mm512_xor_si512( chainv[4], chainv[2] );
+    chainv[5] = _mm512_xor_si512( chainv[5], chainv[3] );
+
+    MULT24W( chainv[2], chainv[3], MASK );
+    chainv[2] = _mm512_xor_si512( chainv[2], chainv[0] );
+    chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );
+
+    MULT24W( chainv[0], chainv[1], MASK );
+    chainv[0] = _mm512_xor_si512( _mm512_xor_si512( chainv[0], t0 ), msg0 );
+    chainv[1] = _mm512_xor_si512( _mm512_xor_si512( chainv[1], t1 ), msg1 );
+
+    MULT24W( msg0, msg1, MASK );
+    chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
+    chainv[3] = _mm512_xor_si512( chainv[3], msg1 );
+
+    MULT24W( msg0, msg1, MASK );
+    chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
+    chainv[5] = _mm512_xor_si512( chainv[5], msg1 );
+
+    MULT24W( msg0, msg1, MASK );
+    chainv[6] = _mm512_xor_si512( chainv[6], msg0 );
+    chainv[7] = _mm512_xor_si512( chainv[7], msg1 );
+
+    MULT24W( msg0, msg1, MASK );
+    chainv[8] = _mm512_xor_si512( chainv[8], msg0 );
+    chainv[9] = _mm512_xor_si512( chainv[9], msg1 );
+
+    MULT24W( msg0, msg1, MASK );
+
+    // replace with ror
+    chainv[3] = _mm512_rol_epi32( chainv[3], 1 );
+    chainv[5] = _mm512_rol_epi32( chainv[5], 2 );
+    chainv[7] = _mm512_rol_epi32( chainv[7], 3 );
+    chainv[9] = _mm512_rol_epi32( chainv[9], 4 );
+
+    NMLTOM10244W( chainv[0], chainv[2], chainv[4], chainv[6],
+                x[0], x[1], x[2], x[3],
+                chainv[1],chainv[3],chainv[5],chainv[7],
+                x[4], x[5], x[6], x[7] );
+
+    STEP_PART4W( &x[0], cns4w( 0), cns4w( 1), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w( 2), cns4w( 3), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w( 4), cns4w( 5), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w( 6), cns4w( 7), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w( 8), cns4w( 9), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w(10), cns4w(11), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w(12), cns4w(13), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w(14), cns4w(15), &tmp[0] );
+
+    MIXTON10244W( x[0], x[1], x[2], x[3],
+                chainv[0], chainv[2], chainv[4],chainv[6],
+                x[4], x[5], x[6], x[7],
+                chainv[1],chainv[3],chainv[5],chainv[7]);
+
+    /* Process last 256-bit block */
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(16), cns4w(17),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(18), cns4w(19),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(20), cns4w(21),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(22), cns4w(23),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(24), cns4w(25),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(26), cns4w(27),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(28), cns4w(29),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(30), cns4w(31),
+                tmp[0], tmp[1] );
+}
+
+void finalization512_4way( luffa_4way_context *state, uint32 *b )
+{
+    uint32_t hash[8*4] __attribute((aligned(128)));
+    __m512i* chainv = state->chainv;
+    __m512i t[2];
+    __m512i zero[2];
+    zero[0] = zero[1] = m512_zero;
+    const __m512i shuff_bswap32 = m512_const_64(
+                                  0x3c3d3e3f38393a3b, 0x3435363730313233,
+                                  0x2c2d2e2f28292a2b, 0x2425262720212223,
+                                  0x1c1d1e1f18191a1b, 0x1415161710111213,
+                                  0x0c0d0e0f08090a0b, 0x0405060700010203 );
+
+    /*---- blank round with m=0 ----*/
+    rnd512_4way( state, zero );
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+
+    t[0] = _mm512_xor_si512( t[0], chainv[2] );
+    t[1] = _mm512_xor_si512( t[1], chainv[3] );
+    t[0] = _mm512_xor_si512( t[0], chainv[4] );
+    t[1] = _mm512_xor_si512( t[1], chainv[5] );
+    t[0] = _mm512_xor_si512( t[0], chainv[6] );
+    t[1] = _mm512_xor_si512( t[1], chainv[7] );
+    t[0] = _mm512_xor_si512( t[0], chainv[8] );
+    t[1] = _mm512_xor_si512( t[1], chainv[9] );
+
+    t[0] = _mm512_shuffle_epi32( t[0], 27 );
+    t[1] = _mm512_shuffle_epi32( t[1], 27 );
+
+    _mm512_store_si512( (__m512i*)&hash[0], t[0] );
+    _mm512_store_si512( (__m512i*)&hash[16], t[1] );
+
+    casti_m512i( b, 0 ) = _mm512_shuffle_epi8(
+                                  casti_m512i( hash, 0 ), shuff_bswap32 );
+    casti_m512i( b, 1 ) = _mm512_shuffle_epi8(
+                                  casti_m512i( hash, 1 ), shuff_bswap32 );
+
+    rnd512_4way( state, zero );
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+    t[0] = _mm512_xor_si512( t[0], chainv[2] );
+    t[1] = _mm512_xor_si512( t[1], chainv[3] );
+    t[0] = _mm512_xor_si512( t[0], chainv[4] );
+    t[1] = _mm512_xor_si512( t[1], chainv[5] );
+    t[0] = _mm512_xor_si512( t[0], chainv[6] );
+    t[1] = _mm512_xor_si512( t[1], chainv[7] );
+    t[0] = _mm512_xor_si512( t[0], chainv[8] );
+    t[1] = _mm512_xor_si512( t[1], chainv[9] );
+
+    t[0] = _mm512_shuffle_epi32( t[0], 27 );
+    t[1] = _mm512_shuffle_epi32( t[1], 27 );
+
+    _mm512_store_si512( (__m512i*)&hash[0], t[0] );
+    _mm512_store_si512( (__m512i*)&hash[16], t[1] );
+
+    casti_m512i( b, 2 ) = _mm512_shuffle_epi8(
+                                  casti_m512i( hash, 0 ), shuff_bswap32 );
+    casti_m512i( b, 3 ) = _mm512_shuffle_epi8(
+                                  casti_m512i( hash, 1 ), shuff_bswap32 );
+}
+
+int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
+{
+    state->hashbitlen = hashbitlen;
+    __m128i *iv = (__m128i*)IV;
+
+    state->chainv[0] = m512_const1_128( iv[0] );
+    state->chainv[1] = m512_const1_128( iv[1] );
+    state->chainv[2] = m512_const1_128( iv[2] );
+    state->chainv[3] = m512_const1_128( iv[3] );
+    state->chainv[4] = m512_const1_128( iv[4] );
+    state->chainv[5] = m512_const1_128( iv[5] );
+    state->chainv[6] = m512_const1_128( iv[6] );
+    state->chainv[7] = m512_const1_128( iv[7] );
+    state->chainv[8] = m512_const1_128( iv[8] );
+    state->chainv[9] = m512_const1_128( iv[9] );
+
+    ((__m512i*)state->buffer)[0] = m512_zero;
+    ((__m512i*)state->buffer)[1] = m512_zero;
+
+    return 0;
+}
+
+// Do not call luffa_update_close after having called luffa_update.
+// Once luffa_update has been called only call luffa_update or luffa_close.
+int luffa_4way_update( luffa_4way_context *state, const void *data,
+                       size_t len )
+{
+    __m512i *vdata  = (__m512i*)data;
+    __m512i *buffer = (__m512i*)state->buffer;
+    __m512i msg[2];
+    int i;
+    int blocks = (int)len >> 5;
+    const __m512i shuff_bswap32 = m512_const_64( 
+                                   0x3c3d3e3f38393a3b, 0x3435363730313233,
+                                   0x2c2d2e2f28292a2b, 0x2425262720212223,
+                                   0x1c1d1e1f18191a1b, 0x1415161710111213,
+                                   0x0c0d0e0f08090a0b, 0x0405060700010203 );
+
+    state->rembytes = (int)len & 0x1F;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       rnd512_4way( state, msg );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    // store in buffer for transform in final for midstate to work
+    if ( state->rembytes  )
+    {
+      // remaining data bytes
+      buffer[0] = _mm512_shuffle_epi8( vdata[0], shuff_bswap32 );
+      buffer[1] = m512_const2_64( 0, 0x0000000080000000 );
+    }
+    return 0;
+}
+
+int luffa_4way_close( luffa_4way_context *state, void *hashval )
+{
+    __m512i *buffer = (__m512i*)state->buffer;
+    __m512i msg[2];
+
+    // transform pad block
+    if ( state->rembytes )
+      // not empty, data is in buffer
+      rnd512_4way( state, buffer );
+    else
+    {     // empty pad block, constant data
+      msg[0] = m512_const2_64( 0, 0x0000000080000000 );
+      msg[1] = m512_zero;
+      rnd512_4way( state, msg );
+    }
+    finalization512_4way( state, (uint32*)hashval );
+
+    if ( state->hashbitlen > 512 )
+        finalization512_4way( state, (uint32*)( hashval+32 ) );
+    return 0;
+}
+
+int luffa_4way_update_close( luffa_4way_context *state,
+                 void *output, const void *data, size_t inlen )
+{
+// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
+    const __m512i *vdata  = (__m512i*)data;
+    __m512i msg[2];
+    int i;
+    const int blocks = (int)( inlen >> 5 );
+    const __m512i shuff_bswap32 = m512_const_64(
+                                   0x3c3d3e3f38393a3b, 0x3435363730313233,
+                                   0x2c2d2e2f28292a2b, 0x2425262720212223,
+                                   0x1c1d1e1f18191a1b, 0x1415161710111213,
+                                   0x0c0d0e0f08090a0b, 0x0405060700010203 );
+
+    state->rembytes = inlen & 0x1F;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       rnd512_4way( state, msg );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    if ( state->rembytes  )
+    {
+       // padding of partial block
+       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = m512_const2_64( 0, 0x0000000080000000 );
+       rnd512_4way( state, msg );
+    }
+    else
+    {
+       // empty pad block
+       msg[0] = m512_const2_64( 0, 0x0000000080000000 );
+       msg[1] = m512_zero;
+       rnd512_4way( state, msg );
+    }
+
+    finalization512_4way( state, (uint32*)output );
+
+    if ( state->hashbitlen > 512 )
+        finalization512_4way( state, (uint32*)( output+64 ) );
+
+    return 0;
+}
+
+#endif // AVX512
+
 #define cns(i)  m256_const1_128( ( (__m128i*)CNS_INIT)[i] )

 #define ADD_CONSTANT(a,b,c0,c1)\
    a = _mm256_xor_si256(a,c0);\
-    b = _mm256_xor_si256(b,c1);\
+    b = _mm256_xor_si256(b,c1);

 #define MULT2( a0, a1, mask ) \
 do { \
@@ -115,7 +677,7 @@ do { \
    s2 = _mm256_load_si256(&r0);\
    q2 = _mm256_load_si256(&p0);\
    s3 = _mm256_load_si256(&r2);\
-    q3 = _mm256_load_si256(&p2);\
+    q3 = _mm256_load_si256(&p2);

 #define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
    s0 = _mm256_load_si256(&r0);\
@@ -174,57 +736,6 @@ do { \
 #define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
    NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);

-/* initial values of chaining variables */
-static const uint32 IV[40] __attribute((aligned(32))) = {
-    0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
-    0xdef610bb,0xee058139,0x90152df4,0x6e292011,
-    0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
-    0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
-    0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
-    0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
-    0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
-    0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
-    0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
-    0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
-};
-
-/* Round Constants */
-static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
-    0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
-    0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
-    0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
-    0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
-    0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
-    0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
-    0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
-    0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
-    0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
-    0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
-    0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
-    0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
-    0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
-    0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
-    0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
-    0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
-    0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
-    0x00000000,0x00000000,0x00000000,0x5090d577,
-    0x00000000,0x00000000,0x00000000,0xac11d7fa,
-    0x00000000,0x00000000,0x00000000,0x2d1925ab,
-    0x00000000,0x00000000,0x00000000,0x1bcb66f2,
-    0x00000000,0x00000000,0x00000000,0xb46496ac,
-    0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
-    0x00000000,0x00000000,0x00000000,0xd1925ab0,
-    0x00000000,0x00000000,0x00000000,0x78602649,
-    0x00000000,0x00000000,0x00000000,0x29131ab6,
-    0x00000000,0x00000000,0x00000000,0x8edae952,
-    0x00000000,0x00000000,0x00000000,0x0fc053c3,
-    0x00000000,0x00000000,0x00000000,0x3b6ba548,
-    0x00000000,0x00000000,0x00000000,0x3f014f0c,
-    0x00000000,0x00000000,0x00000000,0xedae9520,
-    0x00000000,0x00000000,0x00000000,0xfc053c31
-};
-
-

 /***************************************************/
 /* Round function         */
@@ -331,14 +842,10 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )

    MULT2( msg0, msg1, MASK );

-    chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3],  1 ),
-                                 _mm256_srli_epi32( chainv[3], 31 ) );
-    chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5],  2 ),
-                                 _mm256_srli_epi32( chainv[5], 30 ) );
-    chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7],  3 ),
-                                 _mm256_srli_epi32( chainv[7], 29 ) );
-    chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9],  4 ),
-                                 _mm256_srli_epi32( chainv[9], 28 ) );
+    chainv[3] = mm256_rol_32( chainv[3], 1 );
+    chainv[5] = mm256_rol_32( chainv[5], 2 );
+    chainv[7] = mm256_rol_32( chainv[7], 3 );
+    chainv[9] = mm256_rol_32( chainv[9], 4 );

    NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
                x[0], x[1], x[2], x[3],
@@ -385,13 +892,15 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )

 void finalization512_2way( luffa_2way_context *state, uint32 *b )
 {
-    uint32 hash[8] __attribute((aligned(64)));
+    uint32 hash[8*2] __attribute((aligned(64)));
    __m256i* chainv = state->chainv;
    __m256i t[2];
    __m256i zero[2];
    zero[0] = zero[1] = m256_zero;
-    const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
-                                                  0x0405060700010203 );
+    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
+                                                 0x1415161710111213,
+                                                 0x0c0d0e0f08090a0b,
+                                                 0x0405060700010203 );
    /*---- blank round with m=0 ----*/
    rnd512_2way( state, zero );

@@ -475,8 +984,10 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
    __m256i msg[2];
    int i;
    int blocks = (int)len >> 5;
-    const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
-                                                  0x0405060700010203 );
+    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
+                                                 0x1415161710111213,
+                                                 0x0c0d0e0f08090a0b,
+                                                 0x0405060700010203 );
    state-> rembytes = (int)len & 0x1F;

    // full blocks
@@ -528,8 +1039,10 @@ int luffa_2way_update_close( luffa_2way_context *state,
    __m256i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
-    const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
-                                                  0x0405060700010203 );
+    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
+                                                 0x1415161710111213,
+                                                 0x0c0d0e0f08090a0b,
+                                                 0x0405060700010203 );

    state->rembytes = inlen & 0x1F;

@@ -558,6 +1071,7 @@ int luffa_2way_update_close( luffa_2way_context *state,
    }

    finalization512_2way( state, (uint32*)output );
+
    if ( state->hashbitlen > 512 )
        finalization512_2way( state, (uint32*)( output+32 ) );

--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -51,12 +51,30 @@
 #define LIMIT_512 128
 /*********************************/

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 typedef struct {
-    uint32 buffer[8*2] __attribute((aligned(64)));
-    __m256i chainv[10] __attribute((aligned(32)));   /* Chaining values */
+    uint32 buffer[8*4];
+    __m512i chainv[10];   /* Chaining values */
    int hashbitlen;
    int rembytes;
-} luffa_2way_context;
+} luffa_4way_context __attribute((aligned(128)));
+
+int luffa_4way_init( luffa_4way_context *state, int hashbitlen );
+int luffa_4way_update( luffa_4way_context *state, const void *data,
+                       size_t len );
+int luffa_4way_close( luffa_4way_context *state, void *hashval );
+int luffa_4way_update_close( luffa_4way_context *state, void *output,
+                                   const void *data, size_t inlen );
+
+#endif
+
+typedef struct {
+    uint32 buffer[8*2];
+    __m256i chainv[10];   /* Chaining values */
+    int hashbitlen;
+    int rembytes;
+} luffa_2way_context __attribute((aligned(128)));

 int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
 int luffa_2way_update( luffa_2way_context *state, const void *data,
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -542,8 +542,10 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    __m256i* chainv = (__m256i*)state->chainv;
    __m256i  t;
    const __m128i zero = m128_zero;
-    const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
-                                                  0x0405060700010203 );
+    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
+                                                 0x1415161710111213,
+                                                 0x0c0d0e0f08090a0b,
+                                                 0x0405060700010203 );

    rnd512( state, zero, zero );

--- a/algo/lyra2/lyra2-hash-2way.c
+++ b/algo/lyra2/lyra2-hash-2way.c
@@ -0,0 +1,715 @@
+/**
+ * Implementation of the Lyra2 Password Hashing Scheme (PHS).
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <mm_malloc.h>
+#include "compat.h"
+#include "lyra2.h"
+#include "sponge.h"
+
+/**
+ * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
+ * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
+ * where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all
+ * integer parameters (treated as type "unsigned int") in the order they are provided, plus the value
+ * of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols).
+ *
+ * @param K The derived key to be output by the algorithm
+ * @param kLen Desired key length
+ * @param pwd User password
+ * @param pwdlen Password length
+ * @param salt Salt
+ * @param saltlen Salt length
+ * @param timeCost Parameter to determine the processing time (T)
+ * @param nRows Number or rows of the memory matrix (R)
+ * @param nCols Number of columns of the memory matrix (C)
+ *
+ * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
+ */
+
+int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
+               const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
+               const uint64_t timeCost, const uint64_t nRows,
+               const uint64_t nCols )
+{
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[16];
+   int64_t row = 2; //index of row to be processed
+   int64_t prev = 1; //index of prev (last row ever computed/modified)
+   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+   int64_t tau; //Time Loop iterator
+   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+//   int64_t i; //auxiliary iteration counter
+   int64_t v64; // 64bit var for memcpy
+   //====================================================================/
+
+   //=== Initializing the Memory Matrix and pointers to it =============//
+   //Tries to allocate enough space for the whole memory matrix
+
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+//   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   // for Lyra2REv2, nCols = 4, v1 was using 8
+   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
+                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+   uint64_t *ptrWord = wholeMatrix;
+
+//   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+
+   //=== Getting the password + salt + basil padded with 10*1 ==========//
+   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+   //but this ensures that the password copied locally will be overwritten as soon as possible
+
+   //First, we clean enough blocks for the password, salt, basil and padding
+   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+
+   byte *ptrByte = (byte*) wholeMatrix;
+
+   //Prepends the password
+   memcpy(ptrByte, pwd, pwdlen);
+   ptrByte += pwdlen;
+
+   //Concatenates the salt
+   memcpy(ptrByte, salt, saltlen);
+   ptrByte += saltlen;
+
+   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
+                       - (saltlen + pwdlen) );
+
+   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+   memcpy(ptrByte, &kLen, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = pwdlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = saltlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = timeCost;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nRows;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nCols;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+
+   //Now comes the padding
+   *ptrByte = 0x80; //first byte of padding: right after the password
+   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+
+// from here on it's all simd acces to state and matrix
+// define vector pointers and adjust sizes and pointer offsets
+
+   //================= Initializing the Sponge State ====================//
+   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+
+//   initState( state );
+
+   //========================= Setup Phase =============================//
+   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+   
+   ptrWord = wholeMatrix;
+
+   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
+/*
+   for (i = 0; i < nBlocksInput; i++)
+   {
+       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
+       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
+   }
+*/
+
+   //Initializes M[0] and M[1]
+   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+
+   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
+                      nCols);
+
+   do
+   {
+      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+
+      //updates the value of row* (deterministically picked during Setup))
+      rowa = (rowa + step) & (window - 1);
+      //update prev: it now points to the last row ever computed
+
+      prev = row;
+      //updates row: goes to the next row to be computed
+      row++;
+
+      //Checks if all rows in the window where visited.
+      if (rowa == 0)
+      {
+         step = window + gap; //changes the step: approximately doubles its value
+         window *= 2; //doubles the size of the re-visitation window
+         gap = -gap; //inverts the modifier to the step
+      }
+
+   } while (row < nRows);
+
+   //===================== Wandering Phase =============================//
+   row = 0; //Resets the visitation to the first row of the memory matrix
+   for (tau = 1; tau <= timeCost; tau++)
+   {
+       //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+       step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+       do
+       {
+           //Selects a pseudorandom index row*
+           //-----------------------------------------------
+           rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+
+           //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //-------------------------------------------
+
+           //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+           //update prev: it now points to the last row ever computed
+           prev = row;
+
+           //updates row: goes to the next row to be computed
+           //----------------------------------------------------
+           row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //----------------------------------------------------
+
+       } while (row != 0);
+   }
+
+   //===================== Wrap-up Phase ===============================//
+   //Absorbs the last block of the memory matrix
+   absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
+   //Squeezes the key
+   squeeze(state, K, (unsigned int) kLen);
+
+   return 0;
+}
+
+/////////////////////////////////////////////////
+
+// 2 way 256
+// drop salt, salt len arguments, hard code some others.
+// Data is interleaved 2x256.
+
+int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
+      const void *pwd, const uint64_t pwdlen, const void *salt,
+      const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
+      const uint64_t nCols )
+{
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[16];
+   int64_t row = 2; //index of row to be processed
+   int64_t prev = 1; //index of prev (last row ever computed/modified)
+   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+   int64_t tau; //Time Loop iterator
+   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+//   int64_t i; //auxiliary iteration counter
+   int64_t v64; // 64bit var for memcpy
+   uint64_t instance0 = 0; // Seperate instance for each lane
+   uint64_t instance1 = 0;
+   //====================================================================/
+
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+   const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
+
+   uint64_t *ptrWord = wholeMatrix;
+
+//  2 way 256 rewrite. Salt always == password, and data is interleaved,
+//  need to build in parallel:
+//  {   password,    (64 or 80 bytes)
+//      salt,        (64 or 80 bytes) =  same as password
+//      Klen,        (u64)  = 32 bytes
+//      pwdlen,      (u64)
+//      saltlen,     (u64)
+//      timecost,    (u64)
+//      nrows,       (u64)
+//      ncols,       (u64)
+//      0x80,        (byte)
+//      { 0 .. 0 },
+//      1            (byte)
+//   }
+   
+//   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+
+   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+
+   byte *ptrByte = (byte*) wholeMatrix;
+
+   //Prepends the password
+   memcpy(ptrByte, pwd, pwdlen);
+   ptrByte += pwdlen;
+
+   //Concatenates the salt
+   memcpy(ptrByte, salt, saltlen);
+   ptrByte += saltlen;
+
+   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
+                       - (saltlen + pwdlen) );
+
+   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+   memcpy(ptrByte, &kLen, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = pwdlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = saltlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = timeCost;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nRows;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nCols;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+
+   //Now comes the padding
+   *ptrByte = 0x80; //first byte of padding: right after the password
+   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+
+// from here on it's all simd acces to state and matrix
+// define vector pointers and adjust sizes and pointer offsets
+
+   ptrWord = wholeMatrix;
+
+   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
+   reducedSqueezeRow0( state, &wholeMatrix[0], nCols );
+
+   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
+                      nCols);
+
+   do
+   {
+
+      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+
+      rowa = (rowa + step) & (window - 1);
+
+      prev = row;
+      row++;
+
+      if (rowa == 0)
+      {
+         step = window + gap; //changes the step: approximately doubles its value
+         window *= 2; //doubles the size of the re-visitation window
+         gap = -gap; //inverts the modifier to the step
+      }
+
+   } while (row < nRows);
+
+   row = 0;
+   for (tau = 1; tau <= timeCost; tau++)
+   {
+      step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
+      do
+      {
+        // This part is not parallel, rowa will be different for each lane.
+        // state (u64[16]) is interleaved 2x256, need to extract seperately.
+
+        // index = 2 * instance / 4 * 4 + instance % 4
+        uint64_t index0 = ( ( (instance0 & 0xf) >> 3 ) << 2 )
+                           + ( instance0 & 0x3 )
+        uint64_t index1 = ( ( (instance1 & 0xf) >> 3 ) << 2 )
+                           + ( instance1 & 0x3 )
+
+        instance0 = state[ index0 ] & 0xf;
+        instance1 = (state+4)[ index1 ] & 0xf;
+
+        rowa0 = state[ instance0 ];
+        rowa1 = (state+4)[ instance1 ];
+
+        reducedDuplexRow_2way( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                                      &wholeMatrix[rowa0*ROW_LEN_INT64],
+                                      &wholeMatrix[rowa1*ROW_LEN_INT64],
+                                      &wholeMatrix[row*ROW_LEN_INT64], nCols );
+/*
+           instance = state[instance & 0xF];
+           rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
+
+           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+*/
+        // End of divergence.
+
+        prev = row;
+        row = (row + step) & (unsigned int)(nRows-1); 
+
+       } while ( row != 0 );
+   }
+
+   absorbBlock( state, &wholeMatrix[rowa*ROW_LEN_INT64] );
+   squeeze( state, K, (unsigned int) kLen );
+
+   return 0;
+}
+
+
+
+//////////////////////////////////////////////////
+int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
+            const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
+            const uint64_t timeCost, const uint64_t nRows,
+            const uint64_t nCols )
+{
+    //========================== Basic variables ============================//
+    uint64_t _ALIGN(256) state[16];
+    int64_t row = 2; //index of row to be processed
+    int64_t prev = 1; //index of prev (last row ever computed/modified)
+    int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+    int64_t tau; //Time Loop iterator
+    int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+    int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+    int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+//    int64_t i; //auxiliary iteration counter
+    //=======================================================================/
+
+    //======= Initializing the Memory Matrix and pointers to it =============//
+    //Tries to allocate enough space for the whole memory matrix
+
+    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+//    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+//    memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+
+    //==== Getting the password + salt + basil padded with 10*1 ============//
+    //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+    //but this ensures that the password copied locally will be overwritten as soon as possible
+
+    //First, we clean enough blocks for the password, salt, basil and padding
+    uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 *
+                       sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+    byte *ptrByte = (byte*) wholeMatrix;
+    memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );
+
+    //Prepends the password
+    memcpy(ptrByte, pwd, pwdlen);
+    ptrByte += pwdlen;
+
+    //Concatenates the salt
+    memcpy(ptrByte, salt, saltlen);
+    ptrByte += saltlen;
+    //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+    memcpy(ptrByte, &kLen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &saltlen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &timeCost, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &nRows, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &nCols, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+
+    //Now comes the padding
+    *ptrByte = 0x80; //first byte of padding: right after the password
+    ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+    ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+    *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+
+    //=================== Initializing the Sponge State ====================//
+    //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+//        uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32);
+//        if (state == NULL) {
+//                return -1;
+//        }
+//    initState( state );
+
+    //============================== Setup Phase =============================//
+    //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+    uint64_t *ptrWord = wholeMatrix;
+
+    absorbBlockBlake2Safe( state, ptrWord, nBlocksInput,
+                           BLOCK_LEN_BLAKE2_SAFE_INT64 );
+/*
+    for ( i = 0; i < nBlocksInput; i++ )
+    {
+      absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
+      ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
+    }
+*/
+    //Initializes M[0] and M[1]
+        reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here
+        reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols);
+
+        do {
+                //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+                reducedDuplexRowSetup(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
+
+                //updates the value of row* (deterministically picked during Setup))
+                rowa = (rowa + step) & (window - 1);
+                //update prev: it now points to the last row ever computed
+                prev = row;
+                //updates row: goes to the next row to be computed
+                row++;
+
+                //Checks if all rows in the window where visited.
+                if (rowa == 0) {
+                        step = window + gap; //changes the step: approximately doubles its value
+                        window *= 2; //doubles the size of the re-visitation window
+                        gap = -gap; //inverts the modifier to the step
+                }
+
+        } while (row < nRows);
+
+    //======================== Wandering Phase =============================//
+    row = 0; //Resets the visitation to the first row of the memory matrix
+    for ( tau = 1; tau <= timeCost; tau++ )
+    {
+        //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+        step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+        do {
+        //Selects a pseudorandom index row*
+        //----------------------------------------------------------------------
+        //rowa = ((unsigned int)state[0]) & (nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+        rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+        //-----------------------------------------------------------------
+
+        //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+                reducedDuplexRow(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
+
+        //update prev: it now points to the last row ever computed
+        prev = row;
+
+        //updates row: goes to the next row to be computed
+        //---------------------------------------------------------------
+        //row = (row + step) & (nRows-1);       //(USE THIS IF nRows IS A POWER OF 2)
+        row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+        //--------------------------------------------------------------------
+
+      } while (row != 0);
+    }
+
+    //========================= Wrap-up Phase ===============================//
+    //Absorbs the last block of the memory matrix
+    absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
+
+    //Squeezes the key
+    squeeze( state, K, kLen );
+
+    return 0;
+}
+
+// Lyra2RE doesn't like the new wholeMatrix implementation
+int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
+             const void *salt, const uint64_t saltlen, const uint64_t timeCost,
+             const uint64_t nRows, const uint64_t nCols )
+{
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[16];
+   int64_t row = 2; //index of row to be processed
+   int64_t prev = 1; //index of prev (last row ever computed/modified)
+   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+   int64_t tau; //Time Loop iterator
+   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+   int64_t i; //auxiliary iteration counter
+   int64_t v64; // 64bit var for memcpy
+   //====================================================================/
+
+   //=== Initializing the Memory Matrix and pointers to it =============//
+   //Tries to allocate enough space for the whole memory matrix
+
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   // for Lyra2REv2, nCols = 4, v1 was using 8
+   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
+                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+
+   i = (int64_t)ROW_LEN_BYTES * nRows;
+   uint64_t *wholeMatrix = _mm_malloc( i, 64 );
+   if (wholeMatrix == NULL)
+      return -1;
+
+#if defined(__AVX2__)
+   memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
+#elif defined(__SSE2__)
+   memset_zero_128( (__m128i*)wholeMatrix, i>>4 );   
+#else
+   memset( wholeMatrix, 0, i );
+#endif
+
+   uint64_t *ptrWord = wholeMatrix;
+
+   //=== Getting the password + salt + basil padded with 10*1 ==========//
+   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+   //but this ensures that the password copied locally will be overwritten as soon as possible
+
+   //First, we clean enough blocks for the password, salt, basil and padding
+   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+
+   byte *ptrByte = (byte*) wholeMatrix;
+
+   //Prepends the password
+   memcpy(ptrByte, pwd, pwdlen);
+   ptrByte += pwdlen;
+
+   //Concatenates the salt
+   memcpy(ptrByte, salt, saltlen);
+   ptrByte += saltlen;
+
+//   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
+//                       - (saltlen + pwdlen) );
+
+   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+   memcpy(ptrByte, &kLen, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = pwdlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = saltlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = timeCost;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nRows;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nCols;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+
+   //Now comes the padding
+   *ptrByte = 0x80; //first byte of padding: right after the password
+   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+
+   //================= Initializing the Sponge State ====================//
+   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+
+//   initState( state );
+
+   //========================= Setup Phase =============================//
+   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+
+   ptrWord = wholeMatrix;
+
+   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
+/*
+   for (i = 0; i < nBlocksInput; i++)
+   {
+       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
+       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
+   }
+*/
+   //Initializes M[0] and M[1]
+   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+
+   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
+                      nCols);
+
+   do
+   {
+      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+
+      //updates the value of row* (deterministically picked during Setup))
+      rowa = (rowa + step) & (window - 1);
+      //update prev: it now points to the last row ever computed
+
+      prev = row;
+      //updates row: goes to the next row to be computed
+      row++;
+
+      //Checks if all rows in the window where visited.
+      if (rowa == 0)
+      {
+         step = window + gap; //changes the step: approximately doubles its value
+         window *= 2; //doubles the size of the re-visitation window
+         gap = -gap; //inverts the modifier to the step
+      }
+
+   } while (row < nRows);
+
+   //===================== Wandering Phase =============================//
+   row = 0; //Resets the visitation to the first row of the memory matrix
+   for (tau = 1; tau <= timeCost; tau++)
+   {
+       //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+       step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+       do
+       {
+           //Selects a pseudorandom index row*
+           //-----------------------------------------------
+           rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+
+           //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //-------------------------------------------
+
+           //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+           //update prev: it now points to the last row ever computed
+           prev = row;
+
+           //updates row: goes to the next row to be computed
+           //----------------------------------------------------
+           row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //----------------------------------------------------
+
+       } while (row != 0);
+   }
+
+   //===================== Wrap-up Phase ===============================//
+   //Absorbs the last block of the memory matrix
+   absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
+   //Squeezes the key
+   squeeze(state, K, (unsigned int) kLen);
+
+   //================== Freeing the memory =============================//
+   _mm_free(wholeMatrix);
+
+   return 0;
+}
+
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -5,7 +5,6 @@
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 

-
 #if defined (LYRA2REV3_8WAY)

 typedef struct {
@@ -14,7 +13,7 @@ typedef struct {
   bmw256_8way_context       bmw;
 } lyra2v3_8way_ctx_holder;

-static lyra2v3_8way_ctx_holder l2v3_8way_ctx;
+static __thread lyra2v3_8way_ctx_holder l2v3_8way_ctx;

 bool init_lyra2rev3_8way_ctx()
 {
@@ -38,7 +37,7 @@ void lyra2rev3_8way_hash( void *state, const void *input )
   lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );

-   blake256_8way( &ctx.blake, input, 80 );
+   blake256_8way( &ctx.blake, input + (64*8), 16 );
   blake256_8way_close( &ctx.blake, vhash );

   dintrlv_8x32( hash0, hash1, hash2, hash3,
@@ -91,7 +90,7 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
 {
   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *hash7 = &hash[7<<3];
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
@@ -99,12 +98,15 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
-   const int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;

-   if ( opt_benchmark )
-      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+   if ( opt_benchmark )  ( (uint32_t*)ptarget )[7] = 0x0000ff;

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
+
+   blake256_8way_init( &l2v3_8way_ctx.blake );
+   blake256_8way( &l2v3_8way_ctx.blake, vdata, 64 );
+
   do
   {
      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
@@ -119,8 +121,8 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
         extr_lane_8x32( lane_hash, hash, lane, 256 );
         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
         {
-              pdata[19] = n + lane;
-              submit_lane_solution( work, lane_hash, mythr, lane );
+             pdata[19] = n + lane;
+             submit_lane_solution( work, lane_hash, mythr, lane );
         }
      }
      n += 8;
@@ -133,14 +135,14 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,

 #if defined (LYRA2REV3_4WAY)  

-
 typedef struct {
   blake256_4way_context     blake;
   cubehashParam             cube;
   bmw256_4way_context       bmw;
 } lyra2v3_4way_ctx_holder;

-static lyra2v3_4way_ctx_holder l2v3_4way_ctx;
+//static lyra2v3_4way_ctx_holder l2v3_4way_ctx;
+static __thread lyra2v3_4way_ctx_holder l2v3_4way_ctx;

 bool init_lyra2rev3_4way_ctx()
 {
@@ -160,7 +162,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
   memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );

-   blake256_4way( &ctx.blake, input, 80 );
+//   blake256_4way( &ctx.blake, input, 80 );
+   blake256_4way( &ctx.blake, input + (64*4), 16 );
   blake256_4way_close( &ctx.blake, vhash );
   dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

@@ -206,6 +209,10 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+
+   blake256_4way_init( &l2v3_4way_ctx.blake );
+   blake256_4way( &l2v3_4way_ctx.blake, vdata, 64 );
+
   do
   {
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -0,0 +1,319 @@
+/**
+ * A simple implementation of Blake2b's internal permutation
+ * in the form of a sponge.
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "algo-gate.h"
+#include <string.h>
+#include <stdio.h>
+#include <time.h>
+#include <immintrin.h>
+#include "sponge.h"
+#include "lyra2.h"
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
+{
+    const int len_m256i = len / 32;
+    const int fullBlocks = len_m256i / BLOCK_LEN_M256I;
+    __m512i* state = (__m512i*)State;
+    __m512i* out   = (__m512i*)Out;
+    int i;
+
+    //Squeezes full blocks
+    for ( i = 0; i < fullBlocks; i++ )
+    {
+       memcpy_512( out, state, BLOCK_LEN_M256I*2 );
+       LYRA_ROUND_2WAY_AVX2( state[0], state[1], state[2], state[3] );
+       out += BLOCK_LEN_M256I*2;
+    }
+    //Squeezes remaining bytes
+    memcpy_512( out, state, ( (len_m256i % BLOCK_LEN_M256I) * 2 ) );
+}
+
+inline void absorbBlock_2way( uint64_t *State, const uint64_t *In ) 
+{
+    register __m512i state0, state1, state2, state3;
+    __m512i *in = (__m512i*)In;
+
+    state0 = _mm512_load_si512( (__m512i*)State     );
+    state1 = _mm512_load_si512( (__m512i*)State + 1 );
+    state2 = _mm512_load_si512( (__m512i*)State + 2 );
+    state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+    state0 = _mm512_xor_si512( state0, in[0] );
+    state1 = _mm512_xor_si512( state1, in[1] );
+    state2 = _mm512_xor_si512( state2, in[2] );
+
+    LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
+
+    _mm512_store_si512( (__m512i*)State,     state0 );
+    _mm512_store_si512( (__m512i*)State + 1, state1 );
+    _mm512_store_si512( (__m512i*)State + 2, state2 );
+    _mm512_store_si512( (__m512i*)State + 3, state3 );
+
+}
+
+inline void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
+                      const uint64_t nBlocks, const uint64_t block_len )
+{
+  register __m512i state0, state1, state2, state3;
+
+  state0 = 
+  state1 = m512_zero;
+  state2 = m512_const4_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
+                           0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
+  state3 = m512_const4_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
+                           0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
+
+  for ( int i = 0; i < nBlocks; i++ )
+  { 
+    __m512i *in = (__m512i*)In;
+    state0 = _mm512_xor_si512( state0, in[0] );
+    state1 = _mm512_xor_si512( state1, in[1] );
+
+    LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
+    In += block_len * 2;
+  }
+
+  _mm512_store_si512( (__m512i*)State,     state0 );
+  _mm512_store_si512( (__m512i*)State + 1, state1 );
+  _mm512_store_si512( (__m512i*)State + 2, state2 );
+  _mm512_store_si512( (__m512i*)State + 3, state3 );
+
+}
+
+inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
+                                     uint64_t nCols )
+{
+    int i;
+
+    //M[row][C-1-col] = H.reduced_squeeze()
+
+
+    register __m512i state0, state1, state2, state3;
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
+
+    state0 = _mm512_load_si512( (__m512i*)State     );
+    state1 = _mm512_load_si512( (__m512i*)State + 1 );
+    state2 = _mm512_load_si512( (__m512i*)State + 2 );
+    state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+    for ( i = 0; i < 9; i += 3)
+    {
+        _mm_prefetch( out - i,     _MM_HINT_T0 );
+        _mm_prefetch( out - i - 2, _MM_HINT_T0 );
+    }
+
+    for ( i = 0; i < nCols; i++ )
+    {
+       _mm_prefetch( out -  9, _MM_HINT_T0 );
+       _mm_prefetch( out - 11, _MM_HINT_T0 );
+                   
+       out[0] = state0;
+       out[1] = state1;
+       out[2] = state2;
+
+       //Goes to next block (column) that will receive the squeezed data
+       out -= BLOCK_LEN_M256I * 2;
+
+       LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+    }
+
+    _mm512_store_si512( (__m512i*)State,     state0 );
+    _mm512_store_si512( (__m512i*)State + 1, state1 );
+    _mm512_store_si512( (__m512i*)State + 2, state2 );
+    _mm512_store_si512( (__m512i*)State + 3, state3 );
+}
+
+// This function has to deal with gathering 2 256 bit rowin vectors from
+// non-contiguous memory. Extra work and performance penalty.
+
+inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
+                 uint64_t *rowOut, uint64_t nCols )
+{
+    int i;
+    register __m512i state0, state1, state2, state3;
+    __m512i *in = (__m256i*)rowIn;
+
+    state0 = _mm512_load_si512( (__m512i*)State     );
+    state1 = _mm512_load_si512( (__m512i*)State + 1 );
+    state2 = _mm512_load_si512( (__m512i*)State + 2 );
+    state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+    for ( i = 0; i < nCols; i++ )
+    {
+         state0 = _mm512_xor_si512( state0, in[0] );
+         state1 = _mm512_xor_si512( state1, in[1] );
+         state2 = _mm512_xor_si512( state2, in[2] );
+
+         LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+
+         out[0] = _mm512_xor_si512( state0, in[0] );
+         out[1] = _mm512_xor_si512( state1, in[1] );
+         out[2] = _mm512_xor_si512( state2, in[2] );
+
+         //Input: next column (i.e., next block in sequence)
+         in0 += BLOCK_LEN_M256I;
+         in1 += BLOCK_LEN_M256I;
+         //Output: goes to previous column
+         out -= BLOCK_LEN_M256I * 2;
+    }
+
+    _mm512_store_si256( (__m512i*)State,     state0 );
+    _mm512_store_si256( (__m512i*)State + 1, state1 );
+    _mm512_store_si256( (__m512i*)State + 2, state2 );
+    _mm512_store_si256( (__m512i*)State + 3, state3 );
+   }
+}
+
+inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
+                       uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols )
+{
+    int i;
+
+    register __m512i state0, state1, state2, state3;
+    __m512i* in    = (__m512i*)rowIn;
+    __m512i* inout = (__m512i*)rowInOut;
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
+    __m512i  t0, t1, t2;
+
+    state0 = _mm512_load_si512( (__m512i*)State     );
+    state1 = _mm512_load_si512( (__m512i*)State + 1 );
+    state2 = _mm512_load_si512( (__m512i*)State + 2 );
+    state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+    for ( i = 0; i < nCols; i++ )
+    {
+       state0 = _mm512_xor_si512( state0,
+                                  _mm512_add_epi64( in[0], inout[0] ) );
+       state1 = _mm512_xor_si512( state1,
+                                  _mm512_add_epi64( in[1], inout[1] ) );
+       state2 = _mm512_xor_si512( state2,
+                                  _mm512_add_epi64( in[2], inout[2] ) );
+
+       LYRA_ROUND_2WAY AVX512( state0, state1, state2, state3 );
+
+       out[0] = _mm512_xor_si512( state0, in[0] );
+       out[1] = _mm512_xor_si512( state1, in[1] );
+       out[2] = _mm512_xor_si512( state2, in[2] );
+
+       //M[row*][col] = M[row*][col] XOR rotW(rand)
+       t0 = _mm512_permutex_epi64( state0, 0x93 );
+       t1 = _mm512_permutex_epi64( state1, 0x93 );
+       t2 = _mm512_permutex_epi64( state2, 0x93 );
+
+       inout[0] = _mm512_xor_si512( inout[0],
+                                 _mm512_mask_blend_epi32( t0, t2, 0x03 ) );
+       inout[1] = _mm512_xor_si512( inout[1],
+                                 _mm512_mask_blend_epi32( t1, t0, 0x03 ) );
+       inout[2] = _mm512_xor_si512( inout[2],
+                                 _mm512_mask_blend_epi32( t2, t1, 0x03 ) );
+
+       //Inputs: next column (i.e., next block in sequence)
+       in    += BLOCK_LEN_M256I * 2;
+       inout += BLOCK_LEN_M256I * 2;
+       //Output: goes to previous column
+       out   -= BLOCK_LEN_M256I * 2;
+    }
+
+    _mm512_store_si512( (__m512i*)State,     state0 );
+    _mm512_store_si512( (__m512i*)State + 1, state1 );
+    _mm512_store_si512( (__m512i*)State + 2, state2 );
+    _mm512_store_si512( (__m512i*)State + 3, state3 );
+}
+
+inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
+                uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut,
+                uint64_t nCols )
+{
+   int i;
+
+   register __m512i state0, state1, state2, state3;
+    __m256i *in0 = (__m256i*)rowIn0;
+    __m256i *in0 = (__m256i*)rowIn0;
+    __m2512* in    = (__m512i*)rowIn;
+    __m2512* inout = (__m512i*)rowInOut;
+    __m512i* out   = (__m512i*)rowOut;
+    __m512i  t0, t1, t2;
+
+    _mm_prefetch( in0,     _MM_HINT_T0 );
+    _mm_prefetch( in1,     _MM_HINT_T0 );
+    _mm_prefetch( in0 + 2, _MM_HINT_T0 );
+    _mm_prefetch( in1 + 2, _MM_HINT_T0 );
+    _mm_prefetch( in0 + 4, _MM_HINT_T0 );
+    _mm_prefetch( in1 + 4, _MM_HINT_T0 );
+    _mm_prefetch( in0 + 6, _MM_HINT_T0 );
+    _mm_prefetch( in1 + 6, _MM_HINT_T0 );
+   
+   state0 = _mm512_load_si512( (__m512i*)State     );
+   state1 = _mm512_load_si512( (__m512i*)State + 1 );
+   state2 = _mm512_load_si512( (__m512i*)State + 2 );
+   state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+      //Absorbing "M[prev] [+] M[row*]"
+
+//         state0 = _mm512_xor_si512( state0, mm512_concat_256( in1[0], in0[0] );
+//         state1 = _mm512_xor_si512( state1, mm512_concat_256( in1[1], in0[1] );
+//         state2 = _mm512_xor_si512( state2, mm512_concat_256( in1[2], in0[2] );
+      t0 = mm512_concat_256( in1[0], in0[0] );
+      t1 = mm512_concat_256( in1[1], in0[1] );
+      t2 = mm512_concat_256( in1[2], in0[2] );
+      
+      state0 = _mm512_xor_si512( state0,
+                                     _mm512_add_epi64( t0, inout[0] ) );
+      state1 = _mm512_xor_si512( state1,
+                                     _mm512_add_epi64( t1, inout[1] ) );
+      state2 = _mm512_xor_si512( state2,
+                                     _mm512_add_epi64( t2, inout[2] ) );
+
+      //Applies the reduced-round transformation f to the sponge's state
+      LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+
+      //M[rowOut][col] = M[rowOut][col] XOR rand
+      out[0] = _mm512_xor_si512( out[0], state0 );
+      out[1] = _mm512_xor_si512( out[1], state1 );
+      out[2] = _mm512_xor_si512( out[2], state2 );
+
+      //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+      t0 = _mm512_permutex_epi64( state0, 0x93 );
+      t1 = _mm512_permutex_epi64( state1, 0x93 );
+      t2 = _mm512_permutex_epi64( state2, 0x93 );
+
+      inout[0] = _mm512_xor_si512( inout[0],
+                                   _mm512_mask_blend_epi32( t0, t2, 0x03 ) );
+      inout[1] = _mm512_xor_si512( inout[1],
+                                   _mm512_mask_blend_epi32( t1, t0, 0x03 ) );
+      inout[2] = _mm512_xor_si512( inout[2],
+                                   _mm512_mask_blend_epi32( t2, t1, 0x03 ) );
+
+       //Goes to next block
+       in    += BLOCK_LEN_M256I * 2;
+       out   += BLOCK_LEN_M256I * 2;
+       inout += BLOCK_LEN_M256I * 2;
+   }
+
+   _mm512_store_si512( (__m512i*)State,     state0 );
+   _mm512_store_si512( (__m512i*)State + 1, state1 );
+   _mm512_store_si512( (__m512i*)State + 2, state2 );
+   _mm512_store_si512( (__m512i*)State + 3, state3 );
+}
+
+#endif // AVX512
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -52,8 +52,46 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 // However, 2 way parallel looks trivial to code for AVX512 except for
 // a data dependency with rowa.

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define G2W_4X64(a,b,c,d) \
+   a = _mm512_add_epi64( a, b ); \
+   d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
+   c = _mm512_add_epi64( c, d ); \
+   b = mm512_ror_64( _mm512_xor_si512( b, c ), 24 ); \
+   a = _mm512_add_epi64( a, b ); \
+   d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
+   c = _mm512_add_epi64( c, d ); \
+   b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 );
+
+#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   G_4X64( s0, s1, s2, s3 ); \
+   s1 = mm512_ror_1x64( s1); \
+   s2 = mm512_swap128_256( s2 ); \
+   s3 = mm512_rol1x64_256( s3 ); \
+   G_4X64( s0, s1, s2, s3 ); \
+   s1 = mm512_rol1x64_256( s1 ); \
+   s2 = mm512_swap128_256( s2 ); \
+   s3 = mm512_ror1x64_256( s3 );
+
+#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 )
+
+
+#endif  // AVX512
+
 #if defined __AVX2__
-// only available with avx2

 // process 4 columns in parallel
 // returns void, updates all args
@@ -89,9 +127,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
-   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 )

-#elif defined(__SSE2__)
+#endif
+
+#if defined(__SSE2__)

 // process 2 columns in parallel
 // returns void, all args updated
@@ -129,7 +169,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
-   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7)


 #endif // AVX2 else SSE2
@@ -161,6 +201,30 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    G(r,7,v[ 3],v[ 4],v[ 9],v[14]);


+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+//---- Housekeeping
+void initState_2way( uint64_t state[/*16*/] );
+
+//---- Squeezes
+void squeeze_2way( uint64_t *state, unsigned char *out, unsigned int len );
+void reducedSqueezeRow0_2way( uint64_t* state, uint64_t* row, uint64_t nCols );
+
+//---- Absorbs
+void absorbBlock_2way( uint64_t *state, const uint64_t *in );
+void absorbBlockBlake2Safe_2way( uint64_t *state, const uint64_t *in,
+                            const uint64_t nBlocks, const uint64_t block_len );
+
+//---- Duplexes
+void reducedDuplexRow1_2way( uint64_t *state, uint64_t *rowIn,
+                             uint64_t *rowOut, uint64_t nCols);
+void reducedDuplexRowSetup_2way( uint64_t *state, uint64_t *rowIn,
+                    uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
+void reducedDuplexRow_2way(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
+
+#endif
+
+
 //---- Housekeeping
 void initState(uint64_t state[/*16*/]);

@@ -178,20 +242,4 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint6
 void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
 void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);

-//---- Misc
-void printArray(unsigned char *array, unsigned int size, char *name);
-
-////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-////TESTS////
-//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2);
-//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-/////////////
-
-
 #endif /* SPONGE_H_ */
--- a/algo/nist5/nist5-4way.c
+++ b/algo/nist5/nist5-4way.c
@@ -3,22 +3,129 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
-#if defined(NIST5_4WAY)
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"

-void nist5hash_4way( void *out, const void *input )
+#if defined(NIST5_8WAY)
+
+void nist5hash_8way( void *out, const void *input )
 {
+     uint64_t vhash[8*16] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     blake512_8way_context  ctx_blake;
+     hashState_groestl      ctx_groestl;
+     jh512_8way_context     ctx_jh;
+     skein512_8way_context  ctx_skein;
+     keccak512_8way_context ctx_keccak;
+
+     blake512_8way_init( &ctx_blake );
+     blake512_8way_update( &ctx_blake, input, 80 );
+     blake512_8way_close( &ctx_blake, vhash );
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, 512 );
+
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash0,
+                               (const char*)hash0, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash1,
+                               (const char*)hash1, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash2,
+                               (const char*)hash2, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash3,
+                               (const char*)hash3, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash4,
+                               (const char*)hash4, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash5,
+                               (const char*)hash5, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash6,
+                               (const char*)hash6, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash7,
+                               (const char*)hash7, 512 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                         hash7, 512 );
+
+     jh512_8way_init( &ctx_jh );
+     jh512_8way_update( &ctx_jh, vhash, 64 );
+     jh512_8way_close( &ctx_jh, vhash );
+
+     keccak512_8way_init( &ctx_keccak );
+     keccak512_8way_update( &ctx_keccak, vhash, 64 );
+     keccak512_8way_close( &ctx_keccak, vhash );
+
+     skein512_8way_init( &ctx_skein );
+     skein512_8way_update( &ctx_skein, vhash, 64 );
+     skein512_8way_close( &ctx_skein, out );
+}
+
+int scanhash_nist5_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[16*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+     uint32_t *hash7 = &(hash[49]);
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     const uint32_t Htarg = ptarget[7];
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     int thr_id = mythr->id;  
+
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+               _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                 n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+        nist5hash_8way( hash, vdata );
+
+        for ( int lane = 0; lane < 8; lane++ )
+        if ( hash7[ lane<<1 ] < Htarg )
+        {
+           extr_lane_8x64( lane_hash, hash, lane, 256 );
+           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+           {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+           }
+        }
+        n += 8;
+     } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined(NIST5_4WAY)
+
+void nist5hash_4way( void *out, const void *input )
+{
+     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     blake512_4way_context  ctx_blake;
     hashState_groestl      ctx_groestl;
     jh512_4way_context     ctx_jh;
@@ -62,62 +169,39 @@ void nist5hash_4way( void *out, const void *input )
 int scanhash_nist5_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
+     uint32_t vdata[4*24] __attribute__ ((aligned (128)));
     uint32_t hash[4*16] __attribute__ ((aligned (64)));
     uint32_t *hash7 = &(hash[25]);
     uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     const uint32_t Htarg = ptarget[7];
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-     int thr_id = mythr->id;  // thr_id arg is deprecated
-
-     uint64_t htmax[] = {          0,
-                                 0xF,
-                                0xFF,
-                               0xFFF,
-                              0xFFFF,
-                          0x10000000 };
-
-     uint32_t masks[] = { 0xFFFFFFFF,
-                          0xFFFFFFF0,
-                          0xFFFFFF00,
-                          0xFFFFF000,
-                          0xFFFF0000,
-                                   0 };
+     int thr_id = mythr->id;  

     mm256_bswap32_intrlv80_4x64( vdata, pdata );

-     for ( int m=0; m < 6; m++ )
-     {
-        if (Htarg <= htmax[m])
+     do {
+        *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+               _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+
+        nist5hash_4way( hash, vdata );
+
+        for ( int lane = 0; lane < 4; lane++ )
+        if ( hash7[ lane<<1 ] < Htarg )
        {
-           uint32_t mask = masks[m];
-
-           do {
-              *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-
-              nist5hash_4way( hash, vdata );
-
-              for ( int lane = 0; lane < 4; lane++ )
-              if ( ( hash7[ lane ] & mask ) == 0 )
-              {
-                 extr_lane_4x64( lane_hash, hash, lane, 256 );
-                 if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
-                 {
-                    pdata[19] = n + lane;
-                    submit_lane_solution( work, lane_hash, mythr, lane );
-                 }
-              }
-              n += 4;
-           } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
-           break;
+           extr_lane_4x64( lane_hash, hash, lane, 256 );
+           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+           {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+           }
        }
-     }
-     *hashes_done = n - first_nonce + 1;
+        n += 4;
+     } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
     return 0;
 }

--- a/algo/nist5/nist5-gate.c
+++ b/algo/nist5/nist5-gate.c
@@ -2,8 +2,11 @@

 bool register_nist5_algo( algo_gate_t* gate )
 {
-    gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
-#if defined (NIST5_4WAY)
+    gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+#if defined (NIST5_8WAY)
+    gate->scanhash = (void*)&scanhash_nist5_8way;
+    gate->hash     = (void*)&nist5hash_8way;
+#elif defined (NIST5_4WAY)
    gate->scanhash = (void*)&scanhash_nist5_4way;
    gate->hash     = (void*)&nist5hash_4way;
 #else
--- a/algo/nist5/nist5-gate.h
+++ b/algo/nist5/nist5-gate.h
@@ -1,14 +1,23 @@
 #ifndef __NIST5_GATE_H__
-#define __NIST5_GATE_H__
+#define __NIST5_GATE_H__ 1

 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define NIST5_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define NIST5_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define NIST5_4WAY 1
 #endif

-#if defined(NIST5_4WAY)
+#if defined(NIST5_8WAY)
+
+void nist5hash_8way( void *state, const void *input );
+
+int scanhash_nist5_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(NIST5_4WAY)

 void nist5hash_4way( void *state, const void *input );

--- a/algo/quark/hmq1725-gate.h
+++ b/algo/quark/hmq1725-gate.h
@@ -5,7 +5,7 @@
 #include <stdint.h>

 #if defined(__AVX2__) && defined(__AES__)
-//  #define HMQ1725_4WAY
+//  #define HMQ1725_4WAY 1
 #endif

 bool register_hmq1725_algo( algo_gate_t* gate );
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -1,12 +1,8 @@
 #include "cpuminer-config.h"
 #include "quark-gate.h"
-
-#if defined (QUARK_4WAY)
-
 #include <stdio.h>
 #include <string.h>
 #include <stdint.h>
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
@@ -14,6 +10,258 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"

+#if defined (QUARK_8WAY)
+
+typedef struct {
+    blake512_8way_context  blake;
+    bmw512_8way_context    bmw;
+    hashState_groestl      groestl;
+    jh512_8way_context     jh;
+    skein512_8way_context  skein;
+    keccak512_8way_context keccak;
+} quark_8way_ctx_holder;
+
+quark_8way_ctx_holder quark_8way_ctx __attribute__ ((aligned (128)));
+
+void init_quark_8way_ctx()
+{
+     blake512_8way_init( &quark_8way_ctx.blake );
+     bmw512_8way_init( &quark_8way_ctx.bmw );
+     init_groestl( &quark_8way_ctx.groestl, 64 );
+     skein512_8way_init( &quark_8way_ctx.skein );
+     jh512_8way_init( &quark_8way_ctx.jh );
+     keccak512_8way_init( &quark_8way_ctx.keccak );
+}
+
+void quark_8way_hash( void *state, const void *input )
+{
+    uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+    uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
+    uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
+    uint64_t hash0[8] __attribute__ ((aligned (64)));
+    uint64_t hash1[8] __attribute__ ((aligned (64)));
+    uint64_t hash2[8] __attribute__ ((aligned (64)));
+    uint64_t hash3[8] __attribute__ ((aligned (64)));
+    uint64_t hash4[8] __attribute__ ((aligned (64)));
+    uint64_t hash5[8] __attribute__ ((aligned (64)));
+    uint64_t hash6[8] __attribute__ ((aligned (64)));
+    uint64_t hash7[8] __attribute__ ((aligned (64)));
+    __m512i* vh  = (__m512i*)vhash;
+    __m512i* vhA = (__m512i*)vhashA;
+    __m512i* vhB = (__m512i*)vhashB;
+    __mmask8 vh_mask;
+    quark_8way_ctx_holder ctx;
+    const uint32_t mask = 8;
+    const __m512i bit3_mask = m512_const1_64( mask );
+    const __m512i zero = _mm512_setzero_si512();
+
+    memcpy( &ctx, &quark_8way_ctx, sizeof(quark_8way_ctx) );
+
+    blake512_8way_update( &ctx.blake, input, 80 );
+    blake512_8way_close( &ctx.blake, vhash );
+
+    bmw512_8way_update( &ctx.bmw, vhash, 64 );
+    bmw512_8way_close( &ctx.bmw, vhash );
+
+// AVX 512 cmpeq returns a bit mask instead of a vector mask.
+// This should simplify things but the logic doesn't seem to be working.
+// The problem appears to be related to the test to skip a hash if it isn't
+// to be used. Skipping the test for all 8 way hashes seems to have
+// fixed it. The hash selection blending works if the hash is produced
+// but the hash wasn't being produced when it should.
+// Both decisions are based on the same data, the __mmask8. It works
+// as a blend mask but not in a logical comparison, maybe the type is the
+// problem. Maybe a cast to int or movm is needed to make it work.
+// It's now moot because the hash can only be skipped 1 in 256 iterations
+// when hashing parallel 8 ways.
+// The performance impact of the workaround should be negligible.
+// It's a problem for another day.
+
+    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
+                                       zero );
+
+    dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                  vhash, 512 );
+
+    if ( hash0[0] & mask )
+    {
+       update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                               (char*)hash0, 512 );
+    }
+    if ( hash1[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                               (char*)hash1, 512 );
+    }
+    if ( hash2[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                               (char*)hash2, 512 );
+    }
+    if ( hash3[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                               (char*)hash3, 512 );
+    }
+    if ( hash4[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                               (char*)hash4, 512 );
+    }
+    if ( hash5[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                               (char*)hash5, 512 );
+    }
+    if ( hash6[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                               (char*)hash6, 512 );
+    }
+    if ( hash7[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                               (char*)hash7, 512 );
+    }
+
+    intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                         hash7, 512 );
+
+    if ( vh_mask & 0xff )
+    {
+       skein512_8way_update( &ctx.skein, vhash, 64 );
+       skein512_8way_close( &ctx.skein, vhashB );
+    }
+
+    mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+
+    dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                  vhash, 512 );
+
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+    intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 512 );
+
+    jh512_8way_update( &ctx.jh, vhash, 64 );
+    jh512_8way_close( &ctx.jh, vhash );
+
+    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
+                                       zero );
+
+    if ( ( vh_mask & 0xff ) != 0xff )
+    {
+       blake512_8way_init( &ctx.blake );
+       blake512_8way_update( &ctx.blake, vhash, 64 );
+       blake512_8way_close( &ctx.blake, vhashA );
+    }
+
+    if ( vh_mask & 0xff )
+    {
+       bmw512_8way_init( &ctx.bmw );
+       bmw512_8way_update( &ctx.bmw, vhash, 64 );
+       bmw512_8way_close( &ctx.bmw, vhashB );
+    }
+
+    mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+
+    keccak512_8way_update( &ctx.keccak, vhash, 64 );
+    keccak512_8way_close( &ctx.keccak, vhash );
+
+    skein512_8way_init( &ctx.skein );
+    skein512_8way_update( &ctx.skein, vhash, 64 );
+    skein512_8way_close( &ctx.skein, vhash );
+
+    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
+                                       zero );
+
+    if ( ( vh_mask & 0xff ) != 0xff )
+    {
+       keccak512_8way_init( &ctx.keccak );
+       keccak512_8way_update( &ctx.keccak, vhash, 64 );
+       keccak512_8way_close( &ctx.keccak, vhashA );
+    }
+
+    if ( vh_mask & 0xff )
+    {
+       jh512_8way_init( &ctx.jh );
+       jh512_8way_update( &ctx.jh, vhash, 64 );
+       jh512_8way_close( &ctx.jh, vhashB );
+    }
+
+    // Final blend, directly to state, only need 32 bytes.
+    casti_m512i( state,0 ) = _mm512_mask_blend_epi64( vh_mask, vhA[0], vhB[0] );
+    casti_m512i( state,1 ) = _mm512_mask_blend_epi64( vh_mask, vhA[1], vhB[1] );
+    casti_m512i( state,2 ) = _mm512_mask_blend_epi64( vh_mask, vhA[2], vhB[2] );
+    casti_m512i( state,3 ) = _mm512_mask_blend_epi64( vh_mask, vhA[3], vhB[3] );
+}
+
+int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint32_t hash[8*8] __attribute__ ((aligned (128)));
+    uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+    uint32_t *hash7 = &(hash[49]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    uint32_t n = pdata[19];
+    const uint32_t first_nonce = pdata[19];
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+    int thr_id = mythr->id;  // thr_id arg is deprecated
+
+    mm512_bswap32_intrlv80_8x64( vdata, pdata );
+    do
+    {
+       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+       quark_8way_hash( hash, vdata );
+       pdata[19] = n;
+
+       for ( int i = 0; i < 8; i++ )
+       if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 )
+       {
+          extr_lane_8x64( lane_hash, hash, i, 256 );
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark  )
+          {
+            pdata[19] = n+i;
+            submit_lane_solution( work, lane_hash, mythr, i );
+          }
+       }
+       n += 8;
+    } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
+
+    *hashes_done = n - first_nonce;
+    return 0;
+}
+
+
+#elif defined (QUARK_4WAY)
+
 typedef struct {
    blake512_4way_context  blake;
    bmw512_4way_context    bmw;
@@ -91,7 +339,7 @@ void quark_4way_hash( void *state, const void *input )

    intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );

-    if ( mm256_anybits0( vh_mask ) )   
+    if ( mm256_anybits1( vh_mask ) )   
    {
       skein512_4way( &ctx.skein, vhash, 64 );
       skein512_4way_close( &ctx.skein, vhashB );
@@ -117,14 +365,14 @@ void quark_4way_hash( void *state, const void *input )

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );

-    if ( mm256_anybits1( vh_mask ) )
+    if ( mm256_anybits0( vh_mask ) )   
    {
       blake512_4way_init( &ctx.blake );
       blake512_4way( &ctx.blake, vhash, 64 );
       blake512_4way_close( &ctx.blake, vhashA );
    }

-    if ( mm256_anybits0( vh_mask ) )
+    if ( mm256_anybits1( vh_mask ) )
    {
       bmw512_4way_init( &ctx.bmw );
       bmw512_4way( &ctx.bmw, vhash, 64 );
@@ -142,14 +390,14 @@ void quark_4way_hash( void *state, const void *input )

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );

-    if ( mm256_anybits1( vh_mask ) )
+    if ( mm256_anybits0( vh_mask ) )    
    {
       keccak512_4way_init( &ctx.keccak );
       keccak512_4way( &ctx.keccak, vhash, 64 );
       keccak512_4way_close( &ctx.keccak, vhashA );
    }

-    if ( mm256_anybits0( vh_mask ) )
+    if ( mm256_anybits1( vh_mask ) )
    {
       jh512_4way_init( &ctx.jh );
       jh512_4way( &ctx.jh, vhash, 64 );
--- a/algo/quark/quark-gate.c
+++ b/algo/quark/quark-gate.c
@@ -2,7 +2,11 @@

 bool register_quark_algo( algo_gate_t* gate )
 {
-#if defined (QUARK_4WAY)
+#if defined (QUARK_8WAY)
+  init_quark_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_quark_8way;
+  gate->hash      = (void*)&quark_8way_hash;
+#elif defined (QUARK_4WAY)
  init_quark_4way_ctx();
  gate->scanhash  = (void*)&scanhash_quark_4way;
  gate->hash      = (void*)&quark_4way_hash;
@@ -11,7 +15,7 @@ bool register_quark_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_quark;
  gate->hash      = (void*)&quark_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/quark/quark-gate.h
+++ b/algo/quark/quark-gate.h
@@ -4,13 +4,22 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define QUARK_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define QUARK_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define QUARK_4WAY 1
 #endif

 bool register_quark_algo( algo_gate_t* gate );

-#if defined(QUARK_4WAY)
+#if defined(QUARK_8WAY)
+
+void quark_8way_hash( void *state, const void *input );
+int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+void init_quark_8way_ctx();
+
+#elif defined(QUARK_4WAY)

 void quark_4way_hash( void *state, const void *input );
 int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -1,17 +1,135 @@
 #include "qubit-gate.h"
-
-#if defined(QUBIT_2WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/cubehash_sse2.h" 
+#include "algo/cubehash/cube-hash-2way.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/echo/aes_ni/hash_api.h"

+#if defined(QUBIT_4WAY)
+
+typedef struct
+{
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    simd_2way_context       simd2;
+    hashState_echo          echo;
+} qubit_4way_ctx_holder;
+
+qubit_4way_ctx_holder qubit_4way_ctx;
+
+void init_qubit_4way_ctx()
+{
+    cube_4way_init( &qubit_4way_ctx.cube, 512, 16, 32 );
+    sph_shavite512_init(&qubit_4way_ctx.shavite);
+    simd_4way_init( &qubit_4way_ctx.simd, 512 );
+    simd_2way_init( &qubit_4way_ctx.simd2, 512 );
+    init_echo(&qubit_4way_ctx.echo, 512);
+};
+
+void qubit_4way_hash( void *output, const void *input )
+{
+     uint32_t vhash[16*4] __attribute__ ((aligned (128)));
+     uint32_t hash0[16] __attribute__ ((aligned (64)));
+     uint32_t hash1[16] __attribute__ ((aligned (64)));
+     uint32_t hash2[16] __attribute__ ((aligned (64)));
+     uint32_t hash3[16] __attribute__ ((aligned (64)));
+     qubit_4way_ctx_holder ctx;
+
+     memcpy( &ctx, &qubit_4way_ctx, sizeof(qubit_4way_ctx) );
+
+     luffa_4way_update( &ctx.luffa, input + (64<<2), 16 );
+     luffa_4way_close( &ctx.luffa, vhash );
+     
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &qubit_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &qubit_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &qubit_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+     memcpy( output+64, hash2, 32 );
+     memcpy( output+96, hash3, 32 );
+}
+
+int scanhash_qubit_4way( struct work *work,uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *noncep = vdata + 64+3;   // 4*16 + 3
+     int thr_id = mythr->id;
+     const uint32_t Htarg = ptarget[7];
+
+     mm512_bswap32_intrlv80_4x128( vdata, pdata );
+     luffa_4way_init( &qubit_4way_ctx.luffa, 512 );
+     luffa_4way_update( &qubit_4way_ctx.luffa, vdata, 64 );
+
+     do
+     {
+        be32enc( noncep,    n   );
+        be32enc( noncep+ 4, n+1 );
+        be32enc( noncep+ 8, n+2 );
+        be32enc( noncep+12, n+3 );
+
+        qubit_4way_hash( hash, vdata );
+        pdata[19] = n;
+
+        for ( int lane = 0; lane < 4; lane++ )
+        if ( ( hash+(lane<<3) )[7] < Htarg )
+        if ( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark )
+        {
+           pdata[19] = n + lane;
+           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+        }
+        n += 4;
+     } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined(QUBIT_2WAY)
+
 typedef struct
 {
        luffa_2way_context      luffa;
--- a/algo/qubit/qubit-gate.c
+++ b/algo/qubit/qubit-gate.c
@@ -2,7 +2,12 @@

 bool register_qubit_algo( algo_gate_t* gate )
 {
-#if defined (QUBIT_2WAY)
+   
+#if defined (QUBIT_4WAY)
+  init_qubit_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_qubit_4way;
+  gate->hash      = (void*)&qubit_4way_hash;
+#elif defined (QUBIT_2WAY)
  init_qubit_2way_ctx();
  gate->scanhash  = (void*)&scanhash_qubit_2way;
  gate->hash      = (void*)&qubit_2way_hash;
@@ -11,7 +16,7 @@ bool register_qubit_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_qubit;
  gate->hash      = (void*)&qubit_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/qubit/qubit-gate.h
+++ b/algo/qubit/qubit-gate.h
@@ -4,13 +4,23 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define QUBIT_2WAY
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define QUBIT_4WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define QUBIT_2WAY 1
 #endif

 bool register_qubit_algo( algo_gate_t* gate );

-#if defined(QUBIT_2WAY)
+#if defined(QUBIT_4WAY)
+
+void qubit_4way_hash( void *state, const void *input );
+int scanhash_qubit_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+void init_qubit_4way_ctx();
+
+#elif defined(QUBIT_2WAY)

 void qubit_2way_hash( void *state, const void *input );
 int scanhash_qubit_2way( struct work *work, uint32_t max_nonce,
--- a/algo/sha/sha256_hash_11way.c
+++ b/algo/sha/sha256_hash_11way.c
@@ -1,538 +0,0 @@
-#if 0
-
-#include <stddef.h>
-#include <string.h>
-
-#include "sha2-hash-4way.h"
-
-#if defined(__AVX2__)
-
-// naming convention for variables and macros
-// VARx: AVX2 8 way 32 bit
-// VARy: MMX 2 way 32 bit
-// VARz: scalar integer 32 bit
-
-
-static const uint32_t H256[8] =
-{
-        0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-        0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-};
-
-static const uint32_t K256[64] = 
-{
-        0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
-        0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
-        0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
-        0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
-        0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
-        0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
-        0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
-        0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
-        0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
-        0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
-        0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
-        0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
-        0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
-        0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
-        0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
-        0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
-};
-
-#define CHx(X, Y, Z) \
-   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
-
-#define CHy(X, Y, Z) \
-   _mm_xor_si64( _mm_and_si64( _mm_xor_si64( Y, Z ), X ), Z )
-
-#define CHz(X, Y, Z) ((( (Y) ^ (Z) ) & (X) ) ^ (Z) )
-
-
-#define MAJx(X, Y, Z) \
-   _mm256_or_si256( _mm256_and_si256( X, Y ), \
-                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
-
-#define MAJy(X, Y, Z) \
-   _mm_or_si64( _mm_and_si64( X, Y ), \
-                    _mm_and_si64( _mm_or_si64( X, Y ), Z ) )
-
-#define MAJz(X, Y, Z)  ( ( (X) & (Y) ) | ( ( (X) | (Y) ) & (Z) ) )
-
-#define BSG2_0x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-       mm256_ror_32(x,2), mm256_ror_32(x,13) ), _mm256_srli_epi32(x,22) )
-
-#define BSG2_0y(x) \
-   _mm_xor_si64( _mm_xor_si64( \
-       mm64_ror_32(x,2), mm64_ror_32(x,13) ), _mm_srli_pi32(x,22) )
-
-#define BSG2_0z(x)  ( u32_ror_32(x,2) ^ u32_ror_32(x,13)  ^ ((x)>>22) )
-
-#define BSG2_1x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-       mm256_ror_32(x,6), mm256_ror_32(x,11) ), _mm256_srli_epi32(x,25) )
-
-#define BSG2_1y(x) \
-   _mm_xor_si64( _mm_xor_si64( \
-       mm64_ror_32(x,6), mm64_ror_32(x,11) ), _mm_srli_pi32(x,25) )
-
-#define BSG2_1z(x)   ( u32_ror_32(x,6) ^ u32_ror_32(x,11) ^ ((x)>>25) )
-
-#define SSG2_0x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-       mm256_ror_32(x,7), mm256_ror_32(x,18) ), _mm256_srli_epi32(x,3) ) 
-
-#define SSG2_0y(x) \
-   _mm_xor_si64( _mm_xor_si64( \
-       mm64_ror_32(x,7), mm64_ror_32(x,18) ), _mm_srli_pi32(x,3) )
-
-#define SSG2_0z(x)  (( u32_ror_32(x,7) ^ u32_ror_32(x,18) ) ^ ((x)>>3) )
-
-#define SSG2_1x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-       mm256_ror_32(x,17), mm256_ror_32(x,19) ), _mm256_srli_epi32(x,10) )
-
-#define SSG2_1y(x) \
-   _mm_xor_si64( _mm_xor_si64( \
-       mm64_ror_32(x,17), mm64_ror_32(x,19) ), _mm_srli_pi32(x,10) )
-
-#define SSG2_1z(x)   ( u32_ror_32(x,17) ^ u32_ror_32(x,19)  ^ ((x)>>10) )
-
-#define SHA2x_MEXP( a, b, c, d ) \
-     _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
-                 SSG2_1x( Wx[a] ), Wx[b] ), SSG2_0x( Wx[c] ) ), Wx[d] )
-
-#define SHA2y_MEXP( a, b, c, d ) \
-     _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
-                 SSG2_1y( Wy[a] ), Wy[b] ), SSG2_0y( Wy[c] ) ), Wy[d] )
-
-#define SHA2z_MEXP( a, b, c, d ) \
-               ( SSG2_1z( Wz[a] ) + Wz[b] + SSG2_0z( Wz[c] ) + Wz[d] )
-
-
-#define SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, \
-	                  Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, \
-		          Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, i, j) \
-do { \
-  __m256i T1x, T2x; \
-  __m64 T1y, T2y; \
-  uint32_t T1z, T2z; \
-  T1x = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
-        _mm256_add_epi32( Hx, BSG2_1x(Ex) ), CHx(Ex, Fx, Gx) ), \
-                          _mm256_set1_epi32( K256[( (j)+(i) )] ) ), Wx[i] ); \
-  T1y = _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
-        _mm_add_pi32( Hy, BSG2_1y(Ey) ), CHy(Ey, Fy, Gy) ), \
-                          _mm_set1_pi32( K256[( (j)+(i) )] ) ), Wy[i] ); \
-  T1z = Hz + BSG2_1z( Ez ) + CHz( Ez, Fz, Gz ) + K256[ ((j)+(i)) ] + Wz[i]; \
-  T2x = _mm256_add_epi32( BSG2_0x(Ax), MAJx(Ax, Bx, Cx) ); \
-  T2y = _mm_add_pi32( BSG2_0y(Ay), MAJy(Ay, By, Cy) ); \
-  T2z = BSG2_0z( Az ) + MAJz( Az, Bz, Cz ); \
-  Dx  = _mm256_add_epi32( Dx,  T1x ); \
-  Dy  = _mm_add_pi32( Dy, T1y ); \
-  Dz  = Dz + T1z; \
-  Hx  = _mm256_add_epi32( T1x, T2x ); \
-  Hy  = _mm_add_pi32( T1y, T2y ); \
-  Hz  = T1z + T2z; \
-} while (0)
-	
-void sha256_11way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 ry[8],
-                         uint32_t *inz, uint32_t rz[8] )
-{
-   __m256i Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx;
-   __m256i Wx[16];
-   __m64 Ay, By, Cy, Dy, Ey, Fy, Gy, Hy;
-   __m64 Wy[16];
-   uint32_t Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz;
-   uint32_t Wz[16];
-
-   Wx[ 0] = mm256_bswap_32( inx[ 0] );
-   Wy[ 0] =  mm64_bswap_32( iny[ 0] );
-   Wz[ 0] =       bswap_32( inz[ 0] );
-
-   Wx[ 1] = mm256_bswap_32( inx[ 1] );
-   Wy[ 1] =  mm64_bswap_32( iny[ 1] );
-   Wz[ 1] =       bswap_32( inz[ 1] );
-
-   Wx[ 2] = mm256_bswap_32( inx[ 2] );
-   Wy[ 2] =  mm64_bswap_32( iny[ 2] );
-   Wz[ 2] =       bswap_32( inz[ 2] );
-
-   Wx[ 3] = mm256_bswap_32( inx[ 3] );
-   Wy[ 3] =  mm64_bswap_32( iny[ 3] );
-   Wz[ 3] =       bswap_32( inz[ 3] );
-
-   Wx[ 4] = mm256_bswap_32( inx[ 4] );
-   Wy[ 4] =  mm64_bswap_32( iny[ 4] );
-   Wz[ 4] =       bswap_32( inz[ 4] );
-
-   Wx[ 5] = mm256_bswap_32( inx[ 5] );
-   Wy[ 5] =  mm64_bswap_32( iny[ 5] );
-   Wz[ 5] =       bswap_32( inz[ 5] );
-
-   Wx[ 6] = mm256_bswap_32( inx[ 6] );
-   Wy[ 6] =  mm64_bswap_32( iny[ 6] );
-   Wz[ 6] =       bswap_32( inz[ 6] );
-
-   Wx[ 7] = mm256_bswap_32( inx[ 7] );
-   Wy[ 7] =  mm64_bswap_32( iny[ 7] );
-   Wz[ 7] =       bswap_32( inz[ 7] );
-
-   Wx[ 8] = mm256_bswap_32( inx[ 8] );
-   Wy[ 8] =  mm64_bswap_32( iny[ 8] );
-   Wz[ 8] =       bswap_32( inz[ 8] );
-
-   Wx[ 9] = mm256_bswap_32( inx[ 9] );
-   Wy[ 9] =  mm64_bswap_32( iny[ 9] );
-   Wz[ 9] =       bswap_32( inz[ 9] );
-
-   Wx[10] = mm256_bswap_32( inx[10] );
-   Wy[10] =  mm64_bswap_32( iny[10] );
-   Wz[10] =       bswap_32( inz[10] );
-
-   Wx[11] = mm256_bswap_32( inx[11] );
-   Wy[11] =  mm64_bswap_32( iny[11] );
-   Wz[11] =       bswap_32( inz[11] );
-
-   Wx[12] = mm256_bswap_32( inx[12] );
-   Wy[12] =  mm64_bswap_32( iny[12] );
-   Wz[12] =       bswap_32( inz[12] );
-
-   Wx[13] = mm256_bswap_32( inx[13] );
-   Wy[13] =  mm64_bswap_32( iny[13] );
-   Wz[13] =       bswap_32( inz[13] );
-
-   Wx[14] = mm256_bswap_32( inx[14] );
-   Wy[14] =  mm64_bswap_32( iny[14] );
-   Wz[14] =       bswap_32( inz[14] );
-
-   Wx[15] = mm256_bswap_32( inx[15] );
-   Wy[15] =  mm64_bswap_32( iny[15] );
-   Wz[15] =       bswap_32( inz[15] );
-
-   Ax = rx[0];     Ay = ry[0];     Az = rz[0];
-   Bx = rx[1];     By = ry[1];     Bz = rz[1];
-   Cx = rx[2];     Cy = ry[2];     Cz = rz[2];
-   Dx = rx[3];     Dy = ry[3];     Dz = rz[3];
-   Ex = rx[4];     Ey = ry[4];     Ez = rz[4];
-   Fx = rx[5];     Fy = ry[5];     Fz = rz[5];
-   Gx = rx[6];     Gy = ry[6];     Gz = rz[6];
-   Hx = rx[7];     Hy = ry[7];     Hz = rz[7];
-
-   SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
-                     Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
-                     Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  0, 0 );
-   SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
-		     Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
-		     Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  1, 0 );
-   SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
-		     Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
-		     Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz,  2, 0 );
-   SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
-		     Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
-		     Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez,  3, 0 );
-   SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
-		     Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
-		     Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz,  4, 0 );
-   SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
-		     Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
-		     Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz,  5, 0 );
-   SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
-		     Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
-		     Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz,  6, 0 );
-   SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
-		     By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
-		     Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az,  7, 0 );
-   SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
-		     Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
-		     Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  8, 0 );
-   SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
-		     Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
-		     Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  9, 0 );
-   SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
-		     Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
-		     Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, 0 );
-   SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
-		     Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
-		     Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, 0 );
-   SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
-		     Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
-		     Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, 0 );
-   SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
-		     Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
-		     Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, 0 );
-   SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
-		     Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
-		     Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, 0 );
-   SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
-		     By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
-		     Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, 0 );
-
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      Wx[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
-      Wy[ 0] = SHA2y_MEXP( 14,  9,  1,  0 );
-      Wz[ 0] = SHA2z_MEXP( 14,  9,  1,  0 );
-
-      Wx[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
-      Wy[ 1] = SHA2y_MEXP( 15, 10,  2,  1 );
-      Wz[ 1] = SHA2z_MEXP( 15, 10,  2,  1 );
-
-      Wx[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
-      Wy[ 2] = SHA2y_MEXP(  0, 11,  3,  2 );
-      Wz[ 2] = SHA2z_MEXP(  0, 11,  3,  2 );
-
-      Wx[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
-      Wy[ 3] = SHA2y_MEXP(  1, 12,  4,  3 );
-      Wz[ 3] = SHA2z_MEXP(  1, 12,  4,  3 );
-
-      Wx[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
-      Wy[ 4] = SHA2y_MEXP(  2, 13,  5,  4 );
-      Wz[ 4] = SHA2z_MEXP(  2, 13,  5,  4 );
-
-      Wx[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
-      Wy[ 5] = SHA2y_MEXP(  3, 14,  6,  5 );
-      Wz[ 5] = SHA2z_MEXP(  3, 14,  6,  5 );
-
-      Wx[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
-      Wy[ 6] = SHA2y_MEXP(  4, 15,  7,  6 );
-      Wz[ 6] = SHA2z_MEXP(  4, 15,  7,  6 );
-
-      Wx[ 7] = SHA2x_MEXP(  5,  0,  8,  7);
-      Wy[ 7] = SHA2y_MEXP(  5,  0,  8,  7);
-      Wz[ 7] = SHA2z_MEXP(  5,  0,  8,  7);
-
-      Wx[ 8] = SHA2x_MEXP(  6,  1,  9,  8);
-      Wy[ 8] = SHA2y_MEXP(  6,  1,  9,  8);
-      Wz[ 8] = SHA2z_MEXP(  6,  1,  9,  8);
-
-      Wx[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
-      Wy[ 9] = SHA2y_MEXP(  7,  2, 10,  9);
-      Wz[ 9] = SHA2z_MEXP(  7,  2, 10,  9);
-
-      Wx[10] = SHA2x_MEXP(  8,  3, 11, 10 );
-      Wy[10] = SHA2y_MEXP(  8,  3, 11, 10);
-      Wz[10] = SHA2z_MEXP(  8,  3, 11, 10);
-
-      Wx[11] = SHA2x_MEXP(  9,  4, 12, 11);
-      Wy[11] = SHA2y_MEXP(  9,  4, 12, 11);
-      Wz[11] = SHA2z_MEXP(  9,  4, 12, 11 );
-
-      Wx[12] = SHA2x_MEXP( 10,  5, 13, 12 );
-      Wy[12] = SHA2y_MEXP( 10,  5, 13, 12 );
-      Wz[12] = SHA2z_MEXP( 10,  5, 13, 12 );
-
-      Wx[13] = SHA2x_MEXP( 11,  6, 14, 13 );
-      Wy[13] = SHA2y_MEXP( 11,  6, 14, 13 );
-      Wz[13] = SHA2z_MEXP( 11,  6, 14, 13 );
-
-      Wx[14] = SHA2x_MEXP( 12,  7, 15, 14 );
-      Wy[14] = SHA2y_MEXP( 12,  7, 15, 14 );
-      Wz[14] = SHA2z_MEXP( 12,  7, 15, 14 );
-
-      Wx[15] = SHA2x_MEXP( 13,  8,  0, 15 );
-      Wy[15] = SHA2y_MEXP( 13,  8,  0, 15 );
-      Wz[15] = SHA2z_MEXP( 13,  8,  0, 15 );
-
-
-      SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
-                        Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
-			Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,	 0, j );
-      SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
-		        Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
-		       	Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  1, j );
-      SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
-		        Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
-		       	Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz,  2, j );
-      SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
-		        Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
-		       	Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez,  3, j );
-      SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
-		        Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
-		       	Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz,  4, j );
-      SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
-		        Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
-		       	Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz,  5, j );
-      SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
-		        Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
-		       	Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz,  6, j );
-      SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
-		        By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
-		       	Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az,  7, j );
-      SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
-                        Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
-                        Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  8, j );
-      SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx, 
-                        Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, 
-                        Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  9, j );
-      SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx, 
-                        Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, 
-                        Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, j );
-      SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex, 
-                        Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, 
-                        Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, j );
-      SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx, 
-                        Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, 
-                        Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, j );
-      SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx, 
-                        Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, 
-                        Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, j );
-      SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx, 
-                        Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, 
-                        Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, j );
-      SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax, 
-                        By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, 
-                        Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, j );
-   }
-
-   rx[0] = _mm256_add_epi32( rx[0], Ax );
-   ry[0] =     _mm_add_pi32( ry[0], Ay );
-   rz[0] =                   rz[0]+ Az;
-   rx[1] = _mm256_add_epi32( rx[1], Bx );
-   ry[1] =     _mm_add_pi32( ry[1], By );
-   rz[1] =                   rz[1]+ Bz;
-   rx[2] = _mm256_add_epi32( rx[2], Cx );
-   ry[2] =     _mm_add_pi32( ry[2], Cy );
-   rz[3] =                   rz[3]+ Dz;
-   rx[4] = _mm256_add_epi32( rx[4], Ex );
-   ry[4] =     _mm_add_pi32( ry[4], Ey );
-   rz[4] =                   rz[4]+ Ez;
-   rx[5] = _mm256_add_epi32( rx[5], Fx );
-   ry[5] =     _mm_add_pi32( ry[5], Fy );
-   rz[5] =                   rz[5]+ Fz;
-   rx[6] = _mm256_add_epi32( rx[6], Gx );
-   ry[6] =     _mm_add_pi32( ry[6], Gy );
-   rz[6] =                   rz[6]+ Gz;
-   rx[7] = _mm256_add_epi32( rx[7], Hx );
-   ry[7] =     _mm_add_pi32( ry[7], Hy );
-   rz[7] =                   rz[7]+ Hz;
-
-}
-
-void sha256_11way_init( sha256_11way_context *ctx )
-{
-   ctx->count_high = ctx->count_low = 0;
-   ctx->valx[0] = _mm256_set1_epi32( H256[0] );
-   ctx->valy[0] =     _mm_set1_pi32( H256[0] );
-   ctx->valx[1] = _mm256_set1_epi32( H256[0] );
-   ctx->valy[1] =     _mm_set1_pi32( H256[0] );
-   ctx->valx[2] = _mm256_set1_epi32( H256[0] );
-   ctx->valy[2] =     _mm_set1_pi32( H256[0] );
-   ctx->valx[3] = _mm256_set1_epi32( H256[0] );
-   ctx->valy[3] =     _mm_set1_pi32( H256[0] );
-   ctx->valx[4] = _mm256_set1_epi32( H256[0] );
-   ctx->valy[4] =     _mm_set1_pi32( H256[0] );
-   ctx->valx[5] = _mm256_set1_epi32( H256[0] );
-   ctx->valy[5] =     _mm_set1_pi32( H256[0] );
-   ctx->valx[6] = _mm256_set1_epi32( H256[0] );
-   ctx->valy[6] =     _mm_set1_pi32( H256[0] );
-   ctx->valx[7] = _mm256_set1_epi32( H256[0] );
-   ctx->valy[7] =     _mm_set1_pi32( H256[0] );
-   memcpy( ctx->valz, H256, 32 );
-}
-
-
-void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
-	                  const void *datay, const void *dataz, size_t len )
-{
-   __m256i  *vdatax = (__m256i*) datax;
-    __m64   *vdatay = (__m64*)   datay;
-   uint32_t *idataz = (uint32_t*)dataz;
-   size_t ptr;
-   const int buf_size = 64;
-
-   ptr = (unsigned)ctx->count_low & (buf_size - 1U);
-   while ( len > 0 )
-   {
-      size_t clen;
-      uint32_t clow, clow2;
-
-      clen = buf_size - ptr;
-      if ( clen > len )
-         clen = len;
-      memcpy_256( ctx->bufx + (ptr>>2), vdatax + (ptr>>2), clen>>2 );
-      memcpy_m64( ctx->bufy + (ptr>>2), vdatay + (ptr>>2), clen>>2 );
-      memcpy    ( ctx->bufz +  ptr,     idataz +  ptr,     clen    );
-      ptr += clen;
-      len -= clen;
-      if ( ptr == buf_size )
-      {
-         sha256_11way_round( ctx->bufx, ctx->valx,
-			     ctx->bufy, ctx->valy,
-			     ctx->bufz, ctx->valz );
-         ptr = 0;
-      }
-      clow = ctx->count_low;
-      clow2 = clow + clen;
-      ctx->count_low = clow2;
-      if ( clow2 < clow )
-         ctx->count_high++;
-   }
-}
-
-
-void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dsty,
-	                                            void *dstz)
-{
-    unsigned ptr, u;
-    uint32_t low, high;
-    const int buf_size = 64;
-    const int pad = buf_size - 8;
-
-    ptr = (unsigned)ctx->count_low & (buf_size - 1U);
-    ctx->bufx[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
-    ctx->bufy[ ptr>>2 ] = _mm_set1_pi32( 0x80 );
-    ctx->bufz[ ptr>>2 ] = 0x80;
-    ptr += 4;
-
-    if ( ptr > pad )
-    {
-         memset_zero_256( ctx->bufx + (ptr>>2), (buf_size - ptr) >> 2 );
-         memset_zero_m64( ctx->bufy + (ptr>>2), (buf_size - ptr) >> 2 );
-         memset(      ctx->bufz + (ptr>>2), 0,  (buf_size - ptr) >> 2 );
-         sha256_11way_round( ctx->bufx, ctx->valx,
-			     ctx->bufy, ctx->valy,
-			     ctx->bufz, ctx->valz );
-         memset_zero_256( ctx->bufx, pad >> 2 );
-         memset_zero_m64(  ctx->bufy, pad >> 2 );
-         memset(      ctx->bufz, 0,  pad >> 2 );
-    }
-    else
-    {
-        memset_zero_256( ctx->bufx + (ptr>>2),    (pad - ptr) >> 2 );
-        memset_zero_m64(  ctx->bufy + (ptr>>2),    (pad - ptr) >> 2 );
-        memset(          ctx->bufz + (ptr>>2), 0, (pad - ptr) >> 2 );
-    }
-
-    low = ctx->count_low;
-    high = (ctx->count_high << 3) | (low >> 29);
-    low = low << 3;
-
-    ctx->bufx[ pad >> 2 ] =
-                 mm256_bswap_32( _mm256_set1_epi32( high ) );
-    ctx->bufy[ pad >> 2 ] =
-                 mm64_bswap_32( _mm_set1_pi32( high ) );
-    ctx->bufz[ pad >> 2 ] =
-                 bswap_32( high );
-
-
-    ctx->bufx[ ( pad+4 ) >> 2 ] =
-                 mm256_bswap_32( _mm256_set1_epi32( low ) );
-    ctx->bufy[ ( pad+4 ) >> 2 ] =
-                 mm64_bswap_32( _mm_set1_pi32( low ) );
-    ctx->bufz[ ( pad+4 ) >> 2 ] =
-                 bswap_32( low );
-
-    sha256_11way_round( ctx->bufx, ctx->valx,
-		       ctx->bufy, ctx->valy,
-		       ctx->bufz, ctx->valz  );
-
-    for ( u = 0; u < 8; u ++ )
-    {
-       casti_m256i( dstx, u ) = mm256_bswap_32( ctx->valx[u] );
-       casti_m64  ( dsty, u ) =  mm64_bswap_32( ctx->valy[u] );
-       ((uint32_t*)dstz)[u] = bswap_32( ctx->valz[u] );
-   }
-}
-
-#endif
-#endif   // 0
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -5,137 +5,6 @@
 #include <stdio.h>
 #include "sha-hash-4way.h"

-#if defined(SHA256T_11WAY)
-
-static __thread sha256_11way_context sha256_ctx11 __attribute__ ((aligned (64)));
-
-void sha256t_11way_hash( void *outx, void *outy, void *outz, const void *inpx,
-	                 const void *inpy, const void*inpz )
-{
-   uint32_t hashx[8*8] __attribute__ ((aligned (64)));
-   uint32_t hashy[8*2] __attribute__ ((aligned (64)));
-   uint32_t hashz[8]   __attribute__ ((aligned (64)));
-   sha256_11way_context ctx;
-   const void *inpx64 = inpx+(64<<3);
-   const void *inpy64 = inpy+(64<<1);
-   const void *inpz64 = inpz+ 64;
-
-   memcpy( &ctx, &sha256_ctx11, sizeof ctx );
-   sha256_11way_update( &ctx, inpx64, inpy64, inpz64,  16 );
-   sha256_11way_close( &ctx, hashx, hashy, hashz );
-
-   sha256_11way_init( &ctx );
-   sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
-   sha256_11way_close( &ctx, hashx, hashy, hashz );
-
-   sha256_11way_init( &ctx );
-   sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
-   sha256_11way_close( &ctx, outx, outy, outz );
-}
-
-int scanhash_sha256t_11way( struct work *work, uint32_t max_nonce,
-	                    uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t datax[20*8]  __attribute__ ((aligned (64)));
-   uint32_t datay[20*2]  __attribute__ ((aligned (32)));
-   uint32_t dataz[20]    __attribute__ ((aligned (32)));
-   uint32_t hashx[8*8]   __attribute__ ((aligned (32)));
-   uint32_t hashy[8*2]   __attribute__ ((aligned (32)));
-   uint32_t hashz[8]     __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash7;
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   uint32_t n = first_nonce;
-   __m256i  *noncex = (__m256i*) datax + 19;
-   __m64    *noncey = (__m64*)   datay + 19;
-   uint32_t *noncez = (uint32_t*)dataz + 19;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-   int i;
-   const uint64_t htmax[] = {           0,
-                                      0xF,
-                                     0xFF,
-                                    0xFFF,
-                                   0xFFFF,
-                               0x10000000 };
-   const uint32_t masks[] = {  0xFFFFFFFF,
-                               0xFFFFFFF0,
-                               0xFFFFFF00,
-                               0xFFFFF000,
-                               0xFFFF0000,
-                                        0 };
-
-   // Use dataz (scalar) to stage bswapped data for the vectors.
-   casti_m256i( dataz, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
-   casti_m256i( dataz, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
-   casti_m128i( dataz, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-
-   intrlv_8x32( datax, dataz, dataz, dataz, dataz,
-                                 dataz, dataz, dataz, dataz, 640 );
-   mm64_interleave_2x32( datay, dataz, dataz, 640 );
-
-   sha256_11way_init( &sha256_ctx11 );
-   sha256_11way_update( &sha256_ctx11, datax, datay, dataz, 64 );
-
-   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
-   {
-      uint32_t mask = masks[m];
-      do
-      {
-        *noncex = mm256_bswap_32(
-         _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
-        *noncey = mm64_bswap_32( _mm_set_pi32( n+9, n+8 ) );
-        *noncez = bswap_32( n+10 );
-
-        pdata[19] = n;
-
-        sha256t_11way_hash( hashx, hashy, hashz, datax, datay, dataz );
-
-        if ( opt_benchmark ) { n += 11; continue; }
-
-        hash7 = &(hashx[7<<3]); 
-        for ( i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
-        { 
-            // deinterleave hash for lane
-            extr_lane_8x32( lane_hash, hashx, i, 256 );
-            if ( fulltest( lane_hash, ptarget ) )
-            {
-	            pdata[19] = n + i;
-               submit_lane_solution( work, lane_hash, mythr, i );
-            }
-        }
-
-        hash7 = &(hashy[7<<1]);
-        for( i = 0; i < 2; i++ ) if ( !(hash7[ 0] & mask ) )
- 
-        {
-            mm64_extr_lane_2x32( lane_hash, hashy, i, 256 );
-           if ( fulltest( lane_hash, ptarget ) )
-           {
-               pdata[19] = n + 8 + i;
-               submit_lane_solution( work, lane_hash, mythr, i+8 );
-           }
-	     }
-
-        if ( !(hashz[7] & mask ) && fulltest( hashz, ptarget ) )
-        {
-            pdata[19] = n+10;
-            submit_lane_solution( work, hashz, mythr, 10 );
-        }
-        n += 11;
-
-      } while ( (n < max_nonce-12) && !work_restart[thr_id].restart );
-      break;
-   }
-    
-   *hashes_done = n - first_nonce + 1;
-   return 0;
-}
-
-#endif
-
 #if defined(SHA256T_8WAY)

 static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -285,8 +285,10 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
    unsigned ptr;
    const int buf_size = 128;
    const int pad = buf_size - 16;
-    const __m256i shuff_bswap64 = m256_const2_64( 0x08090a0b0c0d0e0f,
-                                                  0x0001020304050607 );
+    const __m256i shuff_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f,
+                                                 0x1011121314151617,
+                                                 0x08090a0b0c0d0e0f,
+                                                 0x0001020304050607 );

    ptr = (unsigned)sc->count & (buf_size - 1U);
    sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 );
--- a/algo/simd/nist.c
+++ b/algo/simd/nist.c
@@ -83,13 +83,14 @@ HashReturn init_sd(hashState_sd *state, int hashbitlen) {
  char *init;

 #ifndef NO_PRECOMPUTED_IV
-  if (hashbitlen == 224)
-    r=InitIV(state, hashbitlen, IV_224);
-  else if (hashbitlen == 256)
-    r=InitIV(state, hashbitlen, IV_256);
-  else if (hashbitlen == 384)
-    r=InitIV(state, hashbitlen, IV_384);
-  else if (hashbitlen == 512)
+//  if (hashbitlen == 224)
+//    r=InitIV(state, hashbitlen, IV_224);
+//  else if (hashbitlen == 256)
+//    r=InitIV(state, hashbitlen, IV_256);
+//  else if (hashbitlen == 384)
+//    r=InitIV(state, hashbitlen, IV_384);
+//  else
+  if (hashbitlen == 512)
    r=InitIV(state, hashbitlen, IV_512);
  else
 #endif
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
--- a/algo/simd/simd-hash-2way.h
+++ b/algo/simd/simd-hash-2way.h
@@ -7,15 +7,37 @@

 #include "simd-utils.h"

+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 typedef struct {
-  uint32_t A[ 32*2 ] __attribute__((aligned(64)));
-  uint8_t buffer[ 128*2 ] __attribute__((aligned(64)));
+  uint32_t A[ 32*4 ];
+  uint8_t buffer[ 128*4 ];
+  uint64_t count;
+  unsigned int hashbitlen;
+  unsigned int blocksize;
+  unsigned int n_feistels;
+
+} simd_4way_context __attribute__((aligned(128)));
+
+int simd_4way_init( simd_4way_context *state, int hashbitlen );
+int simd_4way_update( simd_4way_context *state, const void *data,
+                      int databitlen );
+int simd_4way_close( simd_4way_context *state, void *hashval );
+int simd_4way_update_close( simd_4way_context *state, void *hashval,
+                            const void *data, int databitlen );
+
+#endif
+
+typedef struct {
+  uint32_t A[ 32*2 ];
+  uint8_t buffer[ 128*2 ];
  uint64_t count;
  unsigned int hashbitlen;
  unsigned int blocksize;
  unsigned int n_feistels;
  
-} simd_2way_context;
+} simd_2way_context __attribute__((aligned(128)));

 int simd_2way_init( simd_2way_context *state, int hashbitlen );
 int simd_2way_update( simd_2way_context *state, const void *data,
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -2,17 +2,140 @@
 #include <string.h>
 #include <stdint.h>
 #include "skein-hash-4way.h"
+
+// 8 way is faster than SHA on Icelake
+// SHA is faster than 4 way on Ryzen
+//
 #if defined(__SHA__)
  #include <openssl/sha.h>
-#else
-  #include "algo/sha/sha-hash-4way.h"
 #endif
+#include "algo/sha/sha-hash-4way.h"

-#if defined (SKEIN_4WAY)
+#if defined (SKEIN_8WAY)
+
+void skeinhash_8way( void *state, const void *input )
+{
+     uint64_t vhash64[8*8] __attribute__ ((aligned (128)));
+     skein512_8way_context ctx_skein;
+
+//#if defined(__SHA__)
+//     uint32_t hash0[16] __attribute__ ((aligned (64)));
+//     uint32_t hash1[16] __attribute__ ((aligned (64)));
+//     uint32_t hash2[16] __attribute__ ((aligned (64)));
+//     uint32_t hash3[16] __attribute__ ((aligned (64)));
+//     uint32_t hash4[16] __attribute__ ((aligned (64)));
+//     uint32_t hash5[16] __attribute__ ((aligned (64)));
+//     uint32_t hash6[16] __attribute__ ((aligned (64)));
+//     uint32_t hash7[16] __attribute__ ((aligned (64)));
+//     SHA256_CTX           ctx_sha256;
+//#else
+     uint32_t vhash32[16*8] __attribute__ ((aligned (128)));
+     sha256_8way_context ctx_sha256;
+//#endif
+
+     skein512_8way_init( &ctx_skein );
+     skein512_8way_update( &ctx_skein, input, 80 );
+     skein512_8way_close( &ctx_skein, vhash64 );
+/*
+#if defined(__SHA__)      
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash64, 512 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
+     SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
+     SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
+     SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
+     SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash4, 64 );
+     SHA256_Final( (unsigned char*)hash4, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash5, 64 );
+     SHA256_Final( (unsigned char*)hash5, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash6, 64 );
+     SHA256_Final( (unsigned char*)hash6, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash7, 64 );
+     SHA256_Final( (unsigned char*)hash7, &ctx_sha256 );
+     
+     intrlv_8x32( state, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, 256 );
+#else
+*/
+
+     rintrlv_8x64_8x32( vhash32, vhash64, 512 );
+//     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+//                   vhash64, 512 );
+//     intrlv_8x32( vhash32, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+//                   hash7, 512 );
+
+     sha256_8way_init( &ctx_sha256 );
+     sha256_8way( &ctx_sha256, vhash32, 64 );
+     sha256_8way_close( &ctx_sha256, state );
+//#endif
+}
+
+int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint32_t vdata[20*8] __attribute__ ((aligned (128)));
+    uint32_t hash[16*8] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+    uint32_t *hash7 = &(hash[7<<3]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint32_t Htarg = ptarget[7];
+    const uint32_t first_nonce = pdata[19];
+    uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+    int thr_id = mythr->id; 
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do
+   {
+       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+       skeinhash_8way( hash, vdata );
+
+       for ( int lane = 0; lane < 8; lane++ )
+       if (  hash7[ lane ] <= Htarg )
+       {
+          extr_lane_8x32( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) )
+          {
+             pdata[19] = n + lane;
+             submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+       }
+       n += 8;
+    } while ( (n < max_nonce-8) && !work_restart[thr_id].restart );
+
+    *hashes_done = n - first_nonce;
+    return 0;
+}
+
+#elif defined (SKEIN_4WAY)

 void skeinhash_4way( void *state, const void *input )
 {
-     uint64_t vhash64[16*4] __attribute__ ((aligned (64)));
+     uint64_t vhash64[8*4] __attribute__ ((aligned (128)));
     skein512_4way_context ctx_skein;
 #if defined(__SHA__)
     uint32_t hash0[16] __attribute__ ((aligned (64)));
@@ -26,7 +149,7 @@ void skeinhash_4way( void *state, const void *input )
 #endif

     skein512_4way_init( &ctx_skein );
-     skein512_4way( &ctx_skein, input, 80 );
+     skein512_4way_update( &ctx_skein, input, 80 );
     skein512_4way_close( &ctx_skein, vhash64 );

 #if defined(__SHA__)      
@@ -71,7 +194,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-    int thr_id = mythr->id;  // thr_id arg is deprecated
+    int thr_id = mythr->id; 

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
   do
@@ -92,9 +215,9 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
          }
       }
       n += 4;
-    } while ( (n < max_nonce) && !work_restart[thr_id].restart );
+    } while ( (n < max_nonce-4) && !work_restart[thr_id].restart );

-    *hashes_done = n - first_nonce + 1;
+    *hashes_done = n - first_nonce;
    return 0;
 }

--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -4,8 +4,11 @@

 bool register_skein_algo( algo_gate_t* gate )
 {
-    gate->optimizations = AVX2_OPT | SHA_OPT;
-#if defined (SKEIN_4WAY)
+    gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
+#if defined (SKEIN_8WAY)
+    gate->scanhash  = (void*)&scanhash_skein_8way;
+    gate->hash      = (void*)&skeinhash_8way;
+#elif defined (SKEIN_4WAY)
    gate->scanhash  = (void*)&scanhash_skein_4way;
    gate->hash      = (void*)&skeinhash_4way;
 #else
@@ -15,3 +18,20 @@ bool register_skein_algo( algo_gate_t* gate )
    return true;
 };

+bool register_skein2_algo( algo_gate_t* gate )
+{
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
+#if defined (SKEIN_8WAY)
+  gate->scanhash  = (void*)&scanhash_skein2_8way;
+  gate->hash      = (void*)&skein2hash_8way;
+#elif defined (SKEIN_4WAY)
+  gate->scanhash  = (void*)&scanhash_skein2_4way;
+  gate->hash      = (void*)&skein2hash_4way;
+#else
+  gate->scanhash  = (void*)&scanhash_skein2;
+  gate->hash      = (void*)&skein2hash;
+#endif
+  return true;
+};
+
+
--- a/algo/skein/skein-gate.h
+++ b/algo/skein/skein-gate.h
@@ -1,23 +1,44 @@
 #ifndef __SKEIN_GATE_H__
-#define __SKEIN_GATE_H__
+#define __SKEIN_GATE_H__ 1
 #include <stdint.h>
 #include "algo-gate-api.h"

-#if defined(__AVX2__)
-  #define SKEIN_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define SKEIN_8WAY 1
+#elif defined(__AVX2__)
+  #define SKEIN_4WAY 1
 #endif

-#if defined(SKEIN_4WAY)
+#if defined(SKEIN_8WAY)
+
+void skeinhash_8way( void *output, const void *input );
+int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+void skein2hash_8way( void *output, const void *input );
+int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t* hashes_done, struct thr_info *mythr );
+
+#elif defined(SKEIN_4WAY)

 void skeinhash_4way( void *output, const void *input );
-
 int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-#endif
+
+void skein2hash_4way( void *output, const void *input );
+int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
+                          uint64_t* hashes_done, struct thr_info *mythr );
+
+#else

 void skeinhash( void *output, const void *input );
-
 int scanhash_skein( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );

+void skein2hash( void *output, const void *input );
+int scanhash_skein2( struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
 #endif
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -36,7 +36,6 @@
 #include <string.h>
 #include "skein-hash-4way.h"

-
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -45,6 +44,22 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

+/*
+static const sph_u64 IV256[] = {
+   SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
+   SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
+   SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
+   SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
+};
+
+static const sph_u64 IV512[] = {
+   SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
+   SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
+   SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
+   SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
+};
+*/
+   
 /*
 * M9_ ## s ## _ ## i  evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
 */
@@ -270,8 +285,151 @@ extern "C"{
 #define SKBI(k, s, i)   XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
 #define SKBT(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))

+#define READ_STATE_BIG(sc)   do { \
+      h0 = (sc)->h0; \
+      h1 = (sc)->h1; \
+      h2 = (sc)->h2; \
+      h3 = (sc)->h3; \
+      h4 = (sc)->h4; \
+      h5 = (sc)->h5; \
+      h6 = (sc)->h6; \
+      h7 = (sc)->h7; \
+      bcount = sc->bcount; \
+   } while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+      (sc)->h0 = h0; \
+      (sc)->h1 = h1; \
+      (sc)->h2 = h2; \
+      (sc)->h3 = h3; \
+      (sc)->h4 = h4; \
+      (sc)->h5 = h5; \
+      (sc)->h6 = h6; \
+      (sc)->h7 = h7; \
+      sc->bcount = bcount; \
+   } while (0)
+   
 // AVX2 all scalar vars are now vectors representing 4 nonces in parallel

+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
+do { \
+  k8 = _mm512_xor_si512( _mm512_xor_si512( \
+                            _mm512_xor_si512( _mm512_xor_si512( k0, k1 ), \
+                                              _mm512_xor_si512( k2, k3 ) ), \
+                            _mm512_xor_si512( _mm512_xor_si512( k4, k5 ), \
+                                              _mm512_xor_si512( k6, k7 ) ) ), \
+                         m512_const1_64( 0x1BD11BDAA9FC1A22) ); \
+  t2 = t0 ^ t1; \
+} while (0)
+   
+#define TFBIG_ADDKEY_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \
+do { \
+  w0 = _mm512_add_epi64( w0, SKBI(k,s,0) ); \
+  w1 = _mm512_add_epi64( w1, SKBI(k,s,1) ); \
+  w2 = _mm512_add_epi64( w2, SKBI(k,s,2) ); \
+  w3 = _mm512_add_epi64( w3, SKBI(k,s,3) ); \
+  w4 = _mm512_add_epi64( w4, SKBI(k,s,4) ); \
+  w5 = _mm512_add_epi64( w5, _mm512_add_epi64( SKBI(k,s,5), \
+                                         m512_const1_64( SKBT(t,s,0) ) ) ); \
+  w6 = _mm512_add_epi64( w6, _mm512_add_epi64( SKBI(k,s,6), \
+                                         m512_const1_64( SKBT(t,s,1) ) ) ); \
+  w7 = _mm512_add_epi64( w7, _mm512_add_epi64( SKBI(k,s,7), \
+                                         m512_const1_64( s ) ) ); \
+} while (0)
+
+
+#define TFBIG_MIX_8WAY(x0, x1, rc) \
+do { \
+     x0 = _mm512_add_epi64( x0, x1 ); \
+     x1 = _mm512_xor_si512( mm512_rol_64( x1, rc ), x0 ); \
+} while (0)
+
+#define TFBIG_MIX8_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3)  do { \
+      TFBIG_MIX_8WAY(w0, w1, rc0); \
+      TFBIG_MIX_8WAY(w2, w3, rc1); \
+      TFBIG_MIX_8WAY(w4, w5, rc2); \
+      TFBIG_MIX_8WAY(w6, w7, rc3); \
+   } while (0)
+
+#define TFBIG_8WAY_4e(s)   do { \
+      TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
+      TFBIG_MIX8_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
+      TFBIG_MIX8_8WAY(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
+      TFBIG_MIX8_8WAY(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
+      TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
+   } while (0)
+
+#define TFBIG_8WAY_4o(s)   do { \
+      TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
+      TFBIG_MIX8_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
+      TFBIG_MIX8_8WAY(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
+      TFBIG_MIX8_8WAY(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
+      TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
+   } while (0)
+
+#define UBI_BIG_8WAY(etype, extra) \
+do { \
+  sph_u64 t0, t1, t2; \
+  __m512i h8; \
+  __m512i m0 =  buf[0]; \
+  __m512i m1 =  buf[1]; \
+  __m512i m2 =  buf[2]; \
+  __m512i m3 =  buf[3]; \
+  __m512i m4 =  buf[4]; \
+  __m512i m5 =  buf[5]; \
+  __m512i m6 =  buf[6]; \
+  __m512i m7 =  buf[7]; \
+\
+  __m512i p0 = m0; \
+  __m512i p1 = m1; \
+  __m512i p2 = m2; \
+  __m512i p3 = m3; \
+  __m512i p4 = m4; \
+  __m512i p5 = m5; \
+  __m512i p6 = m6; \
+  __m512i p7 = m7; \
+  t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
+  t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
+  TFBIG_KINIT_8WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \
+  TFBIG_8WAY_4e(0); \
+  TFBIG_8WAY_4o(1); \
+  TFBIG_8WAY_4e(2); \
+  TFBIG_8WAY_4o(3); \
+  TFBIG_8WAY_4e(4); \
+  TFBIG_8WAY_4o(5); \
+  TFBIG_8WAY_4e(6); \
+  TFBIG_8WAY_4o(7); \
+  TFBIG_8WAY_4e(8); \
+  TFBIG_8WAY_4o(9); \
+  TFBIG_8WAY_4e(10); \
+  TFBIG_8WAY_4o(11); \
+  TFBIG_8WAY_4e(12); \
+  TFBIG_8WAY_4o(13); \
+  TFBIG_8WAY_4e(14); \
+  TFBIG_8WAY_4o(15); \
+  TFBIG_8WAY_4e(16); \
+  TFBIG_8WAY_4o(17); \
+  TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \
+  h0 = _mm512_xor_si512( m0, p0 );\
+  h1 = _mm512_xor_si512( m1, p1 );\
+  h2 = _mm512_xor_si512( m2, p2 );\
+  h3 = _mm512_xor_si512( m3, p3 );\
+  h4 = _mm512_xor_si512( m4, p4 );\
+  h5 = _mm512_xor_si512( m5, p5 );\
+  h6 = _mm512_xor_si512( m6, p6 );\
+  h7 = _mm512_xor_si512( m7, p7 );\
+} while (0)
+
+#define DECL_STATE_BIG_8WAY \
+  __m512i h0, h1, h2, h3, h4, h5, h6, h7; \
+  sph_u64 bcount;
+
+
+#endif // AVX512
+
 #define TFBIG_KINIT_4WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
 do { \
  k8 = _mm256_xor_si256( _mm256_xor_si256( \
@@ -298,39 +456,34 @@ do { \
                                         m256_const1_64( s ) ) ); \
 } while (0)

-
 #define TFBIG_MIX_4WAY(x0, x1, rc) \
 do { \
     x0 = _mm256_add_epi64( x0, x1 ); \
     x1 = _mm256_xor_si256( mm256_rol_64( x1, rc ), x0 ); \
 } while (0)
- 

-// typeless
-#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3)  do { \
-		TFBIG_MIX_4WAY(w0, w1, rc0); \
-		TFBIG_MIX_4WAY(w2, w3, rc1); \
-		TFBIG_MIX_4WAY(w4, w5, rc2); \
-		TFBIG_MIX_4WAY(w6, w7, rc3); \
-	} while (0)
+#define TFBIG_MIX8_4WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3)  do { \
+      TFBIG_MIX_4WAY(w0, w1, rc0); \
+      TFBIG_MIX_4WAY(w2, w3, rc1); \
+      TFBIG_MIX_4WAY(w4, w5, rc2); \
+      TFBIG_MIX_4WAY(w6, w7, rc3); \
+   } while (0)

+#define TFBIG_4WAY_4e(s)   do { \
+      TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
+      TFBIG_MIX8_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
+      TFBIG_MIX8_4WAY(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
+      TFBIG_MIX8_4WAY(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
+      TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
+   } while (0)

-#define TFBIG_4e(s)   do { \
-		TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
-		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
-		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
-		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
-		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
-	} while (0)
-
-#define TFBIG_4o(s)   do { \
-		TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
-		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
-		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
-		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
-		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
-	} while (0)
-
+#define TFBIG_4WAY_4o(s)   do { \
+      TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
+      TFBIG_MIX8_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
+      TFBIG_MIX8_4WAY(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
+      TFBIG_MIX8_4WAY(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
+      TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
+   } while (0)

 // scale buf offset by 4
 #define UBI_BIG_4WAY(etype, extra) \
@@ -357,24 +510,24 @@ do { \
  t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
  t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
  TFBIG_KINIT_4WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \
-  TFBIG_4e(0); \
-  TFBIG_4o(1); \
-  TFBIG_4e(2); \
-  TFBIG_4o(3); \
-  TFBIG_4e(4); \
-  TFBIG_4o(5); \
-  TFBIG_4e(6); \
-  TFBIG_4o(7); \
-  TFBIG_4e(8); \
-  TFBIG_4o(9); \
-  TFBIG_4e(10); \
-  TFBIG_4o(11); \
-  TFBIG_4e(12); \
-  TFBIG_4o(13); \
-  TFBIG_4e(14); \
-  TFBIG_4o(15); \
-  TFBIG_4e(16); \
-  TFBIG_4o(17); \
+  TFBIG_4WAY_4e(0); \
+  TFBIG_4WAY_4o(1); \
+  TFBIG_4WAY_4e(2); \
+  TFBIG_4WAY_4o(3); \
+  TFBIG_4WAY_4e(4); \
+  TFBIG_4WAY_4o(5); \
+  TFBIG_4WAY_4e(6); \
+  TFBIG_4WAY_4o(7); \
+  TFBIG_4WAY_4e(8); \
+  TFBIG_4WAY_4o(9); \
+  TFBIG_4WAY_4e(10); \
+  TFBIG_4WAY_4o(11); \
+  TFBIG_4WAY_4e(12); \
+  TFBIG_4WAY_4o(13); \
+  TFBIG_4WAY_4e(14); \
+  TFBIG_4WAY_4o(15); \
+  TFBIG_4WAY_4e(16); \
+  TFBIG_4WAY_4o(17); \
  TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \
  h0 = _mm256_xor_si256( m0, p0 );\
  h1 = _mm256_xor_si256( m1, p1 );\
@@ -391,45 +544,142 @@ do { \
  __m256i h0, h1, h2, h3, h4, h5, h6, h7; \
  sph_u64 bcount;

-#define READ_STATE_BIG(sc)   do { \
-		h0 = (sc)->h0; \
-		h1 = (sc)->h1; \
-		h2 = (sc)->h2; \
-		h3 = (sc)->h3; \
-		h4 = (sc)->h4; \
-		h5 = (sc)->h5; \
-		h6 = (sc)->h6; \
-		h7 = (sc)->h7; \
-		bcount = sc->bcount; \
-	} while (0)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-#define WRITE_STATE_BIG(sc)   do { \
-		(sc)->h0 = h0; \
-		(sc)->h1 = h1; \
-		(sc)->h2 = h2; \
-		(sc)->h3 = h3; \
-		(sc)->h4 = h4; \
-		(sc)->h5 = h5; \
-		(sc)->h6 = h6; \
-		(sc)->h7 = h7; \
-		sc->bcount = bcount; \
-	} while (0)
+void skein256_8way_init( skein256_8way_context *sc )
+{
+        sc->h0 = m512_const1_64( 0xCCD044A12FDB3E13 );
+        sc->h1 = m512_const1_64( 0xE83590301A79A9EB );
+        sc->h2 = m512_const1_64( 0x55AEA0614F816E6F );
+        sc->h3 = m512_const1_64( 0x2A2767A4AE9B94DB );
+        sc->h4 = m512_const1_64( 0xEC06025E74DD7683 );
+        sc->h5 = m512_const1_64( 0xE7A436CDC4746251 );
+        sc->h6 = m512_const1_64( 0xC36FBAF9393AD185 );
+        sc->h7 = m512_const1_64( 0x3EEDBA1833EDFC13 );
+        sc->bcount = 0;
+        sc->ptr = 0;
+}

-/*
-static const sph_u64 IV256[] = {
-   SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
-   SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
-   SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
-   SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
-};
+void skein512_8way_init( skein512_8way_context *sc )
+{
+        sc->h0 = m512_const1_64( 0x4903ADFF749C51CE );
+        sc->h1 = m512_const1_64( 0x0D95DE399746DF03 );
+        sc->h2 = m512_const1_64( 0x8FD1934127C79BCE );
+        sc->h3 = m512_const1_64( 0x9A255629FF352CB1 );
+        sc->h4 = m512_const1_64( 0x5DB62599DF6CA7B0 );
+        sc->h5 = m512_const1_64( 0xEABE394CA9D5C3F4 );
+        sc->h6 = m512_const1_64( 0x991112C71A75B523 );
+        sc->h7 = m512_const1_64( 0xAE18A40B660FCC33 );
+        sc->bcount = 0;
+        sc->ptr = 0;
+}
+
+static void
+skein_big_core_8way( skein512_8way_context *sc, const void *data,
+                     size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   __m512i *buf;
+   size_t ptr;
+   unsigned first;
+   DECL_STATE_BIG_8WAY
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   const int buf_size = 64;   // 64 * _m256i
+
+   if ( len <= buf_size - ptr )
+   {
+       memcpy_512( buf + (ptr>>3), vdata, len>>3 );
+       sc->ptr = ptr + len;
+       return;
+   }
+
+   READ_STATE_BIG( sc );
+   first = ( bcount == 0 ) << 7;
+   do {
+       size_t clen;
+
+       if ( ptr == buf_size )
+       {
+            bcount ++;
+            UBI_BIG_8WAY( 96 + first, 0 );
+            first = 0;
+            ptr = 0;
+       }
+       clen = buf_size - ptr;
+       if ( clen > len )
+            clen = len;
+       memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
+       ptr += clen;
+       vdata += (clen>>3);
+       len -= clen;
+   } while ( len > 0 );
+   WRITE_STATE_BIG( sc );
+   sc->ptr = ptr;
+}
+
+static void
+skein_big_close_8way( skein512_8way_context *sc, unsigned ub, unsigned n,
+                      void *dst, size_t out_len )
+{
+   __m512i *buf;
+   size_t ptr;
+   unsigned et;
+   DECL_STATE_BIG_8WAY
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+        const int buf_size = 64;
+
+   READ_STATE_BIG(sc);
+
+   memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+   et = 352 + ((bcount == 0) << 7);
+   UBI_BIG_8WAY( et, ptr );
+
+   memset_zero_512( buf, buf_size >> 3 );
+   bcount = 0;
+   UBI_BIG_8WAY( 510, 8 );
+
+   buf[0] = h0;
+   buf[1] = h1;
+   buf[2] = h2;
+   buf[3] = h3;
+   buf[4] = h4;
+   buf[5] = h5;
+   buf[6] = h6;
+   buf[7] = h7;
+
+   memcpy_512( dst, buf, out_len >> 3 );
+}
+
+void
+skein256_8way_update(void *cc, const void *data, size_t len)
+{
+   skein_big_core_8way(cc, data, len);
+}
+
+void
+skein256_8way_close(void *cc, void *dst)
+{
+        skein_big_close_8way(cc, 0, 0, dst, 32);
+}
+
+void
+skein512_8way_update(void *cc, const void *data, size_t len)
+{
+   skein_big_core_8way(cc, data, len);
+}
+
+void
+skein512_8way_close(void *cc, void *dst)
+{
+        skein_big_close_8way(cc, 0, 0, dst, 64);
+}
+
+#endif // AVX512

-static const sph_u64 IV512[] = {
-   SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
-   SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
-   SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
-   SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
-};
-*/

 void skein256_4way_init( skein256_4way_context *sc )
 {
@@ -517,66 +767,30 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
 	ptr = sc->ptr;
        const int buf_size = 64;

-	/*
-	 * At that point, if ptr == 0, then the message was empty;
-	 * otherwise, there is between 1 and 64 bytes (inclusive) which
-	 * are yet to be processed. Either way, we complete the buffer
-	 * to a full block with zeros (the Skein specification mandates
-	 * that an empty message is padded so that there is at least
-	 * one block to process).
-	 *
-	 * Once this block has been processed, we do it again, with
-	 * a block full of zeros, for the output (that block contains
-	 * the encoding of "0", over 8 bytes, then padded with zeros).
-	 */
-
 	READ_STATE_BIG(sc);

-        memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+   memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
 	et = 352 + ((bcount == 0) << 7);
-        UBI_BIG_4WAY( et, ptr );
+   UBI_BIG_4WAY( et, ptr );

-        memset_zero_256( buf, buf_size >> 3 );
-        bcount = 0;
-        UBI_BIG_4WAY( 510, 8 );
+   memset_zero_256( buf, buf_size >> 3 );
+   bcount = 0;
+   UBI_BIG_4WAY( 510, 8 );

-        buf[0] = h0;
-        buf[1] = h1;
-        buf[2] = h2;
-        buf[3] = h3;
-        buf[4] = h4;
-        buf[5] = h5;
-        buf[6] = h6;
-        buf[7] = h7;
+   buf[0] = h0;
+   buf[1] = h1;
+   buf[2] = h2;
+   buf[3] = h3;
+   buf[4] = h4;
+   buf[5] = h5;
+   buf[6] = h6;
+   buf[7] = h7;

-        memcpy_256( dst, buf, out_len >> 3 );
+   memcpy_256( dst, buf, out_len >> 3 );
 }

-/*
-static const sph_u64 IV256[] = {
-	SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
-	SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
-	SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
-	SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
-};
-
-static const sph_u64 IV512[] = {
-	SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
-	SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
-	SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
-	SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
-};
-*/
-/*
 void
-skein256_4way_init(void *cc)
-{
-	skein_big_init_4way(cc, IV256);
-}
-*/
-
-void
-skein256_4way(void *cc, const void *data, size_t len)
+skein256_4way_update(void *cc, const void *data, size_t len)
 {
 	skein_big_core_4way(cc, data, len);
 }
@@ -587,16 +801,8 @@ skein256_4way_close(void *cc, void *dst)
        skein_big_close_4way(cc, 0, 0, dst, 32);
 }

-/*
 void
-skein512_4way_init(void *cc)
-{
-	skein_big_init_4way(cc, IV512);
-}
-*/
-
-void
-skein512_4way(void *cc, const void *data, size_t len)
+skein512_4way_update(void *cc, const void *data, size_t len)
 {
 	skein_big_core_4way(cc, data, len);
 }
--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -55,29 +55,50 @@ extern "C"{
 #define SPH_SIZE_skein256   256
 #define SPH_SIZE_skein512   512

+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 typedef struct
 {
-   __m256i buf[8] __attribute__ ((aligned (64)));
+   __m512i buf[8];
+   __m512i h0, h1, h2, h3, h4, h5, h6, h7;
+   size_t ptr;
+   sph_u64 bcount;
+} sph_skein_8way_big_context __attribute__ ((aligned (128)));
+
+typedef sph_skein_8way_big_context skein512_8way_context;
+typedef sph_skein_8way_big_context skein256_8way_context;
+
+void skein512_8way_init( skein512_8way_context *sc );
+void skein512_8way_update( void *cc, const void *data, size_t len );
+void skein512_8way_close( void *cc, void *dst );
+
+void skein256_8way_init( skein256_8way_context *sc );
+void skein256_8way_update( void *cc, const void *data, size_t len );
+void skein256_8way_close( void *cc, void *dst );
+
+#endif // AVX512
+   
+typedef struct
+{
+   __m256i buf[8];
   __m256i h0, h1, h2, h3, h4, h5, h6, h7;
   size_t ptr;
 	sph_u64 bcount;
-} sph_skein_4way_big_context;
+} sph_skein_4way_big_context __attribute__ ((aligned (128)));

 typedef sph_skein_4way_big_context skein512_4way_context;
 typedef sph_skein_4way_big_context skein256_4way_context;

 void skein512_4way_init( skein512_4way_context *sc );
-void skein512_4way( void *cc, const void *data, size_t len );
+void skein512_4way_update( void *cc, const void *data, size_t len );
 void skein512_4way_close( void *cc, void *dst );
-//void sph_skein512_addbits_and_close(
-//        void *cc, unsigned ub, unsigned n, void *dst);
+#define skein512_4way skein512_4way_update

 void skein256_4way_init( skein256_4way_context *sc );
-void skein256_4way( void *cc, const void *data, size_t len );
+void skein256_4way_update( void *cc, const void *data, size_t len );
 void skein256_4way_close( void *cc, void *dst );
-//void sph_skein256_addbits_and_close(
-//	void *cc, unsigned ub, unsigned n, void *dst);
-
+#define skein256_4way skein256_4way_update

 #ifdef __cplusplus
 }
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -1,9 +1,66 @@
-#include "skein2-gate.h"
+#include "skein-gate.h"
 #include <string.h>
 #include <stdint.h>
 #include "skein-hash-4way.h"

-#if defined(SKEIN2_4WAY)
+#if defined(SKEIN_8WAY)
+
+void skein2hash_8way( void *output, const void *input )
+{
+   skein512_8way_context ctx;
+   uint64_t hash[16*8] __attribute__ ((aligned (128)));
+
+   skein512_8way_init( &ctx );
+   skein512_8way_update( &ctx, input, 80 );
+   skein512_8way_close( &ctx, hash );
+
+   skein512_8way_init( &ctx );
+   skein512_8way_update( &ctx, hash, 64 );
+   skein512_8way_close( &ctx, output );
+}
+
+int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint32_t hash[16*8] __attribute__ ((aligned (128)));
+    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+    uint32_t *hash7 = &(hash[49]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint32_t Htarg = ptarget[7];
+    const uint32_t first_nonce = pdata[19];
+    uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+    int thr_id = mythr->id; 
+
+    mm512_bswap32_intrlv80_8x64( vdata, pdata );
+    do
+    {
+       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+       skein2hash_8way( hash, vdata );
+
+       for ( int lane = 0; lane < 8; lane++ )
+       if ( hash7[ lane<<1 ] <= Htarg )
+       {
+          extr_lane_8x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          {
+             pdata[19] = n + lane;
+             submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+       }
+       n += 8;
+    } while ( (n < max_nonce-8) && !work_restart[thr_id].restart );
+
+    *hashes_done = n - first_nonce + 1;
+    return 0;
+}
+
+#elif defined(SKEIN_4WAY)

 void skein2hash_4way( void *output, const void *input )
 {
--- a/algo/skein/skein2-gate.c
+++ b/algo/skein/skein2-gate.c
@@ -1,17 +0,0 @@
-#include "skein2-gate.h"
-#include <stdint.h>
-#include "sph_skein.h"
-
-bool register_skein2_algo( algo_gate_t* gate )
-{
-  gate->optimizations = AVX2_OPT;
-#if defined (SKEIN2_4WAY)
-  gate->scanhash  = (void*)&scanhash_skein2_4way;
-  gate->hash      = (void*)&skein2hash_4way;
-#else
-  gate->scanhash  = (void*)&scanhash_skein2;
-  gate->hash      = (void*)&skein2hash;
-#endif
-  return true;
-};
-
--- a/algo/skein/skein2-gate.h
+++ b/algo/skein/skein2-gate.h
@@ -1,20 +0,0 @@
-#ifndef __SKEIN2GATE_H__
-#define __SKEIN2_GATE_H__
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-#if defined(__AVX2__)
-  #define SKEIN2_4WAY
-#endif
-
-#if defined(SKEIN2_4WAY)
-void skein2hash_4way( void *output, const void *input );
-int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
-                          uint64_t* hashes_done, struct thr_info *mythr );
-#endif
-
-void skein2hash( void *output, const void *input );
-int scanhash_skein2( struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done, struct thr_info *mythr );
-#endif
-
--- a/algo/skein/skein2.c
+++ b/algo/skein/skein2.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "skein-gate.h"
 #include <string.h>
 #include <stdint.h>

--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -1,11 +1,7 @@
 #include "cpuminer-config.h"
 #include "c11-gate.h"
-
-#if defined (C11_4WAY)
-
 #include <string.h>
 #include <stdint.h>
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -13,11 +9,237 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"

+#if defined (C11_8WAY)
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+} c11_8way_ctx_holder;
+
+c11_8way_ctx_holder c11_8way_ctx;
+
+void init_c11_8way_ctx()
+{
+     blake512_8way_init( &c11_8way_ctx.blake );
+     bmw512_8way_init( &c11_8way_ctx.bmw );
+     init_groestl( &c11_8way_ctx.groestl, 64 );
+     skein512_8way_init( &c11_8way_ctx.skein );
+     jh512_8way_init( &c11_8way_ctx.jh );
+     keccak512_8way_init( &c11_8way_ctx.keccak );
+     luffa_4way_init( &c11_8way_ctx.luffa, 512 );
+     cube_4way_init( &c11_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &c11_8way_ctx.shavite );
+     simd_4way_init( &c11_8way_ctx.simd, 512 );
+     init_echo( &c11_8way_ctx.echo, 512 );
+}
+
+void c11_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     c11_8way_ctx_holder ctx;
+     memcpy( &ctx, &c11_8way_ctx, sizeof(c11_8way_ctx) );
+
+     // 1 Blake 4way
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     // 2 Bmw
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     // Serial
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     // 4way
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+
+     // 4 JH
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     // 5 Keccak
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     // 6 Skein
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     // Serial
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     // 7 Luffa + 8 cube
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     // 10 Simd
+     intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
+     intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     int thr_id = mythr->id;   
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     const uint32_t Htarg = ptarget[7];
+
+     max_nonce -= 8;
+
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+        _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                          n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+        c11_8way_hash( hash, vdata );
+        pdata[19] = n;
+
+        for ( int i = 0; i < 8; i++ )
+        if ( ( ( hash+(i<<3) )[7] < Htarg )
+             && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+        {
+           pdata[19] = n+i;
+           submit_lane_solution( work, hash+(i<<3), mythr, i );
+        }
+        n += 8;
+     } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+     
+#elif defined (C11_4WAY)
+
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
--- a/algo/x11/c11-gate.c
+++ b/algo/x11/c11-gate.c
@@ -2,7 +2,11 @@

 bool register_c11_algo( algo_gate_t* gate )
 {
-#if defined (C11_4WAY)
+#if defined (C11_8WAY)
+  init_c11_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_c11_8way;
+  gate->hash      = (void*)&c11_8way_hash;
+#elif defined (C11_4WAY)
  init_c11_4way_ctx();
  gate->scanhash  = (void*)&scanhash_c11_4way;
  gate->hash      = (void*)&c11_4way_hash;
@@ -11,7 +15,7 @@ bool register_c11_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_c11;
  gate->hash      = (void*)&c11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x11/c11-gate.h
+++ b/algo/x11/c11-gate.h
@@ -4,29 +4,36 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define C11_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define C11_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define C11_4WAY 1
 #endif

+
 bool register_c11_algo( algo_gate_t* gate );
+#if defined(C11_8WAY)

-#if defined(C11_4WAY)
+void c11_8way_hash( void *state, const void *input );
+int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+void init_c11_8way_ctx();
+
+#elif defined(C11_4WAY)

 void c11_4way_hash( void *state, const void *input );
-
 int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_c11_4way_ctx();

-#endif
+#else

 void c11_hash( void *state, const void *input );
-
 int scanhash_c11( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_c11_ctx();

 #endif

+#endif
+
--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -45,12 +45,12 @@ void init_tt8_4way_ctx()

 void timetravel_4way_hash(void *output, const void *input)
 {
-   uint64_t hash0[8] __attribute__ ((aligned (64)));
-   uint64_t hash1[8] __attribute__ ((aligned (64)));
-   uint64_t hash2[8] __attribute__ ((aligned (64)));
-   uint64_t hash3[8] __attribute__ ((aligned (64)));
-   uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
-   uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
+   uint64_t hash0[10] __attribute__ ((aligned (64)));
+   uint64_t hash1[10] __attribute__ ((aligned (64)));
+   uint64_t hash2[10] __attribute__ ((aligned (64)));
+   uint64_t hash3[10] __attribute__ ((aligned (64)));
+   uint64_t vhashX[10*4] __attribute__ ((aligned (64)));
+   uint64_t vhashY[10*4] __attribute__ ((aligned (64)));
   uint64_t *vhashA, *vhashB;
   tt8_4way_ctx_holder ctx __attribute__ ((aligned (64)));
   uint32_t dataLen = 64;
--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -51,12 +51,12 @@ void init_tt10_4way_ctx()

 void timetravel10_4way_hash(void *output, const void *input)
 {
-   uint64_t hash0[8] __attribute__ ((aligned (64)));
-   uint64_t hash1[8] __attribute__ ((aligned (64)));
-   uint64_t hash2[8] __attribute__ ((aligned (64)));
-   uint64_t hash3[8] __attribute__ ((aligned (64)));
-   uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
-   uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
+   uint64_t hash0[10] __attribute__ ((aligned (64)));
+   uint64_t hash1[10] __attribute__ ((aligned (64)));
+   uint64_t hash2[10] __attribute__ ((aligned (64)));
+   uint64_t hash3[10] __attribute__ ((aligned (64)));
+   uint64_t vhashX[10*4] __attribute__ ((aligned (64)));
+   uint64_t vhashY[10*4] __attribute__ ((aligned (64)));
   uint64_t *vhashA, *vhashB;
   tt10_4way_ctx_holder ctx __attribute__ ((aligned (64)));
   uint32_t dataLen = 64;
--- a/algo/x11/tribus-4way.c
+++ b/algo/x11/tribus-4way.c
@@ -3,22 +3,121 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
-#if defined(TRIBUS_4WAY)
-
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/echo/aes_ni/hash_api.h"

-//hashState_echo tribus_4way_ctx __attribute__ ((aligned (64)));
-static __thread jh512_4way_context ctx_mid;
-/*
-void init_tribus_4way_ctx()
+#if defined(TRIBUS_8WAY)
+
+static __thread jh512_8way_context ctx_mid;
+
+void tribus_hash_8way( void *state, const void *input )
 {
-     init_echo( &tribus_4way_ctx, 512 );
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     jh512_8way_context     ctx_jh;
+     keccak512_8way_context ctx_keccak;
+     hashState_echo         ctx_echo;
+
+     memcpy( &ctx_jh, &ctx_mid, sizeof(ctx_mid) );
+     jh512_8way_update( &ctx_jh, input + (64<<3), 16 );
+     jh512_8way_close( &ctx_jh, vhash );
+
+     keccak512_8way_init( &ctx_keccak );
+     keccak512_8way_update( &ctx_keccak, vhash, 64 );
+     keccak512_8way_close( &ctx_keccak, vhash );
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, 512 );
+
+     // hash echo serially
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash0,
+                        (const BitSequence *) hash0, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash1,
+                        (const BitSequence *) hash1, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash2,
+                        (const BitSequence *) hash2, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash3,
+                        (const BitSequence *) hash3, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash4,
+                        (const BitSequence *) hash4, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash5,
+                        (const BitSequence *) hash5, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash6,
+                        (const BitSequence *) hash6, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash7,
+                        (const BitSequence *) hash7, 512 );
+
+     memcpy( state,       hash0, 32 );
+     memcpy( state+32,    hash1, 32 );
+     memcpy( state+64,    hash2, 32 );
+     memcpy( state+96,    hash3, 32 );
+     memcpy( state+128,   hash4, 32 );
+     memcpy( state+160,   hash5, 32 );
+     memcpy( state+192,   hash6, 32 );
+     memcpy( state+224,   hash7, 32 );
 }
-*/
-void tribus_hash_4way(void *state, const void *input)
+
+int scanhash_tribus_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t Htarg = ptarget[7];
+   uint32_t n = pdata[19];
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   int thr_id = mythr->id;  
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   jh512_8way_init( &ctx_mid );
+   jh512_8way_update( &ctx_mid, vdata, 64 );
+
+   do {
+     *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+     tribus_hash_8way( hash, vdata );
+     pdata[19] = n;
+
+     for ( int i = 0; i < 8; i++ )
+     if ( (hash+(i<<3))[7] < Htarg )
+     if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+     {
+          pdata[19] = n+i;
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
+     }
+     n += 8;
+   } while ( ( n < max_nonce-8 )  && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(TRIBUS_4WAY)
+
+static __thread jh512_4way_context ctx_mid;
+
+void tribus_hash_4way( void *state, const void *input )
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
@@ -30,11 +129,11 @@ void tribus_hash_4way(void *state, const void *input)
     hashState_echo         ctx_echo;

     memcpy( &ctx_jh, &ctx_mid, sizeof(ctx_mid) );
-     jh512_4way( &ctx_jh, input + (64<<2), 16 );
+     jh512_4way_update( &ctx_jh, input + (64<<2), 16 );
     jh512_4way_close( &ctx_jh, vhash );

     keccak512_4way_init( &ctx_keccak );
-     keccak512_4way( &ctx_keccak, vhash, 64 );
+     keccak512_4way_update( &ctx_keccak, vhash, 64 );
     keccak512_4way_close( &ctx_keccak, vhash );

     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -60,7 +159,7 @@ void tribus_hash_4way(void *state, const void *input)
 }

 int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
-            uint64_t *hashes_done, struct thr_info *mythr)
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -70,57 +169,32 @@ int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
   const uint32_t Htarg = ptarget[7];
   uint32_t n = pdata[19];
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-   uint64_t htmax[] = {          0,
-                               0xF,
-                              0xFF,
-                             0xFFF,
-                            0xFFFF,
-                        0x10000000 };
-
-   uint32_t masks[] = {	0xFFFFFFFF,
-                        0xFFFFFFF0,
-                        0xFFFFFF00,
-                        0xFFFFF000,
-                        0xFFFF0000,
-                                 0 };
+   int thr_id = mythr->id;

   mm256_bswap32_intrlv80_4x64( vdata, pdata );

-   // precalc midstate
-   // doing it one way then then interleaving would be faster but too
-   // complicated tto interleave context.
   jh512_4way_init( &ctx_mid );
-   jh512_4way( &ctx_mid, vdata, 64 );
+   jh512_4way_update( &ctx_mid, vdata, 64 );

-   for ( int m = 0; m < 6; m++ )
-   {
-      if ( Htarg <= htmax[m] )
-      {
-         uint32_t mask = masks[m];
-         do {
-           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+   do {
+     *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

-            tribus_hash_4way( hash, vdata );
+     tribus_hash_4way( hash, vdata );

-            pdata[19] = n;
+     pdata[19] = n;

-            for ( int i = 0; i < 4; i++ )
-            if ( ( !( (hash+(i<<3))[7] & mask ) )
-                 && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-            {
-               pdata[19] = n+i;
-               submit_lane_solution( work, hash+(i<<3), mythr, i );
-            }
-            n += 4;
-         } while ( ( n < max_nonce )  && !work_restart[thr_id].restart);
-         break;
-      }
-   }
+     for ( int i = 0; i < 4; i++ )
+     if ( (hash+(i<<3))[7] < Htarg )
+     if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+     {
+          pdata[19] = n+i;
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
+     }
+     n += 4;
+   } while ( ( n < max_nonce-4 )  && !work_restart[thr_id].restart);

-   *hashes_done = n - first_nonce + 1;
+   *hashes_done = n - first_nonce;
   return 0;
 }

--- a/algo/x11/tribus-gate.c
+++ b/algo/x11/tribus-gate.c
@@ -2,9 +2,11 @@

 bool register_tribus_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
-#if defined (TRIBUS_4WAY)
-//  init_tribus_4way_ctx();
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+#if defined (TRIBUS_8WAY)
+  gate->scanhash      = (void*)&scanhash_tribus_8way;
+  gate->hash          = (void*)&tribus_hash_8way;
+#elif defined (TRIBUS_4WAY)
  gate->scanhash      = (void*)&scanhash_tribus_4way;
  gate->hash          = (void*)&tribus_hash_4way;
 #else
--- a/algo/x11/tribus-gate.h
+++ b/algo/x11/tribus-gate.h
@@ -1,16 +1,23 @@
 #ifndef TRIBUS_GATE_H__
-#define TRIBUS_GATE_H__
+#define TRIBUS_GATE_H__ 1

 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define TRIBUS_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define TRIBUS_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define TRIBUS_4WAY 1
 #endif

-#if defined(TRIBUS_4WAY)
+#if defined(TRIBUS_8WAY)

-//void init_tribus_4way_ctx();
+void tribus_hash_8way( void *state, const void *input );
+
+int scanhash_tribus_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(TRIBUS_4WAY)

 void tribus_hash_4way( void *state, const void *input );

--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -1,8 +1,5 @@
 #include "cpuminer-config.h"
 #include "x11-gate.h"
-
-#if defined (X11_4WAY)
-
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/blake-hash-4way.h"
@@ -12,11 +9,235 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"

+#if defined (X11_8WAY)
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+} x11_8way_ctx_holder;
+
+x11_8way_ctx_holder x11_8way_ctx;
+
+void init_x11_8way_ctx()
+{
+     blake512_8way_init( &x11_8way_ctx.blake );
+     bmw512_8way_init( &x11_8way_ctx.bmw );
+     init_groestl( &x11_8way_ctx.groestl, 64 );
+     skein512_8way_init( &x11_8way_ctx.skein );
+     jh512_8way_init( &x11_8way_ctx.jh );
+     keccak512_8way_init( &x11_8way_ctx.keccak );
+     luffa_4way_init( &x11_8way_ctx.luffa, 512 );
+     cube_4way_init( &x11_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x11_8way_ctx.shavite );
+     simd_4way_init( &x11_8way_ctx.simd, 512 );
+     init_echo( &x11_8way_ctx.echo, 512 );
+}
+
+void x11_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     x11_8way_ctx_holder ctx;
+     memcpy( &ctx, &x11_8way_ctx, sizeof(x11_8way_ctx) );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     // Serial
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     // 4way
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     // Luffa + Cube
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+int scanhash_x11_8way( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     int thr_id = mythr->id;
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     const uint32_t Htarg = ptarget[7];
+
+     const uint32_t last_nonce = max_nonce -8;
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+         _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                           n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+         x11_8way_hash( hash, vdata );
+         pdata[19] = n;
+
+         for ( int i = 0; i < 8; i++ )
+         if ( ( hash+(i<<3) )[7] < Htarg
+              && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+         {
+             pdata[19] = n+i;
+             submit_lane_solution( work, hash+(i<<3), mythr, i );
+         }
+         n += 8;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+
+#elif defined (X11_4WAY)
+
+
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
--- a/algo/x11/x11-gate.c
+++ b/algo/x11/x11-gate.c
@@ -1,8 +1,12 @@
 #include "x11-gate.h"

-bool register_x11_algo( algo_gate_t* gate )
+bool register_x11_algo( algo_gate_t *gate )
 {
-#if defined (X11_4WAY)
+#if defined (X11_8WAY)
+  init_x11_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x11_8way;
+  gate->hash      = (void*)&x11_8way_hash;
+#elif defined (X11_4WAY)
  init_x11_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x11_4way;
  gate->hash      = (void*)&x11_4way_hash;
@@ -11,7 +15,7 @@ bool register_x11_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x11;
  gate->hash      = (void*)&x11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x11/x11-gate.h
+++ b/algo/x11/x11-gate.h
@@ -4,29 +4,35 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X11_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X11_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X11_4WAY 1
 #endif

 bool register_x11_algo( algo_gate_t* gate );
+#if defined(X11_8WAY)

-#if defined(X11_4WAY)
+void x11_8way_hash( void *state, const void *input );
+int scanhash_x11_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+void init_x11_8way_ctx();
+
+#elif defined(X11_4WAY)

 void x11_4way_hash( void *state, const void *input );
-
 int scanhash_x11_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x11_4way_ctx();

-#endif
+#else

 void x11_hash( void *state, const void *input );
-
 int scanhash_x11( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x11_ctx();

 #endif

+#endif
+
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -1,11 +1,7 @@
 #include "cpuminer-config.h"
 #include "x11gost-gate.h"
-
-#if defined (X11GOST_4WAY)
-
 #include <string.h>
 #include <stdint.h>
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -14,18 +10,269 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"

+#if defined (X11GOST_8WAY)
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;    
+    keccak512_8way_context  keccak;    
+    sph_gost512_context     gost;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+} x11gost_8way_ctx_holder;
+
+x11gost_8way_ctx_holder x11gost_8way_ctx;
+
+void init_x11gost_8way_ctx()
+{
+     blake512_8way_init( &x11gost_8way_ctx.blake );
+     bmw512_8way_init( &x11gost_8way_ctx.bmw );
+     init_groestl( &x11gost_8way_ctx.groestl, 64 );
+     skein512_8way_init( &x11gost_8way_ctx.skein );
+     jh512_8way_init( &x11gost_8way_ctx.jh );
+     keccak512_8way_init( &x11gost_8way_ctx.keccak );
+     sph_gost512_init( &x11gost_8way_ctx.gost );
+     luffa_4way_init( &x11gost_8way_ctx.luffa, 512 );
+     cube_4way_init( &x11gost_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x11gost_8way_ctx.shavite );
+     simd_4way_init( &x11gost_8way_ctx.simd, 512 );
+     init_echo( &x11gost_8way_ctx.echo, 512 );
+}
+
+void x11gost_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     x11gost_8way_ctx_holder ctx;
+     memcpy( &ctx, &x11gost_8way_ctx, sizeof(x11gost_8way_ctx) );
+
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     // Serial
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     // 4way
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     // Serial
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, hash1 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash4, 64 );
+     sph_gost512_close( &ctx.gost, hash4 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash5, 64 );
+     sph_gost512_close( &ctx.gost, hash5 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash6, 64 );
+     sph_gost512_close( &ctx.gost, hash6 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash7, 64 );
+     sph_gost512_close( &ctx.gost, hash7 );
+
+
+     // Luffa + Cube
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+int scanhash_x11gost_8way( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     int thr_id = mythr->id; 
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     const uint32_t Htarg = ptarget[7];
+
+     max_nonce -= 8;
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+         _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                           n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+         x11gost_8way_hash( hash, vdata );
+         pdata[19] = n;
+
+         for ( int i = 0; i < 8; i++ )
+         if ( ( hash+(i<<3) )[7] < Htarg 
+              && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+         {
+             pdata[19] = n+i;
+             submit_lane_solution( work, hash+(i<<3), mythr, i );
+         }
+         n += 8;
+     } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined (X11GOST_4WAY)
+
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
    skein512_4way_context   skein;
-    jh512_4way_context      jh;    
-    keccak512_4way_context  keccak;    
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
    sph_gost512_context     gost;
    luffa_2way_context      luffa;
    cubehashParam           cube;
@@ -76,10 +323,10 @@ void x11gost_4way_hash( void *state, const void *input )
     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl, 
+     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl, 
+     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

@@ -175,7 +422,7 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
-     int thr_id = mythr->id;  // thr_id arg is deprecated
+     int thr_id = mythr->id;
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
@@ -185,7 +432,7 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,

     mm256_bswap32_intrlv80_4x64( vdata, pdata );

-     for (int m=0; m < 6; m++) 
+     for (int m=0; m < 6; m++)
       if (Htarg <= htmax[m])
       {
         uint32_t mask = masks[m];
--- a/algo/x11/x11gost-gate.c
+++ b/algo/x11/x11gost-gate.c
@@ -2,7 +2,11 @@

 bool register_x11gost_algo( algo_gate_t* gate )
 {
-#if defined (X11GOST_4WAY)
+#if defined (X11GOST_8WAY)
+  init_x11gost_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x11gost_8way;
+  gate->hash      = (void*)&x11gost_8way_hash;
+#elif defined (X11GOST_4WAY)
  init_x11gost_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x11gost_4way;
  gate->hash      = (void*)&x11gost_4way_hash;
@@ -11,7 +15,7 @@ bool register_x11gost_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x11gost;
  gate->hash      = (void*)&x11gost_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x11/x11gost-gate.h
+++ b/algo/x11/x11gost-gate.h
@@ -4,29 +4,36 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X11GOST_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X11GOST_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X11GOST_4WAY 1
 #endif

 bool register_x11gost_algo( algo_gate_t* gate );

-#if defined(X11GOST_4WAY)
+#if defined(X11GOST_8WAY)
+
+void x11gost_8way_hash( void *state, const void *input );
+int scanhash_x11gost_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+void init_x11gost_8way_ctx();
+
+#elif defined(X11GOST_4WAY)

 void x11gost_4way_hash( void *state, const void *input );
-
 int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x11gost_4way_ctx();

-#endif
+#else

 void x11gost_hash( void *state, const void *input );
-
 int scanhash_x11gost( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x11gost_ctx();

 #endif

+#endif
+
--- a/algo/x12/x12-4way.c
+++ b/algo/x12/x12-4way.c
@@ -108,7 +108,7 @@ void x12_4way_hash( void *state, const void *input )
     intrlv_2x128( vhash, hash2, hash3, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     intrlv_2x128( hash2, hash3, vhash, 512 );
+     dintrlv_2x128( hash2, hash3, vhash, 512 );

     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
--- a/algo/x13/phi1612-4way.c
+++ b/algo/x13/phi1612-4way.c
@@ -1,7 +1,4 @@
 #include "phi1612-gate.h"
-
-#if defined(PHI1612_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -9,10 +6,193 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/echo/aes_ni/hash_api.h"

+#if defined(PHI1612_8WAY)
+
+typedef struct {
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    cube_4way_context       cube;
+    sph_fugue512_context    fugue;
+    sph_gost512_context     gost;
+    hashState_echo          echo;
+} phi1612_8way_ctx_holder;
+
+phi1612_8way_ctx_holder phi1612_8way_ctx __attribute__ ((aligned (64)));
+
+void init_phi1612_8way_ctx()
+{
+     skein512_8way_init( &phi1612_8way_ctx.skein );
+     jh512_8way_init( &phi1612_8way_ctx.jh );
+     cube_4way_init( &phi1612_8way_ctx.cube, 512, 16, 32 );
+     sph_fugue512_init( &phi1612_8way_ctx.fugue );
+     sph_gost512_init( &phi1612_8way_ctx.gost );
+     init_echo( &phi1612_8way_ctx.echo, 512 );
+};
+
+void phi1612_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     phi1612_8way_ctx_holder ctx;
+     memcpy( &ctx, &phi1612_8way_ctx, sizeof(phi1612_8way_ctx) );
+
+     // Skein parallel 4way
+     skein512_8way_update( &ctx.skein, input, 80 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     // JH
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     // Cubehash
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     // Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     // Gost
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, hash1 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash4, 64 );
+     sph_gost512_close( &ctx.gost, hash4 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash5, 64 );
+     sph_gost512_close( &ctx.gost, hash5 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash6, 64 );
+     sph_gost512_close( &ctx.gost, hash6 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash7, 64 );
+     sph_gost512_close( &ctx.gost, hash7 );
+
+     // Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+int scanhash_phi1612_8way( struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     const uint32_t first_nonce = pdata[19];
+     uint32_t n = first_nonce;
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     int thr_id = mythr->id;  
+     const uint32_t Htarg = ptarget[7];
+
+     if ( opt_benchmark )
+          ( (uint32_t*)ptarget )[7] = 0x0cff;
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do {
+           *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+               _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                 n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+        phi1612_8way_hash( hash, vdata );
+        pdata[19] = n;
+
+        for ( int i = 0; i < 8; i++ )
+        if ( (hash+(i<<3))[7] <= Htarg )
+        if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+        {
+           pdata[19] = n+i;
+           submit_lane_solution( work, hash+(i<<3), mythr, i );
+        }
+        n += 8;
+     } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined(PHI1612_4WAY)
+
+
 typedef struct {
    skein512_4way_context   skein;
    jh512_4way_context      jh;
--- a/algo/x13/phi1612-gate.c
+++ b/algo/x13/phi1612-gate.c
@@ -2,7 +2,11 @@

 bool register_phi1612_algo( algo_gate_t* gate )
 {
-#if defined(PHI1612_4WAY)
+#if defined(PHI1612_8WAY)
+  init_phi1612_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_phi1612_8way;
+  gate->hash      = (void*)&phi1612_8way_hash;
+#elif defined(PHI1612_4WAY)
  init_phi1612_4way_ctx();
  gate->scanhash  = (void*)&scanhash_phi1612_4way;
  gate->hash      = (void*)&phi1612_4way_hash;
@@ -11,7 +15,7 @@ bool register_phi1612_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_phi1612;
  gate->hash      = (void*)&phi1612_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x13/phi1612-gate.h
+++ b/algo/x13/phi1612-gate.h
@@ -4,29 +4,35 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define PHI1612_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define PHI1612_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define PHI1612_4WAY 1
 #endif

 bool register_phi1612_algo( algo_gate_t* gate );

-#if defined(PHI1612_4WAY)
+#if defined(PHI1612_8WAY)
+
+void phi1612_8way_hash( void *state, const void *input );
+int scanhash_phi1612_8way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+void init_phi1612_8way_ctx();
+
+#elif defined(PHI1612_4WAY)

 void phi1612_4way_hash( void *state, const void *input );
-
 int scanhash_phi1612_4way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_phi1612_4way_ctx();

-#endif
+#else

 void phi1612_hash( void *state, const void *input );
-
 int scanhash_phi1612( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_phi1612_ctx();

 #endif
+#endif

--- a/algo/x13/skunk-4way.c
+++ b/algo/x13/skunk-4way.c
@@ -1,7 +1,4 @@
 #include "skunk-gate.h"
-
-#if defined(SKUNK_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -10,6 +7,146 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
+
+#if defined(SKUNK_8WAY)
+
+typedef struct {
+    skein512_8way_context skein;
+    cube_4way_context     cube;
+    sph_fugue512_context  fugue;
+    sph_gost512_context   gost;
+} skunk_8way_ctx_holder;
+
+static __thread skunk_8way_ctx_holder skunk_8way_ctx;
+
+void skunk_8way_hash( void *output, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     skunk_8way_ctx_holder ctx __attribute__ ((aligned (64)));
+     memcpy( &ctx, &skunk_8way_ctx, sizeof(skunk_8way_ctx) );
+
+     skein512_8way_update( &ctx.skein, input, 80 );
+     skein512_8way_close( &ctx.skein, vhash );
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                        hash7, vhash, 512 );
+  
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); 
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 ); 
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 ); 
+     cube_4way_init( &ctx.cube, 512, 16, 32 );           
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );  
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, output );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, output+ 32 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, output+ 64 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, output+ 96 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash4, 64 );
+     sph_gost512_close( &ctx.gost, output+128 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash5, 64 );
+     sph_gost512_close( &ctx.gost, output+160 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash6, 64 );
+     sph_gost512_close( &ctx.gost, output+192 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash7, 64 );
+     sph_gost512_close( &ctx.gost, output+224 );
+}
+
+int scanhash_skunk_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   const uint32_t Htarg = ptarget[7];
+   int thr_id = mythr->id;  
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+      ((uint32_t*)ptarget)[7] = 0x0cff;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+      skunk_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
+      if ( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      {
+         pdata[19] = n+i;
+         submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n +=8;
+   } while ( likely( ( n < max_nonce-8 ) && !(*restart) ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+bool skunk_8way_thread_init()
+{
+   skein512_8way_init( &skunk_8way_ctx.skein );
+   cube_4way_init( &skunk_8way_ctx.cube, 512, 16, 32 );
+   sph_fugue512_init( &skunk_8way_ctx.fugue );
+   sph_gost512_init( &skunk_8way_ctx.gost );
+   return true;
+}
+
+#elif defined(SKUNK_4WAY)

 typedef struct {
    skein512_4way_context skein;
--- a/algo/x13/skunk-gate.c
+++ b/algo/x13/skunk-gate.c
@@ -2,12 +2,15 @@

 bool register_skunk_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AVX2_OPT;
-#if defined (SKUNK_4WAY)
+   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+#if defined (SKUNK_8WAY)
+   gate->miner_thread_init = (void*)&skunk_8way_thread_init;
+   gate->scanhash = (void*)&scanhash_skunk_8way;
+   gate->hash     = (void*)&skunk_8way_hash;
+#elif defined (SKUNK_4WAY)
   gate->miner_thread_init = (void*)&skunk_4way_thread_init;
   gate->scanhash = (void*)&scanhash_skunk_4way;
   gate->hash     = (void*)&skunk_4way_hash;
-//   init_skunk_4way_ctx();
 #else
   gate->miner_thread_init = (void*)&skunk_thread_init;
   gate->scanhash = (void*)&scanhash_skunk;
--- a/algo/x13/skunk-gate.h
+++ b/algo/x13/skunk-gate.h
@@ -4,29 +4,33 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__)
-  #define SKUNK_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define SKUNK_8WAY 1
+#elif defined(__AVX2__)
+  #define SKUNK_4WAY 1
 #endif

 bool register_skunk_algo( algo_gate_t* gate );

-#if defined(SKUNK_4WAY)
+#if defined(SKUNK_8WAY)
+
+void skunk_8way_hash( void *state, const void *input );
+int scanhash_skunk_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+bool skunk_8way_thread_init();
+
+#elif defined(SKUNK_4WAY)

 void skunk_4way_hash( void *state, const void *input );
-
 int scanhash_skunk_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
-
 bool skunk_4way_thread_init();
-//void init_skunk_4way_ctx();

 #endif

 void skunkhash( void *state, const void *input );
-
 int scanhash_skunk( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
-
 bool skunk_thread_init();

 #endif
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -127,6 +127,7 @@ void x17_4way_hash( void *state, const void *input )
     dintrlv_2x128_512( hash0, hash1, vhashA );
     dintrlv_2x128_512( hash2, hash3, vhashB );

+
     // 11 Echo serial
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
--- a/algo/yespower/yespower-blake2b.c
+++ b/algo/yespower/yespower-blake2b.c
@@ -49,6 +49,7 @@
 * no slowdown from the prefixes is generally observed on AMD CPUs supporting
 * XOP, some slowdown is sometimes observed on Intel CPUs with AVX.
 */
+/*
 #ifdef __XOP__
 #warning "Note: XOP is enabled.  That's great."
 #elif defined(__AVX__)
@@ -60,6 +61,7 @@
 #else
 #warning "Note: building generic code for non-x86.  That's OK."
 #endif
+*/

 /*
 * The SSE4 code version has fewer instructions than the generic SSE2 version,
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #
 # This script is not intended for users, it is only used for compile testing
-# during develpment. Howver the information contained my provide cimpilation
+# during develpment. However the information contained may provide compilation
 # tips to users.

 make distclean || echo clean
@@ -16,7 +16,8 @@ mv cpuminer cpuminer-avx512

 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl
+# GCC 9 doesn't include AES with core-avx2
+CFLAGS="-O3 -march=core-avx2 -maes -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-avx2.exe
@@ -25,10 +26,10 @@ mv cpuminer cpuminer-avx2

 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=corei7-avx -Wall" ./configure --with-curl
+CFLAGS="-O3 -march=corei7-avx -maes -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
-mv cpuminer.exe cpuminer-aes-avx.exe
+mv cpuminer.exe cpuminer-avx.exe
 strip -s cpuminer
 mv cpuminer cpuminer-aes-avx

--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.11.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.2.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.9.11'
-PACKAGE_STRING='cpuminer-opt 3.9.11'
+PACKAGE_VERSION='3.10.2'
+PACKAGE_STRING='cpuminer-opt 3.10.2'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.9.11 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.10.2 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.9.11:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.10.2:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.9.11
+cpuminer-opt configure 3.10.2
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.9.11, which was
+It was created by cpuminer-opt $as_me 3.10.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.9.11'
+ VERSION='3.10.2'


 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.9.11, which was
+This file was extended by cpuminer-opt $as_me 3.10.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.9.11
+cpuminer-opt config.status 3.10.2
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.9.11])
+AC_INIT([cpuminer-opt], [3.10.2])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -53,6 +53,8 @@
 #if HAVE_SYS_PARAM_H
 #include <sys/param.h>
 #endif
+
+// GCC 9 warning sysctl.h is deprecated
 #include <sys/sysctl.h>
 #endif
 #endif
@@ -3325,7 +3327,7 @@ static void show_credits()
 {
   printf("\n         **********  "PACKAGE_NAME" "PACKAGE_VERSION"  *********** \n");
   printf("     A CPU miner with multi algo support and optimized for CPUs\n");
-   printf("     with AES_NI and AVX2 and SHA extensions.\n");
+   printf("     with AES_NI, AVX2, AVX512 and SHA extensions.\n");
   printf("     BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n");
 }

@@ -3339,12 +3341,15 @@ bool check_cpu_capability ()
     bool cpu_has_avx2   = has_avx2();
     bool cpu_has_sha    = has_sha();
     bool cpu_has_avx512 = has_avx512();
+     bool cpu_has_vaes   = has_vaes();
     bool sw_has_aes    = false;
+     bool sw_has_sse2   = false;
     bool sw_has_sse42  = false;
     bool sw_has_avx    = false;
     bool sw_has_avx2   = false;
     bool sw_has_avx512 = false;
     bool sw_has_sha    = false;
+     bool sw_has_vaes   = false;
     set_t algo_features = algo_gate.optimizations;
     bool algo_has_sse2   = set_incl( SSE2_OPT,    algo_features );
     bool algo_has_aes    = set_incl( AES_OPT,     algo_features );
@@ -3352,17 +3357,22 @@ bool check_cpu_capability ()
     bool algo_has_avx2   = set_incl( AVX2_OPT,    algo_features );
     bool algo_has_avx512 = set_incl( AVX512_OPT,  algo_features );
     bool algo_has_sha    = set_incl( SHA_OPT,     algo_features );
+     bool algo_has_vaes   = set_incl( VAES_OPT,    algo_features );
     bool use_aes;
     bool use_sse2;
     bool use_sse42;
     bool use_avx2;
     bool use_avx512;
     bool use_sha;
+     bool use_vaes;
     bool use_none;

     #ifdef __AES__
       sw_has_aes = true;
     #endif
+     #ifdef __SSE2__
+         sw_has_sse2 = true;
+     #endif
     #ifdef __SSE4_2__
         sw_has_sse42 = true;
     #endif
@@ -3372,12 +3382,16 @@ bool check_cpu_capability ()
     #ifdef __AVX2__
         sw_has_avx2 = true;
     #endif
-     #if (defined(__AVX512F__) && defined(__AVX51DQF__) && defined(__AVX51BW__) && defined(__AVX512VL__))
+     #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
         sw_has_avx512 = true;
     #endif
     #ifdef __SHA__
         sw_has_sha = true;
     #endif
+     #ifdef __VAES__
+         sw_has_vaes = true;
+     #endif
+         

 //     #if !((__AES__) || (__SSE2__))
 //         printf("Neither __AES__ nor __SSE2__ defined.\n");
@@ -3397,33 +3411,36 @@ bool check_cpu_capability ()
     #endif

     printf("CPU features:");
-     if ( cpu_has_sse2   )    printf( " SSE2"   );
-     if ( cpu_has_aes    )    printf( " AES"    );
-     if ( cpu_has_sse42  )    printf( " SSE4.2" );
-     if ( cpu_has_avx    )    printf( " AVX"    );
-     if ( cpu_has_avx2   )    printf( " AVX2"   );
-     if ( cpu_has_avx512 )    printf( " AVX512" );
-     if ( cpu_has_sha    )    printf( " SHA"    );
+     if      ( cpu_has_vaes   )    printf( " VAES"   );
+     else if ( cpu_has_aes    )    printf( " AES"    );
+     if      ( cpu_has_sha    )    printf( " SHA"    );
+     if      ( cpu_has_avx512 )    printf( " AVX512" );
+     else if ( cpu_has_avx2   )    printf( " AVX2"   );
+     else if ( cpu_has_avx    )    printf( " AVX"    );
+     else if ( cpu_has_sse42  )    printf( " SSE4.2" );
+     else if ( cpu_has_sse2   )    printf( " SSE2"   );

-     printf(".\nSW features: SSE2");
-     if ( sw_has_aes    )     printf( " AES"    );
-     if ( sw_has_sse42  )     printf( " SSE4.2" );
-     if ( sw_has_avx    )     printf( " AVX"    );
-     if ( sw_has_avx2   )     printf( " AVX2"   );
-     if ( sw_has_avx512 )     printf( " AVX512" );
-     if ( sw_has_sha    )     printf( " SHA"    );
-    
+     printf(".\nSW features:");
+     if      ( sw_has_vaes   )    printf( " VAES"   );
+     else if ( sw_has_aes    )    printf( " AES"    );
+     if      ( sw_has_sha    )    printf( " SHA"    );
+     if      ( sw_has_avx512 )    printf( " AVX512" );
+     else if ( sw_has_avx2   )    printf( " AVX2"   );
+     else if ( sw_has_avx    )    printf( " AVX"    );
+     else if ( sw_has_sse42  )    printf( " SSE4.2" );
+     else if ( sw_has_sse2   )    printf( " SSE2"   );

     printf(".\nAlgo features:");
     if ( algo_features == EMPTY_SET ) printf( " None" );
     else
     {
-        if ( algo_has_sse2   ) printf( " SSE2"    );
-        if ( algo_has_aes    ) printf( " AES"     );
-        if ( algo_has_sse42  ) printf( " SSE4.2"  );
-        if ( algo_has_avx2   ) printf( " AVX2"   );
-        if ( algo_has_avx512 ) printf( " AVX512" );
-        if ( algo_has_sha    ) printf( " SHA"    );
+        if      ( algo_has_vaes   )    printf( " VAES"   );
+        else if ( algo_has_aes    )    printf( " AES"    );
+        if      ( algo_has_sha    )    printf( " SHA"    );
+        if      ( algo_has_avx512 )    printf( " AVX512" );
+        else if ( algo_has_avx2   )    printf( " AVX2"   );
+        else if ( algo_has_sse42  )    printf( " SSE4.2" );
+        else if ( algo_has_sse2   )    printf( " SSE2"   );
     }
     printf(".\n");

@@ -3461,20 +3478,22 @@ bool check_cpu_capability ()
     use_avx2   = cpu_has_avx2   && sw_has_avx2   && algo_has_avx2;
     use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
     use_sha    = cpu_has_sha    && sw_has_sha    && algo_has_sha;
+     use_vaes   = cpu_has_vaes   && sw_has_vaes   && algo_has_vaes;
     use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 ||
-                   use_sha );
+                   use_sha || use_vaes );
      
     // Display best options
     printf( "Start mining with" );
     if         ( use_none ) printf( " no optimizations" );
     else
     {
-        if      ( use_aes    ) printf( " AES"  );
+        if      ( use_vaes   ) printf( " VAES"   );
+        else if ( use_aes    ) printf( " AES"    );
        if      ( use_avx512 ) printf( " AVX512" );
-        else if ( use_avx2   ) printf( " AVX2" );
-        else if ( use_sse42  ) printf( " SSE4.2"  );
-        else if ( use_sse2   ) printf( " SSE2" );
-        if      ( use_sha    ) printf( " SHA"  );
+        else if ( use_avx2   ) printf( " AVX2"   );
+        else if ( use_sse42  ) printf( " SSE4.2" );
+        else if ( use_sse2   ) printf( " SSE2"   );
+        if      ( use_sha    ) printf( " SHA"    );
     }
     printf( ".\n\n" );

--- a/junk/Android.mk
+++ b/junk/Android.mk
--- a/junk/cpuminer.vcxproj
+++ b/junk/cpuminer.vcxproj
--- a/junk/cpuminer.vcxproj.filters
+++ b/junk/cpuminer.vcxproj.filters
--- a/junk/mingw64.sh
+++ b/junk/mingw64.sh
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	a17ff6f189	v3.10.2	2019-12-09 15:59:02 -05:00
Jay D Dee	73430b13b1	v3.10.1	2019-12-05 19:09:23 -05:00
Jay D Dee	40039386a0	v3.10.0	2019-12-03 12:26:11 -05:00