v3.10.5

v3.10.2
v3.10.1
2025-09-17 23:44:27 +00:00 · 2019-12-21 13:19:29 -05:00 · 2019-12-09 15:59:02 -05:00 · 2019-12-05 19:09:23 -05:00
119 changed files with 15642 additions and 2053 deletions
--- a/71
+++ b/71
@@ -1,12 +1,14 @@


-Requirements:
+1. Requirements:
+---------------

 Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
 supported.
 64 bit Linux operating system. Apple is not supported.

-Building on linux prerequisites:
+2. Building on linux prerequisites:
+-----------------------------------

 It is assumed users know how to install packages on their system and
 be able to compile standard source packages. This is basic Linux and
@@ -20,41 +22,74 @@ http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu

 Install any additional dependencies needed by cpuminer-opt. The list below
 are some of the ones that may not be in the default install and need to
-be installed manually. There may be others, read the error messages they
-will give a clue as to the missing package.
+be installed manually. There may be others, read the compiler error messages,
+they will give a clue as to the missing package.

 The following command should install everything you need on Debian based
 distributions such as Ubuntu. Fedora and other distributions may have similar
-but different package names.
+but different package names. 

-sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev
+$ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git

 SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
-openssl 1.1.0e or higher. Add one of the following, depending on the
-compiler version, to CFLAGS:
-"-march=native" or "-march=znver1" or "-msha".
+openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA
+support depending on your CPU and compiler version:
+
+"-march=native" is always the best choice
+
+"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000.
+
+"-msha"  Add SHA to other tuning options

 Additional instructions for static compilalation can be found here:
 https://lxadm.com/Static_compilation_of_cpuminer
 Static builds should only considered in a homogeneous HW and SW environment.
 Local builds will always have the best performance and compatibility.

-Extract cpuminer source.
+3. Download cpuminer-opt
+------------------------

-tar xvzf cpuminer-opt-x.y.z.tar.gz
-cd cpuminer-opt-x.y.z
+Download the source code for the latest realease from the official repository.

-Run ./build.sh to build on Linux or execute the following commands.
+https://github.com/JayDDee/cpuminer-opt/releases

-./autogen.sh
-CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
-make
+Extract the source code.

-Start mining.
+$ tar xvzf cpuminer-opt-x.y.z.tar.gz
+
+
+Alternatively it can be cloned from git.
+
+$ git clone https://github.com/JayDDee/cpuminer-opt.git
+ 
+4. Build cpuminer-opt
+---------------------
+
+It is recomended to Build with default options, this will usuallly
+produce the best results.
+
+$ ./build.sh to build on Linux or execute the following commands.
+
+or 
+
+$ ./autogen.sh
+$ CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
+$ make -j n
+
+n is the number of threads.
+
+5. Start mining.
+----------------
+
+$ ./cpuminer -a algo -o url -u username -p password

-./cpuminer -a algo -o url -u username -p password

 Windows
+-------
+
+See also INSTAL_WINDOWS
+
+The following procedure is obsolete and uses an old compiler.

 Precompiled Windows binaries are built on a Linux host using Mingw
 with a more recent compiler than the following Windows hosted procedure.
--- a/Makefile.am
+++ b/Makefile.am
@@ -124,6 +124,8 @@ cpuminer_SOURCES = \
  algo/luffa/luffa-hash-2way.c \
  algo/lyra2/lyra2.c \
  algo/lyra2/sponge.c \
+  algo/lyra2/sponge-2way.c \
+  algo/lyra2/lyra2-hash-2way.c \
  algo/lyra2/lyra2-gate.c \
  algo/lyra2/lyra2rev2.c \
  algo/lyra2/lyra2rev2-4way.c \
--- a/README.md
+++ b/README.md
@@ -144,6 +144,9 @@ Supported Algorithms
 Errata
 ------

+Old algorithms that are no longer used frequently will not have the latest
+optimizations.
+
 Cryptonight and variants are no longer supported, use another miner.

 Neoscrypt crashes on Windows, use legacy version.
--- a/README.txt
+++ b/README.txt
@@ -15,20 +15,28 @@ the features listed at cpuminer startup to ensure you are mining at
 optimum speed using the best available features.

 Architecture names and compile options used are only provided for Intel
-Core series. Even the newest Pentium and Celeron CPUs are often missing
-features.
+Core series. Budget CPUs like Pentium and Celeron are often missing the
+latest features.

 AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
 supported by cpuminer-opt due to an incompatible implementation of SSE2 on
 these CPUs. Some algos may crash the miner with an invalid instruction.
 Users are recommended to use an unoptimized miner such as cpuminer-multi.

+More information for Intel and AMD CPU architectures and their features
+can be found on Wikipedia.
+
+https://en.wikipedia.org/wiki/List_of_Intel_CPU_microarchitectures
+
+https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures
+
+
 Exe name                Compile flags            Arch name

 cpuminer-sse2.exe      "-msse2"                  Core2, Nehalem   
 cpuminer-aes-sse42.exe "-march=westmere"         Westmere
-cpuminer-avx.exe       "-march=corei7-avx"       Sandy-Ivybridge
-cpuminer-avx2.exe      "-march=core-avx2"        Haswell, Sky-Kaby-Coffeelake
+cpuminer-avx.exe       "-march=corei7-avx"       Sandybridge
+cpuminer-avx2.exe      "-march=core-avx2 -maes"  Haswell, Skylake, Coffeelake
 cpuminer-avx512.exe    "-march=skylake-avx512"   Skylake-X, Cascadelake-X
 cpuminer-zen           "-march=znver1"           AMD Ryzen, Threadripper

--- a/39
+++ b/39
@@ -1,6 +1,8 @@
 cpuminer-opt is a console program run from the command line using the
 keyboard, not the mouse.

+See also README.md for list of supported algorithms,
+
 Security warning
 ----------------

@@ -25,15 +27,44 @@ Requirements
 Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
 supported.

-64 bit Linux or Windows operating system. Apple and Android are not supported.
-FreeBSD YMMV.
+64 bit Linux or Windows operating system. Apple, Android and Rpi are
+not supported. FreeBSD YMMV.

 Change Log
 ----------

+v3.10.5
+
+AVX512 for x17, sonoa, xevan, hmq1725, lyra2rev3, lyra2rev2. 
+Faster hmq1725 AVX2.
+
+v3.10.4
+
+AVX512 for x16r, x16rv2, x16rt, x16s, x16rt-veil (veil).
+
+v3.10.3
+
+AVX512 for x12, x13, x14, x15.
+Fixed x12 AVX2 invalid shares.
+
+v.10.2
+
+AVX512 added for bmw512, c11, phi1612 (phi), qubit, skunk, x11, x11gost (sib).
+Fixed c11 AVX2 invalid shares.
+
+v3.10.1
+
+AVX512 for blake2b, nist5, quark, tribus.
+
+More broken lane fixes, fixed buffer overflow in skein AVX512, fixed
+quark invalid shares AVX2.
+
+Only the highest ranking feature in a class is listed at startup, lower ranking
+features are available but no longer listed.
+
 v3.10.0

-AVX-512 is now supported on selected algos, Windows binary is now available.
+AVX512 is now supported on selected algos, Windows binary is now available.
 AVX512 optimizations are available for argon2d, blake2s, keccak, keccakc,
 skein & skein2.

@@ -45,7 +76,7 @@ Fixed some previously undetected buffer overflows.

 Lyra2rev2 3% faster SSE2 and AVX2.

-Added "-fno-asynchronous-unwind-tables" to AVX512 build acript for Windows
+Added "-fno-asynchronous-unwind-tables" to AVX512 build script for Windows
 to fix known mingw issue.

 Changed AVX2 build script to explicitly add AES to address change in
--- a/algo/argon2/argon2d/argon2d/opt.c
+++ b/algo/argon2/argon2d/argon2d/opt.c
@@ -21,7 +21,7 @@

 #include "argon2.h"
 #include "core.h"
-
+#include "simd-utils.h"
 #include "../blake2/blake2.h"
 #include "../blake2/blamka-round-opt.h"

@@ -37,24 +37,28 @@

 #if defined(__AVX512F__)

-static void fill_block(__m512i *state, const block *ref_block,
-                       block *next_block, int with_xor) {
+static void fill_block( __m512i *state, const block *ref_block,
+                       block *next_block, int with_xor )
+{
    __m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK];
    unsigned int i;

-    if (with_xor) {
-        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
-            state[i] = _mm512_xor_si512(
-                state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
-            block_XY[i] = _mm512_xor_si512(
-                state[i], _mm512_loadu_si512((const __m512i *)next_block->v + i));
-        }
-    } else {
-        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
-            block_XY[i] = state[i] = _mm512_xor_si512(
-                state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
+    if ( with_xor )
+    {
+        for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
+        {
+            state[i] = _mm512_xor_si512( state[i],
+                      _mm512_load_si512( (const __m512i*)ref_block->v + i ) );
+            block_XY[i] = _mm512_xor_si512( state[i],
+                      _mm512_load_si512( (const __m512i*)next_block->v + i ) );
        }
    }
+    else
+    {
+        for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
+            block_XY[i] = state[i] = _mm512_xor_si512( state[i],
+                      _mm512_load_si512( (const __m512i*)ref_block->v + i ) );
+    }

    BLAKE2_ROUND_1( state[ 0], state[ 1], state[ 2], state[ 3],
                    state[ 4], state[ 5], state[ 6], state[ 7] );
@@ -66,23 +70,10 @@ static void fill_block(__m512i *state, const block *ref_block,
    BLAKE2_ROUND_2( state[ 1], state[ 3], state[ 5], state[ 7],
                    state[ 9], state[11], state[13], state[15] );

-/*
-    for (i = 0; i < 2; ++i) {
-        BLAKE2_ROUND_1(
-            state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
-            state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
-    }
-
-    for (i = 0; i < 2; ++i) {
-        BLAKE2_ROUND_2(
-            state[2 * 0 + i], state[2 * 1 + i], state[2 * 2 + i], state[2 * 3 + i],
-            state[2 * 4 + i], state[2 * 5 + i], state[2 * 6 + i], state[2 * 7 + i]);
-    }
-*/
-
-    for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
-        state[i] = _mm512_xor_si512(state[i], block_XY[i]);
-        _mm512_storeu_si512((__m512i *)next_block->v + i, state[i]);
+    for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
+    {
+        state[i] = _mm512_xor_si512( state[i], block_XY[i] );
+        _mm512_store_si512( (__m512i*)next_block->v + i, state[i] );
    }
 }

@@ -125,18 +116,6 @@ static void fill_block(__m256i *state, const block *ref_block,
    BLAKE2_ROUND_2( state[ 3], state[ 7], state[11], state[15],
                    state[19], state[23], state[27], state[31] );

-/*
-    for (i = 0; i < 4; ++i) {
-        BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
-                       state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
-    }
-
-    for (i = 0; i < 4; ++i) {
-        BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i],
-                       state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
-    }
-*/
-
    for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
        state[i] = _mm256_xor_si256(state[i], block_XY[i]);
        _mm256_store_si256((__m256i *)next_block->v + i, state[i]);
@@ -153,14 +132,14 @@ static void fill_block(__m128i *state, const block *ref_block,
    if (with_xor) {
        for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
            state[i] = _mm_xor_si128(
-                state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
+                state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
            block_XY[i] = _mm_xor_si128(
-                state[i], _mm_loadu_si128((const __m128i *)next_block->v + i));
+                state[i], _mm_load_si128((const __m128i *)next_block->v + i));
        }
    } else {
        for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
            block_XY[i] = state[i] = _mm_xor_si128(
-                state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
+                state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
        }
    }

@@ -198,22 +177,9 @@ static void fill_block(__m128i *state, const block *ref_block,
    BLAKE2_ROUND( state[ 7], state[15], state[23], state[31],  
                  state[39], state[47], state[55], state[63] );

-/*
-    for (i = 0; i < 8; ++i) {
-        BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
-            state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
-            state[8 * i + 6], state[8 * i + 7]);
-    }
-
-    for (i = 0; i < 8; ++i) {
-        BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i],
-            state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
-            state[8 * 6 + i], state[8 * 7 + i]);
-    }
-*/
    for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
        state[i] = _mm_xor_si128(state[i], block_XY[i]);
-        _mm_storeu_si128((__m128i *)next_block->v + i, state[i]);
+        _mm_store_si128((__m128i *)next_block->v + i, state[i]);
    }
 }

--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -184,10 +184,10 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {

 #include <immintrin.h>

-#define  rotr32  mm256_swap32_64
-#define  rotr24  mm256_ror3x8_64
-#define  rotr16  mm256_ror1x16_64
-#define  rotr63( x ) mm256_rol_64( x, 1 )
+#define  rotr32( x )  mm256_ror_64( x, 32 )
+#define  rotr24( x )  mm256_ror_64( x, 24 )
+#define  rotr16( x )  mm256_ror_64( x, 16 )
+#define  rotr63( x )  mm256_rol_64( x,  1 )

 //#define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
 //#define rotr24(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
@@ -427,14 +427,14 @@ static __m512i muladd(__m512i x, __m512i y)
 #define SWAP_QUARTERS(A0, A1) \
    do { \
        SWAP_HALVES(A0, A1); \
-        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
-        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+        A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
+        A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
    } while((void)0, 0)

 #define UNSWAP_QUARTERS(A0, A1) \
    do { \
-        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
-        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+        A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
+        A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
        SWAP_HALVES(A0, A1); \
    } while((void)0, 0)

--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -70,19 +70,22 @@ typedef struct {
 // Default 14 rounds
 typedef blake_4way_small_context blake256_4way_context;
 void blake256_4way_init(void *ctx);
-void blake256_4way(void *ctx, const void *data, size_t len);
+void blake256_4way_update(void *ctx, const void *data, size_t len);
+#define blake256_4way blake256_4way_update
 void blake256_4way_close(void *ctx, void *dst);

 // 14 rounds, blake, decred
 typedef blake_4way_small_context blake256r14_4way_context;
 void blake256r14_4way_init(void *cc);
-void blake256r14_4way(void *cc, const void *data, size_t len);
+void blake256r14_4way_update(void *cc, const void *data, size_t len);
+#define blake256r14_4way blake256r14_4way_update
 void blake256r14_4way_close(void *cc, void *dst);

 // 8 rounds, blakecoin, vanilla
 typedef blake_4way_small_context blake256r8_4way_context;
 void blake256r8_4way_init(void *cc);
-void blake256r8_4way(void *cc, const void *data, size_t len);
+void blake256r8_4way_update(void *cc, const void *data, size_t len);
+#define blake256r8_4way blake256r8_4way_update
 void blake256r8_4way_close(void *cc, void *dst);

 #ifdef __AVX2__
@@ -100,38 +103,93 @@ typedef struct {
 // Default 14 rounds
 typedef blake_8way_small_context blake256_8way_context;
 void blake256_8way_init(void *cc);
-void blake256_8way(void *cc, const void *data, size_t len);
+void blake256_8way_update(void *cc, const void *data, size_t len);
+#define blake256_8way blake256_8way_update
 void blake256_8way_close(void *cc, void *dst);

 // 14 rounds, blake, decred
 typedef blake_8way_small_context blake256r14_8way_context;
 void blake256r14_8way_init(void *cc);
-void blake256r14_8way(void *cc, const void *data, size_t len);
+void blake256r14_8way_update(void *cc, const void *data, size_t len);
 void blake256r14_8way_close(void *cc, void *dst);

 // 8 rounds, blakecoin, vanilla
 typedef blake_8way_small_context blake256r8_8way_context;
 void blake256r8_8way_init(void *cc);
-void blake256r8_8way(void *cc, const void *data, size_t len);
+void blake256r8_8way_update(void *cc, const void *data, size_t len);
+#define blake256r8_8way blake256r8_8way_update
 void blake256r8_8way_close(void *cc, void *dst);

 // Blake-512 4 way

 typedef struct {
-   __m256i buf[16] __attribute__ ((aligned (64)));
+   __m256i buf[16];
   __m256i H[8];
   __m256i S[4];   
   size_t ptr;
   sph_u64 T0, T1;
-} blake_4way_big_context;
+} blake_4way_big_context __attribute__ ((aligned (128)));

 typedef blake_4way_big_context blake512_4way_context;

-void blake512_4way_init(void *cc);
-void blake512_4way(void *cc, const void *data, size_t len);
-void blake512_4way_close(void *cc, void *dst);
-void blake512_4way_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
+void blake512_4way_init( blake_4way_big_context *sc );
+void blake512_4way_update( void *cc, const void *data, size_t len );
+#define blake512_4way blake512_4way_update
+void blake512_4way_close( void *cc, void *dst );
+void blake512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                      void *dst );
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+//Blake-256 16 way
+
+typedef struct {
+   __m512i buf[16];
+   __m512i H[8];
+   size_t ptr;
+   uint32_t T0, T1;
+   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
+} blake_16way_small_context __attribute__ ((aligned (128)));
+
+// Default 14 rounds
+typedef blake_16way_small_context blake256_16way_context;
+void blake256_16way_init(void *cc);
+void blake256_16way_update(void *cc, const void *data, size_t len);
+void blake256_16way_close(void *cc, void *dst);
+
+// 14 rounds, blake, decred
+typedef blake_16way_small_context blake256r14_16way_context;
+void blake256r14_16way_init(void *cc);
+void blake256r14_16way_update(void *cc, const void *data, size_t len);
+void blake256r14_16way_close(void *cc, void *dst);
+
+// 8 rounds, blakecoin, vanilla
+typedef blake_16way_small_context blake256r8_16way_context;
+void blake256r8_16way_init(void *cc);
+void blake256r8_16way_update(void *cc, const void *data, size_t len);
+void blake256r8_16way_close(void *cc, void *dst);
+
+
+// Blake-512 8 way
+
+typedef struct {
+   __m512i buf[16];
+   __m512i H[8];
+   __m512i S[4];
+   size_t ptr;
+   sph_u64 T0, T1;
+} blake_8way_big_context __attribute__ ((aligned (128)));
+
+typedef blake_8way_big_context blake512_8way_context;
+
+void blake512_8way_init( blake_8way_big_context *sc );
+void blake512_8way_update( void *cc, const void *data, size_t len );
+void blake512_8way_close( void *cc, void *dst );
+void blake512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                      void *dst );
+
+#endif  // AVX512
+

 #endif  // AVX2

--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -634,7 +634,7 @@ do { \
                              m256_const1_64( 0x082EFA98082EFA98 ) ); \
   VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
                              m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
-   shuf_bswap32 = m256_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
+   shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
   M0 = _mm256_shuffle_epi8( * buf    , shuf_bswap32 ); \
   M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
@@ -680,6 +680,144 @@ do { \
 } while (0)


+#endif
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// Blaske-256 16 way AVX512
+
+#define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
+do { \
+   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
+                         _mm512_xor_si512( _mm512_set1_epi32( c1 ), m0 ) ); \
+   d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
+   c = _mm512_add_epi32( c, d ); \
+   b = mm512_ror_32( _mm512_xor_si512( b, c ), 12 ); \
+   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
+                         _mm512_xor_si512( _mm512_set1_epi32( c0 ), m1 ) ); \
+   d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \
+   c = _mm512_add_epi32( c, d ); \
+   b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \
+} while (0)
+
+#define ROUND_S_16WAY(r)   do { \
+        GS_16WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
+        GS_16WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
+        GS_16WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
+        GS_16WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
+        GS_16WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
+        GS_16WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
+        GS_16WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
+        GS_16WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
+} while (0)
+
+#define DECL_STATE32_16WAY \
+   __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
+   sph_u32 T0, T1;
+
+#define READ_STATE32_16WAY(state) \
+do { \
+   H0 = (state)->H[0]; \
+   H1 = (state)->H[1]; \
+   H2 = (state)->H[2]; \
+   H3 = (state)->H[3]; \
+   H4 = (state)->H[4]; \
+   H5 = (state)->H[5]; \
+   H6 = (state)->H[6]; \
+   H7 = (state)->H[7]; \
+   T0 = (state)->T0; \
+   T1 = (state)->T1; \
+} while (0)
+
+#define WRITE_STATE32_16WAY(state) \
+do { \
+   (state)->H[0] = H0; \
+   (state)->H[1] = H1; \
+   (state)->H[2] = H2; \
+   (state)->H[3] = H3; \
+   (state)->H[4] = H4; \
+   (state)->H[5] = H5; \
+   (state)->H[6] = H6; \
+   (state)->H[7] = H7; \
+   (state)->T0 = T0; \
+   (state)->T1 = T1; \
+} while (0)
+
+#define COMPRESS32_16WAY( rounds ) \
+do { \
+   __m512i M0, M1, M2, M3, M4, M5, M6, M7; \
+   __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
+   __m512i V0, V1, V2, V3, V4, V5, V6, V7; \
+   __m512i V8, V9, VA, VB, VC, VD, VE, VF; \
+   __m512i shuf_bswap32; \
+   V0 = H0; \
+   V1 = H1; \
+   V2 = H2; \
+   V3 = H3; \
+   V4 = H4; \
+   V5 = H5; \
+   V6 = H6; \
+   V7 = H7; \
+   V8 = m512_const1_64( 0x243F6A88243F6A88 ); \
+   V9 = m512_const1_64( 0x85A308D385A308D3 ); \
+   VA = m512_const1_64( 0x13198A2E13198A2E ); \
+   VB = m512_const1_64( 0x0370734403707344 ); \
+   VC = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
+                              m512_const1_64( 0xA4093822A4093822 ) ); \
+   VD = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
+                              m512_const1_64( 0x299F31D0299F31D0 ) ); \
+   VE = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
+                              m512_const1_64( 0x082EFA98082EFA98 ) ); \
+   VF = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
+                              m512_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
+   shuf_bswap32 = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
+                                 0x2c2d2e2f28292a2b, 0x2425262720212223, \
+                                 0x1c1d1e1f18191a1b, 0x1415161710111213, \
+                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+   M0 = _mm512_shuffle_epi8( * buf    , shuf_bswap32 ); \
+   M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
+   M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
+   M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
+   M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
+   M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
+   M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
+   M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
+   M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
+   M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
+   MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
+   MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
+   MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
+   MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
+   ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
+   MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
+   ROUND_S_16WAY(0); \
+   ROUND_S_16WAY(1); \
+   ROUND_S_16WAY(2); \
+   ROUND_S_16WAY(3); \
+   ROUND_S_16WAY(4); \
+   ROUND_S_16WAY(5); \
+   ROUND_S_16WAY(6); \
+   ROUND_S_16WAY(7); \
+   if (rounds == 14) \
+   { \
+      ROUND_S_16WAY(8); \
+      ROUND_S_16WAY(9); \
+      ROUND_S_16WAY(0); \
+      ROUND_S_16WAY(1); \
+      ROUND_S_16WAY(2); \
+      ROUND_S_16WAY(3); \
+   } \
+   H0 = _mm512_xor_si512( _mm512_xor_si512( V8, V0 ), H0 ); \
+   H1 = _mm512_xor_si512( _mm512_xor_si512( V9, V1 ), H1 ); \
+   H2 = _mm512_xor_si512( _mm512_xor_si512( VA, V2 ), H2 ); \
+   H3 = _mm512_xor_si512( _mm512_xor_si512( VB, V3 ), H3 ); \
+   H4 = _mm512_xor_si512( _mm512_xor_si512( VC, V4 ), H4 ); \
+   H5 = _mm512_xor_si512( _mm512_xor_si512( VD, V5 ), H5 ); \
+   H6 = _mm512_xor_si512( _mm512_xor_si512( VE, V6 ), H6 ); \
+   H7 = _mm512_xor_si512( _mm512_xor_si512( VF, V7 ), H7 ); \
+} while (0)
+
 #endif

 // Blake-256 4 way
@@ -916,6 +1054,179 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,

 #endif

+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+//Blake-256 16 way AVX512
+
+static void
+blake32_16way_init( blake_16way_small_context *sc, const sph_u32 *iv,
+                   const sph_u32 *salt, int rounds )
+{
+   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E6676A09E667 );
+   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE85BB67AE85 );
+   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF3723C6EF372 );
+   casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53AA54FF53A );
+   casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527F510E527F );
+   casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C9B05688C );
+   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9AB1F83D9AB );
+   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD195BE0CD19 );
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
+   sc->rounds = rounds;
+}
+
+static void
+blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   __m512i *buf;
+   size_t ptr;
+   const int buf_size = 64;   // number of elements, sizeof/4
+   DECL_STATE32_16WAY
+   buf = sc->buf;
+   ptr = sc->ptr;
+   if ( len < buf_size - ptr )
+   {
+        memcpy_512( buf + (ptr>>2), vdata, len>>2 );
+        ptr += len;
+        sc->ptr = ptr;
+        return;
+   }
+   READ_STATE32_16WAY(sc);
+   while ( len > 0 )
+   {
+      size_t clen;
+
+      clen = buf_size - ptr;
+      if (clen > len)
+           clen = len;
+      memcpy_512( buf + (ptr>>2), vdata, clen>>2 );
+      ptr += clen;
+      vdata += (clen>>2);
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+          if ( ( T0 = T0 + 512 ) < 512 )
+                T1 = T1 + 1;
+          COMPRESS32_16WAY( sc->rounds );
+          ptr = 0;
+      }
+   }
+   WRITE_STATE32_16WAY(sc);
+   sc->ptr = ptr;
+}
+
+static void
+blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
+                    void *dst, size_t out_size_w32 )
+{
+   __m512i buf[16];
+   size_t ptr;
+   unsigned bit_len;
+   sph_u32 th, tl;
+
+   ptr = sc->ptr;
+   bit_len = ((unsigned)ptr << 3);
+   buf[ptr>>2] = m512_const1_64( 0x0000008000000080ULL );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+
+   if ( ptr == 0 )
+   {
+        sc->T0 = 0xFFFFFE00UL;
+        sc->T1 = 0xFFFFFFFFUL;
+   }
+   else if ( sc->T0 == 0 )
+   {
+        sc->T0 = 0xFFFFFE00UL + bit_len;
+        sc->T1 = sc->T1 - 1;
+   }
+   else
+        sc->T0 -= 512 - bit_len;
+
+   if ( ptr <= 52 )
+   {
+       memset_zero_512( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
+       if ( out_size_w32 == 8 )
+           buf[52>>2] = _mm512_or_si512( buf[52>>2],
+                                m512_const1_64( 0x0100000001000000ULL ) );
+       buf[+56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
+       buf[+60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
+       blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
+   }
+   else
+   {
+        memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
+        blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
+        sc->T0 = 0xFFFFFE00UL;
+        sc->T1 = 0xFFFFFFFFUL;
+        memset_zero_512( buf, 56>>2 );
+       if ( out_size_w32 == 8 )
+           buf[52>>2] = m512_const1_64( 0x0100000001000000ULL );
+        buf[56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
+        buf[60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
+        blake32_16way( sc, buf, 64 );
+   }
+   mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
+}
+
+void
+blake256_16way_init(void *cc)
+{
+   blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
+}
+
+void
+blake256_16way_update(void *cc, const void *data, size_t len)
+{
+        blake32_16way(cc, data, len);
+}
+
+void
+blake256_16way_close(void *cc, void *dst)
+{
+        blake32_16way_close(cc, 0, 0, dst, 8);
+}
+
+void blake256r14_16way_init(void *cc)
+{
+   blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
+}
+
+void
+blake256r14_16way_update(void *cc, const void *data, size_t len)
+{
+   blake32_16way(cc, data, len);
+}
+
+void
+blake256r14_16way_close(void *cc, void *dst)
+{
+   blake32_16way_close(cc, 0, 0, dst, 8);
+}
+
+void blake256r8_16way_init(void *cc)
+{
+   blake32_16way_init( cc, IV256, salt_zero_8way_small, 8 );
+}
+
+void
+blake256r8_16way_update(void *cc, const void *data, size_t len)
+{
+   blake32_16way(cc, data, len);
+}
+
+void
+blake256r8_16way_close(void *cc, void *dst)
+{
+   blake32_16way_close(cc, 0, 0, dst, 8);
+}
+
+#endif // AVX512
+
+
+
 // Blake-256 4 way

 // default 14 rounds, backward copatibility
@@ -948,7 +1259,7 @@ blake256_8way_init(void *cc)
 }

 void
-blake256_8way(void *cc, const void *data, size_t len)
+blake256_8way_update(void *cc, const void *data, size_t len)
 {
        blake32_8way(cc, data, len);
 }
@@ -968,7 +1279,7 @@ void blake256r14_4way_init(void *cc)
 }

 void
-blake256r14_4way(void *cc, const void *data, size_t len)
+blake256r14_4way_update(void *cc, const void *data, size_t len)
 {
   blake32_4way(cc, data, len);
 }
@@ -987,7 +1298,7 @@ void blake256r14_8way_init(void *cc)
 }

 void
-blake256r14_8way(void *cc, const void *data, size_t len)
+blake256r14_8way_update(void *cc, const void *data, size_t len)
 {
   blake32_8way(cc, data, len);
 }
@@ -1007,7 +1318,7 @@ void blake256r8_4way_init(void *cc)
 }

 void
-blake256r8_4way(void *cc, const void *data, size_t len)
+blake256r8_4way_update(void *cc, const void *data, size_t len)
 {
   blake32_4way(cc, data, len);
 }
@@ -1026,7 +1337,7 @@ void blake256r8_8way_init(void *cc)
 }

 void
-blake256r8_8way(void *cc, const void *data, size_t len)
+blake256r8_8way_update(void *cc, const void *data, size_t len)
 {
   blake32_8way(cc, data, len);
 }
--- a/algo/blake/blake2b-4way.c
+++ b/algo/blake/blake2b-4way.c
@@ -17,7 +17,7 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &(hash[25]);   // 3*8+1
+   uint32_t *hash7 = &(hash[49]);   // 3*16+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   int thr_id = mythr->id;
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -42,21 +42,13 @@
 extern "C"{
 #endif

-#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE
-#define SPH_SMALL_FOOTPRINT_BLAKE   1
-#endif
-
-#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE)
-#define SPH_COMPACT_BLAKE_64   1
-#endif
-
 #ifdef _MSC_VER
 #pragma warning (disable: 4146)
 #endif

-
-// Blake-512
-
+// Blake-512 common
+   
+/*
 static const sph_u64 IV512[8] = {
 	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
 	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
@@ -64,10 +56,7 @@ static const sph_u64 IV512[8] = {
 	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
 };

-
-#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
-
-// Blake-256 4 & 8 way, Blake-512 4 way
+static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };

 static const unsigned sigma[16][16] = {
 	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
@@ -88,7 +77,17 @@ static const unsigned sigma[16][16] = {
 	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 }
 };

-#endif
+static const sph_u64 CB[16] = {
+   SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
+   SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
+   SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
+   SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
+   SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
+   SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
+   SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
+   SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
+
+*/

 #define Z00   0
 #define Z01   1
@@ -264,8 +263,6 @@ static const unsigned sigma[16][16] = {
 #define Mx_(n)      Mx__(n)
 #define Mx__(n)     M ## n

-// Blake-512 4 way
-
 #define CBx(r, i)   CBx_(Z ## r ## i)
 #define CBx_(n)     CBx__(n)
 #define CBx__(n)    CB ## n
@@ -287,21 +284,288 @@ static const unsigned sigma[16][16] = {
 #define CBE   SPH_C64(0x0801F2E2858EFC16)
 #define CBF   SPH_C64(0x636920D871574E69)

-#if SPH_COMPACT_BLAKE_64
-// not used
-static const sph_u64 CB[16] = {
-	SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
-	SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
-	SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
-	SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
-	SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
-	SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
-	SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
-	SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
-};
+#define READ_STATE64(state)   do { \
+      H0 = (state)->H[0]; \
+      H1 = (state)->H[1]; \
+      H2 = (state)->H[2]; \
+      H3 = (state)->H[3]; \
+      H4 = (state)->H[4]; \
+      H5 = (state)->H[5]; \
+      H6 = (state)->H[6]; \
+      H7 = (state)->H[7]; \
+      S0 = (state)->S[0]; \
+      S1 = (state)->S[1]; \
+      S2 = (state)->S[2]; \
+      S3 = (state)->S[3]; \
+      T0 = (state)->T0; \
+      T1 = (state)->T1; \
+   } while (0)

-#endif
+#define WRITE_STATE64(state)   do { \
+      (state)->H[0] = H0; \
+      (state)->H[1] = H1; \
+      (state)->H[2] = H2; \
+      (state)->H[3] = H3; \
+      (state)->H[4] = H4; \
+      (state)->H[5] = H5; \
+      (state)->H[6] = H6; \
+      (state)->H[7] = H7; \
+      (state)->S[0] = S0; \
+      (state)->S[1] = S1; \
+      (state)->S[2] = S2; \
+      (state)->S[3] = S3; \
+      (state)->T0 = T0; \
+      (state)->T1 = T1; \
+   } while (0)

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// Blake-512 8 way AVX512
+
+#define GB_8WAY(m0, m1, c0, c1, a, b, c, d)   do { \
+   a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
+                 _mm512_set1_epi64( c1 ), m0 ), b ), a ); \
+   d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
+   c = _mm512_add_epi64( c, d ); \
+   b = mm512_ror_64( _mm512_xor_si512( b, c ), 25 ); \
+   a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
+                 _mm512_set1_epi64( c0 ), m1 ), b ), a ); \
+   d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
+   c = _mm512_add_epi64( c, d ); \
+   b = mm512_ror_64( _mm512_xor_si512( b, c ), 11 ); \
+} while (0)
+
+#define ROUND_B_8WAY(r)   do { \
+   GB_8WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
+   GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
+   GB_8WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
+   GB_8WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
+   GB_8WAY(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
+   GB_8WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
+   GB_8WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
+   GB_8WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
+   } while (0)
+
+#define DECL_STATE64_8WAY \
+   __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
+        __m512i S0, S1, S2, S3; \
+   sph_u64 T0, T1;
+
+#define COMPRESS64_8WAY   do \
+{ \
+  __m512i M0, M1, M2, M3, M4, M5, M6, M7; \
+  __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
+  __m512i V0, V1, V2, V3, V4, V5, V6, V7; \
+  __m512i V8, V9, VA, VB, VC, VD, VE, VF; \
+  __m512i shuf_bswap64; \
+  V0 = H0; \
+  V1 = H1; \
+  V2 = H2; \
+  V3 = H3; \
+  V4 = H4; \
+  V5 = H5; \
+  V6 = H6; \
+  V7 = H7; \
+  V8 = _mm512_xor_si512( S0, m512_const1_64( CB0 ) );  \
+  V9 = _mm512_xor_si512( S1, m512_const1_64( CB1 ) );  \
+  VA = _mm512_xor_si512( S2, m512_const1_64( CB2 ) );  \
+  VB = _mm512_xor_si512( S3, m512_const1_64( CB3 ) );  \
+  VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
+                         m512_const1_64( CB4 ) );  \
+  VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
+                         m512_const1_64( CB5 ) );  \
+  VE = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
+                         m512_const1_64( CB6 ) );  \
+  VF = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
+                         m512_const1_64( CB7 ) );  \
+  shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
+                                0x28292a2b2c2d2e2f, 0x2021222324252627, \
+                                0x18191a1b1c1d1e1f, 0x1011121314151617, \
+                                0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
+  M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
+  M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
+  M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
+  M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
+  M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
+  M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
+  M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
+  M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
+  M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
+  M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
+  MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
+  MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
+  MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
+  MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
+  ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
+  MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
+  ROUND_B_8WAY(0); \
+  ROUND_B_8WAY(1); \
+  ROUND_B_8WAY(2); \
+  ROUND_B_8WAY(3); \
+  ROUND_B_8WAY(4); \
+  ROUND_B_8WAY(5); \
+  ROUND_B_8WAY(6); \
+  ROUND_B_8WAY(7); \
+  ROUND_B_8WAY(8); \
+  ROUND_B_8WAY(9); \
+  ROUND_B_8WAY(0); \
+  ROUND_B_8WAY(1); \
+  ROUND_B_8WAY(2); \
+  ROUND_B_8WAY(3); \
+  ROUND_B_8WAY(4); \
+  ROUND_B_8WAY(5); \
+  H0 = mm512_xor4( V8, V0, S0, H0 ); \
+  H1 = mm512_xor4( V9, V1, S1, H1 ); \
+  H2 = mm512_xor4( VA, V2, S2, H2 ); \
+  H3 = mm512_xor4( VB, V3, S3, H3 ); \
+  H4 = mm512_xor4( VC, V4, S0, H4 ); \
+  H5 = mm512_xor4( VD, V5, S1, H5 ); \
+  H6 = mm512_xor4( VE, V6, S2, H6 ); \
+  H7 = mm512_xor4( VF, V7, S3, H7 ); \
+} while (0)
+
+void blake512_8way_init( blake_8way_big_context *sc )
+{
+   __m512i zero = m512_zero;
+   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
+   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
+   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
+   casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
+   casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
+   casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
+   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
+   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
+
+   casti_m512i( sc->S, 0 ) = zero;
+   casti_m512i( sc->S, 1 ) = zero;
+   casti_m512i( sc->S, 2 ) = zero;
+   casti_m512i( sc->S, 3 ) = zero;
+
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
+}
+
+static void
+blake64_8way( blake_8way_big_context *sc, const void *data, size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   __m512i *buf;
+   size_t ptr;
+   DECL_STATE64_8WAY
+
+   const int buf_size = 128;  //  sizeof/8
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   if ( len < (buf_size - ptr) )
+   {
+   memcpy_512( buf + (ptr>>3), vdata, len>>3 );
+   ptr += len;
+   sc->ptr = ptr;
+   return;
+   }
+
+   READ_STATE64(sc);
+   while ( len > 0 )
+   {
+   size_t clen;
+
+   clen = buf_size - ptr;
+   if ( clen > len )
+      clen = len;
+   memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
+   ptr += clen;
+   vdata = vdata + (clen>>3);
+   len -= clen;
+   if ( ptr == buf_size )
+        {
+      if ( ( T0 = SPH_T64(T0 + 1024) ) < 1024 )
+         T1 = SPH_T64(T1 + 1);
+      COMPRESS64_8WAY;
+      ptr = 0;
+   }
+   }
+   WRITE_STATE64(sc);
+   sc->ptr = ptr;
+}
+
+static void
+blake64_8way_close( blake_8way_big_context *sc, void *dst )
+{
+   __m512i buf[16];
+   size_t ptr;
+   unsigned bit_len;
+//   uint64_t z, zz;
+   sph_u64 th, tl;
+
+   ptr = sc->ptr;
+   bit_len = ((unsigned)ptr << 3);
+//   z = 0x80 >> n;
+//   zz = ((ub & -z) | z) & 0xFF;
+//   buf[ptr>>3] = _mm512_set1_epi64( zz );
+   buf[ptr>>3] = m512_const1_64( 0x80 );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+   if (ptr == 0 )
+   {
+   sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
+   sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
+   }
+   else if ( sc->T0 == 0 )
+   {
+   sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
+   sc->T1 = SPH_T64(sc->T1 - 1);
+   }
+   else
+   {
+        sc->T0 -= 1024 - bit_len;
+   }
+   if ( ptr <= 104 )
+   {
+       memset_zero_512( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
+       buf[104>>3] = _mm512_or_si512( buf[104>>3],
+                                 m512_const1_64( 0x0100000000000000ULL ) );
+       buf[112>>3] = m512_const1_64( bswap_64( th ) );
+       buf[120>>3] = m512_const1_64( bswap_64( tl ) );
+
+       blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
+   }
+   else
+  {
+       memset_zero_512( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
+
+       blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
+       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
+       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
+       memset_zero_512( buf, 112>>3 );
+       buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
+       buf[112>>3] = m512_const1_64( bswap_64( th ) );
+       buf[120>>3] = m512_const1_64( bswap_64( tl ) );
+
+       blake64_8way( sc, buf, 128 );
+   }
+   mm512_block_bswap_64( (__m512i*)dst, sc->H );
+}
+
+void
+blake512_8way_update(void *cc, const void *data, size_t len)
+{
+   blake64_8way(cc, data, len);
+}
+
+void
+blake512_8way_close(void *cc, void *dst)
+{
+   blake512_8way_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+   blake64_8way_close(cc, dst);
+}
+
+#endif  // AVX512

 // Blake-512 4 way

@@ -318,29 +582,6 @@ static const sph_u64 CB[16] = {
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
 } while (0)

-#if SPH_COMPACT_BLAKE_64
-// not used
-#define ROUND_B_4WAY(r)   do { \
-	GB_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
-		CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \
-	GB_4WAY(M[sigma[r][0x2]], M[sigma[r][0x3]], \
-		CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \
-	GB_4WAY(M[sigma[r][0x4]], M[sigma[r][0x5]], \
-		CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \
-	GB_4WAY(M[sigma[r][0x6]], M[sigma[r][0x7]], \
-		CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \
-	GB_4WAY(M[sigma[r][0x8]], M[sigma[r][0x9]], \
-		CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \
-	GB_4WAY(M[sigma[r][0xA]], M[sigma[r][0xB]], \
-		CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \
-	GB_4WAY(M[sigma[r][0xC]], M[sigma[r][0xD]], \
-		CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \
-	GB_4WAY(M[sigma[r][0xE]], M[sigma[r][0xF]], \
-		CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \
-} while (0)
-
-#else
-//current_impl
 #define ROUND_B_4WAY(r)   do { \
 	GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
 	GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
@@ -352,120 +593,11 @@ static const sph_u64 CB[16] = {
 	GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
 	} while (0)

-#endif
-
-
-// Blake-512 4 way
-
 #define DECL_STATE64_4WAY \
 	__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
        __m256i S0, S1, S2, S3; \
 	sph_u64 T0, T1;

-#define READ_STATE64_4WAY(state)   do { \
-		H0 = (state)->H[0]; \
-		H1 = (state)->H[1]; \
-		H2 = (state)->H[2]; \
-		H3 = (state)->H[3]; \
-		H4 = (state)->H[4]; \
-		H5 = (state)->H[5]; \
-		H6 = (state)->H[6]; \
-		H7 = (state)->H[7]; \
-		S0 = (state)->S[0]; \
-		S1 = (state)->S[1]; \
-		S2 = (state)->S[2]; \
-		S3 = (state)->S[3]; \
-		T0 = (state)->T0; \
-		T1 = (state)->T1; \
-	} while (0)
-
-#define WRITE_STATE64_4WAY(state)   do { \
-		(state)->H[0] = H0; \
-		(state)->H[1] = H1; \
-		(state)->H[2] = H2; \
-		(state)->H[3] = H3; \
-		(state)->H[4] = H4; \
-		(state)->H[5] = H5; \
-		(state)->H[6] = H6; \
-		(state)->H[7] = H7; \
-		(state)->S[0] = S0; \
-		(state)->S[1] = S1; \
-		(state)->S[2] = S2; \
-		(state)->S[3] = S3; \
-		(state)->T0 = T0; \
-		(state)->T1 = T1; \
-	} while (0)
-
-#if SPH_COMPACT_BLAKE_64
-
-// not used
-#define COMPRESS64_4WAY   do { \
-	__m256i M[16]; \
-	__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
-	__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
-   const __m256i shuff_bswap64 = m256_const2_64( 0x08090a0b0c0d0e0f, \
-                                                 0x0001020304050607 ) \
-   unsigned r; \
-	V0 = H0; \
-	V1 = H1; \
-	V2 = H2; \
-	V3 = H3; \
-	V4 = H4; \
-	V5 = H5; \
-	V6 = H6; \
-	V7 = H7; \
-   V8 = _mm256_xor_si256( S0, _mm256_set1_epi64x( CB0 ) ); \
-   V9 = _mm256_xor_si256( S1, _mm256_set1_epi64x( CB1 ) ); \
-   VA = _mm256_xor_si256( S2, _mm256_set1_epi64x( CB2 ) ); \
-   VB = _mm256_xor_si256( S3, _mm256_set1_epi64x( CB3 ) ); \
-   VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
-                          _mm256_set1_epi64x( CB4 ) ); \
-   VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
-                          _mm256_set1_epi64x( CB5 ) ); \
-   VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
-                          _mm256_set1_epi64x( CB6 ) ); \
-   VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
-                          _mm256_set1_epi64x( CB7, CB7, CB7, CB7 ) ); \
-   M[0x0] = _mm256_shuffle_epi8( *(buf+ 0), shuff_bswap64 ); \
-	M[0x1] = _mm256_shuffle_epi8( *(buf+ 1), shuff_bswap64 ); \
-	M[0x2] = _mm256_shuffle_epi8( *(buf+ 2), shuff_bswap64 ); \
-	M[0x3] = _mm256_shuffle_epi8( *(buf+ 3), shuff_bswap64 ); \
-	M[0x4] = _mm256_shuffle_epi8( *(buf+ 4), shuff_bswap64 ); \
-	M[0x5] = _mm256_shuffle_epi8( *(buf+ 5), shuff_bswap64 ); \
-	M[0x6] = _mm256_shuffle_epi8( *(buf+ 6), shuff_bswap64 ); \
-	M[0x7] = _mm256_shuffle_epi8( *(buf+ 7), shuff_bswap64 ); \
-	M[0x8] = _mm256_shuffle_epi8( *(buf+ 8), shuff_bswap64 ); \
-	M[0x9] = _mm256_shuffle_epi8( *(buf+ 9), shuff_bswap64 ); \
-	M[0xA] = _mm256_shuffle_epi8( *(buf+10), shuff_bswap64 ); \
-	M[0xB] = _mm256_shuffle_epi8( *(buf+11), shuff_bswap64 ); \
-	M[0xC] = _mm256_shuffle_epi8( *(buf+12), shuff_bswap64 ); \
-	M[0xD] = _mm256_shuffle_epi8( *(buf+13), shuff_bswap64 ); \
-	M[0xE] = _mm256_shuffle_epi8( *(buf+14), shuff_bswap64 ); \
-	M[0xF] = _mm256_shuffle_epi8( *(buf+15), shuff_bswap64 ); \
-	for (r = 0; r < 16; r ++) \
-		ROUND_B_4WAY(r); \
-   H0 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
-   H1 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
-   H2 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S2, V2 ), VA ), H2 ); \
-   H3 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S3, V3 ), VB ), H3 ); \
-   H4 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S0, V4 ), VC ), H4 ); \
-   H5 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S1, V5 ), VD ), H5 ); \
-   H6 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S2, V6 ), VE ), H6 ); \
-   H7 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S3, V7 ), VF ), H7 ); \
-} while (0)
-
-#else
-
-//current impl
-
 #define COMPRESS64_4WAY   do \
 { \
  __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
@@ -493,7 +625,8 @@ static const sph_u64 CB[16] = {
                         m256_const1_64( CB6 ) );  \
  VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
                         m256_const1_64( CB7 ) );  \
-  shuf_bswap64 = m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
+  shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
+                                0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
  M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
  M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
  M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
@@ -536,13 +669,8 @@ static const sph_u64 CB[16] = {
  H7 = mm256_xor4( VF, V7, S3, H7 ); \
 } while (0)

-#endif

-static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
-
-static void
-blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
-              const sph_u64 *salt )
+void blake512_4way_init( blake_4way_big_context *sc )
 {
   __m256i zero = m256_zero;
   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
@@ -553,12 +681,10 @@ blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
-
   casti_m256i( sc->S, 0 ) = zero;
   casti_m256i( sc->S, 1 ) = zero;
   casti_m256i( sc->S, 2 ) = zero;
   casti_m256i( sc->S, 3 ) = zero;
-
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
 }
@@ -583,7 +709,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
 	return;
   }

-   READ_STATE64_4WAY(sc);
+   READ_STATE64(sc);
   while ( len > 0 )
   {
 	size_t clen;
@@ -603,25 +729,21 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
 		ptr = 0;
 	}
   }
-   WRITE_STATE64_4WAY(sc);
+   WRITE_STATE64(sc);
   sc->ptr = ptr;
 }

 static void
-blake64_4way_close( blake_4way_big_context *sc,
-	unsigned ub, unsigned n, void *dst, size_t out_size_w64)
+blake64_4way_close( blake_4way_big_context *sc, void *dst )
 {
   __m256i buf[16];
   size_t ptr;
   unsigned bit_len;
-   uint64_t z, zz;
   sph_u64 th, tl;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-   z = 0x80 >> n;
-   zz = ((ub & -z) | z) & 0xFF;
-   buf[ptr>>3] = _mm256_set1_epi64x( zz );
+   buf[ptr>>3] = m256_const1_64( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;
   if (ptr == 0 )
@@ -638,43 +760,44 @@ blake64_4way_close( blake_4way_big_context *sc,
   {
        sc->T0 -= 1024 - bit_len;
   }
+
   if ( ptr <= 104 )
   {
       memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
-       if ( out_size_w64 == 8 )
-          buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)],
+       buf[104>>3] = _mm256_or_si256( buf[104>>3],
                                 m256_const1_64( 0x0100000000000000ULL ) );
-       *(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) );
-       *(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) );
+       buf[112>>3] = m256_const1_64( bswap_64( th ) );
+       buf[120>>3] = m256_const1_64( bswap_64( tl ) );

       blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
   }
   else
-  {
+   {
       memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );

       blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
       memset_zero_256( buf, 112>>3 ); 
-       if ( out_size_w64 == 8 )
-           buf[104>>3] = m256_const1_64( 0x0100000000000000ULL );
-       *(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) );
-       *(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) );
+       buf[104>>3] = m256_const1_64( 0x0100000000000000ULL );
+       buf[112>>3] = m256_const1_64( bswap_64( th ) );
+       buf[120>>3] = m256_const1_64( bswap_64( tl ) );

       blake64_4way( sc, buf, 128 );
   }
   mm256_block_bswap_64( (__m256i*)dst, sc->H );
 }

+/*
 void
 blake512_4way_init(void *cc)
 {
 	blake64_4way_init(cc, IV512, salt_zero_big);
 }
+*/

 void
-blake512_4way(void *cc, const void *data, size_t len)
+blake512_4way_update(void *cc, const void *data, size_t len)
 {
 	blake64_4way(cc, data, len);
 }
@@ -682,15 +805,18 @@ blake512_4way(void *cc, const void *data, size_t len)
 void
 blake512_4way_close(void *cc, void *dst)
 {
-	blake512_4way_addbits_and_close(cc, 0, 0, dst);
+   blake64_4way_close( cc, dst );
+
+//   blake512_4way_addbits_and_close(cc, dst);
 }

+/*
 void
 blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
 	blake64_4way_close(cc, ub, n, dst, 8);
 }
-
+*/
 #ifdef __cplusplus
 }
 #endif
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -64,7 +64,8 @@ typedef bmw_4way_small_context bmw256_4way_context;

 void bmw256_4way_init( bmw256_4way_context *ctx );

-void bmw256_4way(void *cc, const void *data, size_t len);
+void bmw256_4way_update(void *cc, const void *data, size_t len);
+#define bmw256_4way bmw256_4way_update

 void bmw256_4way_close(void *cc, void *dst);

@@ -87,11 +88,33 @@ typedef struct {
 typedef bmw_8way_small_context bmw256_8way_context;

 void bmw256_8way_init( bmw256_8way_context *ctx );
-void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len );
+void bmw256_8way_update( bmw256_8way_context *ctx, const void *data,
+                         size_t len );
+#define bmw256_8way bmw256_8way_update
 void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );

 #endif

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// BMW-256 16 way 32
+
+typedef struct {
+   __m512i buf[16];
+   __m512i H[16];
+   size_t ptr;
+   uint32_t bit_count;  // assume bit_count fits in 32 bits
+} bmw_16way_small_context __attribute__ ((aligned (128)));
+
+typedef bmw_16way_small_context bmw256_16way_context;
+
+void bmw256_16way_init( bmw256_16way_context *ctx );
+void bmw256_16way_update( bmw256_16way_context *ctx, const void *data,
+                          size_t len );
+void bmw256_16way_close( bmw256_16way_context *ctx, void *dst );
+
+#endif
+

 #if defined(__SSE2__)

@@ -107,7 +130,8 @@ typedef struct {
 typedef bmw_2way_big_context bmw512_2way_context;

 void bmw512_2way_init( bmw512_2way_context *ctx );
-void bmw512_2way( bmw512_2way_context *ctx, const void *data, size_t len );
+void bmw512_2way_update( bmw512_2way_context *ctx, const void *data,
+                         size_t len );
 void bmw512_2way_close( bmw512_2way_context *ctx, void *dst );

 #endif // __SSE2__
@@ -128,7 +152,8 @@ typedef bmw_4way_big_context bmw512_4way_context;

 void bmw512_4way_init(void *cc);

-void bmw512_4way(void *cc, const void *data, size_t len);
+void bmw512_4way_update(void *cc, const void *data, size_t len);
+#define bmw512_4way bmw512_4way_update

 void bmw512_4way_close(void *cc, void *dst);

--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -564,7 +564,7 @@ bmw256_4way_init(void *cc)
 */

 void
-bmw256_4way(void *cc, const void *data, size_t len)
+bmw256_4way_update(void *cc, const void *data, size_t len)
 {
 	bmw32_4way(cc, data, len);
 }
@@ -874,6 +874,57 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
                 mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
                 mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );

+#define DH1L( m, sl, sr, a, b, c ) \
+   _mm256_add_epi32( \
+               _mm256_xor_si256( M[m], \
+                  _mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \
+                                    _mm256_srli_epi32( qt[a], sr ) ) ), \
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+
+#define DH1R( m, sl, sr, a, b, c ) \
+   _mm256_add_epi32( \
+               _mm256_xor_si256( M[m], \
+                  _mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \
+                                    _mm256_slli_epi32( qt[a], sr ) ) ), \
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+
+#define DH2L( m, rl, sl, h, a, b, c ) \
+   _mm256_add_epi32( _mm256_add_epi32( \
+       mm256_rol_32( dH[h], rl ), \
+          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
+                 _mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \
+                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+
+#define DH2R( m, rl, sr, h, a, b, c ) \
+   _mm256_add_epi32( _mm256_add_epi32( \
+       mm256_rol_32( dH[h], rl ), \
+          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
+                 _mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \
+                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+
+   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
+   dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
+   dH[ 2] = DH1R(  2,  5,  5, 18, 26, 2 );
+   dH[ 3] = DH1R(  3,  1,  5, 19, 27, 3 );
+   dH[ 4] = DH1R(  4,  3,  0, 20, 28, 4 );
+   dH[ 5] = DH1L(  5,  6,  6, 21, 29, 5 );
+   dH[ 6] = DH1R(  6,  4,  6, 22, 30, 6 );
+   dH[ 7] = DH1R(  7, 11,  2, 23, 31, 7 );
+   dH[ 8] = DH2L(  8,  9,  8,  4, 24, 23,  8 );
+   dH[ 9] = DH2R(  9, 10,  6,  5, 25, 16,  9 );
+   dH[10] = DH2L( 10, 11,  6,  6, 26, 17, 10 );
+   dH[11] = DH2L( 11, 12,  4,  7, 27, 18, 11 );
+   dH[12] = DH2R( 12, 13,  3,  0, 28, 19, 12 );
+   dH[13] = DH2R( 13, 14,  4,  1, 29, 20, 13 );
+   dH[14] = DH2R( 14, 15,  7,  2, 30, 21, 14 );
+   dH[15] = DH2R( 15, 16,  2,  3, 31, 22, 15 );
+
+#undef DH1L
+#undef DH1R
+#undef DH2L
+#undef DH2R
+
+/*   
   dH[ 0] = _mm256_add_epi32(
                 _mm256_xor_si256( M[0],
                      _mm256_xor_si256( _mm256_slli_epi32( xh, 5 ),
@@ -954,6 +1005,7 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
                 _mm256_xor_si256( _mm256_srli_epi32( xl, 2 ),
                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
+*/
 }

 static const __m256i final_s8[16] =
@@ -1014,7 +1066,8 @@ void bmw256_8way_init( bmw256_8way_context *ctx )
   ctx->bit_count = 0;
 }

-void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
+void bmw256_8way_update( bmw256_8way_context *ctx, const void *data,
+                         size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   __m256i *buf;
@@ -1092,6 +1145,513 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )

 #endif // __AVX2__

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// BMW-256 16 way 32
+
+
+#define s16s0(x) \
+   mm512_xor4( _mm512_srli_epi32( (x), 1), \
+                _mm512_slli_epi32( (x), 3), \
+                mm512_rol_32( (x),  4), \
+                mm512_rol_32( (x), 19) )
+
+#define s16s1(x) \
+   mm512_xor4( _mm512_srli_epi32( (x), 1), \
+                _mm512_slli_epi32( (x), 2), \
+                mm512_rol_32( (x), 8), \
+                mm512_rol_32( (x), 23) )
+
+#define s16s2(x) \
+   mm512_xor4( _mm512_srli_epi32( (x), 2), \
+               _mm512_slli_epi32( (x), 1), \
+               mm512_rol_32( (x), 12), \
+               mm512_rol_32( (x), 25) )
+
+#define s16s3(x) \
+   mm512_xor4( _mm512_srli_epi32( (x), 2), \
+               _mm512_slli_epi32( (x), 2), \
+               mm512_rol_32( (x), 15), \
+               mm512_rol_32( (x), 29) )
+
+#define s16s4(x) \
+  _mm512_xor_si512( (x), _mm512_srli_epi32( (x), 1 ) )
+
+#define s16s5(x) \
+  _mm512_xor_si512( (x), _mm512_srli_epi32( (x), 2 ) )
+
+#define r16s1(x)    mm512_rol_32( x,  3 ) 
+#define r16s2(x)    mm512_rol_32( x,  7 ) 
+#define r16s3(x)    mm512_rol_32( x, 13 ) 
+#define r16s4(x)    mm512_rol_32( x, 16 ) 
+#define r16s5(x)    mm512_rol_32( x, 19 ) 
+#define r16s6(x)    mm512_rol_32( x, 23 ) 
+#define r16s7(x)    mm512_rol_32( x, 27 ) 
+
+#define mm512_rol_off_32( M, j, off ) \
+   mm512_rol_32( M[ ( (j) + (off) ) & 0xF ] , \
+                  ( ( (j) + (off) ) & 0xF ) + 1 )
+
+#define add_elt_s16( M, H, j ) \
+   _mm512_xor_si512( \
+      _mm512_add_epi32( \
+            _mm512_sub_epi32( _mm512_add_epi32( mm512_rol_off_32( M, j, 0 ), \
+                                                mm512_rol_off_32( M, j, 3 ) ), \
+                             mm512_rol_off_32( M, j, 10 ) ), \
+            _mm512_set1_epi32( ( (j) + 16 ) * 0x05555555UL ) ), \
+       H[ ( (j)+7 ) & 0xF ] )
+
+#define expand1s16( qt, M, H, i ) \
+   _mm512_add_epi32( add_elt_s16( M, H, (i)-16 ), \
+                     mm512_add4_32( mm512_add4_32( s16s1( qt[ (i)-16 ] ), \
+                                                   s16s2( qt[ (i)-15 ] ), \
+                                                   s16s3( qt[ (i)-14 ] ), \
+                                                   s16s0( qt[ (i)-13 ] ) ), \
+                                    mm512_add4_32( s16s1( qt[ (i)-12 ] ), \
+                                                   s16s2( qt[ (i)-11 ] ), \
+                                                   s16s3( qt[ (i)-10 ] ), \
+                                                   s16s0( qt[ (i)- 9 ] ) ), \
+                                    mm512_add4_32( s16s1( qt[ (i)- 8 ] ), \
+                                                   s16s2( qt[ (i)- 7 ] ), \
+                                                   s16s3( qt[ (i)- 6 ] ), \
+                                                   s16s0( qt[ (i)- 5 ] ) ), \
+                                    mm512_add4_32( s16s1( qt[ (i)- 4 ] ), \
+                                                   s16s2( qt[ (i)- 3 ] ), \
+                                                   s16s3( qt[ (i)- 2 ] ), \
+                                                   s16s0( qt[ (i)- 1 ] ) ) ) )
+
+#define expand2s16( qt, M, H, i) \
+   _mm512_add_epi32( add_elt_s16( M, H, (i)-16 ), \
+      mm512_add4_32( mm512_add4_32( qt[ (i)-16 ], \
+                                    r16s1( qt[ (i)-15 ] ), \
+                                    qt[ (i)-14 ], \
+                                    r16s2( qt[ (i)-13 ] ) ), \
+                     mm512_add4_32( qt[ (i)-12 ], \
+                                    r16s3( qt[ (i)-11 ] ), \
+                                    qt[ (i)-10 ], \
+                                    r16s4( qt[ (i)- 9 ] ) ), \
+                     mm512_add4_32( qt[ (i)- 8 ], \
+                                    r16s5( qt[ (i)- 7 ] ), \
+                                    qt[ (i)- 6 ], \
+                                    r16s6( qt[ (i)- 5 ] ) ), \
+                     mm512_add4_32( qt[ (i)- 4 ], \
+                                    r16s7( qt[ (i)- 3 ] ), \
+                                    s16s4( qt[ (i)- 2 ] ), \
+                                    s16s5( qt[ (i)- 1 ] ) ) ) )
+
+
+#define W16s0 \
+   _mm512_add_epi32( \
+      _mm512_add_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 5], H[ 5] ), \
+                           _mm512_xor_si512( M[ 7], H[ 7] ) ), \
+         _mm512_xor_si512( M[10], H[10] ) ), \
+      _mm512_add_epi32( _mm512_xor_si512( M[13], H[13] ), \
+                        _mm512_xor_si512( M[14], H[14] ) ) )
+
+#define W16s1 \
+   _mm512_add_epi32( \
+       _mm512_add_epi32( \
+          _mm512_sub_epi32( _mm512_xor_si512( M[ 6], H[ 6] ), \
+                            _mm512_xor_si512( M[ 8], H[ 8] ) ), \
+          _mm512_xor_si512( M[11], H[11] ) ), \
+       _mm512_sub_epi32( _mm512_xor_si512( M[14], H[14] ), \
+                         _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W16s2 \
+   _mm512_sub_epi32( \
+      _mm512_add_epi32( \
+         _mm512_add_epi32( _mm512_xor_si512( M[ 0], H[ 0] ), \
+                           _mm512_xor_si512( M[ 7], H[ 7] ) ), \
+         _mm512_xor_si512( M[ 9], H[ 9] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[12], H[12] ), \
+                        _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W16s3 \
+   _mm512_sub_epi32( \
+      _mm512_add_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 0], H[ 0] ), \
+                           _mm512_xor_si512( M[ 1], H[ 1] ) ), \
+         _mm512_xor_si512( M[ 8], H[ 8] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[10], H[10] ), \
+                        _mm512_xor_si512( M[13], H[13] ) ) )
+
+#define W16s4 \
+   _mm512_sub_epi32( \
+      _mm512_add_epi32( \
+         _mm512_add_epi32( _mm512_xor_si512( M[ 1], H[ 1] ), \
+                           _mm512_xor_si512( M[ 2], H[ 2] ) ), \
+         _mm512_xor_si512( M[ 9], H[ 9] ) ), \
+      _mm512_add_epi32( _mm512_xor_si512( M[11], H[11] ), \
+                        _mm512_xor_si512( M[14], H[14] ) ) )
+
+#define W16s5 \
+   _mm512_sub_epi32( \
+      _mm512_add_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 3], H[ 3] ), \
+                           _mm512_xor_si512( M[ 2], H[ 2] ) ), \
+         _mm512_xor_si512( M[10], H[10] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[12], H[12] ), \
+                        _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W16s6 \
+   _mm512_sub_epi32( \
+      _mm512_sub_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 4], H[ 4] ), \
+                           _mm512_xor_si512( M[ 0], H[ 0] ) ), \
+         _mm512_xor_si512( M[ 3], H[ 3] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[11], H[11] ), \
+                        _mm512_xor_si512( M[13], H[13] ) ) )
+
+#define W16s7 \
+   _mm512_sub_epi32( \
+      _mm512_sub_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 1], H[ 1] ), \
+                           _mm512_xor_si512( M[ 4], H[ 4] ) ), \
+         _mm512_xor_si512( M[ 5], H[ 5] ) ), \
+      _mm512_add_epi32( _mm512_xor_si512( M[12], H[12] ), \
+                        _mm512_xor_si512( M[14], H[14] ) ) )
+
+#define W16s8 \
+   _mm512_add_epi32( \
+      _mm512_sub_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 2], H[ 2] ), \
+                           _mm512_xor_si512( M[ 5], H[ 5] ) ), \
+         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[13], H[13] ), \
+                        _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W16s9 \
+   _mm512_sub_epi32( \
+      _mm512_add_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 0], H[ 0] ), \
+                           _mm512_xor_si512( M[ 3], H[ 3] ) ), \
+         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[ 7], H[ 7] ), \
+                        _mm512_xor_si512( M[14], H[14] ) ) )
+
+#define W16s10 \
+   _mm512_sub_epi32( \
+      _mm512_sub_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 8], H[ 8] ), \
+                           _mm512_xor_si512( M[ 1], H[ 1] ) ), \
+         _mm512_xor_si512( M[ 4], H[ 4] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[ 7], H[ 7] ), \
+                        _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W16s11 \
+   _mm512_sub_epi32( \
+      _mm512_sub_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 8], H[ 8] ), \
+                           _mm512_xor_si512( M[ 0], H[ 0] ) ), \
+         _mm512_xor_si512( M[ 2], H[ 2] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[ 5], H[ 5] ), \
+                        _mm512_xor_si512( M[ 9], H[ 9] ) ) )
+
+#define W16s12 \
+   _mm512_sub_epi32( \
+      _mm512_sub_epi32( \
+         _mm512_add_epi32( _mm512_xor_si512( M[ 1], H[ 1] ), \
+                           _mm512_xor_si512( M[ 3], H[ 3] ) ), \
+         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[ 9], H[ 9] ), \
+                        _mm512_xor_si512( M[10], H[10] ) ) )
+
+#define W16s13 \
+   _mm512_add_epi32( \
+      _mm512_add_epi32( \
+         _mm512_add_epi32( _mm512_xor_si512( M[ 2], H[ 2] ), \
+                           _mm512_xor_si512( M[ 4], H[ 4] ) ), \
+         _mm512_xor_si512( M[ 7], H[ 7] ) ), \
+      _mm512_add_epi32( _mm512_xor_si512( M[10], H[10] ), \
+                        _mm512_xor_si512( M[11], H[11] ) ) )
+
+#define W16s14 \
+   _mm512_sub_epi32( \
+      _mm512_add_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 3], H[ 3] ), \
+                           _mm512_xor_si512( M[ 5], H[ 5] ) ), \
+         _mm512_xor_si512( M[ 8], H[ 8] ) ), \
+      _mm512_add_epi32( _mm512_xor_si512( M[11], H[11] ), \
+                        _mm512_xor_si512( M[12], H[12] ) ) )
+
+#define W16s15 \
+   _mm512_sub_epi32( \
+      _mm512_sub_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[12], H[12] ), \
+                           _mm512_xor_si512( M[ 4], H[4] ) ), \
+         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[ 9], H[ 9] ), \
+                        _mm512_xor_si512( M[13], H[13] ) ) )
+
+void compress_small_16way( const __m512i *M, const __m512i H[16],
+                     __m512i dH[16] )
+{
+   __m512i qt[32], xl, xh;
+
+   qt[ 0] = _mm512_add_epi32( s16s0( W16s0 ), H[ 1] );
+   qt[ 1] = _mm512_add_epi32( s16s1( W16s1 ), H[ 2] );
+   qt[ 2] = _mm512_add_epi32( s16s2( W16s2 ), H[ 3] );
+   qt[ 3] = _mm512_add_epi32( s16s3( W16s3 ), H[ 4] );
+   qt[ 4] = _mm512_add_epi32( s16s4( W16s4 ), H[ 5] );
+   qt[ 5] = _mm512_add_epi32( s16s0( W16s5 ), H[ 6] );
+   qt[ 6] = _mm512_add_epi32( s16s1( W16s6 ), H[ 7] );
+   qt[ 7] = _mm512_add_epi32( s16s2( W16s7 ), H[ 8] );
+   qt[ 8] = _mm512_add_epi32( s16s3( W16s8 ), H[ 9] );
+   qt[ 9] = _mm512_add_epi32( s16s4( W16s9 ), H[10] );
+   qt[10] = _mm512_add_epi32( s16s0( W16s10), H[11] );
+   qt[11] = _mm512_add_epi32( s16s1( W16s11), H[12] );
+   qt[12] = _mm512_add_epi32( s16s2( W16s12), H[13] );
+   qt[13] = _mm512_add_epi32( s16s3( W16s13), H[14] );
+   qt[14] = _mm512_add_epi32( s16s4( W16s14), H[15] );
+   qt[15] = _mm512_add_epi32( s16s0( W16s15), H[ 0] );
+   qt[16] = expand1s16( qt, M, H, 16 );
+   qt[17] = expand1s16( qt, M, H, 17 );
+   qt[18] = expand2s16( qt, M, H, 18 );
+   qt[19] = expand2s16( qt, M, H, 19 );
+   qt[20] = expand2s16( qt, M, H, 20 );
+   qt[21] = expand2s16( qt, M, H, 21 );
+   qt[22] = expand2s16( qt, M, H, 22 );
+   qt[23] = expand2s16( qt, M, H, 23 );
+   qt[24] = expand2s16( qt, M, H, 24 );
+   qt[25] = expand2s16( qt, M, H, 25 );
+   qt[26] = expand2s16( qt, M, H, 26 );
+   qt[27] = expand2s16( qt, M, H, 27 );
+   qt[28] = expand2s16( qt, M, H, 28 );
+   qt[29] = expand2s16( qt, M, H, 29 );
+   qt[30] = expand2s16( qt, M, H, 30 );
+   qt[31] = expand2s16( qt, M, H, 31 );
+
+   xl = _mm512_xor_si512(
+              mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
+              mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
+   xh = _mm512_xor_si512( xl,  _mm512_xor_si512(
+                 mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
+                 mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
+
+#define DH1L( m, sl, sr, a, b, c ) \
+   _mm512_add_epi32( \
+               _mm512_xor_si512( M[m], \
+                  _mm512_xor_si512( _mm512_slli_epi32( xh, sl ), \
+                                    _mm512_srli_epi32( qt[a], sr ) ) ), \
+               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
+
+#define DH1R( m, sl, sr, a, b, c ) \
+   _mm512_add_epi32( \
+               _mm512_xor_si512( M[m], \
+                  _mm512_xor_si512( _mm512_srli_epi32( xh, sl ), \
+                                    _mm512_slli_epi32( qt[a], sr ) ) ), \
+               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
+
+#define DH2L( m, rl, sl, h, a, b, c ) \
+   _mm512_add_epi32( _mm512_add_epi32( \
+       mm512_rol_32( dH[h], rl ), \
+          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
+                 _mm512_xor_si512( _mm512_slli_epi32( xl, sl ), \
+                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
+
+#define DH2R( m, rl, sr, h, a, b, c ) \
+   _mm512_add_epi32( _mm512_add_epi32( \
+       mm512_rol_32( dH[h], rl ), \
+          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
+                 _mm512_xor_si512( _mm512_srli_epi32( xl, sr ), \
+                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
+
+   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
+   dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
+   dH[ 2] = DH1R(  2,  5,  5, 18, 26, 2 );
+   dH[ 3] = DH1R(  3,  1,  5, 19, 27, 3 );
+   dH[ 4] = DH1R(  4,  3,  0, 20, 28, 4 );
+   dH[ 5] = DH1L(  5,  6,  6, 21, 29, 5 );
+   dH[ 6] = DH1R(  6,  4,  6, 22, 30, 6 );
+   dH[ 7] = DH1R(  7, 11,  2, 23, 31, 7 );
+   dH[ 8] = DH2L(  8,  9,  8,  4, 24, 23,  8 );
+   dH[ 9] = DH2R(  9, 10,  6,  5, 25, 16,  9 );
+   dH[10] = DH2L( 10, 11,  6,  6, 26, 17, 10 );
+   dH[11] = DH2L( 11, 12,  4,  7, 27, 18, 11 );
+   dH[12] = DH2R( 12, 13,  3,  0, 28, 19, 12 );
+   dH[13] = DH2R( 13, 14,  4,  1, 29, 20, 13 );
+   dH[14] = DH2R( 14, 15,  7,  2, 30, 21, 14 );
+   dH[15] = DH2R( 15, 16,  2,  3, 31, 22, 15 );
+
+#undef DH1L
+#undef DH1R
+#undef DH2L
+#undef DH2R
+
+}
+
+static const __m512i final_s16[16] =
+{
+    { 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0,
+      0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0,
+      0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0,
+      0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
+    { 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1,
+      0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1,
+      0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1,
+      0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
+    { 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2,
+      0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2,
+      0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2,
+      0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 },
+    { 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3,
+      0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3,
+      0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3,
+      0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 },
+    { 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4,
+      0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4,
+      0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4,
+      0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 },
+    { 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5,
+      0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5,
+      0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5,
+      0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 },
+    { 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6,
+      0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6,
+      0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6,
+      0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 },
+    { 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7,
+      0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7,
+      0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7,
+      0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 },
+    { 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8,
+      0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8,
+      0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8,
+      0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 },
+    { 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9,
+      0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9,
+      0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9,
+      0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 },
+    { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
+      0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
+      0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
+      0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
+    { 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab,
+      0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab,
+      0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab,
+      0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab },
+    { 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac,
+      0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac,
+      0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac,
+      0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac },
+    { 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad,
+      0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad,
+      0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad,
+      0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad },
+    { 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae,
+      0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae,
+      0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae,
+      0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
+    { 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf,
+      0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf,
+      0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf,
+      0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
+};
+
+
+void bmw256_16way_init( bmw256_16way_context *ctx )
+{
+   ctx->H[ 0] = m512_const1_64( 0x4041424340414243 );
+   ctx->H[ 1] = m512_const1_64( 0x4445464744454647 );
+   ctx->H[ 2] = m512_const1_64( 0x48494A4B48494A4B );
+   ctx->H[ 3] = m512_const1_64( 0x4C4D4E4F4C4D4E4F );
+   ctx->H[ 4] = m512_const1_64( 0x5051525350515253 );
+   ctx->H[ 5] = m512_const1_64( 0x5455565754555657 );
+   ctx->H[ 6] = m512_const1_64( 0x58595A5B58595A5B );
+   ctx->H[ 7] = m512_const1_64( 0x5C5D5E5F5C5D5E5F );
+   ctx->H[ 8] = m512_const1_64( 0x6061626360616263 );
+   ctx->H[ 9] = m512_const1_64( 0x6465666764656667 );
+   ctx->H[10] = m512_const1_64( 0x68696A6B68696A6B );
+   ctx->H[11] = m512_const1_64( 0x6C6D6E6F6C6D6E6F );
+   ctx->H[12] = m512_const1_64( 0x7071727370717273 );
+   ctx->H[13] = m512_const1_64( 0x7475767774757677 );
+   ctx->H[14] = m512_const1_64( 0x78797A7B78797A7B );
+   ctx->H[15] = m512_const1_64( 0x7C7D7E7F7C7D7E7F );
+   ctx->ptr       = 0;
+   ctx->bit_count = 0;
+}
+
+void bmw256_16way_update( bmw256_16way_context *ctx, const void *data,
+                          size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   __m512i *buf;
+   __m512i htmp[16];
+   __m512i *h1, *h2;
+   size_t ptr;
+   const int buf_size = 64;  // bytes of one lane, compatible with len
+
+   ctx->bit_count += len << 3;
+   buf = ctx->buf;
+   ptr = ctx->ptr;
+   h1 = ctx->H;
+   h2 = htmp;
+
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_512( buf + (ptr>>2), vdata, clen >> 2 );
+      vdata = vdata + (clen>>2);
+      len -= clen;
+      ptr += clen;
+      if ( ptr == buf_size )
+      {
+         __m512i *ht;
+         compress_small_16way( buf, h1, h2 );
+         ht = h1;
+         h1 = h2;
+         h2 = ht;
+         ptr = 0;
+      }
+   }
+   ctx->ptr = ptr;
+
+   if ( h1 != ctx->H )
+        memcpy_512( ctx->H, h1, 16 );
+}
+
+void bmw256_16way_close( bmw256_16way_context *ctx, void *dst )
+{
+   __m512i *buf;
+   __m512i h1[16], h2[16], *h;
+   size_t ptr, u, v;
+   const int buf_size = 64;  // bytes of one lane, compatible with len
+
+   buf = ctx->buf;
+   ptr = ctx->ptr;
+   buf[ ptr>>2 ] = m512_const1_64( 0x0000008000000080 );
+   ptr += 4;
+   h = ctx->H;
+
+   if (  ptr > (buf_size - 4) )
+   {
+      memset_zero_512( buf + (ptr>>2), (buf_size - ptr) >> 2 );
+      compress_small_16way( buf, h, h1 );
+      ptr = 0;
+      h = h1;
+   }
+   memset_zero_512( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
+   buf[ (buf_size - 8) >> 2 ] = _mm512_set1_epi32( ctx->bit_count );
+   buf[ (buf_size - 4) >> 2 ] = m512_zero;
+
+   compress_small_16way( buf, h, h2 );
+
+   for ( u = 0; u < 16; u ++ )
+      buf[u] = h2[u];
+
+   compress_small_16way( buf, final_s16, h1 );
+   for (u = 0, v = 16 - 8; u < 8; u ++, v ++)
+      casti_m512i(dst,u) = h1[v];
+}
+
+
+#endif // AVX512
+
+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/bmw/bmw512-4way.c
+++ b/algo/bmw/bmw512-4way.c
@@ -18,16 +18,17 @@ void bmw512hash_8way(void *state, const void *input)
 int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
-   uint32_t hash[16*8] __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash[16*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[49]);   // 3*16+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-//   const uint32_t Htarg = ptarget[7];
+   const uint32_t Htarg = ptarget[7];
   int thr_id = mythr->id;

   mm512_bswap32_intrlv80_8x64( vdata, pdata );
@@ -39,7 +40,8 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
      bmw512hash_8way( hash, vdata );

      for ( int lane = 0; lane < 8; lane++ )
-      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
+      if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
+//      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) )
@@ -48,15 +50,14 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
              submit_lane_solution( work, lane_hash, mythr, lane );
          }
      }
-      n += 4;
+      n += 8;

-   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart) );

-   *hashes_done = n - first_nonce + 1;
+   *hashes_done = n - first_nonce;
   return 0;
 }
   
-
 #elif defined(BMW512_4WAY)

 //#ifdef BMW512_4WAY
@@ -72,16 +73,17 @@ void bmw512hash_4way(void *state, const void *input)
 int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-   uint32_t hash[16*4] __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (128)));
+   uint32_t hash[16*4] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[25]);   // 3*8+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce -  4;
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-//   const uint32_t Htarg = ptarget[7];
+   const uint32_t Htarg = ptarget[7];
    int thr_id = mythr->id;  // thr_id arg is deprecated

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
@@ -92,7 +94,8 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
      bmw512hash_4way( hash, vdata );

      for ( int lane = 0; lane < 4; lane++ )
-      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
+      if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
+//      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) )
@@ -103,9 +106,9 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
      }
      n += 4;

-   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );

-   *hashes_done = n - first_nonce + 1;
+   *hashes_done = n - first_nonce;
   return 0;
 }

--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -58,8 +58,7 @@ static const sph_u64 IV512[] = {

 #if defined(__SSE2__)

-// BMW-512 2 way 64
-
+// BMW-512 2 way 64 

 #define s2b0(x) \
   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 1), \
@@ -561,13 +560,10 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )

 #endif  // __SSE2__

-
-
 #if defined(__AVX2__)

 // BMW-512 4 way 64

-
 #define sb0(x) \
   mm256_xor4( _mm256_srli_epi64( (x), 1), _mm256_slli_epi64( (x), 3), \
                mm256_rol_64(     (x), 4),  mm256_rol_64(     (x),37) )
@@ -827,87 +823,57 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
           mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
           mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );

-   dH[ 0] = _mm256_add_epi64(
-               _mm256_xor_si256( M[0],
-                  _mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
-                                    _mm256_srli_epi64( qt[16], 5 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ) );
-   dH[ 1] = _mm256_add_epi64(
-               _mm256_xor_si256( M[1],
-                  _mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
-                                    _mm256_slli_epi64( qt[17], 8 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ) );
-   dH[ 2] = _mm256_add_epi64(
-               _mm256_xor_si256( M[2],
-                  _mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
-                                    _mm256_slli_epi64( qt[18], 5 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ) );
-   dH[ 3] = _mm256_add_epi64(
-               _mm256_xor_si256( M[3],
-                  _mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
-                                    _mm256_slli_epi64( qt[19], 5 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ) );
-   dH[ 4] = _mm256_add_epi64(
-               _mm256_xor_si256( M[4],
-                  _mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
-                                    _mm256_slli_epi64( qt[20], 0 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ) );
-   dH[ 5] = _mm256_add_epi64(
-               _mm256_xor_si256( M[5],
-                  _mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
-                                    _mm256_srli_epi64( qt[21], 6 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ) );
-   dH[ 6] = _mm256_add_epi64(
-               _mm256_xor_si256( M[6],
-                  _mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
-                                    _mm256_slli_epi64( qt[22], 6 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ) );
-   dH[ 7] = _mm256_add_epi64(
-               _mm256_xor_si256( M[7],
-                  _mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
-                                    _mm256_slli_epi64( qt[23], 2 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ) );
-   dH[ 8] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[4], 9 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
-                 _mm256_xor_si256( _mm256_slli_epi64( xl, 8 ),
-                                   _mm256_xor_si256( qt[23], qt[ 8] ) ) );
-   dH[ 9] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[5], 10 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 6 ),
-                                   _mm256_xor_si256( qt[16], qt[ 9] ) ) );
-   dH[10] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[6], 11 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
-                 _mm256_xor_si256( _mm256_slli_epi64( xl, 6 ),
-                                   _mm256_xor_si256( qt[17], qt[10] ) ) );
-   dH[11] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[7], 12 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
-                 _mm256_xor_si256( _mm256_slli_epi64( xl, 4 ),
-                                   _mm256_xor_si256( qt[18], qt[11] ) ) );
-   dH[12] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[0], 13 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 3 ),
-                                   _mm256_xor_si256( qt[19], qt[12] ) ) );
-   dH[13] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[1], 14 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 4 ),
-                                   _mm256_xor_si256( qt[20], qt[13] ) ) );
-   dH[14] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[2], 15 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 7 ),
-                                   _mm256_xor_si256( qt[21], qt[14] ) ) );
-   dH[15] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[3], 16 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 2 ),
-                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
-} 
+
+#define DH1L( m, sl, sr, a, b, c ) \
+   _mm256_add_epi64( \
+               _mm256_xor_si256( M[m], \
+                  _mm256_xor_si256( _mm256_slli_epi64( xh, sl ), \
+                                    _mm256_srli_epi64( qt[a], sr ) ) ), \
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+
+#define DH1R( m, sl, sr, a, b, c ) \
+   _mm256_add_epi64( \
+               _mm256_xor_si256( M[m], \
+                  _mm256_xor_si256( _mm256_srli_epi64( xh, sl ), \
+                                    _mm256_slli_epi64( qt[a], sr ) ) ), \
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+
+#define DH2L( m, rl, sl, h, a, b, c ) \
+   _mm256_add_epi64( _mm256_add_epi64( \
+       mm256_rol_64( dH[h], rl ), \
+          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
+                 _mm256_xor_si256( _mm256_slli_epi64( xl, sl ), \
+                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+
+#define DH2R( m, rl, sr, h, a, b, c ) \
+   _mm256_add_epi64( _mm256_add_epi64( \
+       mm256_rol_64( dH[h], rl ), \
+          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, sr ), \
+                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+
+   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
+   dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
+   dH[ 2] = DH1R(  2,  5,  5, 18, 26, 2 );
+   dH[ 3] = DH1R(  3,  1,  5, 19, 27, 3 );
+   dH[ 4] = DH1R(  4,  3,  0, 20, 28, 4 );
+   dH[ 5] = DH1L(  5,  6,  6, 21, 29, 5 );
+   dH[ 6] = DH1R(  6,  4,  6, 22, 30, 6 );
+   dH[ 7] = DH1R(  7, 11,  2, 23, 31, 7 );
+   dH[ 8] = DH2L(  8,  9,  8,  4, 24, 23,  8 );
+   dH[ 9] = DH2R(  9, 10,  6,  5, 25, 16,  9 );
+   dH[10] = DH2L( 10, 11,  6,  6, 26, 17, 10 );
+   dH[11] = DH2L( 11, 12,  4,  7, 27, 18, 11 );
+   dH[12] = DH2R( 12, 13,  3,  0, 28, 19, 12 );
+   dH[13] = DH2R( 13, 14,  4,  1, 29, 20, 13 );
+   dH[14] = DH2R( 14, 15,  7,  2, 30, 21, 14 );
+   dH[15] = DH2R( 15, 16,  2,  3, 31, 22, 15 );
+
+#undef DH1L
+#undef DH1R
+#undef DH2L
+#undef DH2R
+}

 static const __m256i final_b[16] =
 {
@@ -1047,7 +1013,7 @@ bmw512_4way_init(void *cc)
 }

 void
-bmw512_4way(void *cc, const void *data, size_t len)
+bmw512_4way_update(void *cc, const void *data, size_t len)
 {
 	bmw64_4way(cc, data, len);
 }
@@ -1137,8 +1103,6 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
                     s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ), \
      add_elt_b8( M, H, (i)-16 ) )

-
-
 #define W8b0 \
   _mm512_add_epi64( \
      _mm512_add_epi64( \
@@ -1328,21 +1292,28 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
           mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
           mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );

-#define DH1( m, sl, sr, a, b, c ) \
+#define DH1L( m, sl, sr, a, b, c ) \
   _mm512_add_epi64( \
               _mm512_xor_si512( M[m], \
                  _mm512_xor_si512( _mm512_slli_epi64( xh, sl ), \
                                    _mm512_srli_epi64( qt[a], sr ) ) ), \
               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )

-#define DHL( m, rl, sl, h, a, b, c ) \
+#define DH1R( m, sl, sr, a, b, c ) \
+   _mm512_add_epi64( \
+               _mm512_xor_si512( M[m], \
+                  _mm512_xor_si512( _mm512_srli_epi64( xh, sl ), \
+                                    _mm512_slli_epi64( qt[a], sr ) ) ), \
+               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
+
+#define DH2L( m, rl, sl, h, a, b, c ) \
   _mm512_add_epi64( _mm512_add_epi64( \
       mm512_rol_64( dH[h], rl ), \
          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
                 _mm512_xor_si512( _mm512_slli_epi64( xl, sl ), \
                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
   
-#define DHR( m, rl, sr, h, a, b, c ) \
+#define DH2R( m, rl, sr, h, a, b, c ) \
   _mm512_add_epi64( _mm512_add_epi64( \
       mm512_rol_64( dH[h], rl ), \
          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
@@ -1350,26 +1321,27 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
                                   _mm512_xor_si512( qt[b], qt[c] ) ) );


-   dH[ 0] = DH1(  0,  5,  5, 16, 24, 0 );
-   dH[ 1] = DH1(  1,  7,  8, 17, 25, 1 );
-   dH[ 2] = DH1(  2,  5,  5, 18, 26, 2 );
-   dH[ 3] = DH1(  3,  1,  5, 19, 27, 3 );
-   dH[ 4] = DH1(  4,  3,  0, 20, 28, 4 );
-   dH[ 5] = DH1(  5,  6,  6, 21, 29, 5 );
-   dH[ 6] = DH1(  6,  4,  6, 22, 30, 6 );
-   dH[ 7] = DH1(  7, 11,  2, 23, 31, 7 );
-   dH[ 8] = DHL(  8,  9,  8,  4, 24, 23,  8 );
-   dH[ 9] = DHR(  9, 10,  6,  5, 25, 16,  9 );
-   dH[10] = DHL( 10, 11,  6,  6, 26, 17, 10 );
-   dH[11] = DHL( 11, 12,  4,  7, 27, 18, 11 );
-   dH[12] = DHR( 12, 13,  3,  0, 28, 19, 12 );
-   dH[13] = DHR( 13, 14,  4,  1, 29, 20, 13 );
-   dH[14] = DHR( 14, 15,  7,  2, 30, 21, 14 );
-   dH[15] = DHR( 15, 16,  2,  3, 31, 22, 15 );
+   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
+   dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
+   dH[ 2] = DH1R(  2,  5,  5, 18, 26, 2 );
+   dH[ 3] = DH1R(  3,  1,  5, 19, 27, 3 );
+   dH[ 4] = DH1R(  4,  3,  0, 20, 28, 4 );
+   dH[ 5] = DH1L(  5,  6,  6, 21, 29, 5 );
+   dH[ 6] = DH1R(  6,  4,  6, 22, 30, 6 );
+   dH[ 7] = DH1R(  7, 11,  2, 23, 31, 7 );
+   dH[ 8] = DH2L(  8,  9,  8,  4, 24, 23,  8 );
+   dH[ 9] = DH2R(  9, 10,  6,  5, 25, 16,  9 );
+   dH[10] = DH2L( 10, 11,  6,  6, 26, 17, 10 );
+   dH[11] = DH2L( 11, 12,  4,  7, 27, 18, 11 );
+   dH[12] = DH2R( 12, 13,  3,  0, 28, 19, 12 );
+   dH[13] = DH2R( 13, 14,  4,  1, 29, 20, 13 );
+   dH[14] = DH2R( 14, 15,  7,  2, 30, 21, 14 );
+   dH[15] = DH2R( 15, 16,  2,  3, 31, 22, 15 );

-#undef DH1
-#undef DHL
-#undef DHR
+#undef DH1L
+#undef DH1R
+#undef DH2L
+#undef DH2R
         
 }

--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -26,6 +26,186 @@ static const uint64_t IV512[] =
 0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
 };

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// 4 way 128 is handy to avoid reinterleaving in many algos.
+// If reinterleaving is necessary it may be more efficient to use
+// 2 way 256. The same transform code should work for both.
+
+static void transform_4way( cube_4way_context *sp )
+{
+    int r;
+    const int rounds = sp->rounds;
+
+    __m512i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1;
+
+    x0 = _mm512_load_si512( (__m512i*)sp->h     );
+    x1 = _mm512_load_si512( (__m512i*)sp->h + 1 );
+    x2 = _mm512_load_si512( (__m512i*)sp->h + 2 );
+    x3 = _mm512_load_si512( (__m512i*)sp->h + 3 );
+    x4 = _mm512_load_si512( (__m512i*)sp->h + 4 );
+    x5 = _mm512_load_si512( (__m512i*)sp->h + 5 );
+    x6 = _mm512_load_si512( (__m512i*)sp->h + 6 );
+    x7 = _mm512_load_si512( (__m512i*)sp->h + 7 );
+
+    for ( r = 0; r < rounds; ++r )
+    {
+        x4 = _mm512_add_epi32( x0, x4 );
+        x5 = _mm512_add_epi32( x1, x5 );
+        x6 = _mm512_add_epi32( x2, x6 );
+        x7 = _mm512_add_epi32( x3, x7 );
+        y0 = x0;
+        y1 = x1;
+        x0 = mm512_rol_32( x2, 7 );
+        x1 = mm512_rol_32( x3, 7 );
+        x2 = mm512_rol_32( y0, 7 );
+        x3 = mm512_rol_32( y1, 7 );
+        x0 = _mm512_xor_si512( x0, x4 );
+        x1 = _mm512_xor_si512( x1, x5 );
+        x2 = _mm512_xor_si512( x2, x6 );
+        x3 = _mm512_xor_si512( x3, x7 );
+        x4 = mm512_swap128_64( x4 );
+        x5 = mm512_swap128_64( x5 );
+        x6 = mm512_swap128_64( x6 );
+        x7 = mm512_swap128_64( x7 );
+        x4 = _mm512_add_epi32( x0, x4 );
+        x5 = _mm512_add_epi32( x1, x5 );
+        x6 = _mm512_add_epi32( x2, x6 );
+        x7 = _mm512_add_epi32( x3, x7 );
+        y0 = x0;
+        y1 = x2;
+        x0 = mm512_rol_32( x1, 11 );
+        x1 = mm512_rol_32( y0, 11 );
+        x2 = mm512_rol_32( x3, 11 );
+        x3 = mm512_rol_32( y1, 11 );
+        x0 = _mm512_xor_si512( x0, x4 );
+        x1 = _mm512_xor_si512( x1, x5 );
+        x2 = _mm512_xor_si512( x2, x6 );
+        x3 = _mm512_xor_si512( x3, x7 );
+        x4 = mm512_swap64_32( x4 );
+        x5 = mm512_swap64_32( x5 );
+        x6 = mm512_swap64_32( x6 );
+        x7 = mm512_swap64_32( x7 );
+    }
+
+    _mm512_store_si512( (__m512i*)sp->h,     x0 );
+    _mm512_store_si512( (__m512i*)sp->h + 1, x1 );
+    _mm512_store_si512( (__m512i*)sp->h + 2, x2 );
+    _mm512_store_si512( (__m512i*)sp->h + 3, x3 );
+    _mm512_store_si512( (__m512i*)sp->h + 4, x4 );
+    _mm512_store_si512( (__m512i*)sp->h + 5, x5 );
+    _mm512_store_si512( (__m512i*)sp->h + 6, x6 );
+    _mm512_store_si512( (__m512i*)sp->h + 7, x7 );
+}
+
+int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
+                    int blockbytes )
+{
+    __m512i *h = (__m512i*)sp->h;
+    __m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
+                                                : (__m128i*)IV256 );
+    sp->hashlen   = hashbitlen/128;
+    sp->blocksize = blockbytes/16;
+    sp->rounds    = rounds;
+    sp->pos       = 0;
+
+    h[ 0] = m512_const1_128( iv[0] );
+    h[ 1] = m512_const1_128( iv[1] );
+    h[ 2] = m512_const1_128( iv[2] );
+    h[ 3] = m512_const1_128( iv[3] );
+    h[ 4] = m512_const1_128( iv[4] );
+    h[ 5] = m512_const1_128( iv[5] );
+    h[ 6] = m512_const1_128( iv[6] );
+    h[ 7] = m512_const1_128( iv[7] );
+    h[ 0] = m512_const1_128( iv[0] );
+    h[ 1] = m512_const1_128( iv[1] );
+    h[ 2] = m512_const1_128( iv[2] );
+    h[ 3] = m512_const1_128( iv[3] );
+    h[ 4] = m512_const1_128( iv[4] );
+    h[ 5] = m512_const1_128( iv[5] );
+    h[ 6] = m512_const1_128( iv[6] );
+    h[ 7] = m512_const1_128( iv[7] );
+
+    return 0;
+}
+
+int cube_4way_update( cube_4way_context *sp, const void *data, size_t size )
+{
+    const int len = size >> 4;
+    const __m512i *in = (__m512i*)data;
+    int i;
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_4way( sp );
+           sp->pos = 0;
+        }
+    }
+    return 0;
+}
+
+int cube_4way_close( cube_4way_context *sp, void *output )
+{
+    __m512i *hash = (__m512i*)output;
+    int i;
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
+                                 m512_const2_64( 0, 0x0000000000000080 ) );
+    transform_4way( sp );
+
+    sp->h[7] = _mm512_xor_si512( sp->h[7],
+                                 m512_const2_64( 0x0000000100000000, 0 ) );
+
+    for ( i = 0; i < 10; ++i ) 
+       transform_4way( sp );
+
+    memcpy( hash, sp->h, sp->hashlen<<6 );
+    return 0;
+}
+
+int cube_4way_update_close( cube_4way_context *sp, void *output,
+                               const void *data, size_t size )
+{
+    const int len = size >> 4;
+    const __m512i *in = (__m512i*)data;
+    __m512i *hash = (__m512i*)output;
+    int i;
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_4way( sp );
+           sp->pos = 0;
+        }
+    }
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
+                                    m512_const2_64( 0, 0x0000000000000080 ) );
+    transform_4way( sp );
+
+    sp->h[7] = _mm512_xor_si512( sp->h[7],
+                                    m512_const2_64( 0x0000000100000000, 0 ) );
+
+    for ( i = 0; i < 10; ++i )
+       transform_4way( sp );
+
+    memcpy( hash, sp->h, sp->hashlen<<6);
+    return 0;
+}
+
+
+#endif // AVX512
+
+// 2 way 128 

 static void transform_2way( cube_2way_context *sp )
 {
@@ -59,10 +239,10 @@ static void transform_2way( cube_2way_context *sp )
        x1 = _mm256_xor_si256( x1, x5 );
        x2 = _mm256_xor_si256( x2, x6 );
        x3 = _mm256_xor_si256( x3, x7 );
-        x4 = mm256_swap64_128( x4 );
-        x5 = mm256_swap64_128( x5 );
-        x6 = mm256_swap64_128( x6 );
-        x7 = mm256_swap64_128( x7 );
+        x4 = mm256_swap128_64( x4 );
+        x5 = mm256_swap128_64( x5 );
+        x6 = mm256_swap128_64( x6 );
+        x7 = mm256_swap128_64( x7 );
        x4 = _mm256_add_epi32( x0, x4 );
        x5 = _mm256_add_epi32( x1, x5 );
        x6 = _mm256_add_epi32( x2, x6 );
@@ -77,10 +257,10 @@ static void transform_2way( cube_2way_context *sp )
        x1 = _mm256_xor_si256( x1, x5 );
        x2 = _mm256_xor_si256( x2, x6 );
        x3 = _mm256_xor_si256( x3, x7 );
-        x4 = mm256_swap32_64( x4 );
-        x5 = mm256_swap32_64( x5 );
-        x6 = mm256_swap32_64( x6 );
-        x7 = mm256_swap32_64( x7 );
+        x4 = mm256_swap64_32( x4 );
+        x5 = mm256_swap64_32( x5 );
+        x6 = mm256_swap64_32( x6 );
+        x7 = mm256_swap64_32( x7 );
    }

    _mm256_store_si256( (__m256i*)sp->h,     x0 );
@@ -91,7 +271,6 @@ static void transform_2way( cube_2way_context *sp )
    _mm256_store_si256( (__m256i*)sp->h + 5, x5 );
    _mm256_store_si256( (__m256i*)sp->h + 6, x6 );
    _mm256_store_si256( (__m256i*)sp->h + 7, x7 );
-
 }

 int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
@@ -132,9 +311,6 @@ int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
    const __m256i *in = (__m256i*)data;
    int i;

-    // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
-    // Current usage sata is either 64 or 80 bytes.
-
    for ( i = 0; i < len; i++ )
    {
        sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -1,11 +1,38 @@
 #ifndef CUBE_HASH_2WAY_H__
-#define CUBE_HASH_2WAY_H__
-
-#if defined(__AVX2__)
+#define CUBE_HASH_2WAY_H__ 1

 #include <stdint.h>
 #include "simd-utils.h"

+#if defined(__AVX2__)
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+struct _cube_4way_context
+{
+    __m512i h[8];
+    int hashlen;
+    int rounds;
+    int blocksize;
+    int pos; 
+} __attribute__ ((aligned (128)));
+
+typedef struct _cube_4way_context cube_4way_context;
+
+int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
+                       int blockbytes );
+// reinitialize context with same parameters, much faster.
+int cube_4way_reinit( cube_4way_context *sp );
+
+int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
+
+int cube_4way_close( cube_4way_context *sp, void *output );
+
+int cube_4way_update_close( cube_4way_context *sp, void *output,
+                            const void *data, size_t size );
+
+#endif
+
 // 2x128, 2 way parallel SSE2

 struct _cube_2way_context
@@ -15,7 +42,7 @@ struct _cube_2way_context
    int rounds;
    int blocksize;         // __m128i
    int pos;               // number of __m128i read into x from current block
-} __attribute__ ((aligned (64)));
+} __attribute__ ((aligned (128)));

 typedef struct _cube_2way_context cube_2way_context;

--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -39,8 +39,8 @@ static void transform( cubehashParam *sp )
        x1 = mm256_rol_32( y0, 7 );
        x0 = _mm256_xor_si256( x0, x2 );
        x1 = _mm256_xor_si256( x1, x3 );
-        x2 = mm256_swap64_128( x2 );
-        x3 = mm256_swap64_128( x3 );
+        x2 = mm256_swap128_64( x2 );
+        x3 = mm256_swap128_64( x3 );
        x2 = _mm256_add_epi32( x0, x2 );
        x3 = _mm256_add_epi32( x1, x3 );
        y0 = mm256_swap_128( x0 );
@@ -49,8 +49,8 @@ static void transform( cubehashParam *sp )
        x1 = mm256_rol_32( y1, 11 );
        x0 = _mm256_xor_si256( x0, x2 );
        x1 = _mm256_xor_si256( x1, x3 );
-        x2 = mm256_swap32_64( x2 );
-        x3 = mm256_swap32_64( x3 );
+        x2 = mm256_swap64_32( x2 );
+        x3 = mm256_swap64_32( x3 );
    }

    _mm256_store_si256( (__m256i*)sp->x,     x0 );
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -528,6 +528,346 @@ static const sph_u32 T512[64][16] = {
 	  SPH_C32(0xe7e00a94) }
 };

+#define s0   m0
+#define s1   c0
+#define s2   m1
+#define s3   c1
+#define s4   c2
+#define s5   m2
+#define s6   c3
+#define s7   m3
+#define s8   m4
+#define s9   c4
+#define sA   m5
+#define sB   c5
+#define sC   c6
+#define sD   m6
+#define sE   c7
+#define sF   m7
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// Hamsi 8 way 
+
+#define INPUT_BIG8 \
+do { \
+  __m512i db = *buf; \
+  const uint64_t *tp = (uint64_t*)&T512[0][0];  \
+  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
+  for ( int u = 0; u < 64; u++ ) \
+  { \
+     __m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
+     dm = mm512_negate_32( _mm512_or_si512( dm, \
+                                          _mm512_slli_epi64( dm, 32 ) ) ); \
+     m0 = _mm512_xor_si512( m0, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[0] ) ) ); \
+     m1 = _mm512_xor_si512( m1, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[1] ) ) ); \
+     m2 = _mm512_xor_si512( m2, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[2] ) ) ); \
+     m3 = _mm512_xor_si512( m3, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[3] ) ) ); \
+     m4 = _mm512_xor_si512( m4, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[4] ) ) ); \
+     m5 = _mm512_xor_si512( m5, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[5] ) ) ); \
+     m6 = _mm512_xor_si512( m6, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[6] ) ) ); \
+     m7 = _mm512_xor_si512( m7, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[7] ) ) ); \
+     tp += 8; \
+     db = _mm512_srli_epi64( db, 1 ); \
+  } \
+} while (0)
+
+#define SBOX8( a, b, c, d ) \
+do { \
+  __m512i t; \
+  t = a; \
+  a = _mm512_and_si512( a, c ); \
+  a = _mm512_xor_si512( a, d ); \
+  c = _mm512_xor_si512( c, b ); \
+  c = _mm512_xor_si512( c, a ); \
+  d = _mm512_or_si512( d, t ); \
+  d = _mm512_xor_si512( d, b ); \
+  t = _mm512_xor_si512( t, c ); \
+  b = d; \
+  d = _mm512_or_si512( d, t ); \
+  d = _mm512_xor_si512( d, a ); \
+  a = _mm512_and_si512( a, b ); \
+  t = _mm512_xor_si512( t, a ); \
+  b = _mm512_xor_si512( b, d ); \
+  b = _mm512_xor_si512( b, t ); \
+  a = c; \
+  c = b; \
+  b = d; \
+  d = mm512_not( t ); \
+} while (0)
+
+#define L8( a, b, c, d ) \
+do { \
+   a = mm512_rol_32( a, 13 ); \
+   c = mm512_rol_32( c,  3 ); \
+   b = _mm512_xor_si512( b, _mm512_xor_si512( a, c ) ); \
+   d = _mm512_xor_si512( d, _mm512_xor_si512( c, \
+                                              _mm512_slli_epi32( a, 3 ) ) ); \
+   b = mm512_rol_32( b, 1 ); \
+   d = mm512_rol_32( d, 7 ); \
+   a = _mm512_xor_si512( a, _mm512_xor_si512( b, d ) ); \
+   c = _mm512_xor_si512( c, _mm512_xor_si512( d, \
+                                              _mm512_slli_epi32( b, 7 ) ) ); \
+   a = mm512_rol_32( a,  5 ); \
+   c = mm512_rol_32( c, 22 ); \
+} while (0)
+
+#define DECL_STATE_BIG8 \
+   __m512i c0, c1, c2, c3, c4, c5, c6, c7; \
+
+#define READ_STATE_BIG8(sc) \
+do { \
+   c0 = sc->h[0x0]; \
+   c1 = sc->h[0x1]; \
+   c2 = sc->h[0x2]; \
+   c3 = sc->h[0x3]; \
+   c4 = sc->h[0x4]; \
+   c5 = sc->h[0x5]; \
+   c6 = sc->h[0x6]; \
+   c7 = sc->h[0x7]; \
+} while (0)
+
+#define WRITE_STATE_BIG8(sc) \
+do { \
+   sc->h[0x0] = c0; \
+   sc->h[0x1] = c1; \
+   sc->h[0x2] = c2; \
+   sc->h[0x3] = c3; \
+   sc->h[0x4] = c4; \
+   sc->h[0x5] = c5; \
+   sc->h[0x6] = c6; \
+   sc->h[0x7] = c7; \
+} while (0)
+
+
+#define ROUND_BIG8(rc, alpha) \
+do { \
+   __m512i t0, t1, t2, t3; \
+   s0 = _mm512_xor_si512( s0, m512_const1_64( \
+                   ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
+   s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
+   s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
+   s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
+   s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
+   s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
+   s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
+   s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
+   s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
+   s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
+   sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
+   sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
+   sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
+   sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
+   sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
+   sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
+\
+  SBOX8( s0, s4, s8, sC ); \
+  SBOX8( s1, s5, s9, sD ); \
+  SBOX8( s2, s6, sA, sE ); \
+  SBOX8( s3, s7, sB, sF ); \
+\
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \
+                                        _mm512_bslli_epi128( s5, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \
+                                        _mm512_bslli_epi128( sE, 4 ) ); \
+  L8( s0, t1, s9, t3 ); \
+  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \
+  s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \
+  sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \
+  sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
+                                        _mm512_bslli_epi128( s6, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \
+                                        _mm512_bslli_epi128( sF, 4 ) ); \
+  L8( s1, t1, sA, t3 ); \
+  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
+  s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \
+  sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \
+  sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \
+                                        _mm512_bslli_epi128( s7, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \
+                                        _mm512_bslli_epi128( sC, 4 ) ); \
+  L8( s2, t1, sB, t3 ); \
+  s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \
+  s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \
+  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \
+  sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \
+                                        _mm512_bslli_epi128( s4, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \
+                                        _mm512_bslli_epi128( sD, 4 ) ); \
+  L8( s3, t1, s8, t3 ); \
+  s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \
+  s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \
+  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \
+  sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \
+                                        _mm512_bslli_epi128( sB, 4 ) ); \
+  L8( t0, t1, t2, t3 ); \
+  s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
+  s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \
+  s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
+  s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
+  s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \
+  sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
+  s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \
+  sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
+                                        _mm512_bslli_epi128( sD, 4 ) ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \
+  L8( t0, t1, t2, t3 ); \
+  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \
+  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \
+  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
+  sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \
+  s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \
+  sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \
+  s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \
+  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
+} while (0)
+
+#define P_BIG8 \
+do { \
+   ROUND_BIG8(0, alpha_n); \
+   ROUND_BIG8(1, alpha_n); \
+   ROUND_BIG8(2, alpha_n); \
+   ROUND_BIG8(3, alpha_n); \
+   ROUND_BIG8(4, alpha_n); \
+   ROUND_BIG8(5, alpha_n); \
+} while (0)
+
+#define PF_BIG8 \
+do { \
+   ROUND_BIG8( 0, alpha_f); \
+   ROUND_BIG8( 1, alpha_f); \
+   ROUND_BIG8( 2, alpha_f); \
+   ROUND_BIG8( 3, alpha_f); \
+   ROUND_BIG8( 4, alpha_f); \
+   ROUND_BIG8( 5, alpha_f); \
+   ROUND_BIG8( 6, alpha_f); \
+   ROUND_BIG8( 7, alpha_f); \
+   ROUND_BIG8( 8, alpha_f); \
+   ROUND_BIG8( 9, alpha_f); \
+   ROUND_BIG8(10, alpha_f); \
+   ROUND_BIG8(11, alpha_f); \
+} while (0)
+
+#define T_BIG8 \
+do { /* order is important */ \
+   c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \
+   c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \
+   c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \
+   c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \
+   c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \
+   c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \
+   c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \
+   c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \
+} while (0)
+
+void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
+{
+   DECL_STATE_BIG8
+   uint32_t tmp = num << 6;
+
+   sc->count_low = SPH_T32( sc->count_low + tmp );
+   sc->count_high += (sph_u32)( (num >> 13) >> 13 );
+   if ( sc->count_low < tmp )
+      sc->count_high++;
+
+   READ_STATE_BIG8( sc );
+   while ( num-- > 0 )
+   {
+      __m512i m0, m1, m2, m3, m4, m5, m6, m7;
+
+      INPUT_BIG8;
+      P_BIG8;
+      T_BIG8;
+      buf++;
+   }
+   WRITE_STATE_BIG8( sc );
+}
+
+void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
+{
+   __m512i m0, m1, m2, m3, m4, m5, m6, m7;
+   DECL_STATE_BIG8
+   READ_STATE_BIG8( sc );
+   INPUT_BIG8;
+   PF_BIG8;
+   T_BIG8;
+   WRITE_STATE_BIG8( sc );
+}
+
+
+void hamsi512_8way_init( hamsi_8way_big_context *sc )
+{
+   sc->partial_len = 0;
+   sc->count_high = sc->count_low = 0;
+
+   sc->h[0] = m512_const1_64( 0x6c70617273746565 );
+   sc->h[1] = m512_const1_64( 0x656e62656b204172 );
+   sc->h[2] = m512_const1_64( 0x302c206272672031 );
+   sc->h[3] = m512_const1_64( 0x3434362c75732032 );
+   sc->h[4] = m512_const1_64( 0x3030312020422d33 );
+   sc->h[5] = m512_const1_64( 0x656e2d484c657576 );
+   sc->h[6] = m512_const1_64( 0x6c65652c65766572 );
+   sc->h[7] = m512_const1_64( 0x6769756d2042656c );
+}
+
+void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
+                           size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+
+   hamsi_8way_big( sc, vdata, len>>3 );
+   vdata += ( (len& ~(size_t)7) >> 3 );
+   len &= (size_t)7;
+   memcpy_512( sc->buf, vdata, len>>3 );
+   sc->partial_len = len;
+}
+
+void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
+{
+   __m512i pad[1];
+   int ch, cl;
+
+   sph_enc32be( &ch, sc->count_high );
+   sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
+   pad[0] =  _mm512_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch,
+                               cl, ch, cl, ch, cl, ch, cl, ch );
+//   pad[0] =  m512_const2_32( cl, ch );
+   sc->buf[0] = m512_const1_64( 0x80 );
+   hamsi_8way_big( sc, sc->buf, 1 );
+   hamsi_8way_big_final( sc, pad );
+
+   mm512_block_bswap_32( (__m512i*)dst, sc->h );
+}
+
+
+#endif // AVX512
+
+
+// Hamsi 4 way

 #define INPUT_BIG \
 do { \
@@ -627,6 +967,7 @@ do { \
   sc->h[0x7] = c7; \
 } while (0)

+/*
 #define s0   m0
 #define s1   c0
 #define s2   m1
@@ -643,42 +984,28 @@ do { \
 #define sD   m6
 #define sE   c7
 #define sF   m7
+*/

 #define ROUND_BIG(rc, alpha) \
 do { \
   __m256i t0, t1, t2, t3; \
   s0 = _mm256_xor_si256( s0, m256_const1_64( \
-        ( ( (uint64_t)( (rc) ^ alpha[1] ) << 32 ) ) | (uint64_t)alpha[0] ) ); \
-   s1 = _mm256_xor_si256( s1, m256_const1_64( \
-        ( (uint64_t)alpha[ 3] << 32 ) | (uint64_t)alpha[ 2] ) ); \
-   s2 = _mm256_xor_si256( s2, m256_const1_64( \
-        ( (uint64_t)alpha[ 5] << 32 ) | (uint64_t)alpha[ 4] ) ); \
-   s3 = _mm256_xor_si256( s3, m256_const1_64( \
-        ( (uint64_t)alpha[ 7] << 32 ) | (uint64_t)alpha[ 6] ) ); \
-   s4 = _mm256_xor_si256( s4, m256_const1_64( \
-        ( (uint64_t)alpha[ 9] << 32 ) | (uint64_t)alpha[ 8] ) ); \
-   s5 = _mm256_xor_si256( s5, m256_const1_64( \
-        ( (uint64_t)alpha[11] << 32 ) | (uint64_t)alpha[10] ) ); \
-   s6 = _mm256_xor_si256( s6, m256_const1_64( \
-        ( (uint64_t)alpha[13] << 32 ) | (uint64_t)alpha[12] ) ); \
-   s7 = _mm256_xor_si256( s7, m256_const1_64( \
-        ( (uint64_t)alpha[15] << 32 ) | (uint64_t)alpha[14] ) ); \
-   s8 = _mm256_xor_si256( s8, m256_const1_64( \
-        ( (uint64_t)alpha[17] << 32 ) | (uint64_t)alpha[16] ) ); \
-   s9 = _mm256_xor_si256( s9, m256_const1_64( \
-        ( (uint64_t)alpha[19] << 32 ) | (uint64_t)alpha[18] ) ); \
-   sA = _mm256_xor_si256( sA, m256_const1_64( \
-        ( (uint64_t)alpha[21] << 32 ) | (uint64_t)alpha[20] ) ); \
-   sB = _mm256_xor_si256( sB, m256_const1_64( \
-        ( (uint64_t)alpha[23] << 32 ) | (uint64_t)alpha[22] ) ); \
-   sC = _mm256_xor_si256( sC, m256_const1_64( \
-        ( (uint64_t)alpha[25] << 32 ) | (uint64_t)alpha[24] ) ); \
-   sD = _mm256_xor_si256( sD, m256_const1_64( \
-        ( (uint64_t)alpha[27] << 32 ) | (uint64_t)alpha[26] ) ); \
-   sE = _mm256_xor_si256( sE, m256_const1_64( \
-        ( (uint64_t)alpha[29] << 32 ) | (uint64_t)alpha[28] ) ); \
-   sF = _mm256_xor_si256( sF, m256_const1_64( \
-        ( (uint64_t)alpha[31] << 32 ) | (uint64_t)alpha[30] ) ); \
+                   ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
+   s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
+   s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
+   s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
+   s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
+   s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
+   s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
+   s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
+   s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
+   s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
+   sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
+   sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
+   sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
+   sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
+   sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
+   sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
 \
  SBOX( s0, s4, s8, sC ); \
  SBOX( s1, s5, s9, sD ); \
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -60,9 +60,32 @@ typedef struct {
 typedef hamsi_4way_big_context hamsi512_4way_context;

 void hamsi512_4way_init( hamsi512_4way_context *sc );
-void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
+void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
+      size_t len );
+#define hamsi512_4way hamsi512_4way_update
 void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+typedef struct {
+   __m512i h[8];
+   __m512i buf[1];
+   size_t partial_len;
+   sph_u32 count_high, count_low;
+} hamsi_8way_big_context;
+
+typedef hamsi_8way_big_context hamsi512_8way_context;
+
+void hamsi512_8way_init( hamsi512_8way_context *sc );
+void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
+                           size_t len );
+void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
+
+
+
+#endif
+
+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/haval/haval-8way-helper.c
+++ b/algo/haval/haval-8way-helper.c
@@ -0,0 +1,115 @@
+/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
+/*
+ * Helper code, included (three times !) by HAVAL implementation.
+ *
+ * TODO: try to merge this with md_helper.c.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#undef SPH_XCAT
+#define SPH_XCAT(a, b)    SPH_XCAT_(a, b)
+#undef SPH_XCAT_
+#define SPH_XCAT_(a, b)   a ## b
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_update)
+( haval_8way_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   unsigned current;
+
+   current = (unsigned)sc->count_low & 127U;
+   while ( len > 0 )
+   {
+      unsigned clen;
+      uint32_t clow, clow2;
+
+      clen = 128U - current;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( sc->buf + (current>>2), vdata, clen>>2 );
+      vdata += clen>>2;
+      current += clen;
+      len -= clen;
+      if ( current == 128U )
+      {
+         DSTATE_8W;
+         IN_PREPARE_8W(sc->buf);
+         RSTATE_8W;
+         SPH_XCAT(CORE_8W, PASSES)(INW_8W);
+         WSTATE_8W;
+         current = 0;
+      }
+      clow = sc->count_low;
+      clow2 = clow + clen;
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high ++;
+   }
+}
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_close)( haval_8way_context *sc,
+                                                void *dst)
+{
+   unsigned current;
+   DSTATE_8W;
+
+   current = (unsigned)sc->count_low & 127UL;
+
+   sc->buf[ current>>2 ] = m256_one_32;
+   current += 4;   
+   RSTATE_8W;
+   if ( current > 116UL )
+   {
+      memset_zero_256( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
+      do
+      {
+         IN_PREPARE_8W(sc->buf);
+         SPH_XCAT(CORE_8W, PASSES)(INW_8W);
+      } while (0);
+      current = 0;
+   }
+
+   uint32_t t1, t2;
+   memset_zero_256( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
+   t1 = 0x01 | (PASSES << 3);
+   t2 = sc->olen << 3;
+   sc->buf[ 116>>2 ] = _mm256_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
+   sc->buf[ 120>>2 ] = _mm256_set1_epi32( sc->count_low << 3 );
+   sc->buf[ 124>>2 ] = _mm256_set1_epi32( (sc->count_high << 3)
+                                     | (sc->count_low >> 29) );
+   do
+   {
+      IN_PREPARE_8W(sc->buf);
+      SPH_XCAT(CORE_8W, PASSES)(INW_8W);
+   } while (0);
+   WSTATE_8W;
+   haval_8way_out( sc, dst );
+}
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -40,7 +40,7 @@
 #include <string.h>
 #include "haval-hash-4way.h"

-// won't compile with sse4.2
+// won't compile with sse4.2, not a problem, it's only used with AVX2 4 way.
 //#if defined (__SSE4_2__)
 #if defined(__AVX__)

@@ -518,6 +518,301 @@ do { \

 #define INMSG(i)   msg[i]

+#if defined(__AVX2__)
+
+// Haval-256 8 way 32 bit avx2
+
+#define F1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   _mm256_xor_si256( x0, \
+       _mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \
+                      _mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
+                                     _mm256_and_si256( x3, x6 ) ) ) ) \
+
+#define F2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   _mm256_xor_si256( \
+      _mm256_and_si256( x2, \
+         _mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \
+                        _mm256_xor_si256( _mm256_and_si256( x4, x5 ), \
+                                       _mm256_xor_si256( x6, x0 ) ) ) ), \
+         _mm256_xor_si256( \
+             _mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \
+             _mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \
+
+#define F3_8W(x6, x5, x4, x3, x2, x1, x0) \
+  _mm256_xor_si256( \
+    _mm256_and_si256( x3, \
+      _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
+                     _mm256_xor_si256( x6, x0 ) ) ), \
+      _mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \
+                                   _mm256_and_si256( x2, x5 ) ), x0 ) )
+
+#define F4_8W(x6, x5, x4, x3, x2, x1, x0) \
+  _mm256_xor_si256( \
+     _mm256_xor_si256( \
+        _mm256_and_si256( x3, \
+           _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
+                                         _mm256_or_si256( x4, x6 ) ), x5 ) ), \
+        _mm256_and_si256( x4, \
+           _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \
+                          _mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \
+     _mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) )
+
+
+#define F5_8W(x6, x5, x4, x3, x2, x1, x0) \
+   _mm256_xor_si256( \
+       _mm256_and_si256( x0, \
+            mm256_not( _mm256_xor_si256( \
+                    _mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \
+      _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
+                                    _mm256_and_si256( x2, x5 ) ), \
+                                    _mm256_and_si256( x3, x6 ) ) )
+
+#define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F1_8W(x1, x0, x3, x5, x6, x2, x4)
+#define FP3_2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F2_8W(x4, x2, x1, x0, x5, x3, x6)
+#define FP3_3_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F3_8W(x6, x1, x2, x3, x4, x5, x0)
+
+#define FP4_1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F1_8W(x2, x6, x1, x4, x5, x3, x0)
+#define FP4_2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F2_8W(x3, x5, x2, x0, x1, x6, x4)
+#define FP4_3_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F3_8W(x1, x4, x3, x6, x0, x2, x5)
+#define FP4_4_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F4_8W(x6, x4, x0, x5, x2, x1, x3)
+
+#define FP5_1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F1_8W(x3, x4, x1, x0, x5, x2, x6)
+#define FP5_2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F2_8W(x6, x2, x1, x0, x3, x4, x5)
+#define FP5_3_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F3_8W(x2, x6, x0, x4, x3, x1, x5)
+#define FP5_4_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F4_8W(x1, x5, x3, x2, x0, x4, x6)
+#define FP5_5_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F5_8W(x2, x5, x0, x6, x4, x3, x1)
+
+#define STEP_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
+do { \
+   __m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
+                                      mm256_ror_32( x7, 11 ) ), \
+                       _mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
+} while (0)
+
+#define PASS1_8W(n, in)   do { \
+      unsigned pass_count; \
+      for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+         STEP_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
+            in(pass_count + 0), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
+            in(pass_count + 1), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
+            in(pass_count + 2), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
+            in(pass_count + 3), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
+            in(pass_count + 4), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
+            in(pass_count + 5), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
+            in(pass_count + 6), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
+            in(pass_count + 7), SPH_C32(0x00000000)); \
+         } \
+   } while (0)
+
+#define PASSG_8W(p, n, in)   do { \
+      unsigned pass_count; \
+      for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+         STEP_8W(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
+            in(MP ## p[pass_count + 0]), \
+            RK ## p[pass_count + 0]); \
+         STEP_8W(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
+            in(MP ## p[pass_count + 1]), \
+            RK ## p[pass_count + 1]); \
+         STEP_8W(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
+            in(MP ## p[pass_count + 2]), \
+            RK ## p[pass_count + 2]); \
+         STEP_8W(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
+            in(MP ## p[pass_count + 3]), \
+            RK ## p[pass_count + 3]); \
+         STEP_8W(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
+            in(MP ## p[pass_count + 4]), \
+            RK ## p[pass_count + 4]); \
+         STEP_8W(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
+            in(MP ## p[pass_count + 5]), \
+            RK ## p[pass_count + 5]); \
+         STEP_8W(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
+            in(MP ## p[pass_count + 6]), \
+            RK ## p[pass_count + 6]); \
+         STEP_8W(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
+            in(MP ## p[pass_count + 7]), \
+            RK ## p[pass_count + 7]); \
+         } \
+   } while (0)
+
+#define PASS2_8W(n, in)    PASSG_8W(2, n, in)
+#define PASS3_8W(n, in)    PASSG_8W(3, n, in)
+#define PASS4_8W(n, in)    PASSG_8W(4, n, in)
+#define PASS5_8W(n, in)    PASSG_8W(5, n, in)
+
+#define SAVE_STATE_8W \
+   __m256i u0, u1, u2, u3, u4, u5, u6, u7; \
+   do { \
+      u0 = s0; \
+      u1 = s1; \
+      u2 = s2; \
+      u3 = s3; \
+      u4 = s4; \
+      u5 = s5; \
+      u6 = s6; \
+      u7 = s7; \
+   } while (0)
+
+#define UPDATE_STATE_8W \
+do { \
+   s0 = _mm256_add_epi32( s0, u0 ); \
+   s1 = _mm256_add_epi32( s1, u1 ); \
+   s2 = _mm256_add_epi32( s2, u2 ); \
+   s3 = _mm256_add_epi32( s3, u3 ); \
+   s4 = _mm256_add_epi32( s4, u4 ); \
+   s5 = _mm256_add_epi32( s5, u5 ); \
+   s6 = _mm256_add_epi32( s6, u6 ); \
+   s7 = _mm256_add_epi32( s7, u7 ); \
+} while (0)
+
+#define CORE_8W5(in)  do { \
+      SAVE_STATE_8W; \
+      PASS1_8W(5, in); \
+      PASS2_8W(5, in); \
+      PASS3_8W(5, in); \
+      PASS4_8W(5, in); \
+      PASS5_8W(5, in); \
+      UPDATE_STATE_8W; \
+   } while (0)
+
+#define DSTATE_8W   __m256i s0, s1, s2, s3, s4, s5, s6, s7
+
+#define RSTATE_8W \
+do { \
+   s0 = sc->s0; \
+   s1 = sc->s1; \
+   s2 = sc->s2; \
+   s3 = sc->s3; \
+   s4 = sc->s4; \
+   s5 = sc->s5; \
+   s6 = sc->s6; \
+   s7 = sc->s7; \
+} while (0)
+
+#define WSTATE_8W \
+do { \
+   sc->s0 = s0; \
+   sc->s1 = s1; \
+   sc->s2 = s2; \
+   sc->s3 = s3; \
+   sc->s4 = s4; \
+   sc->s5 = s5; \
+   sc->s6 = s6; \
+   sc->s7 = s7; \
+} while (0)
+
+static void
+haval_8way_init( haval_8way_context *sc, unsigned olen, unsigned passes )
+{
+   sc->s0 = m256_const1_32( 0x243F6A88UL );
+   sc->s1 = m256_const1_32( 0x85A308D3UL );
+   sc->s2 = m256_const1_32( 0x13198A2EUL );
+   sc->s3 = m256_const1_32( 0x03707344UL );
+   sc->s4 = m256_const1_32( 0xA4093822UL );
+   sc->s5 = m256_const1_32( 0x299F31D0UL );
+   sc->s6 = m256_const1_32( 0x082EFA98UL );
+   sc->s7 = m256_const1_32( 0xEC4E6C89UL );
+   sc->olen = olen;
+   sc->passes = passes;
+   sc->count_high = 0;
+   sc->count_low = 0;
+
+}
+#define IN_PREPARE_8W(indata) const __m256i *const load_ptr_8w = (indata)
+
+#define INW_8W(i)   load_ptr_8w[ i ] 
+
+static void
+haval_8way_out( haval_8way_context *sc, void *dst )
+{
+   __m256i *buf = (__m256i*)dst;
+   DSTATE_8W;
+   RSTATE_8W;
+
+   buf[0] = s0;
+   buf[1] = s1;
+   buf[2] = s2;
+   buf[3] = s3;
+   buf[4] = s4;
+   buf[5] = s5;
+   buf[6] = s6;
+   buf[7] = s7;
+}
+
+#undef PASSES
+#define PASSES   5
+#include "haval-8way-helper.c"
+
+#define API_8W(xxx, y) \
+void \
+haval ## xxx ## _ ## y ## _8way_init(void *cc) \
+{ \
+   haval_8way_init(cc, xxx >> 5, y); \
+} \
+ \
+void \
+haval ## xxx ## _ ## y ## _8way_update (void *cc, const void *data, size_t len) \
+{ \
+   haval ## y ## _8way_update(cc, data, len); \
+} \
+ \
+void \
+haval ## xxx ## _ ## y ## _8way_close(void *cc, void *dst) \
+{ \
+   haval ## y ## _8way_close(cc, dst); \
+} \
+
+API_8W(256, 5)
+
+#define RVAL_8W \
+do { \
+   s0 = val[0]; \
+   s1 = val[1]; \
+   s2 = val[2]; \
+   s3 = val[3]; \
+   s4 = val[4]; \
+   s5 = val[5]; \
+   s6 = val[6]; \
+   s7 = val[7]; \
+} while (0)
+
+#define WVAL_8W \
+do { \
+   val[0] = s0; \
+   val[1] = s1; \
+   val[2] = s2; \
+   val[3] = s3; \
+   val[4] = s4; \
+   val[5] = s5; \
+   val[6] = s6; \
+   val[7] = s7; \
+} while (0)
+
+#define INMSG_8W(i)   msg[i]
+
+
+
+#endif // AVX2
+
 #ifdef __cplusplus
 }
 #endif	
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -59,7 +59,7 @@
 */

 #ifndef HAVAL_HASH_4WAY_H__
-#define HAVAL_HASH_4WAY_H__
+#define HAVAL_HASH_4WAY_H__ 1

 #if defined(__AVX__)

@@ -84,10 +84,30 @@ typedef haval_4way_context haval256_5_4way_context;

 void haval256_5_4way_init( void *cc );

-void haval256_5_4way( void *cc, const void *data, size_t len );
+void haval256_5_4way_update( void *cc, const void *data, size_t len );
+#define haval256_5_4way haval256_5_4way_update

 void haval256_5_4way_close( void *cc, void *dst );

+#if defined(__AVX2__)
+
+typedef struct {
+   __m256i buf[32];
+   __m256i s0, s1, s2, s3, s4, s5, s6, s7;
+   unsigned olen, passes;
+   uint32_t count_high, count_low;
+} haval_8way_context __attribute__ ((aligned (64)));
+
+typedef haval_8way_context haval256_5_8way_context;
+
+void haval256_5_8way_init( void *cc );
+
+void haval256_5_8way_update( void *cc, const void *data, size_t len );
+
+void haval256_5_8way_close( void *cc, void *dst );
+
+#endif // AVX2
+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -92,6 +92,38 @@ extern "C"{

 #endif

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define Sb_8W(x0, x1, x2, x3, c) \
+do { \
+   __m512i cc = _mm512_set1_epi64( c ); \
+    x3 = mm512_not( x3 ); \
+    x0 = _mm512_xor_si512( x0, _mm512_andnot_si512( x2, cc ) ); \
+    tmp = _mm512_xor_si512( cc, _mm512_and_si512( x0, x1 ) ); \
+    x0 = _mm512_xor_si512( x0, _mm512_and_si512( x2, x3 ) ); \
+    x3 = _mm512_xor_si512( x3, _mm512_andnot_si512( x1, x2 ) ); \
+    x1 = _mm512_xor_si512( x1, _mm512_and_si512( x0, x2 ) ); \
+    x2 = _mm512_xor_si512( x2, _mm512_andnot_si512( x3, x0 ) ); \
+    x0 = _mm512_xor_si512( x0, _mm512_or_si512( x1, x3 ) ); \
+    x3 = _mm512_xor_si512( x3, _mm512_and_si512( x1, x2 ) ); \
+    x1 = _mm512_xor_si512( x1, _mm512_and_si512( tmp, x0 ) ); \
+    x2 = _mm512_xor_si512( x2, tmp ); \
+} while (0)
+
+#define Lb_8W(x0, x1, x2, x3, x4, x5, x6, x7) \
+do { \
+    x4 = _mm512_xor_si512( x4, x1 ); \
+    x5 = _mm512_xor_si512( x5, x2 ); \
+    x6 = _mm512_xor_si512( x6, _mm512_xor_si512( x3, x0 ) ); \
+    x7 = _mm512_xor_si512( x7, x0 ); \
+    x0 = _mm512_xor_si512( x0, x5 ); \
+    x1 = _mm512_xor_si512( x1, x6 ); \
+    x2 = _mm512_xor_si512( x2, _mm512_xor_si512( x7, x4 ) ); \
+    x3 = _mm512_xor_si512( x3, x4 ); \
+} while (0)
+
+#endif
+
 #define Sb(x0, x1, x2, x3, c) \
 do { \
   __m256i cc = _mm256_set1_epi64x( c ); \
@@ -226,6 +258,48 @@ static const sph_u64 C[] = {
 			x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
 	} while (0)

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define S_8W(x0, x1, x2, x3, cb, r)   do { \
+      Sb_8W(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \
+      Sb_8W(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \
+   } while (0)
+
+#define L_8W(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+      Lb_8W(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \
+         x4 ## h, x5 ## h, x6 ## h, x7 ## h); \
+      Lb_8W(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \
+         x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
+   } while (0)
+
+#define Wz_8W(x, c, n) \
+do { \
+   __m512i t = _mm512_slli_epi64( _mm512_and_si512(x ## h, (c)), (n) ); \
+   x ## h = _mm512_or_si512( _mm512_and_si512( \
+                                _mm512_srli_epi64(x ## h, (n)), (c)), t ); \
+   t = _mm512_slli_epi64( _mm512_and_si512(x ## l, (c)), (n) ); \
+   x ## l = _mm512_or_si512( _mm512_and_si512((x ## l >> (n)), (c)), t ); \
+} while (0)
+
+#define W80(x)   Wz_8W(x, m512_const1_64( 0x5555555555555555 ),  1 )
+#define W81(x)   Wz_8W(x, m512_const1_64( 0x3333333333333333 ),  2 )
+#define W82(x)   Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ),  4 )
+#define W83(x)   Wz_8W(x, m512_const1_64( 0x00FF00FF00FF00FF ),  8 ) 
+#define W84(x)   Wz_8W(x, m512_const1_64( 0x0000FFFF0000FFFF ), 16 )
+#define W85(x)   Wz_8W(x, m512_const1_64( 0x00000000FFFFFFFF ), 32 )
+#define W86(x) \
+do { \
+   __m512i t = x ## h; \
+   x ## h = x ## l; \
+   x ## l = t; \
+} while (0)
+
+#define DECL_STATE_8W \
+   __m512i h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \
+   __m512i h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \
+   __m512i tmp;
+
+#endif

 #define Wz(x, c, n) \
 do { \
@@ -236,16 +310,6 @@ do { \
   x ## l = _mm256_or_si256( _mm256_and_si256((x ## l >> (n)), (c)), t ); \
 } while (0)

-
-/*
-#define Wz(x, c, n)   do { \
-		sph_u64 t = (x ## h & (c)) << (n); \
-		x ## h = ((x ## h >> (n)) & (c)) | t; \
-		t = (x ## l & (c)) << (n); \
-		x ## l = ((x ## l >> (n)) & (c)) | t; \
-	} while (0)
-*/
-
 #define W0(x)   Wz(x, m256_const1_64( 0x5555555555555555 ),  1 )
 #define W1(x)   Wz(x, m256_const1_64( 0x3333333333333333 ),  2 )
 #define W2(x)   Wz(x, m256_const1_64( 0x0F0F0F0F0F0F0F0F ),  4 )
@@ -259,25 +323,12 @@ do { \
   x ## l = t; \
 } while (0)

-/*
-#define W0(x)   Wz(x, SPH_C64(0x5555555555555555),  1)
-#define W1(x)   Wz(x, SPH_C64(0x3333333333333333),  2)
-#define W2(x)   Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F),  4)
-#define W3(x)   Wz(x, SPH_C64(0x00FF00FF00FF00FF),  8)
-#define W4(x)   Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16)
-#define W5(x)   Wz(x, SPH_C64(0x00000000FFFFFFFF), 32)
-#define W6(x)   do { \
-		sph_u64 t = x ## h; \
-		x ## h = x ## l; \
-		x ## l = t; \
-	} while (0)
-*/
-
 #define DECL_STATE \
 	__m256i h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \
 	__m256i h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \
 	__m256i tmp;

+
 #define READ_STATE(state)   do { \
 		h0h = (state)->H[ 0]; \
 		h0l = (state)->H[ 1]; \
@@ -316,6 +367,38 @@ do { \
 		(state)->H[15] = h7l; \
 	} while (0)

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define INPUT_BUF1_8W \
+   __m512i m0h = buf[0]; \
+   __m512i m0l = buf[1]; \
+   __m512i m1h = buf[2]; \
+   __m512i m1l = buf[3]; \
+   __m512i m2h = buf[4]; \
+   __m512i m2l = buf[5]; \
+   __m512i m3h = buf[6]; \
+   __m512i m3l = buf[7]; \
+   h0h = _mm512_xor_si512( h0h, m0h ); \
+   h0l = _mm512_xor_si512( h0l, m0l ); \
+   h1h = _mm512_xor_si512( h1h, m1h ); \
+   h1l = _mm512_xor_si512( h1l, m1l ); \
+   h2h = _mm512_xor_si512( h2h, m2h ); \
+   h2l = _mm512_xor_si512( h2l, m2l ); \
+   h3h = _mm512_xor_si512( h3h, m3h ); \
+   h3l = _mm512_xor_si512( h3l, m3l ); \
+
+#define INPUT_BUF2_8W \
+   h4h = _mm512_xor_si512( h4h, m0h ); \
+   h4l = _mm512_xor_si512( h4l, m0l ); \
+   h5h = _mm512_xor_si512( h5h, m1h ); \
+   h5l = _mm512_xor_si512( h5l, m1l ); \
+   h6h = _mm512_xor_si512( h6h, m2h ); \
+   h6l = _mm512_xor_si512( h6l, m2l ); \
+   h7h = _mm512_xor_si512( h7h, m3h ); \
+   h7l = _mm512_xor_si512( h7l, m3l ); \
+
+#endif
+
 #define INPUT_BUF1 \
 	__m256i m0h = buf[0]; \
 	__m256i m0l = buf[1]; \
@@ -344,6 +427,7 @@ do { \
   h7h = _mm256_xor_si256( h7h, m3h ); \
   h7l = _mm256_xor_si256( h7l, m3l ); \

+
 static const sph_u64 IV256[] = {
 	C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1),
 	C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03),
@@ -370,6 +454,22 @@ static const sph_u64 IV512[] = {
 #else


+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define SL_8W(ro)   SLu_8W(r + ro, ro)
+
+#define SLu_8W(r, ro)   do { \
+      S_8W(h0, h2, h4, h6, Ceven_, r); \
+      S_8W(h1, h3, h5, h7, Codd_, r); \
+      L_8W(h0, h2, h4, h6, h1, h3, h5, h7); \
+      W8 ## ro(h1); \
+      W8 ## ro(h3); \
+      W8 ## ro(h5); \
+      W8 ## ro(h7); \
+   } while (0)
+
 #endif

 #define SL(ro)   SLu(r + ro, ro)
@@ -393,6 +493,23 @@ static const sph_u64 IV512[] = {
 * loop.
 */

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define E8_8W   do { \
+      unsigned r; \
+      for (r = 0; r < 42; r += 7) { \
+         SL_8W(0); \
+         SL_8W(1); \
+         SL_8W(2); \
+         SL_8W(3); \
+         SL_8W(4); \
+         SL_8W(5); \
+         SL_8W(6); \
+      } \
+   } while (0)
+
+#endif
+
 #define E8   do { \
 		unsigned r; \
 		for (r = 0; r < 42; r += 7) { \
@@ -419,51 +536,100 @@ static const sph_u64 IV512[] = {
 * On a "true 64-bit" architecture, we can unroll at will.
 */

-#define E8   do { \
-		SLu( 0, 0); \
-		SLu( 1, 1); \
-		SLu( 2, 2); \
-		SLu( 3, 3); \
-		SLu( 4, 4); \
-		SLu( 5, 5); \
-		SLu( 6, 6); \
-		SLu( 7, 0); \
-		SLu( 8, 1); \
-		SLu( 9, 2); \
-		SLu(10, 3); \
-		SLu(11, 4); \
-		SLu(12, 5); \
-		SLu(13, 6); \
-		SLu(14, 0); \
-		SLu(15, 1); \
-		SLu(16, 2); \
-		SLu(17, 3); \
-		SLu(18, 4); \
-		SLu(19, 5); \
-		SLu(20, 6); \
-		SLu(21, 0); \
-		SLu(22, 1); \
-		SLu(23, 2); \
-		SLu(24, 3); \
-		SLu(25, 4); \
-		SLu(26, 5); \
-		SLu(27, 6); \
-		SLu(28, 0); \
-		SLu(29, 1); \
-		SLu(30, 2); \
-		SLu(31, 3); \
-		SLu(32, 4); \
-		SLu(33, 5); \
-		SLu(34, 6); \
-		SLu(35, 0); \
-		SLu(36, 1); \
-		SLu(37, 2); \
-		SLu(38, 3); \
-		SLu(39, 4); \
-		SLu(40, 5); \
-		SLu(41, 6); \
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define E8_8W   do { \
+		SLu_8W( 0, 0); \
+		SLu_8W( 1, 1); \
+		SLu_8W( 2, 2); \
+		SLu_8W( 3, 3); \
+		SLu_8W( 4, 4); \
+		SLu_8W( 5, 5); \
+		SLu_8W( 6, 6); \
+		SLu_8W( 7, 0); \
+		SLu_8W( 8, 1); \
+		SLu_8W( 9, 2); \
+		SLu_8W(10, 3); \
+		SLu_8W(11, 4); \
+		SLu_8W(12, 5); \
+		SLu_8W(13, 6); \
+		SLu_8W(14, 0); \
+		SLu_8W(15, 1); \
+		SLu_8W(16, 2); \
+		SLu_8W(17, 3); \
+		SLu_8W(18, 4); \
+		SLu_8W(19, 5); \
+		SLu_8W(20, 6); \
+		SLu_8W(21, 0); \
+		SLu_8W(22, 1); \
+		SLu_8W(23, 2); \
+		SLu_8W(24, 3); \
+		SLu_8W(25, 4); \
+		SLu_8W(26, 5); \
+		SLu_8W(27, 6); \
+		SLu_8W(28, 0); \
+		SLu_8W(29, 1); \
+		SLu_8W(30, 2); \
+		SLu_8W(31, 3); \
+		SLu_8W(32, 4); \
+		SLu_8W(33, 5); \
+		SLu_8W(34, 6); \
+		SLu_8W(35, 0); \
+		SLu_8W(36, 1); \
+		SLu_8W(37, 2); \
+		SLu_8W(38, 3); \
+		SLu_8W(39, 4); \
+		SLu_8W(40, 5); \
+		SLu_8W(41, 6); \
 	} while (0)

+#endif  // AVX512
+
+#define E8   do { \
+      SLu( 0, 0); \
+      SLu( 1, 1); \
+      SLu( 2, 2); \
+      SLu( 3, 3); \
+      SLu( 4, 4); \
+      SLu( 5, 5); \
+      SLu( 6, 6); \
+      SLu( 7, 0); \
+      SLu( 8, 1); \
+      SLu( 9, 2); \
+      SLu(10, 3); \
+      SLu(11, 4); \
+      SLu(12, 5); \
+      SLu(13, 6); \
+      SLu(14, 0); \
+      SLu(15, 1); \
+      SLu(16, 2); \
+      SLu(17, 3); \
+      SLu(18, 4); \
+      SLu(19, 5); \
+      SLu(20, 6); \
+      SLu(21, 0); \
+      SLu(22, 1); \
+      SLu(23, 2); \
+      SLu(24, 3); \
+      SLu(25, 4); \
+      SLu(26, 5); \
+      SLu(27, 6); \
+      SLu(28, 0); \
+      SLu(29, 1); \
+      SLu(30, 2); \
+      SLu(31, 3); \
+      SLu(32, 4); \
+      SLu(33, 5); \
+      SLu(34, 6); \
+      SLu(35, 0); \
+      SLu(36, 1); \
+      SLu(37, 2); \
+      SLu(38, 3); \
+      SLu(39, 4); \
+      SLu(40, 5); \
+      SLu(41, 6); \
+   } while (0)
+
 #else


@@ -471,6 +637,158 @@ static const sph_u64 IV512[] = {

 #endif

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+void jh256_8way_init( jh_8way_context *sc )
+{
+    // bswapped IV256
+    sc->H[ 0] = m512_const1_64( 0xebd3202c41a398eb );
+    sc->H[ 1] = m512_const1_64( 0xc145b29c7bbecd92 );
+    sc->H[ 2] = m512_const1_64( 0xfac7d4609151931c );
+    sc->H[ 3] = m512_const1_64( 0x038a507ed6820026 );
+    sc->H[ 4] = m512_const1_64( 0x45b92677269e23a4 );
+    sc->H[ 5] = m512_const1_64( 0x77941ad4481afbe0 );
+    sc->H[ 6] = m512_const1_64( 0x7a176b0226abb5cd );
+    sc->H[ 7] = m512_const1_64( 0xa82fff0f4224f056 );
+    sc->H[ 8] = m512_const1_64( 0x754d2e7f8996a371 );
+    sc->H[ 9] = m512_const1_64( 0x62e27df70849141d );
+    sc->H[10] = m512_const1_64( 0x948f2476f7957627 );
+    sc->H[11] = m512_const1_64( 0x6c29804757b6d587 );
+    sc->H[12] = m512_const1_64( 0x6c0d8eac2d275e5c );
+    sc->H[13] = m512_const1_64( 0x0f7a0557c6508451 );
+    sc->H[14] = m512_const1_64( 0xea12247067d3e47b );
+    sc->H[15] = m512_const1_64( 0x69d71cd313abe389 );
+    sc->ptr = 0;
+    sc->block_count = 0;
+}
+
+void jh512_8way_init( jh_8way_context *sc )
+{
+    // bswapped IV512
+    sc->H[ 0] = m512_const1_64( 0x17aa003e964bd16f );
+    sc->H[ 1] = m512_const1_64( 0x43d5157a052e6a63 );
+    sc->H[ 2] = m512_const1_64( 0x0bef970c8d5e228a );
+    sc->H[ 3] = m512_const1_64( 0x61c3b3f2591234e9 );
+    sc->H[ 4] = m512_const1_64( 0x1e806f53c1a01d89 );
+    sc->H[ 5] = m512_const1_64( 0x806d2bea6b05a92a );
+    sc->H[ 6] = m512_const1_64( 0xa6ba7520dbcc8e58 );
+    sc->H[ 7] = m512_const1_64( 0xf73bf8ba763a0fa9 );
+    sc->H[ 8] = m512_const1_64( 0x694ae34105e66901 );
+    sc->H[ 9] = m512_const1_64( 0x5ae66f2e8e8ab546 );
+    sc->H[10] = m512_const1_64( 0x243c84c1d0a74710 );
+    sc->H[11] = m512_const1_64( 0x99c15a2db1716e3b );
+    sc->H[12] = m512_const1_64( 0x56f8b19decf657cf );
+    sc->H[13] = m512_const1_64( 0x56b116577c8806a7 );
+    sc->H[14] = m512_const1_64( 0xfb1785e6dffcc2e3 );
+    sc->H[15] = m512_const1_64( 0x4bdd8ccc78465a54 );
+    sc->ptr = 0;
+    sc->block_count = 0;
+}
+
+static void
+jh_8way_core( jh_8way_context *sc, const void *data, size_t len )
+{
+    __m512i *buf;
+    __m512i *vdata = (__m512i*)data;
+   const int buf_size = 64;   // 64 * _m512i
+   size_t ptr;
+   DECL_STATE_8W
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   if ( len < (buf_size - ptr) )
+   {
+       memcpy_512( buf + (ptr>>3), vdata, len>>3 );
+       ptr += len;
+       sc->ptr = ptr;
+       return;
+   }
+
+   READ_STATE(sc);
+   while ( len > 0 )
+   {
+       size_t clen;
+       clen = buf_size - ptr;
+       if ( clen > len )
+          clen = len;
+
+       memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
+       ptr += clen;
+       vdata += (clen>>3);
+       len -= clen;
+       if ( ptr == buf_size )
+       {
+          INPUT_BUF1_8W;
+          E8_8W;
+          INPUT_BUF2_8W;
+          sc->block_count ++;
+          ptr = 0;
+       }
+   }
+   WRITE_STATE(sc);
+   sc->ptr = ptr;
+}
+
+static void
+jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst,
+               size_t out_size_w32, const void *iv )
+{
+   __m512i buf[16*4];
+   __m512i *dst512 = (__m512i*)dst;
+   size_t numz, u;
+   sph_u64 l0, l1, l0e, l1e;
+
+   buf[0] = m512_const1_64( 0x80ULL );
+
+   if ( sc->ptr == 0 )
+       numz = 48;
+   else
+       numz = 112 - sc->ptr;
+
+   memset_zero_512( buf+1, (numz>>3) - 1 );
+
+   l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
+   l1 = SPH_T64(sc->block_count >> 55);
+   sph_enc64be( &l0e, l0 );
+   sph_enc64be( &l1e, l1 );
+   *(buf + (numz>>3)    ) = _mm512_set1_epi64( l1e );
+   *(buf + (numz>>3) + 1) = _mm512_set1_epi64( l0e );
+
+   jh_8way_core( sc, buf, numz + 16 );
+
+   for ( u=0; u < 8; u++ )
+       buf[u] = sc->H[u+8];
+
+    memcpy_512( dst512, buf, 8 );
+}
+
+void
+jh256_8way_update(void *cc, const void *data, size_t len)
+{
+   jh_8way_core(cc, data, len);
+}
+
+void
+jh256_8way_close(void *cc, void *dst)
+{
+   jh_8way_close(cc, 0, 0, dst, 8, IV256);
+}
+
+void
+jh512_8way_update(void *cc, const void *data, size_t len)
+{
+   jh_8way_core(cc, data, len);
+}
+
+void
+jh512_8way_close(void *cc, void *dst)
+{
+   jh_8way_close(cc, 0, 0, dst, 16, IV512);
+}
+
+#endif
+
 void jh256_4way_init( jh_4way_context *sc )
 {
    // bswapped IV256
@@ -595,16 +913,8 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
    memcpy_256( dst256, buf, 8 );
 }

-/*
 void
-jh256_4way_init(void *cc)
-{
-	jhs_4way_init(cc, IV256);
-}
-*/
-
-void
-jh256_4way(void *cc, const void *data, size_t len)
+jh256_4way_update(void *cc, const void *data, size_t len)
 {
 	jh_4way_core(cc, data, len);
 }
@@ -615,16 +925,8 @@ jh256_4way_close(void *cc, void *dst)
 	jh_4way_close(cc, 0, 0, dst, 8, IV256);
 }

-/*
 void
-jh512_4way_init(void *cc)
-{
-	jhb_4way_init(cc, IV512);
-}
-*/
-
-void
-jh512_4way(void *cc, const void *data, size_t len)
+jh512_4way_update(void *cc, const void *data, size_t len)
 {
 	jh_4way_core(cc, data, len);
 }
@@ -635,6 +937,7 @@ jh512_4way_close(void *cc, void *dst)
 	jh_4way_close(cc, 0, 0, dst, 16, IV512);
 }

+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -60,20 +60,41 @@ extern "C"{
 * can be cloned by copying the context (e.g. with a simple
 * <code>memcpy()</code>).
 */
+
+ 
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 typedef struct {
-    __m256i buf[8] __attribute__ ((aligned (64)));
+    __m512i buf[8];
+    __m512i H[16];
+    size_t ptr;
+    uint64_t block_count;
+} jh_8way_context __attribute__ ((aligned (128)));
+
+typedef jh_8way_context jh256_8way_context;
+
+typedef jh_8way_context jh512_8way_context;
+
+void jh256_8way_init( jh_8way_context *sc);
+
+void jh256_8way_update(void *cc, const void *data, size_t len);
+
+void jh256_8way_close(void *cc, void *dst);
+
+void jh512_8way_init( jh_8way_context *sc );
+
+void jh512_8way_update(void *cc, const void *data, size_t len);
+
+void jh512_8way_close(void *cc, void *dst);
+
+#endif
+
+typedef struct {
+    __m256i buf[8];
    __m256i H[16];
    size_t ptr;
    uint64_t block_count;
-/*
-	unsigned char buf[64]; 
-	size_t ptr;
-	union {
-		sph_u64 wide[16];
-	} H;
-	sph_u64 block_count;
-*/
-} jh_4way_context;
+} jh_4way_context __attribute__ ((aligned (128)));

 typedef jh_4way_context jh256_4way_context;

@@ -81,13 +102,15 @@ typedef jh_4way_context jh512_4way_context;

 void jh256_4way_init( jh_4way_context *sc);

-void jh256_4way(void *cc, const void *data, size_t len);
+void jh256_4way_update(void *cc, const void *data, size_t len);
+#define jh256_4way jh256_4way_update

 void jh256_4way_close(void *cc, void *dst);

 void jh512_4way_init( jh_4way_context *sc );

-void jh512_4way(void *cc, const void *data, size_t len);
+void jh512_4way_update(void *cc, const void *data, size_t len);
+#define jh512_4way jh512_4way_update

 void jh512_4way_close(void *cc, void *dst);

@@ -95,6 +118,6 @@ void jh512_4way_close(void *cc, void *dst);
 }
 #endif

-#endif
+#endif // AVX2

 #endif
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -1,16 +1,578 @@
 #include <string.h>
 #include <immintrin.h>
 #include "luffa-hash-2way.h"
+#include <stdio.h>

 #if defined(__AVX2__)

 #include "simd-utils.h"

+/* initial values of chaining variables */
+static const uint32 IV[40] __attribute((aligned(64))) = {
+    0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
+    0xdef610bb,0xee058139,0x90152df4,0x6e292011,
+    0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
+    0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
+    0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
+    0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
+    0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
+    0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
+    0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
+    0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
+};
+
+/* Round Constants */
+static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
+    0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
+    0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
+    0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
+    0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
+    0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
+    0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
+    0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
+    0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
+    0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
+    0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
+    0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
+    0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
+    0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
+    0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
+    0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
+    0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
+    0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
+    0x00000000,0x00000000,0x00000000,0x5090d577,
+    0x00000000,0x00000000,0x00000000,0xac11d7fa,
+    0x00000000,0x00000000,0x00000000,0x2d1925ab,
+    0x00000000,0x00000000,0x00000000,0x1bcb66f2,
+    0x00000000,0x00000000,0x00000000,0xb46496ac,
+    0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
+    0x00000000,0x00000000,0x00000000,0xd1925ab0,
+    0x00000000,0x00000000,0x00000000,0x78602649,
+    0x00000000,0x00000000,0x00000000,0x29131ab6,
+    0x00000000,0x00000000,0x00000000,0x8edae952,
+    0x00000000,0x00000000,0x00000000,0x0fc053c3,
+    0x00000000,0x00000000,0x00000000,0x3b6ba548,
+    0x00000000,0x00000000,0x00000000,0x3f014f0c,
+    0x00000000,0x00000000,0x00000000,0xedae9520,
+    0x00000000,0x00000000,0x00000000,0xfc053c31
+};
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define cns4w(i)  m512_const1_128( ( (__m128i*)CNS_INIT)[i] )
+
+#define ADD_CONSTANT4W(a,b,c0,c1)\
+    a = _mm512_xor_si512(a,c0);\
+    b = _mm512_xor_si512(b,c1);
+
+#define MULT24W( a0, a1, mask ) \
+do { \
+  __m512i b = _mm512_xor_si512( a0, \
+                   _mm512_shuffle_epi32( _mm512_and_si512(a1,mask), 16 ) ); \
+  a0 = _mm512_or_si512( _mm512_bsrli_epi128(b,4), _mm512_bslli_epi128(a1,12) );\
+  a1 = _mm512_or_si512( _mm512_bsrli_epi128(a1,4), _mm512_bslli_epi128(b,12) );\
+} while(0)
+
+// confirm pointer arithmetic
+// ok but use array indexes
+#define STEP_PART4W(x,c0,c1,t)\
+    SUBCRUMB4W(*x,*(x+1),*(x+2),*(x+3),*t);\
+    SUBCRUMB4W(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
+    MIXWORD4W(*x,*(x+4),*t,*(t+1));\
+    MIXWORD4W(*(x+1),*(x+5),*t,*(t+1));\
+    MIXWORD4W(*(x+2),*(x+6),*t,*(t+1));\
+    MIXWORD4W(*(x+3),*(x+7),*t,*(t+1));\
+    ADD_CONSTANT4W(*x, *(x+4), c0, c1);
+
+#define SUBCRUMB4W(a0,a1,a2,a3,t)\
+    t  = _mm512_load_si512(&a0);\
+    a0 = _mm512_or_si512(a0,a1);\
+    a2 = _mm512_xor_si512(a2,a3);\
+    a1 = _mm512_andnot_si512(a1, m512_neg1 );\
+    a0 = _mm512_xor_si512(a0,a3);\
+    a3 = _mm512_and_si512(a3,t);\
+    a1 = _mm512_xor_si512(a1,a3);\
+    a3 = _mm512_xor_si512(a3,a2);\
+    a2 = _mm512_and_si512(a2,a0);\
+    a0 = _mm512_andnot_si512(a0, m512_neg1 );\
+    a2 = _mm512_xor_si512(a2,a1);\
+    a1 = _mm512_or_si512(a1,a3);\
+    t  = _mm512_xor_si512(t,a1);\
+    a3 = _mm512_xor_si512(a3,a2);\
+    a2 = _mm512_and_si512(a2,a1);\
+    a1 = _mm512_xor_si512(a1,a0);\
+    a0 = _mm512_load_si512(&t);
+
+#define MIXWORD4W(a,b,t1,t2)\
+    b  = _mm512_xor_si512(a,b);\
+    t1 = _mm512_slli_epi32(a,2);\
+    t2 = _mm512_srli_epi32(a,30);\
+     a = _mm512_or_si512(t1,t2);\
+    a  = _mm512_xor_si512(a,b);\
+    t1 = _mm512_slli_epi32(b,14);\
+    t2 = _mm512_srli_epi32(b,18);\
+    b  = _mm512_or_si512(t1,t2);\
+    b  = _mm512_xor_si512(a,b);\
+    t1 = _mm512_slli_epi32(a,10);\
+    t2 = _mm512_srli_epi32(a,22);\
+    a  = _mm512_or_si512(t1,t2);\
+    a  = _mm512_xor_si512(a,b);\
+    t1 = _mm512_slli_epi32(b,1);\
+    t2 = _mm512_srli_epi32(b,31);\
+    b  = _mm512_or_si512(t1,t2);
+
+#define STEP_PART24W(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
+    a1 = _mm512_shuffle_epi32(a1,147);\
+    t0 = _mm512_load_si512(&a1);\
+    a1 = _mm512_unpacklo_epi32(a1,a0);\
+    t0 = _mm512_unpackhi_epi32(t0,a0);\
+    t1 = _mm512_shuffle_epi32(t0,78);\
+    a0 = _mm512_shuffle_epi32(a1,78);\
+    SUBCRUMB4W(t1,t0,a0,a1,tmp0);\
+    t0 = _mm512_unpacklo_epi32(t0,t1);\
+    a1 = _mm512_unpacklo_epi32(a1,a0);\
+    a0 = _mm512_load_si512(&a1);\
+    a0 = _mm512_unpackhi_epi64(a0,t0);\
+    a1 = _mm512_unpacklo_epi64(a1,t0);\
+    a1 = _mm512_shuffle_epi32(a1,57);\
+    MIXWORD4W(a0,a1,tmp0,tmp1);\
+    ADD_CONSTANT4W(a0,a1,c0,c1);
+
+#define NMLTOM7684W(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
+    s2 = _mm512_load_si512(&r1);\
+    q2 = _mm512_load_si512(&p1);\
+    r2 = _mm512_shuffle_epi32(r2,216);\
+    p2 = _mm512_shuffle_epi32(p2,216);\
+    r1 = _mm512_unpacklo_epi32(r1,r0);\
+    p1 = _mm512_unpacklo_epi32(p1,p0);\
+    s2 = _mm512_unpackhi_epi32(s2,r0);\
+    q2 = _mm512_unpackhi_epi32(q2,p0);\
+    s0 = _mm512_load_si512(&r2);\
+    q0 = _mm512_load_si512(&p2);\
+    r2 = _mm512_unpacklo_epi64(r2,r1);\
+    p2 = _mm512_unpacklo_epi64(p2,p1);\
+    s1 = _mm512_load_si512(&s0);\
+    q1 = _mm512_load_si512(&q0);\
+    s0 = _mm512_unpackhi_epi64(s0,r1);\
+    q0 = _mm512_unpackhi_epi64(q0,p1);\
+    r2 = _mm512_shuffle_epi32(r2,225);\
+    p2 = _mm512_shuffle_epi32(p2,225);\
+    r0 = _mm512_load_si512(&s1);\
+    p0 = _mm512_load_si512(&q1);\
+    s0 = _mm512_shuffle_epi32(s0,225);\
+    q0 = _mm512_shuffle_epi32(q0,225);\
+    s1 = _mm512_unpacklo_epi64(s1,s2);\
+    q1 = _mm512_unpacklo_epi64(q1,q2);\
+    r0 = _mm512_unpackhi_epi64(r0,s2);\
+    p0 = _mm512_unpackhi_epi64(p0,q2);\
+    s2 = _mm512_load_si512(&r0);\
+    q2 = _mm512_load_si512(&p0);\
+    s3 = _mm512_load_si512(&r2);\
+    q3 = _mm512_load_si512(&p2);
+
+#define MIXTON7684W(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
+    s0 = _mm512_load_si512(&r0);\
+    q0 = _mm512_load_si512(&p0);\
+    s1 = _mm512_load_si512(&r2);\
+    q1 = _mm512_load_si512(&p2);\
+    r0 = _mm512_unpackhi_epi32(r0,r1);\
+    p0 = _mm512_unpackhi_epi32(p0,p1);\
+    r2 = _mm512_unpackhi_epi32(r2,r3);\
+    p2 = _mm512_unpackhi_epi32(p2,p3);\
+    s0 = _mm512_unpacklo_epi32(s0,r1);\
+    q0 = _mm512_unpacklo_epi32(q0,p1);\
+    s1 = _mm512_unpacklo_epi32(s1,r3);\
+    q1 = _mm512_unpacklo_epi32(q1,p3);\
+    r1 = _mm512_load_si512(&r0);\
+    p1 = _mm512_load_si512(&p0);\
+    r0 = _mm512_unpackhi_epi64(r0,r2);\
+    p0 = _mm512_unpackhi_epi64(p0,p2);\
+    s0 = _mm512_unpackhi_epi64(s0,s1);\
+    q0 = _mm512_unpackhi_epi64(q0,q1);\
+    r1 = _mm512_unpacklo_epi64(r1,r2);\
+    p1 = _mm512_unpacklo_epi64(p1,p2);\
+    s2 = _mm512_load_si512(&r0);\
+    q2 = _mm512_load_si512(&p0);\
+    s1 = _mm512_load_si512(&r1);\
+    q1 = _mm512_load_si512(&p1);
+
+#define NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
+    s1 = _mm512_load_si512(&r3);\
+    q1 = _mm512_load_si512(&p3);\
+    s3 = _mm512_load_si512(&r3);\
+    q3 = _mm512_load_si512(&p3);\
+    s1 = _mm512_unpackhi_epi32(s1,r2);\
+    q1 = _mm512_unpackhi_epi32(q1,p2);\
+    s3 = _mm512_unpacklo_epi32(s3,r2);\
+    q3 = _mm512_unpacklo_epi32(q3,p2);\
+    s0 = _mm512_load_si512(&s1);\
+    q0 = _mm512_load_si512(&q1);\
+    s2 = _mm512_load_si512(&s3);\
+    q2 = _mm512_load_si512(&q3);\
+    r3 = _mm512_load_si512(&r1);\
+    p3 = _mm512_load_si512(&p1);\
+    r1 = _mm512_unpacklo_epi32(r1,r0);\
+    p1 = _mm512_unpacklo_epi32(p1,p0);\
+    r3 = _mm512_unpackhi_epi32(r3,r0);\
+    p3 = _mm512_unpackhi_epi32(p3,p0);\
+    s0 = _mm512_unpackhi_epi64(s0,r3);\
+    q0 = _mm512_unpackhi_epi64(q0,p3);\
+    s1 = _mm512_unpacklo_epi64(s1,r3);\
+    q1 = _mm512_unpacklo_epi64(q1,p3);\
+    s2 = _mm512_unpackhi_epi64(s2,r1);\
+    q2 = _mm512_unpackhi_epi64(q2,p1);\
+    s3 = _mm512_unpacklo_epi64(s3,r1);\
+    q3 = _mm512_unpacklo_epi64(q3,p1);
+
+#define MIXTON10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
+    NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
+
+void rnd512_4way( luffa_4way_context *state, __m512i *msg )
+{
+    __m512i t0, t1;
+    __m512i *chainv = state->chainv;
+    __m512i msg0, msg1;
+    __m512i tmp[2];
+    __m512i x[8];
+    const __m512i MASK = m512_const2_64( 0, 0x00000000ffffffff );
+
+    t0 = chainv[0];
+    t1 = chainv[1];
+
+    t0 = _mm512_xor_si512( t0, chainv[2] );
+    t1 = _mm512_xor_si512( t1, chainv[3] );
+    t0 = _mm512_xor_si512( t0, chainv[4] );
+    t1 = _mm512_xor_si512( t1, chainv[5] );
+    t0 = _mm512_xor_si512( t0, chainv[6] );
+    t1 = _mm512_xor_si512( t1, chainv[7] );
+    t0 = _mm512_xor_si512( t0, chainv[8] );
+    t1 = _mm512_xor_si512( t1, chainv[9] );
+
+    MULT24W( t0, t1, MASK );
+
+    msg0 = _mm512_shuffle_epi32( msg[0], 27 );
+    msg1 = _mm512_shuffle_epi32( msg[1], 27 );
+
+    chainv[0] = _mm512_xor_si512( chainv[0], t0 );
+    chainv[1] = _mm512_xor_si512( chainv[1], t1 );
+    chainv[2] = _mm512_xor_si512( chainv[2], t0 );
+    chainv[3] = _mm512_xor_si512( chainv[3], t1 );
+    chainv[4] = _mm512_xor_si512( chainv[4], t0 );
+    chainv[5] = _mm512_xor_si512( chainv[5], t1 );
+    chainv[6] = _mm512_xor_si512( chainv[6], t0 );
+    chainv[7] = _mm512_xor_si512( chainv[7], t1 );
+    chainv[8] = _mm512_xor_si512( chainv[8], t0 );
+    chainv[9] = _mm512_xor_si512( chainv[9], t1 );
+
+    t0 = chainv[0];
+    t1 = chainv[1];
+
+    MULT24W( chainv[0], chainv[1], MASK );
+    chainv[0] = _mm512_xor_si512( chainv[0], chainv[2] );
+    chainv[1] = _mm512_xor_si512( chainv[1], chainv[3] );
+
+    MULT24W( chainv[2], chainv[3], MASK );
+    chainv[2] = _mm512_xor_si512(chainv[2], chainv[4]);
+    chainv[3] = _mm512_xor_si512(chainv[3], chainv[5]);
+
+    MULT24W( chainv[4], chainv[5], MASK );
+    chainv[4] = _mm512_xor_si512(chainv[4], chainv[6]);
+    chainv[5] = _mm512_xor_si512(chainv[5], chainv[7]);
+
+    MULT24W( chainv[6], chainv[7], MASK );
+    chainv[6] = _mm512_xor_si512(chainv[6], chainv[8]);
+    chainv[7] = _mm512_xor_si512(chainv[7], chainv[9]);
+
+    MULT24W( chainv[8], chainv[9], MASK );
+    chainv[8] = _mm512_xor_si512( chainv[8], t0 );
+    chainv[9] = _mm512_xor_si512( chainv[9], t1 );
+
+    t0 = chainv[8];
+    t1 = chainv[9];
+
+    MULT24W( chainv[8], chainv[9], MASK );
+    chainv[8] = _mm512_xor_si512( chainv[8], chainv[6] );
+    chainv[9] = _mm512_xor_si512( chainv[9], chainv[7] );
+
+    MULT24W( chainv[6], chainv[7], MASK );
+    chainv[6] = _mm512_xor_si512( chainv[6], chainv[4] );
+    chainv[7] = _mm512_xor_si512( chainv[7], chainv[5] );
+
+    MULT24W( chainv[4], chainv[5], MASK );
+    chainv[4] = _mm512_xor_si512( chainv[4], chainv[2] );
+    chainv[5] = _mm512_xor_si512( chainv[5], chainv[3] );
+
+    MULT24W( chainv[2], chainv[3], MASK );
+    chainv[2] = _mm512_xor_si512( chainv[2], chainv[0] );
+    chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );
+
+    MULT24W( chainv[0], chainv[1], MASK );
+    chainv[0] = _mm512_xor_si512( _mm512_xor_si512( chainv[0], t0 ), msg0 );
+    chainv[1] = _mm512_xor_si512( _mm512_xor_si512( chainv[1], t1 ), msg1 );
+
+    MULT24W( msg0, msg1, MASK );
+    chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
+    chainv[3] = _mm512_xor_si512( chainv[3], msg1 );
+
+    MULT24W( msg0, msg1, MASK );
+    chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
+    chainv[5] = _mm512_xor_si512( chainv[5], msg1 );
+
+    MULT24W( msg0, msg1, MASK );
+    chainv[6] = _mm512_xor_si512( chainv[6], msg0 );
+    chainv[7] = _mm512_xor_si512( chainv[7], msg1 );
+
+    MULT24W( msg0, msg1, MASK );
+    chainv[8] = _mm512_xor_si512( chainv[8], msg0 );
+    chainv[9] = _mm512_xor_si512( chainv[9], msg1 );
+
+    MULT24W( msg0, msg1, MASK );
+
+    // replace with ror
+    chainv[3] = _mm512_rol_epi32( chainv[3], 1 );
+    chainv[5] = _mm512_rol_epi32( chainv[5], 2 );
+    chainv[7] = _mm512_rol_epi32( chainv[7], 3 );
+    chainv[9] = _mm512_rol_epi32( chainv[9], 4 );
+
+    NMLTOM10244W( chainv[0], chainv[2], chainv[4], chainv[6],
+                x[0], x[1], x[2], x[3],
+                chainv[1],chainv[3],chainv[5],chainv[7],
+                x[4], x[5], x[6], x[7] );
+
+    STEP_PART4W( &x[0], cns4w( 0), cns4w( 1), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w( 2), cns4w( 3), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w( 4), cns4w( 5), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w( 6), cns4w( 7), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w( 8), cns4w( 9), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w(10), cns4w(11), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w(12), cns4w(13), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w(14), cns4w(15), &tmp[0] );
+
+    MIXTON10244W( x[0], x[1], x[2], x[3],
+                chainv[0], chainv[2], chainv[4],chainv[6],
+                x[4], x[5], x[6], x[7],
+                chainv[1],chainv[3],chainv[5],chainv[7]);
+
+    /* Process last 256-bit block */
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(16), cns4w(17),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(18), cns4w(19),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(20), cns4w(21),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(22), cns4w(23),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(24), cns4w(25),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(26), cns4w(27),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(28), cns4w(29),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(30), cns4w(31),
+                tmp[0], tmp[1] );
+}
+
+void finalization512_4way( luffa_4way_context *state, uint32 *b )
+{
+    uint32_t hash[8*4] __attribute((aligned(128)));
+    __m512i* chainv = state->chainv;
+    __m512i t[2];
+    __m512i zero[2];
+    zero[0] = zero[1] = m512_zero;
+    const __m512i shuff_bswap32 = m512_const_64(
+                                  0x3c3d3e3f38393a3b, 0x3435363730313233,
+                                  0x2c2d2e2f28292a2b, 0x2425262720212223,
+                                  0x1c1d1e1f18191a1b, 0x1415161710111213,
+                                  0x0c0d0e0f08090a0b, 0x0405060700010203 );
+
+    /*---- blank round with m=0 ----*/
+    rnd512_4way( state, zero );
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+
+    t[0] = _mm512_xor_si512( t[0], chainv[2] );
+    t[1] = _mm512_xor_si512( t[1], chainv[3] );
+    t[0] = _mm512_xor_si512( t[0], chainv[4] );
+    t[1] = _mm512_xor_si512( t[1], chainv[5] );
+    t[0] = _mm512_xor_si512( t[0], chainv[6] );
+    t[1] = _mm512_xor_si512( t[1], chainv[7] );
+    t[0] = _mm512_xor_si512( t[0], chainv[8] );
+    t[1] = _mm512_xor_si512( t[1], chainv[9] );
+
+    t[0] = _mm512_shuffle_epi32( t[0], 27 );
+    t[1] = _mm512_shuffle_epi32( t[1], 27 );
+
+    _mm512_store_si512( (__m512i*)&hash[0], t[0] );
+    _mm512_store_si512( (__m512i*)&hash[16], t[1] );
+
+    casti_m512i( b, 0 ) = _mm512_shuffle_epi8(
+                                  casti_m512i( hash, 0 ), shuff_bswap32 );
+    casti_m512i( b, 1 ) = _mm512_shuffle_epi8(
+                                  casti_m512i( hash, 1 ), shuff_bswap32 );
+
+    rnd512_4way( state, zero );
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+    t[0] = _mm512_xor_si512( t[0], chainv[2] );
+    t[1] = _mm512_xor_si512( t[1], chainv[3] );
+    t[0] = _mm512_xor_si512( t[0], chainv[4] );
+    t[1] = _mm512_xor_si512( t[1], chainv[5] );
+    t[0] = _mm512_xor_si512( t[0], chainv[6] );
+    t[1] = _mm512_xor_si512( t[1], chainv[7] );
+    t[0] = _mm512_xor_si512( t[0], chainv[8] );
+    t[1] = _mm512_xor_si512( t[1], chainv[9] );
+
+    t[0] = _mm512_shuffle_epi32( t[0], 27 );
+    t[1] = _mm512_shuffle_epi32( t[1], 27 );
+
+    _mm512_store_si512( (__m512i*)&hash[0], t[0] );
+    _mm512_store_si512( (__m512i*)&hash[16], t[1] );
+
+    casti_m512i( b, 2 ) = _mm512_shuffle_epi8(
+                                  casti_m512i( hash, 0 ), shuff_bswap32 );
+    casti_m512i( b, 3 ) = _mm512_shuffle_epi8(
+                                  casti_m512i( hash, 1 ), shuff_bswap32 );
+}
+
+int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
+{
+    state->hashbitlen = hashbitlen;
+    __m128i *iv = (__m128i*)IV;
+
+    state->chainv[0] = m512_const1_128( iv[0] );
+    state->chainv[1] = m512_const1_128( iv[1] );
+    state->chainv[2] = m512_const1_128( iv[2] );
+    state->chainv[3] = m512_const1_128( iv[3] );
+    state->chainv[4] = m512_const1_128( iv[4] );
+    state->chainv[5] = m512_const1_128( iv[5] );
+    state->chainv[6] = m512_const1_128( iv[6] );
+    state->chainv[7] = m512_const1_128( iv[7] );
+    state->chainv[8] = m512_const1_128( iv[8] );
+    state->chainv[9] = m512_const1_128( iv[9] );
+
+    ((__m512i*)state->buffer)[0] = m512_zero;
+    ((__m512i*)state->buffer)[1] = m512_zero;
+
+    return 0;
+}
+
+// Do not call luffa_update_close after having called luffa_update.
+// Once luffa_update has been called only call luffa_update or luffa_close.
+int luffa_4way_update( luffa_4way_context *state, const void *data,
+                       size_t len )
+{
+    __m512i *vdata  = (__m512i*)data;
+    __m512i *buffer = (__m512i*)state->buffer;
+    __m512i msg[2];
+    int i;
+    int blocks = (int)len >> 5;
+    const __m512i shuff_bswap32 = m512_const_64( 
+                                   0x3c3d3e3f38393a3b, 0x3435363730313233,
+                                   0x2c2d2e2f28292a2b, 0x2425262720212223,
+                                   0x1c1d1e1f18191a1b, 0x1415161710111213,
+                                   0x0c0d0e0f08090a0b, 0x0405060700010203 );
+
+    state->rembytes = (int)len & 0x1F;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       rnd512_4way( state, msg );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    // store in buffer for transform in final for midstate to work
+    if ( state->rembytes  )
+    {
+      // remaining data bytes
+      buffer[0] = _mm512_shuffle_epi8( vdata[0], shuff_bswap32 );
+      buffer[1] = m512_const2_64( 0, 0x0000000080000000 );
+    }
+    return 0;
+}
+
+int luffa_4way_close( luffa_4way_context *state, void *hashval )
+{
+    __m512i *buffer = (__m512i*)state->buffer;
+    __m512i msg[2];
+
+    // transform pad block
+    if ( state->rembytes )
+      // not empty, data is in buffer
+      rnd512_4way( state, buffer );
+    else
+    {     // empty pad block, constant data
+      msg[0] = m512_const2_64( 0, 0x0000000080000000 );
+      msg[1] = m512_zero;
+      rnd512_4way( state, msg );
+    }
+    finalization512_4way( state, (uint32*)hashval );
+
+    if ( state->hashbitlen > 512 )
+        finalization512_4way( state, (uint32*)( hashval+32 ) );
+    return 0;
+}
+
+int luffa_4way_update_close( luffa_4way_context *state,
+                 void *output, const void *data, size_t inlen )
+{
+// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
+    const __m512i *vdata  = (__m512i*)data;
+    __m512i msg[2];
+    int i;
+    const int blocks = (int)( inlen >> 5 );
+    const __m512i shuff_bswap32 = m512_const_64(
+                                   0x3c3d3e3f38393a3b, 0x3435363730313233,
+                                   0x2c2d2e2f28292a2b, 0x2425262720212223,
+                                   0x1c1d1e1f18191a1b, 0x1415161710111213,
+                                   0x0c0d0e0f08090a0b, 0x0405060700010203 );
+
+    state->rembytes = inlen & 0x1F;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       rnd512_4way( state, msg );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    if ( state->rembytes  )
+    {
+       // padding of partial block
+       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = m512_const2_64( 0, 0x0000000080000000 );
+       rnd512_4way( state, msg );
+    }
+    else
+    {
+       // empty pad block
+       msg[0] = m512_const2_64( 0, 0x0000000080000000 );
+       msg[1] = m512_zero;
+       rnd512_4way( state, msg );
+    }
+
+    finalization512_4way( state, (uint32*)output );
+
+    if ( state->hashbitlen > 512 )
+        finalization512_4way( state, (uint32*)( output+64 ) );
+
+    return 0;
+}
+
+#endif // AVX512
+
 #define cns(i)  m256_const1_128( ( (__m128i*)CNS_INIT)[i] )

 #define ADD_CONSTANT(a,b,c0,c1)\
    a = _mm256_xor_si256(a,c0);\
-    b = _mm256_xor_si256(b,c1);\
+    b = _mm256_xor_si256(b,c1);

 #define MULT2( a0, a1, mask ) \
 do { \
@@ -115,7 +677,7 @@ do { \
    s2 = _mm256_load_si256(&r0);\
    q2 = _mm256_load_si256(&p0);\
    s3 = _mm256_load_si256(&r2);\
-    q3 = _mm256_load_si256(&p2);\
+    q3 = _mm256_load_si256(&p2);

 #define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
    s0 = _mm256_load_si256(&r0);\
@@ -174,57 +736,6 @@ do { \
 #define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
    NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);

-/* initial values of chaining variables */
-static const uint32 IV[40] __attribute((aligned(32))) = {
-    0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
-    0xdef610bb,0xee058139,0x90152df4,0x6e292011,
-    0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
-    0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
-    0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
-    0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
-    0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
-    0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
-    0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
-    0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
-};
-
-/* Round Constants */
-static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
-    0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
-    0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
-    0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
-    0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
-    0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
-    0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
-    0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
-    0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
-    0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
-    0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
-    0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
-    0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
-    0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
-    0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
-    0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
-    0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
-    0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
-    0x00000000,0x00000000,0x00000000,0x5090d577,
-    0x00000000,0x00000000,0x00000000,0xac11d7fa,
-    0x00000000,0x00000000,0x00000000,0x2d1925ab,
-    0x00000000,0x00000000,0x00000000,0x1bcb66f2,
-    0x00000000,0x00000000,0x00000000,0xb46496ac,
-    0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
-    0x00000000,0x00000000,0x00000000,0xd1925ab0,
-    0x00000000,0x00000000,0x00000000,0x78602649,
-    0x00000000,0x00000000,0x00000000,0x29131ab6,
-    0x00000000,0x00000000,0x00000000,0x8edae952,
-    0x00000000,0x00000000,0x00000000,0x0fc053c3,
-    0x00000000,0x00000000,0x00000000,0x3b6ba548,
-    0x00000000,0x00000000,0x00000000,0x3f014f0c,
-    0x00000000,0x00000000,0x00000000,0xedae9520,
-    0x00000000,0x00000000,0x00000000,0xfc053c31
-};
-
-

 /***************************************************/
 /* Round function         */
@@ -331,14 +842,10 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )

    MULT2( msg0, msg1, MASK );

-    chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3],  1 ),
-                                 _mm256_srli_epi32( chainv[3], 31 ) );
-    chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5],  2 ),
-                                 _mm256_srli_epi32( chainv[5], 30 ) );
-    chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7],  3 ),
-                                 _mm256_srli_epi32( chainv[7], 29 ) );
-    chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9],  4 ),
-                                 _mm256_srli_epi32( chainv[9], 28 ) );
+    chainv[3] = mm256_rol_32( chainv[3], 1 );
+    chainv[5] = mm256_rol_32( chainv[5], 2 );
+    chainv[7] = mm256_rol_32( chainv[7], 3 );
+    chainv[9] = mm256_rol_32( chainv[9], 4 );

    NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
                x[0], x[1], x[2], x[3],
@@ -385,13 +892,15 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )

 void finalization512_2way( luffa_2way_context *state, uint32 *b )
 {
-    uint32 hash[8] __attribute((aligned(64)));
+    uint32 hash[8*2] __attribute((aligned(64)));
    __m256i* chainv = state->chainv;
    __m256i t[2];
    __m256i zero[2];
    zero[0] = zero[1] = m256_zero;
-    const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
-                                                  0x0405060700010203 );
+    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
+                                                 0x1415161710111213,
+                                                 0x0c0d0e0f08090a0b,
+                                                 0x0405060700010203 );
    /*---- blank round with m=0 ----*/
    rnd512_2way( state, zero );

@@ -475,8 +984,10 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
    __m256i msg[2];
    int i;
    int blocks = (int)len >> 5;
-    const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
-                                                  0x0405060700010203 );
+    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
+                                                 0x1415161710111213,
+                                                 0x0c0d0e0f08090a0b,
+                                                 0x0405060700010203 );
    state-> rembytes = (int)len & 0x1F;

    // full blocks
@@ -528,8 +1039,10 @@ int luffa_2way_update_close( luffa_2way_context *state,
    __m256i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
-    const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
-                                                  0x0405060700010203 );
+    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
+                                                 0x1415161710111213,
+                                                 0x0c0d0e0f08090a0b,
+                                                 0x0405060700010203 );

    state->rembytes = inlen & 0x1F;

@@ -558,6 +1071,7 @@ int luffa_2way_update_close( luffa_2way_context *state,
    }

    finalization512_2way( state, (uint32*)output );
+
    if ( state->hashbitlen > 512 )
        finalization512_2way( state, (uint32*)( output+32 ) );

--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -51,12 +51,30 @@
 #define LIMIT_512 128
 /*********************************/

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 typedef struct {
-    uint32 buffer[8*2] __attribute((aligned(64)));
-    __m256i chainv[10] __attribute((aligned(32)));   /* Chaining values */
+    uint32 buffer[8*4];
+    __m512i chainv[10];   /* Chaining values */
    int hashbitlen;
    int rembytes;
-} luffa_2way_context;
+} luffa_4way_context __attribute((aligned(128)));
+
+int luffa_4way_init( luffa_4way_context *state, int hashbitlen );
+int luffa_4way_update( luffa_4way_context *state, const void *data,
+                       size_t len );
+int luffa_4way_close( luffa_4way_context *state, void *hashval );
+int luffa_4way_update_close( luffa_4way_context *state, void *output,
+                                   const void *data, size_t inlen );
+
+#endif
+
+typedef struct {
+    uint32 buffer[8*2];
+    __m256i chainv[10];   /* Chaining values */
+    int hashbitlen;
+    int rembytes;
+} luffa_2way_context __attribute((aligned(128)));

 int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
 int luffa_2way_update( luffa_2way_context *state, const void *data,
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -542,8 +542,10 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    __m256i* chainv = (__m256i*)state->chainv;
    __m256i  t;
    const __m128i zero = m128_zero;
-    const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
-                                                  0x0405060700010203 );
+    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
+                                                 0x1415161710111213,
+                                                 0x0c0d0e0f08090a0b,
+                                                 0x0405060700010203 );

    rnd512( state, zero, zero );

--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -44,8 +44,13 @@ bool lyra2rev3_thread_init()
 {
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   int size = ROW_LEN_BYTES * 4; // nRows;

-   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+#if defined(LYRA2REV3_16WAY)
+//   l2v3_wholeMatrix = _mm_malloc( 2*size, 128 );
+   l2v3_wholeMatrix = _mm_malloc( 2*size, 64 );
+   init_lyra2rev3_16way_ctx();;
+#else
   l2v3_wholeMatrix = _mm_malloc( size, 64 );
 #if defined (LYRA2REV3_8WAY)
   init_lyra2rev3_8way_ctx();;
@@ -53,13 +58,17 @@ bool lyra2rev3_thread_init()
   init_lyra2rev3_4way_ctx();;
 #else
   init_lyra2rev3_ctx();
+#endif
 #endif
   return l2v3_wholeMatrix;
 }

 bool register_lyra2rev3_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV3_8WAY)
+#if defined(LYRA2REV3_16WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev3_16way;
+  gate->hash      = (void*)&lyra2rev3_16way_hash;
+#elif defined (LYRA2REV3_8WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev3_8way;
  gate->hash      = (void*)&lyra2rev3_8way_hash;
 #elif defined (LYRA2REV3_4WAY)
@@ -69,7 +78,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_lyra2rev3;
  gate->hash      = (void*)&lyra2rev3_hash;
 #endif
-  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
  opt_target_factor = 256.0;
  return true;
@@ -85,10 +94,14 @@ bool lyra2rev2_thread_init()
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;

   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+#if defined (LYRA2REV2_8WAY)
+   l2v2_wholeMatrix = _mm_malloc( 2 * size, 64 );   // 2 way
+   init_lyra2rev2_8way_ctx();;
+#elif defined (LYRA2REV2_4WAY)
   l2v2_wholeMatrix = _mm_malloc( size, 64 );
-#if defined (LYRA2REV2_4WAY)
   init_lyra2rev2_4way_ctx();;
 #else
+   l2v2_wholeMatrix = _mm_malloc( size, 64 );
   init_lyra2rev2_ctx();
 #endif
   return l2v2_wholeMatrix;
@@ -96,14 +109,17 @@ bool lyra2rev2_thread_init()

 bool register_lyra2rev2_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV2_4WAY)
+#if defined (LYRA2REV2_8WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev2_8way;
+  gate->hash      = (void*)&lyra2rev2_8way_hash;
+#elif defined (LYRA2REV2_4WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
  gate->hash      = (void*)&lyra2rev2_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_lyra2rev2;
  gate->hash      = (void*)&lyra2rev2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  opt_target_factor = 256.0;
  return true;
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -5,18 +5,27 @@
 #include <stdint.h>
 #include "lyra2.h"

-#if defined(__AVX2__)
-  #define LYRA2REV3_8WAY
-#endif

-#if defined(__SSE2__)
-  #define LYRA2REV3_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define LYRA2REV3_16WAY 1
+#elif defined(__AVX2__)
+  #define LYRA2REV3_8WAY 1
+#elif defined(__SSE2__)
+  #define LYRA2REV3_4WAY 1
 #endif

 extern __thread uint64_t* l2v3_wholeMatrix;

 bool register_lyra2rev3_algo( algo_gate_t* gate );
-#if defined(LYRA2REV3_8WAY)
+
+#if defined(LYRA2REV3_16WAY)
+
+void lyra2rev3_16way_hash( void *state, const void *input );
+int scanhash_lyra2rev3_16way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr );
+bool init_lyra2rev3_16way_ctx();
+
+#elif defined(LYRA2REV3_8WAY)

 void lyra2rev3_8way_hash( void *state, const void *input );
 int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
@@ -41,15 +50,24 @@ bool init_lyra2rev3_ctx();

 //////////////////////////////////

-#if defined(__AVX2__)
-  #define LYRA2REV2_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define LYRA2REV2_8WAY 1
+#elif defined(__AVX2__)
+  #define LYRA2REV2_4WAY 1
 #endif

 extern __thread uint64_t* l2v2_wholeMatrix;

 bool register_lyra2rev2_algo( algo_gate_t* gate );

-#if defined(LYRA2REV2_4WAY)
+#if defined(LYRA2REV2_8WAY)
+
+void lyra2rev2_8way_hash( void *state, const void *input );
+int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr );
+bool init_lyra2rev2_8way_ctx();
+
+#elif defined(LYRA2REV2_4WAY)

 void lyra2rev2_4way_hash( void *state, const void *input );
 int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
--- a/algo/lyra2/lyra2-hash-2way.c
+++ b/algo/lyra2/lyra2-hash-2way.c
@@ -0,0 +1,695 @@
+/**
+ * Implementation of the Lyra2 Password Hashing Scheme (PHS).
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <mm_malloc.h>
+#include "compat.h"
+#include "lyra2.h"
+#include "sponge.h"
+
+//  LYRA2RE 8 cols 8 rows used by lyea2re, allium, phi2, x22i, x25x.
+//
+//  LYRA2REV2 4 cols 4 rows used by lyra2rev2.
+//
+//  LYRA2REV3 4 cols 4 rows with an extra twist in calculating
+//  rowa in the wandering phase. Used by lyra2rev3.
+// 
+//  LYRA2Z various cols & rows and supports 80 input. Used by lyra2z,
+//  lyra2z330, lyra2h, 
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+/**
+ * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
+ * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
+ * where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all
+ * integer parameters (treated as type "unsigned int") in the order they are provided, plus the value
+ * of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols).
+ *
+ * @param K The derived key to be output by the algorithm
+ * @param kLen Desired key length
+ * @param pwd User password
+ * @param pwdlen Password length
+ * @param salt Salt
+ * @param saltlen Salt length
+ * @param timeCost Parameter to determine the processing time (T)
+ * @param nRows Number or rows of the memory matrix (R)
+ * @param nCols Number of columns of the memory matrix (C)
+ *
+ * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
+ */
+
+// For lyra2rev3.
+// convert a simple offset to an index into interleaved data.
+// good for state and 4 row matrix. 
+// index = ( int( off / 4 ) * 2 ) + ( off mod 4 )
+
+#define offset_to_index( o ) \
+   ( ( ( (uint64_t)( (o) & 0xf) / 4 ) * 8 ) + ( (o) % 4 ) )
+
+
+int LYRA2REV2_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
+             const void *pwd, const uint64_t pwdlen, const uint64_t timeCost,
+             const uint64_t nRows, const uint64_t nCols )
+{
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[32];
+   int64_t row = 2;
+   int64_t prev = 1;
+   int64_t rowa0 = 0;
+   int64_t rowa1 = 0;
+   int64_t tau; 
+   int64_t step = 1;
+   int64_t window = 2;
+   int64_t gap = 1;
+   //====================================================================/
+
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+
+   // for Lyra2REv2, nCols = 4, v1 was using 8
+   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
+                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+   uint64_t *ptrWord = wholeMatrix;
+
+   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+
+   uint64_t *ptr = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;
+
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;
+
+   // now build the rest interleaving on the fly.
+
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;
+
+   ptrWord = wholeMatrix;
+
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
+
+   //Initializes M[0] and M[1]
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
+
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                           &wholeMatrix[ 2 * ROW_LEN_INT64 ],  nCols );
+
+   do
+   {
+     //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+     reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64],
+                                        &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64],
+                                        &wholeMatrix[ 2* row*ROW_LEN_INT64],
+                                        nCols );
+
+     rowa0 = (rowa0 + step) & (window - 1);
+
+     prev = row;
+     row++;
+
+     if ( rowa0 == 0 )
+     {
+        step = window + gap;
+        window *= 2; 
+        gap = -gap;
+     }
+   } while ( row < nRows );
+
+   //===================== Wandering Phase =============================//
+   row = 0;
+   for ( tau = 1; tau <= timeCost; tau++ )
+   {
+      step = ( (tau & 1) == 0 ) ? -1 : ( nRows >> 1 ) - 1;
+      do
+      {
+        rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
+        rowa1 = state[ 4 ] & (unsigned int)(nRows-1);
+
+        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row *ROW_LEN_INT64 ],
+                                      nCols );
+         prev = row;
+
+         row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+
+      } while (row != 0);
+   }
+
+   //===================== Wrap-up Phase ===============================//
+   //Absorbs the last block of the memory matrix
+   absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64 ],
+                            &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64 ] );
+   //Squeezes the key
+   squeeze_2way( state, K, (unsigned int) kLen );
+
+   return 0;
+}
+
+// This version is currently only used by REv3 and has some hard coding
+// specific to v3 such as input data size of 32 bytes.
+//
+// Similarly with REv2. Thedifference with REv3 isn't clear and maybe
+// they can be merged.
+//
+// RE is used by RE, allium. The main difference between RE and REv2
+// in the matrix size.
+//
+// Z also needs to support 80 byte input as well as 32 byte, and odd
+// matrix sizes like 330 rows. It is used by lyra2z330, lyra2z, lyra2h.
+
+
+/////////////////////////////////////////////////
+
+// 2 way 256
+// drop salt, salt len arguments, hard code some others.
+// Data is interleaved 2x256.
+
+int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
+                    const void *pwd, uint64_t pwdlen, uint64_t timeCost,
+                    uint64_t nRows, uint64_t nCols )
+
+// hard coded for 32 byte input as well as matrix size.
+// Other required versions include 80 byte input and different block
+// sizez
+
+//int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
+//      const void *pwd, const uint64_t pwdlen, const void *salt,
+//      const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
+//      const uint64_t nCols )
+{
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[32];
+   int64_t row = 2; 
+   int64_t prev = 1;
+   int64_t rowa0 = 0;
+   int64_t rowa1 = 0;
+   int64_t tau; 
+   int64_t step = 1;
+   int64_t window = 2;
+   int64_t gap = 1; 
+   uint64_t instance0 = 0;
+   uint64_t instance1 = 0;
+   //====================================================================/
+
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+   const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
+
+   uint64_t *ptrWord = wholeMatrix;
+
+//  2 way 256 rewrite. Salt always == password, and data is interleaved,
+//  need to build in parallel as pw isalready interleaved.
+
+   
+//  {   password,    (64 or 80 bytes)
+//      salt,        (64 or 80 bytes) =  same as password
+//      Klen,        (u64)  = 32 bytes
+//      pwdlen,      (u64)
+//      saltlen,     (u64)
+//      timecost,    (u64)
+//      nrows,       (u64)
+//      ncols,       (u64)
+//      0x80,        (byte)
+//      { 0 .. 0 },
+//      1            (byte)
+//   }
+   
+// input is usually 32 maybe 64, both are aligned to 256 bit vector.
+// 80 byte inpput is not aligned complicating matters for lyra2z.   
+
+   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+   
+   uint64_t *ptr = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;
+
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;
+ 
+   // now build the rest interleaving on the fly.
+
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;
+
+   ptrWord = wholeMatrix;
+
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
+
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
+
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                           &wholeMatrix[2*ROW_LEN_INT64],  nCols );
+
+   do
+   {
+
+      reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
+                                         nCols );
+
+      rowa0 = (rowa0 + step) & (window - 1);
+
+      prev = row;
+      row++;
+
+      if (rowa0 == 0)
+      {
+         step = window + gap; //changes the step: approximately doubles its value
+         window *= 2; //doubles the size of the re-visitation window
+         gap = -gap; //inverts the modifier to the step
+      }
+
+   } while (row < nRows);
+
+   row = 0;
+   for (tau = 1; tau <= timeCost; tau++)
+   {
+      step = ( (tau & 1) == 0 ) ? -1 : ( nRows >> 1 ) - 1;
+      do
+      {
+        instance0 = state[ offset_to_index( instance0 ) ];
+        instance1 = (&state[4])[ offset_to_index( instance1 ) ];
+
+        rowa0 = state[ offset_to_index( instance0 )  ]
+                & (unsigned int)(nRows-1);
+        rowa1 = (state+4)[ offset_to_index( instance1 ) ]
+                & (unsigned int)(nRows-1);
+
+        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
+                                      nCols );
+
+        prev = row;
+        row = (row + step) & (unsigned int)(nRows-1); 
+
+       } while ( row != 0 );
+   }
+
+   absorbBlock_2way( state, &wholeMatrix[2*rowa0*ROW_LEN_INT64],
+                            &wholeMatrix[2*rowa1*ROW_LEN_INT64] );
+
+   squeeze_2way( state, K, (unsigned int) kLen );
+
+   return 0;
+}
+
+#endif // AVX512
+
+#if 0
+
+//////////////////////////////////////////////////
+int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
+            const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
+            const uint64_t timeCost, const uint64_t nRows,
+            const uint64_t nCols )
+{
+    //========================== Basic variables ============================//
+    uint64_t _ALIGN(256) state[16];
+    int64_t row = 2; //index of row to be processed
+    int64_t prev = 1; //index of prev (last row ever computed/modified)
+    int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+    int64_t tau; //Time Loop iterator
+    int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+    int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+    int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+//    int64_t i; //auxiliary iteration counter
+    //=======================================================================/
+
+    //======= Initializing the Memory Matrix and pointers to it =============//
+    //Tries to allocate enough space for the whole memory matrix
+
+    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+//    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+//    memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+
+    //==== Getting the password + salt + basil padded with 10*1 ============//
+    //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+    //but this ensures that the password copied locally will be overwritten as soon as possible
+
+    //First, we clean enough blocks for the password, salt, basil and padding
+    uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 *
+                       sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+    byte *ptrByte = (byte*) wholeMatrix;
+    memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );
+
+    //Prepends the password
+    memcpy(ptrByte, pwd, pwdlen);
+    ptrByte += pwdlen;
+
+    //Concatenates the salt
+    memcpy(ptrByte, salt, saltlen);
+    ptrByte += saltlen;
+    //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+    memcpy(ptrByte, &kLen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &saltlen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &timeCost, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &nRows, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &nCols, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+
+    //Now comes the padding
+    *ptrByte = 0x80; //first byte of padding: right after the password
+    ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+    ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+    *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+
+    //=================== Initializing the Sponge State ====================//
+    //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+//        uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32);
+//        if (state == NULL) {
+//                return -1;
+//        }
+//    initState( state );
+
+    //============================== Setup Phase =============================//
+    //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+    uint64_t *ptrWord = wholeMatrix;
+
+    absorbBlockBlake2Safe( state, ptrWord, nBlocksInput,
+                           BLOCK_LEN_BLAKE2_SAFE_INT64 );
+/*
+    for ( i = 0; i < nBlocksInput; i++ )
+    {
+      absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
+      ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
+    }
+*/
+    //Initializes M[0] and M[1]
+        reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here
+        reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols);
+
+        do {
+                //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+                reducedDuplexRowSetup(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
+
+                //updates the value of row* (deterministically picked during Setup))
+                rowa = (rowa + step) & (window - 1);
+                //update prev: it now points to the last row ever computed
+                prev = row;
+                //updates row: goes to the next row to be computed
+                row++;
+
+                //Checks if all rows in the window where visited.
+                if (rowa == 0) {
+                        step = window + gap; //changes the step: approximately doubles its value
+                        window *= 2; //doubles the size of the re-visitation window
+                        gap = -gap; //inverts the modifier to the step
+                }
+
+        } while (row < nRows);
+
+    //======================== Wandering Phase =============================//
+    row = 0; //Resets the visitation to the first row of the memory matrix
+    for ( tau = 1; tau <= timeCost; tau++ )
+    {
+        //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+        step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+        do {
+        //Selects a pseudorandom index row*
+        //----------------------------------------------------------------------
+        //rowa = ((unsigned int)state[0]) & (nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+        rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+        //-----------------------------------------------------------------
+
+        //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+                reducedDuplexRow(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
+
+        //update prev: it now points to the last row ever computed
+        prev = row;
+
+        //updates row: goes to the next row to be computed
+        //---------------------------------------------------------------
+        //row = (row + step) & (nRows-1);       //(USE THIS IF nRows IS A POWER OF 2)
+        row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+        //--------------------------------------------------------------------
+
+      } while (row != 0);
+    }
+
+    //========================= Wrap-up Phase ===============================//
+    //Absorbs the last block of the memory matrix
+    absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
+
+    //Squeezes the key
+    squeeze( state, K, kLen );
+
+    return 0;
+}
+
+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// Lyra2RE doesn't like the new wholeMatrix implementation
+int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
+                  const uint64_t pwdlen, const uint64_t timeCost,
+                  const uint64_t nRows, const uint64_t nCols )
+{
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[16];
+   int64_t row = 2; //index of row to be processed
+   int64_t prev = 1; //index of prev (last row ever computed/modified)
+   int64_t rowa0 = 0;
+   int64_t rowa1 = 0;
+   int64_t tau; //Time Loop iterator
+   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+   int64_t i; //auxiliary iteration counter
+   //====================================================================/
+
+   //=== Initializing the Memory Matrix and pointers to it =============//
+   //Tries to allocate enough space for the whole memory matrix
+
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   // for Lyra2REv2, nCols = 4, v1 was using 8
+   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
+                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+
+   i = (int64_t)ROW_LEN_BYTES * nRows;
+   uint64_t *wholeMatrix = _mm_malloc( i, 64 );
+   if (wholeMatrix == NULL)
+      return -1;
+
+#if defined(__AVX2__)
+   memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
+#elif defined(__SSE2__)
+   memset_zero_128( (__m128i*)wholeMatrix, i>>4 );   
+#else
+   memset( wholeMatrix, 0, i );
+#endif
+
+   uint64_t *ptrWord = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;
+
+   //=== Getting the password + salt + basil padded with 10*1 ==========//
+   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+   //but this ensures that the password copied locally will be overwritten as soon as possible
+
+   //First, we clean enough blocks for the password, salt, basil and padding
+   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+
+   uint64_t *ptr = wholeMatrix;
+
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;
+
+   // now build the rest interleaving on the fly.
+
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;
+
+   
+/*   
+   byte *ptrByte = (byte*) wholeMatrix;
+
+   //Prepends the password
+   memcpy(ptrByte, pwd, pwdlen);
+   ptrByte += pwdlen;
+
+   //Concatenates the salt
+   memcpy(ptrByte, salt, saltlen);
+   ptrByte += saltlen;
+
+//   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
+//                       - (saltlen + pwdlen) );
+
+   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+   memcpy(ptrByte, &kLen, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = pwdlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = saltlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = timeCost;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nRows;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nCols;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+
+   //Now comes the padding
+   *ptrByte = 0x80; //first byte of padding: right after the password
+   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+
+   //================= Initializing the Sponge State ====================//
+   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+
+//   initState( state );
+
+   //========================= Setup Phase =============================//
+   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+
+   ptrWord = wholeMatrix;
+
+*/
+
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
+/*
+   for (i = 0; i < nBlocksInput; i++)
+   {
+       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
+       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
+   }
+*/
+   //Initializes M[0] and M[1]
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                                  &wholeMatrix[ 2 * ROW_LEN_INT64], nCols );
+
+   do
+   {
+      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+      reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
+                                         nCols );
+
+      //updates the value of row* (deterministically picked during Setup))
+      rowa0 = (rowa0 + step) & (window - 1);
+      //update prev: it now points to the last row ever computed
+
+      prev = row;
+      //updates row: goes to the next row to be computed
+      row++;
+
+      //Checks if all rows in the window where visited.
+      if (rowa0 == 0)
+      {
+         step = window + gap; //changes the step: approximately doubles its value
+         window *= 2; //doubles the size of the re-visitation window
+         gap = -gap; //inverts the modifier to the step
+      }
+
+   } while (row < nRows);
+
+   //===================== Wandering Phase =============================//
+   row = 0; //Resets the visitation to the first row of the memory matrix
+   for (tau = 1; tau <= timeCost; tau++)
+   {
+      step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
+      do
+      {
+        rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
+        rowa1 = state[ 4 ] & (unsigned int)(nRows-1);
+
+        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row *ROW_LEN_INT64 ],
+                                      nCols );
+
+           //update prev: it now points to the last row ever computed
+           prev = row;
+
+           //updates row: goes to the next row to be computed
+           //----------------------------------------------------
+           row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //----------------------------------------------------
+
+       } while (row != 0);
+   }
+
+   //===================== Wrap-up Phase ===============================//
+   //Absorbs the last block of the memory matrix
+   absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64],
+                            &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64] );
+   //Squeezes the key
+   squeeze_2way( state, K, (unsigned int) kLen );
+
+   //================== Freeing the memory =============================//
+   _mm_free(wholeMatrix);
+
+   return 0;
+}
+
+#endif
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -327,7 +327,6 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,

   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
                      nCols);
-
   do
   {
      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -60,4 +60,15 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,

 int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+
+int LYRA2REV2_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
+        uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
+int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
+        uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
+#endif
+
 #endif /* LYRA2_H_ */
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -1,13 +1,150 @@
 #include "lyra2-gate.h"
 #include <memory.h>
-
-#if defined (LYRA2REV2_4WAY)	
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 
+#include "algo/cubehash/cube-hash-2way.h"
+
+#if defined (LYRA2REV2_8WAY)
+
+typedef struct {
+   blake256_8way_context     blake;
+   keccak256_8way_context    keccak;
+   cube_4way_context          cube;
+   skein256_8way_context     skein;
+   bmw256_8way_context          bmw;
+} lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));
+
+static lyra2v2_8way_ctx_holder l2v2_8way_ctx;
+
+bool init_lyra2rev2_8way_ctx()
+{
+   keccak256_8way_init( &l2v2_8way_ctx.keccak );
+   cube_4way_init( &l2v2_8way_ctx.cube, 256, 16, 32 );
+   skein256_8way_init( &l2v2_8way_ctx.skein );
+   bmw256_8way_init( &l2v2_8way_ctx.bmw );
+   return true;
+}
+
+void lyra2rev2_8way_hash( void *state, const void *input )
+{
+   uint32_t vhash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
+   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (64)));
+   uint32_t hash2[8] __attribute__ ((aligned (64)));
+   uint32_t hash3[8] __attribute__ ((aligned (64)));
+   uint32_t hash4[8] __attribute__ ((aligned (64)));
+   uint32_t hash5[8] __attribute__ ((aligned (64)));
+   uint32_t hash6[8] __attribute__ ((aligned (64)));
+   uint32_t hash7[8] __attribute__ ((aligned (64)));
+   lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );
+
+   blake256_8way( &ctx.blake, input + (64<<3), 16 );
+   blake256_8way_close( &ctx.blake, vhash );
+
+   rintrlv_8x32_8x64( vhashA, vhash, 256 );
+
+   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
+   keccak256_8way_close( &ctx.keccak, vhash );
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
+
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+
+   intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                hash7, 256 );
+
+   skein256_8way_update( &ctx.skein, vhash, 32 );
+   skein256_8way_close( &ctx.skein, vhash );
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
+
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+   
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
+
+   intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, 
+                hash7, 256 );
+
+   bmw256_8way_update( &ctx.bmw, vhash, 32 );
+   bmw256_8way_close( &ctx.bmw, state );
+}
+
+int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+   const uint32_t Htarg = ptarget[7];
+   __m256i *noncev = (__m256i*)vdata + 19;   // aligned
+   int thr_id = mythr->id; 
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );
+
+   blake256_8way_init( &l2v2_8way_ctx.blake );
+   blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 );
+
+   do
+   {
+      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                  n+3, n+2, n+1, n ) );
+
+      lyra2rev2_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (LYRA2REV2_4WAY)

 typedef struct {
   blake256_4way_context     blake;
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -4,8 +4,180 @@
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 
+#include "algo/cubehash/cube-hash-2way.h"

-#if defined (LYRA2REV3_8WAY)
+#if defined (LYRA2REV3_16WAY)
+
+typedef struct {
+   blake256_16way_context     blake;
+   cube_4way_context          cube;
+   bmw256_16way_context       bmw;
+} lyra2v3_16way_ctx_holder;
+
+static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx;
+
+bool init_lyra2rev3_16way_ctx()
+{
+   blake256_16way_init( &l2v3_16way_ctx.blake );
+   cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 );
+   bmw256_16way_init( &l2v3_16way_ctx.bmw );
+   return true;
+}
+
+void lyra2rev3_16way_hash( void *state, const void *input )
+{
+   uint32_t vhash[16*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (64)));
+   uint32_t hash2[8] __attribute__ ((aligned (64)));
+   uint32_t hash3[8] __attribute__ ((aligned (64)));
+   uint32_t hash4[8] __attribute__ ((aligned (64)));
+   uint32_t hash5[8] __attribute__ ((aligned (64)));
+   uint32_t hash6[8] __attribute__ ((aligned (64)));
+   uint32_t hash7[8] __attribute__ ((aligned (64)));
+   uint32_t hash8[8] __attribute__ ((aligned (64)));
+   uint32_t hash9[8] __attribute__ ((aligned (64)));
+   uint32_t hash10[8] __attribute__ ((aligned (64)));
+   uint32_t hash11[8] __attribute__ ((aligned (64)));
+   uint32_t hash12[8] __attribute__ ((aligned (64)));
+   uint32_t hash13[8] __attribute__ ((aligned (64)));
+   uint32_t hash14[8] __attribute__ ((aligned (64)));
+   uint32_t hash15[8] __attribute__ ((aligned (64)));
+   lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );
+
+   blake256_16way_update( &ctx.blake, input + (64*16), 16 );
+   blake256_16way_close( &ctx.blake, vhash );
+
+   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+           hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
+           vhash, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+   intrlv_2x256( vhash, hash8, hash9, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash8, hash9, vhash, 256 );
+   intrlv_2x256( vhash, hash10, hash11, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash10, hash11, vhash, 256 );
+   intrlv_2x256( vhash, hash12, hash13, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash12, hash13, vhash, 256 );
+   intrlv_2x256( vhash, hash14, hash15, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash14, hash15, vhash, 256 );
+
+   intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 256 );
+   intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 256 );
+   intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash8, hash9, hash10, hash11, vhash, 256 );
+   intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash12, hash13, hash14, hash15, vhash, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+   intrlv_2x256( vhash, hash8, hash9, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash8, hash9, vhash, 256 );
+   intrlv_2x256( vhash, hash10, hash11, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash10, hash11, vhash, 256 );
+   intrlv_2x256( vhash, hash12, hash13, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash12, hash13, vhash, 256 );
+   intrlv_2x256( vhash, hash14, hash15, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash14, hash15, vhash, 256 );
+
+   intrlv_16x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+             hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
+             hash15, 256 );
+
+   bmw256_16way_update( &ctx.bmw, vhash, 32 );
+   bmw256_16way_close( &ctx.bmw, state );
+}
+
+
+int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &hash[7<<4];
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 16;
+   const uint32_t Htarg = ptarget[7];
+   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
+   const int thr_id = mythr->id;
+
+   if ( opt_benchmark )  ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm512_bswap32_intrlv80_16x32( vdata, pdata );
+
+   blake256_16way_init( &l2v3_16way_ctx.blake );
+   blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
+
+   do
+   {
+      *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
+                                                  n+11, n+10, n+ 9, n+ 8,
+                                                  n+ 7, n+ 6, n+ 5, n+ 4,
+                                                  n+ 3, n+ 2, n+ 1, n ) );
+
+      lyra2rev3_16way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int lane = 0; lane < 16; lane++ )
+      if ( unlikely( hash7[lane] <= Htarg ) )
+      {
+         extr_lane_16x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+             pdata[19] = n + lane;
+             submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 16;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (LYRA2REV3_8WAY)

 typedef struct {
   blake256_8way_context     blake;
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -0,0 +1,357 @@
+/**
+ * A simple implementation of Blake2b's internal permutation
+ * in the form of a sponge.
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//#include "algo-gate.h"
+#include <string.h>
+#include <stdio.h>
+#include <time.h>
+#include <immintrin.h>
+#include "sponge.h"
+#include "lyra2.h"
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
+{
+    const int len_m256i = len / 32;
+    const int fullBlocks = len_m256i / BLOCK_LEN_M256I;
+    __m512i* state = (__m512i*)State;
+    __m512i* out   = (__m512i*)Out;
+    int i;
+
+    //Squeezes full blocks
+    for ( i = 0; i < fullBlocks; i++ )
+    {
+       memcpy_512( out, state, BLOCK_LEN_M256I );
+       LYRA_ROUND_2WAY_AVX512( state[0], state[1], state[2], state[3] );
+       out += BLOCK_LEN_M256I;
+    }
+    //Squeezes remaining bytes
+    memcpy_512( out, state, len_m256i % BLOCK_LEN_M256I );
+}
+
+inline void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
+                                               const uint64_t *In1 ) 
+{
+    register __m512i state0, state1, state2, state3;
+    __m512i in[3];
+    casti_m256i( in, 0 ) = casti_m256i( In0, 0 );
+    casti_m256i( in, 1 ) = casti_m256i( In1, 1 );
+    casti_m256i( in, 2 ) = casti_m256i( In0, 2 );
+    casti_m256i( in, 3 ) = casti_m256i( In1, 3 );
+    casti_m256i( in, 4 ) = casti_m256i( In0, 4 );
+    casti_m256i( in, 5 ) = casti_m256i( In1, 5 );
+    
+    state0 = _mm512_load_si512( (__m512i*)State     );
+    state1 = _mm512_load_si512( (__m512i*)State + 1 );
+    state2 = _mm512_load_si512( (__m512i*)State + 2 );
+    state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+    state0 = _mm512_xor_si512( state0, in[0] );
+    state1 = _mm512_xor_si512( state1, in[1] );
+    state2 = _mm512_xor_si512( state2, in[2] );
+
+    LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
+
+    _mm512_store_si512( (__m512i*)State,     state0 );
+    _mm512_store_si512( (__m512i*)State + 1, state1 );
+    _mm512_store_si512( (__m512i*)State + 2, state2 );
+    _mm512_store_si512( (__m512i*)State + 3, state3 );
+
+}
+
+inline void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
+                      const uint64_t nBlocks, const uint64_t block_len )
+{
+  register __m512i state0, state1, state2, state3;
+
+  state0 = 
+  state1 = m512_zero;
+  state2 = m512_const4_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
+                           0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
+  state3 = m512_const4_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
+                           0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
+
+  for ( int i = 0; i < nBlocks; i++ )
+  { 
+    __m512i *in = (__m512i*)In;
+    state0 = _mm512_xor_si512( state0, in[0] );
+    state1 = _mm512_xor_si512( state1, in[1] );
+
+    LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
+    In += block_len*2;
+  }
+
+  _mm512_store_si512( (__m512i*)State,     state0 );
+  _mm512_store_si512( (__m512i*)State + 1, state1 );
+  _mm512_store_si512( (__m512i*)State + 2, state2 );
+  _mm512_store_si512( (__m512i*)State + 3, state3 );
+
+}
+
+inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
+                                     uint64_t nCols )
+{
+    int i;
+
+    //M[row][C-1-col] = H.reduced_squeeze()
+
+
+    register __m512i state0, state1, state2, state3;
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
+
+    state0 = _mm512_load_si512( (__m512i*)State     );
+    state1 = _mm512_load_si512( (__m512i*)State + 1 );
+    state2 = _mm512_load_si512( (__m512i*)State + 2 );
+    state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+    for ( i = 0; i < 9; i += 3)
+    {
+        _mm_prefetch( out - i,     _MM_HINT_T0 );
+        _mm_prefetch( out - i - 2, _MM_HINT_T0 );
+    }
+
+    for ( i = 0; i < nCols; i++ )
+    {
+       _mm_prefetch( out -  9, _MM_HINT_T0 );
+       _mm_prefetch( out - 11, _MM_HINT_T0 );
+
+       out[0] = state0;
+       out[1] = state1;
+       out[2] = state2;
+
+       //Goes to next block (column) that will receive the squeezed data
+       out -= BLOCK_LEN_M256I;
+
+       LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+    }
+
+    _mm512_store_si512( (__m512i*)State,     state0 );
+    _mm512_store_si512( (__m512i*)State + 1, state1 );
+    _mm512_store_si512( (__m512i*)State + 2, state2 );
+    _mm512_store_si512( (__m512i*)State + 3, state3 );
+}
+
+
+inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
+                 uint64_t *rowOut, uint64_t nCols )
+{
+    int i;
+    register __m512i state0, state1, state2, state3;
+    __m512i *in = (__m512i*)rowIn;
+    __m512i *out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
+
+    state0 = _mm512_load_si512( (__m512i*)State     );
+    state1 = _mm512_load_si512( (__m512i*)State + 1 );
+    state2 = _mm512_load_si512( (__m512i*)State + 2 );
+    state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+    for ( i = 0; i < nCols; i++ )
+    {
+         state0 = _mm512_xor_si512( state0, in[0] );
+         state1 = _mm512_xor_si512( state1, in[1] );
+         state2 = _mm512_xor_si512( state2, in[2] );
+
+         LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+
+         out[0] = _mm512_xor_si512( state0, in[0] );
+         out[1] = _mm512_xor_si512( state1, in[1] );
+         out[2] = _mm512_xor_si512( state2, in[2] );
+
+         //Input: next column (i.e., next block in sequence)
+         in += BLOCK_LEN_M256I;
+         //Output: goes to previous column
+         out -= BLOCK_LEN_M256I;
+    }
+
+    _mm512_store_si512( (__m512i*)State,     state0 );
+    _mm512_store_si512( (__m512i*)State + 1, state1 );
+    _mm512_store_si512( (__m512i*)State + 2, state2 );
+    _mm512_store_si512( (__m512i*)State + 3, state3 );
+}
+
+inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
+                       uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols )
+{
+    int i;
+    register __m512i state0, state1, state2, state3;
+    __m512i* in    = (__m512i*)rowIn;
+    __m512i* inout = (__m512i*)rowInOut;
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
+    __m512i  t0, t1, t2;
+
+    state0 = _mm512_load_si512( (__m512i*)State     );
+    state1 = _mm512_load_si512( (__m512i*)State + 1 );
+    state2 = _mm512_load_si512( (__m512i*)State + 2 );
+    state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+    for ( i = 0; i < nCols; i++ )
+    {
+       state0 = _mm512_xor_si512( state0,
+                                  _mm512_add_epi64( in[0], inout[0] ) );
+       state1 = _mm512_xor_si512( state1,
+                                  _mm512_add_epi64( in[1], inout[1] ) );
+       state2 = _mm512_xor_si512( state2,
+                                  _mm512_add_epi64( in[2], inout[2] ) );
+
+       LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+
+       out[0] = _mm512_xor_si512( state0, in[0] );
+       out[1] = _mm512_xor_si512( state1, in[1] );
+       out[2] = _mm512_xor_si512( state2, in[2] );
+
+       //M[row*][col] = M[row*][col] XOR rotW(rand)
+       t0 = _mm512_permutex_epi64( state0, 0x93 );
+       t1 = _mm512_permutex_epi64( state1, 0x93 );
+       t2 = _mm512_permutex_epi64( state2, 0x93 );
+
+       inout[0] = _mm512_xor_si512( inout[0],
+                                 _mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
+       inout[1] = _mm512_xor_si512( inout[1],
+                                 _mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
+       inout[2] = _mm512_xor_si512( inout[2],
+                                 _mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
+
+
+       //Inputs: next column (i.e., next block in sequence)
+       in    += BLOCK_LEN_M256I;
+       inout += BLOCK_LEN_M256I;
+       //Output: goes to previous column
+       out   -= BLOCK_LEN_M256I;
+    }
+
+    _mm512_store_si512( (__m512i*)State,     state0 );
+    _mm512_store_si512( (__m512i*)State + 1, state1 );
+    _mm512_store_si512( (__m512i*)State + 2, state2 );
+    _mm512_store_si512( (__m512i*)State + 3, state3 );
+}
+
+// big ugly workaound for pointer aliasing, use a union of pointers.
+// Access matrix using m512i for in and out, m256i for inout
+
+inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
+                            uint64_t *rowInOut0, uint64_t *rowInOut1,
+                            uint64_t *rowOut, uint64_t nCols)
+{
+   int i;
+   register __m512i state0, state1, state2, state3;
+   __m512i *in = (__m512i*)rowIn;
+   __m256i *inout0 = (__m256i*)rowInOut0;
+   __m256i *inout1 = (__m256i*)rowInOut1;
+   __m512i *out = (__m512i*)rowOut;
+   __m512i io[3];
+   povly inout;
+   inout.v512 = &io[0];
+    __m512i t0, t1, t2;
+
+   state0 = _mm512_load_si512( (__m512i*)State     );
+   state1 = _mm512_load_si512( (__m512i*)State + 1 );
+   state2 = _mm512_load_si512( (__m512i*)State + 2 );
+   state3 = _mm512_load_si512( (__m512i*)State + 3 );
+    
+    _mm_prefetch( in,     _MM_HINT_T0 );
+    _mm_prefetch( inout0,     _MM_HINT_T0 );
+    _mm_prefetch( inout1,     _MM_HINT_T0 );
+    _mm_prefetch( in     + 2, _MM_HINT_T0 );
+    _mm_prefetch( inout0 + 2, _MM_HINT_T0 );
+    _mm_prefetch( inout1 + 2, _MM_HINT_T0 );
+    _mm_prefetch( in     + 4, _MM_HINT_T0 );
+    _mm_prefetch( inout0 + 4, _MM_HINT_T0 );
+    _mm_prefetch( inout1 + 4, _MM_HINT_T0 );
+    _mm_prefetch( in     + 6, _MM_HINT_T0 );
+    _mm_prefetch( inout0 + 6, _MM_HINT_T0 );
+    _mm_prefetch( inout1 + 6, _MM_HINT_T0 );
+
+    
+    for ( i = 0; i < nCols; i++ )
+    {
+
+      //Absorbing "M[prev] [+] M[row*]"
+      inout.v256[0] = inout0[0];
+      inout.v256[1] = inout1[1];
+      inout.v256[2] = inout0[2];
+      inout.v256[3] = inout1[3];
+      inout.v256[4] = inout0[4];
+      inout.v256[5] = inout1[5];
+
+      state0 = _mm512_xor_si512( state0,
+                                 _mm512_add_epi64( in[0], inout.v512[0] ) );
+      state1 = _mm512_xor_si512( state1,
+                                 _mm512_add_epi64( in[1], inout.v512[1] ) );
+      state2 = _mm512_xor_si512( state2,
+                                 _mm512_add_epi64( in[2], inout.v512[2] ) );
+
+
+      //Applies the reduced-round transformation f to the sponge's state
+      LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+
+      //M[rowOut][col] = M[rowOut][col] XOR rand
+      out[0] = _mm512_xor_si512( out[0], state0 );
+      out[1] = _mm512_xor_si512( out[1], state1 );
+      out[2] = _mm512_xor_si512( out[2], state2 );
+
+      // if inout is the same row as out it was just overwritten, reload.
+      if ( rowOut == rowInOut0 )
+      {
+         inout.v256[0] = inout0[0];
+         inout.v256[2] = inout0[2];
+         inout.v256[4] = inout0[4];
+      }
+      if ( rowOut == rowInOut1 )
+      {
+         inout.v256[1] = inout1[1];
+         inout.v256[3] = inout1[3];
+         inout.v256[5] = inout1[5];
+      }
+
+      //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+      t0 = _mm512_permutex_epi64( state0, 0x93 );
+      t1 = _mm512_permutex_epi64( state1, 0x93 );
+      t2 = _mm512_permutex_epi64( state2, 0x93 );
+
+      inout.v512[0] = _mm512_xor_si512( inout.v512[0],
+                                   _mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
+      inout.v512[1] = _mm512_xor_si512( inout.v512[1],
+                                   _mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
+      inout.v512[2] = _mm512_xor_si512( inout.v512[2],
+                                   _mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
+      
+      inout0[0] = inout.v256[0];
+      inout1[1] = inout.v256[1];
+      inout0[2] = inout.v256[2];
+      inout1[3] = inout.v256[3];
+      inout0[4] = inout.v256[4];
+      inout1[5] = inout.v256[5];
+
+       //Goes to next block
+       in     += BLOCK_LEN_M256I;
+       inout0 += BLOCK_LEN_M256I * 2;
+       inout1 += BLOCK_LEN_M256I * 2;
+       out    += BLOCK_LEN_M256I;
+   }
+
+   _mm512_store_si512( (__m512i*)State,     state0 );
+   _mm512_store_si512( (__m512i*)State + 1, state1 );
+   _mm512_store_si512( (__m512i*)State + 2, state2 );
+   _mm512_store_si512( (__m512i*)State + 3, state3 );
+}
+
+#endif // AVX512
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -375,7 +375,10 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
    {
       _mm_prefetch( out -  9, _MM_HINT_T0 );
       _mm_prefetch( out - 11, _MM_HINT_T0 );
-                   
+
+//printf("S RSR0 col= %d, out= %x\n",i,out);
+
+
       out[0] = state0;
       out[1] = state1;
       out[2] = state2;
@@ -706,11 +709,34 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
       out[1] = _mm256_xor_si256( state1, in[1] );
       out[2] = _mm256_xor_si256( state2, in[2] );

+/*
+printf("s duplexsetup col= %d\n",i); 
+uint64_t * o = (uint64_t*)out;
+printf("S out %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
+*/
+
       //M[row*][col] = M[row*][col] XOR rotW(rand)
       t0 = _mm256_permute4x64_epi64( state0, 0x93 );
       t1 = _mm256_permute4x64_epi64( state1, 0x93 );
       t2 = _mm256_permute4x64_epi64( state2, 0x93 );

+/*
+uint64_t *t = (uint64_t*)&t0;
+printf("S t0 %016lx %016lx %016lx %016lx\n",t[0],t[1],t[2],t[3]);
+
+o = (uint64_t*)inout;
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
+*/       
       inout[0] = _mm256_xor_si256( inout[0],
                                    _mm256_blend_epi32( t0, t2, 0x03 ) );
       inout[1] = _mm256_xor_si256( inout[1],
@@ -718,7 +744,17 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
       inout[2] = _mm256_xor_si256( inout[2],
                                    _mm256_blend_epi32( t2, t1, 0x03 ) );

-       //Inputs: next column (i.e., next block in sequence)
+/*
+o = (uint64_t*)inout;
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
+*/
+
+//Inputs: next column (i.e., next block in sequence)
       in    += BLOCK_LEN_M256I;
       inout += BLOCK_LEN_M256I;
       //Output: goes to previous column
@@ -949,6 +985,22 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
      _mm_prefetch( inout +  9, _MM_HINT_T0 );
      _mm_prefetch( inout + 11, _MM_HINT_T0 );

+/*
+uint64_t *io = (uint64_t*)inout;
+uint64_t *ii = (uint64_t*)in;
+
+printf("RDRS1 col= %d\n", i);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[15]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[0],ii[1],ii[2],ii[3]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[4],ii[5],ii[6],ii[7]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[8],ii[9],ii[10],ii[11]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[12],ii[13],ii[14],ii[15]);
+*/
+
+
      //Absorbing "M[prev] [+] M[row*]"
      state0 = _mm256_xor_si256( state0,
                                     _mm256_add_epi64( in[0], inout[0] ) );
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -52,8 +52,46 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 // However, 2 way parallel looks trivial to code for AVX512 except for
 // a data dependency with rowa.

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define G2W_4X64(a,b,c,d) \
+   a = _mm512_add_epi64( a, b ); \
+   d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
+   c = _mm512_add_epi64( c, d ); \
+   b = mm512_ror_64( _mm512_xor_si512( b, c ), 24 ); \
+   a = _mm512_add_epi64( a, b ); \
+   d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
+   c = _mm512_add_epi64( c, d ); \
+   b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 );
+
+#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   G2W_4X64( s0, s1, s2, s3 ); \
+   s1 = mm512_ror256_64( s1); \
+   s2 = mm512_swap256_128( s2 ); \
+   s3 = mm512_rol256_64( s3 ); \
+   G2W_4X64( s0, s1, s2, s3 ); \
+   s1 = mm512_rol256_64( s1 ); \
+   s2 = mm512_swap256_128( s2 ); \
+   s3 = mm512_ror256_64( s3 );
+
+#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 )
+
+
+#endif  // AVX512
+
 #if defined __AVX2__
-// only available with avx2

 // process 4 columns in parallel
 // returns void, updates all args
@@ -89,9 +127,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
-   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 )

-#elif defined(__SSE2__)
+#endif
+
+#if defined(__SSE2__)

 // process 2 columns in parallel
 // returns void, all args updated
@@ -108,14 +148,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_ror1x64_256( s2, s3 ); \
-   mm128_swap128_256( s4, s5 ); \
-   mm128_rol1x64_256( s6, s7 ); \
+   mm128_ror256_64( s2, s3 ); \
+   mm128_swap256_128( s4, s5 ); \
+   mm128_rol256_64( s6, s7 ); \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_rol1x64_256( s2, s3 ); \
-   mm128_swap128_256( s4, s5 ); \
-   mm128_ror1x64_256( s6, s7 );
+   mm128_rol256_64( s2, s3 ); \
+   mm128_swap256_128( s4, s5 ); \
+   mm128_ror256_64( s6, s7 );

 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
@@ -129,7 +169,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
-   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7)


 #endif // AVX2 else SSE2
@@ -161,6 +201,42 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    G(r,7,v[ 3],v[ 4],v[ 9],v[14]);


+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+union _povly
+{
+   __m512i *v512;
+   __m256i *v256;
+   uint64_t *u64;
+};
+typedef union _povly povly;
+
+//---- Housekeeping
+void initState_2way( uint64_t State[/*16*/] );
+
+//---- Squeezes
+void squeeze_2way( uint64_t *State, unsigned char *out, unsigned int len );
+void reducedSqueezeRow0_2way( uint64_t* state, uint64_t* row, uint64_t nCols );
+
+//---- Absorbs
+void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
+                       const uint64_t *In1 );
+void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
+                            const uint64_t nBlocks, const uint64_t block_len );
+
+//---- Duplexes
+void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
+                             uint64_t *rowOut, uint64_t nCols);
+void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
+                    uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
+
+void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
+                            uint64_t *rowInOut0, uint64_t *rowInOut1,
+                            uint64_t *rowOut, uint64_t nCols);
+
+#endif
+
+
 //---- Housekeeping
 void initState(uint64_t state[/*16*/]);

@@ -178,20 +254,4 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint6
 void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
 void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);

-//---- Misc
-void printArray(unsigned char *array, unsigned int size, char *name);
-
-////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-////TESTS////
-//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2);
-//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-/////////////
-
-
 #endif /* SPONGE_H_ */
--- a/algo/nist5/nist5-4way.c
+++ b/algo/nist5/nist5-4way.c
@@ -3,22 +3,129 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
-#if defined(NIST5_4WAY)
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"

-void nist5hash_4way( void *out, const void *input )
+#if defined(NIST5_8WAY)
+
+void nist5hash_8way( void *out, const void *input )
 {
+     uint64_t vhash[8*16] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     blake512_8way_context  ctx_blake;
+     hashState_groestl      ctx_groestl;
+     jh512_8way_context     ctx_jh;
+     skein512_8way_context  ctx_skein;
+     keccak512_8way_context ctx_keccak;
+
+     blake512_8way_init( &ctx_blake );
+     blake512_8way_update( &ctx_blake, input, 80 );
+     blake512_8way_close( &ctx_blake, vhash );
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, 512 );
+
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash0,
+                               (const char*)hash0, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash1,
+                               (const char*)hash1, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash2,
+                               (const char*)hash2, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash3,
+                               (const char*)hash3, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash4,
+                               (const char*)hash4, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash5,
+                               (const char*)hash5, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash6,
+                               (const char*)hash6, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash7,
+                               (const char*)hash7, 512 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                         hash7, 512 );
+
+     jh512_8way_init( &ctx_jh );
+     jh512_8way_update( &ctx_jh, vhash, 64 );
+     jh512_8way_close( &ctx_jh, vhash );
+
+     keccak512_8way_init( &ctx_keccak );
+     keccak512_8way_update( &ctx_keccak, vhash, 64 );
+     keccak512_8way_close( &ctx_keccak, vhash );
+
+     skein512_8way_init( &ctx_skein );
+     skein512_8way_update( &ctx_skein, vhash, 64 );
+     skein512_8way_close( &ctx_skein, out );
+}
+
+int scanhash_nist5_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[16*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+     uint32_t *hash7 = &(hash[49]);
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     const uint32_t Htarg = ptarget[7];
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     int thr_id = mythr->id;  
+
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+               _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                 n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+        nist5hash_8way( hash, vdata );
+
+        for ( int lane = 0; lane < 8; lane++ )
+        if ( hash7[ lane<<1 ] < Htarg )
+        {
+           extr_lane_8x64( lane_hash, hash, lane, 256 );
+           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+           {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+           }
+        }
+        n += 8;
+     } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined(NIST5_4WAY)
+
+void nist5hash_4way( void *out, const void *input )
+{
+     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     blake512_4way_context  ctx_blake;
     hashState_groestl      ctx_groestl;
     jh512_4way_context     ctx_jh;
@@ -62,62 +169,39 @@ void nist5hash_4way( void *out, const void *input )
 int scanhash_nist5_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
+     uint32_t vdata[4*24] __attribute__ ((aligned (128)));
     uint32_t hash[4*16] __attribute__ ((aligned (64)));
     uint32_t *hash7 = &(hash[25]);
     uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     const uint32_t Htarg = ptarget[7];
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-     int thr_id = mythr->id;  // thr_id arg is deprecated
-
-     uint64_t htmax[] = {          0,
-                                 0xF,
-                                0xFF,
-                               0xFFF,
-                              0xFFFF,
-                          0x10000000 };
-
-     uint32_t masks[] = { 0xFFFFFFFF,
-                          0xFFFFFFF0,
-                          0xFFFFFF00,
-                          0xFFFFF000,
-                          0xFFFF0000,
-                                   0 };
+     int thr_id = mythr->id;  

     mm256_bswap32_intrlv80_4x64( vdata, pdata );

-     for ( int m=0; m < 6; m++ )
-     {
-        if (Htarg <= htmax[m])
+     do {
+        *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+               _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+
+        nist5hash_4way( hash, vdata );
+
+        for ( int lane = 0; lane < 4; lane++ )
+        if ( hash7[ lane<<1 ] < Htarg )
        {
-           uint32_t mask = masks[m];
-
-           do {
-              *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-
-              nist5hash_4way( hash, vdata );
-
-              for ( int lane = 0; lane < 4; lane++ )
-              if ( ( hash7[ lane ] & mask ) == 0 )
-              {
-                 extr_lane_4x64( lane_hash, hash, lane, 256 );
-                 if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
-                 {
-                    pdata[19] = n + lane;
-                    submit_lane_solution( work, lane_hash, mythr, lane );
-                 }
-              }
-              n += 4;
-           } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
-           break;
+           extr_lane_4x64( lane_hash, hash, lane, 256 );
+           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+           {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+           }
        }
-     }
-     *hashes_done = n - first_nonce + 1;
+        n += 4;
+     } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
     return 0;
 }

--- a/algo/nist5/nist5-gate.c
+++ b/algo/nist5/nist5-gate.c
@@ -2,8 +2,11 @@

 bool register_nist5_algo( algo_gate_t* gate )
 {
-    gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
-#if defined (NIST5_4WAY)
+    gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+#if defined (NIST5_8WAY)
+    gate->scanhash = (void*)&scanhash_nist5_8way;
+    gate->hash     = (void*)&nist5hash_8way;
+#elif defined (NIST5_4WAY)
    gate->scanhash = (void*)&scanhash_nist5_4way;
    gate->hash     = (void*)&nist5hash_4way;
 #else
--- a/algo/nist5/nist5-gate.h
+++ b/algo/nist5/nist5-gate.h
@@ -1,14 +1,23 @@
 #ifndef __NIST5_GATE_H__
-#define __NIST5_GATE_H__
+#define __NIST5_GATE_H__ 1

 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define NIST5_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define NIST5_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define NIST5_4WAY 1
 #endif

-#if defined(NIST5_4WAY)
+#if defined(NIST5_8WAY)
+
+void nist5hash_8way( void *state, const void *input );
+
+int scanhash_nist5_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(NIST5_4WAY)

 void nist5hash_4way( void *state, const void *input );

--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
--- a/algo/quark/hmq1725-gate.c
+++ b/algo/quark/hmq1725-gate.c
@@ -2,7 +2,10 @@

 bool register_hmq1725_algo( algo_gate_t* gate )
 {
-#if defined(HMQ1725_4WAY)
+#if defined(HMQ1725_8WAY)
+  gate->scanhash  = (void*)&scanhash_hmq1725_8way;
+  gate->hash      = (void*)&hmq1725_8way_hash;
+#elif defined(HMQ1725_4WAY)
  gate->scanhash  = (void*)&scanhash_hmq1725_4way;
  gate->hash      = (void*)&hmq1725_4way_hash;
 #else
@@ -10,7 +13,7 @@ bool register_hmq1725_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_hmq1725;
  gate->hash      = (void*)&hmq1725hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  opt_target_factor = 65536.0;
  return true;
 };
--- a/algo/quark/hmq1725-gate.h
+++ b/algo/quark/hmq1725-gate.h
@@ -4,13 +4,21 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-//  #define HMQ1725_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define HMQ1725_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define HMQ1725_4WAY 1
 #endif

 bool register_hmq1725_algo( algo_gate_t* gate );

-#if defined(HMQ1725_4WAY)
+#if defined(HMQ1725_8WAY)
+
+void hmq1725_8way_hash( void *state, const void *input );
+int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(HMQ1725_4WAY)

 void hmq1725_4way_hash( void *state, const void *input );
 int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -333,6 +333,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFFFFF)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -346,6 +347,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFFFF0)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -359,6 +361,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFFF00)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -372,6 +375,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFF000)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -386,6 +390,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFF0000)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -399,6 +404,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			hmq1725hash(hash64, endiandata);
 			if (fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -1,12 +1,8 @@
 #include "cpuminer-config.h"
 #include "quark-gate.h"
-
-#if defined (QUARK_4WAY)
-
 #include <stdio.h>
 #include <string.h>
 #include <stdint.h>
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
@@ -14,6 +10,244 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"

+#if defined (QUARK_8WAY)
+
+typedef struct {
+    blake512_8way_context  blake;
+    bmw512_8way_context    bmw;
+    hashState_groestl      groestl;
+    jh512_8way_context     jh;
+    skein512_8way_context  skein;
+    keccak512_8way_context keccak;
+} quark_8way_ctx_holder;
+
+quark_8way_ctx_holder quark_8way_ctx __attribute__ ((aligned (128)));
+
+void init_quark_8way_ctx()
+{
+     blake512_8way_init( &quark_8way_ctx.blake );
+     bmw512_8way_init( &quark_8way_ctx.bmw );
+     init_groestl( &quark_8way_ctx.groestl, 64 );
+     skein512_8way_init( &quark_8way_ctx.skein );
+     jh512_8way_init( &quark_8way_ctx.jh );
+     keccak512_8way_init( &quark_8way_ctx.keccak );
+}
+
+void quark_8way_hash( void *state, const void *input )
+{
+    uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+    uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
+    uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
+    uint64_t hash0[8] __attribute__ ((aligned (64)));
+    uint64_t hash1[8] __attribute__ ((aligned (64)));
+    uint64_t hash2[8] __attribute__ ((aligned (64)));
+    uint64_t hash3[8] __attribute__ ((aligned (64)));
+    uint64_t hash4[8] __attribute__ ((aligned (64)));
+    uint64_t hash5[8] __attribute__ ((aligned (64)));
+    uint64_t hash6[8] __attribute__ ((aligned (64)));
+    uint64_t hash7[8] __attribute__ ((aligned (64)));
+    __m512i* vh  = (__m512i*)vhash;
+    __m512i* vhA = (__m512i*)vhashA;
+    __m512i* vhB = (__m512i*)vhashB;
+    __mmask8 vh_mask;
+    quark_8way_ctx_holder ctx;
+    const uint32_t mask = 8;
+    const __m512i bit3_mask = m512_const1_64( mask );
+    const __m512i zero = _mm512_setzero_si512();
+
+    memcpy( &ctx, &quark_8way_ctx, sizeof(quark_8way_ctx) );
+
+    blake512_8way_update( &ctx.blake, input, 80 );
+    blake512_8way_close( &ctx.blake, vhash );
+
+    bmw512_8way_update( &ctx.bmw, vhash, 64 );
+    bmw512_8way_close( &ctx.bmw, vhash );
+
+    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
+                                       zero );
+
+    dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                  vhash, 512 );
+
+    if ( hash0[0] & mask )
+    {
+       update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                               (char*)hash0, 512 );
+    }
+    if ( hash1[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                               (char*)hash1, 512 );
+    }
+    if ( hash2[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                               (char*)hash2, 512 );
+    }
+    if ( hash3[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                               (char*)hash3, 512 );
+    }
+    if ( hash4[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                               (char*)hash4, 512 );
+    }
+    if ( hash5[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                               (char*)hash5, 512 );
+    }
+    if ( hash6[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                               (char*)hash6, 512 );
+    }
+    if ( hash7[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                               (char*)hash7, 512 );
+    }
+
+    intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                         hash7, 512 );
+
+    if ( vh_mask & 0xff )
+    {
+       skein512_8way_update( &ctx.skein, vhash, 64 );
+       skein512_8way_close( &ctx.skein, vhashB );
+    }
+
+    mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+
+    dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                  vhash, 512 );
+
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+    intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 512 );
+
+    jh512_8way_update( &ctx.jh, vhash, 64 );
+    jh512_8way_close( &ctx.jh, vhash );
+
+    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
+                                       zero );
+
+    if ( ( vh_mask & 0xff ) != 0xff )
+    {
+       blake512_8way_init( &ctx.blake );
+       blake512_8way_update( &ctx.blake, vhash, 64 );
+       blake512_8way_close( &ctx.blake, vhashA );
+    }
+
+    if ( vh_mask & 0xff )
+    {
+       bmw512_8way_init( &ctx.bmw );
+       bmw512_8way_update( &ctx.bmw, vhash, 64 );
+       bmw512_8way_close( &ctx.bmw, vhashB );
+    }
+
+    mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+
+    keccak512_8way_update( &ctx.keccak, vhash, 64 );
+    keccak512_8way_close( &ctx.keccak, vhash );
+
+    skein512_8way_init( &ctx.skein );
+    skein512_8way_update( &ctx.skein, vhash, 64 );
+    skein512_8way_close( &ctx.skein, vhash );
+
+    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
+                                       zero );
+
+    if ( ( vh_mask & 0xff ) != 0xff )
+    {
+       keccak512_8way_init( &ctx.keccak );
+       keccak512_8way_update( &ctx.keccak, vhash, 64 );
+       keccak512_8way_close( &ctx.keccak, vhashA );
+    }
+
+    if ( vh_mask & 0xff )
+    {
+       jh512_8way_init( &ctx.jh );
+       jh512_8way_update( &ctx.jh, vhash, 64 );
+       jh512_8way_close( &ctx.jh, vhashB );
+    }
+
+    // Final blend, directly to state, only need 32 bytes.
+    casti_m512i( state,0 ) = _mm512_mask_blend_epi64( vh_mask, vhA[0], vhB[0] );
+    casti_m512i( state,1 ) = _mm512_mask_blend_epi64( vh_mask, vhA[1], vhB[1] );
+    casti_m512i( state,2 ) = _mm512_mask_blend_epi64( vh_mask, vhA[2], vhB[2] );
+    casti_m512i( state,3 ) = _mm512_mask_blend_epi64( vh_mask, vhA[3], vhB[3] );
+}
+
+int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint32_t hash[8*8] __attribute__ ((aligned (128)));
+    uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+    uint32_t *hash7 = &(hash[49]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    uint32_t n = pdata[19];
+    const uint32_t first_nonce = pdata[19];
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+    int thr_id = mythr->id;  // thr_id arg is deprecated
+
+    mm512_bswap32_intrlv80_8x64( vdata, pdata );
+    do
+    {
+       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+       quark_8way_hash( hash, vdata );
+       pdata[19] = n;
+
+       for ( int i = 0; i < 8; i++ )
+       if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 )
+       {
+          extr_lane_8x64( lane_hash, hash, i, 256 );
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark  )
+          {
+            pdata[19] = n+i;
+            submit_lane_solution( work, lane_hash, mythr, i );
+          }
+       }
+       n += 8;
+    } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
+
+    *hashes_done = n - first_nonce;
+    return 0;
+}
+
+
+#elif defined (QUARK_4WAY)
+
 typedef struct {
    blake512_4way_context  blake;
    bmw512_4way_context    bmw;
@@ -91,7 +325,7 @@ void quark_4way_hash( void *state, const void *input )

    intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );

-    if ( mm256_anybits0( vh_mask ) )   
+    if ( mm256_anybits1( vh_mask ) )   
    {
       skein512_4way( &ctx.skein, vhash, 64 );
       skein512_4way_close( &ctx.skein, vhashB );
@@ -117,14 +351,14 @@ void quark_4way_hash( void *state, const void *input )

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );

-    if ( mm256_anybits1( vh_mask ) )
+    if ( mm256_anybits0( vh_mask ) )   
    {
       blake512_4way_init( &ctx.blake );
       blake512_4way( &ctx.blake, vhash, 64 );
       blake512_4way_close( &ctx.blake, vhashA );
    }

-    if ( mm256_anybits0( vh_mask ) )
+    if ( mm256_anybits1( vh_mask ) )
    {
       bmw512_4way_init( &ctx.bmw );
       bmw512_4way( &ctx.bmw, vhash, 64 );
@@ -142,14 +376,14 @@ void quark_4way_hash( void *state, const void *input )

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );

-    if ( mm256_anybits1( vh_mask ) )
+    if ( mm256_anybits0( vh_mask ) )    
    {
       keccak512_4way_init( &ctx.keccak );
       keccak512_4way( &ctx.keccak, vhash, 64 );
       keccak512_4way_close( &ctx.keccak, vhashA );
    }

-    if ( mm256_anybits0( vh_mask ) )
+    if ( mm256_anybits1( vh_mask ) )
    {
       jh512_4way_init( &ctx.jh );
       jh512_4way( &ctx.jh, vhash, 64 );
--- a/algo/quark/quark-gate.c
+++ b/algo/quark/quark-gate.c
@@ -2,7 +2,11 @@

 bool register_quark_algo( algo_gate_t* gate )
 {
-#if defined (QUARK_4WAY)
+#if defined (QUARK_8WAY)
+  init_quark_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_quark_8way;
+  gate->hash      = (void*)&quark_8way_hash;
+#elif defined (QUARK_4WAY)
  init_quark_4way_ctx();
  gate->scanhash  = (void*)&scanhash_quark_4way;
  gate->hash      = (void*)&quark_4way_hash;
@@ -11,7 +15,7 @@ bool register_quark_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_quark;
  gate->hash      = (void*)&quark_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/quark/quark-gate.h
+++ b/algo/quark/quark-gate.h
@@ -4,13 +4,22 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define QUARK_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define QUARK_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define QUARK_4WAY 1
 #endif

 bool register_quark_algo( algo_gate_t* gate );

-#if defined(QUARK_4WAY)
+#if defined(QUARK_8WAY)
+
+void quark_8way_hash( void *state, const void *input );
+int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+void init_quark_8way_ctx();
+
+#elif defined(QUARK_4WAY)

 void quark_4way_hash( void *state, const void *input );
 int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -1,17 +1,134 @@
 #include "qubit-gate.h"
-
-#if defined(QUBIT_2WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/cubehash_sse2.h" 
+#include "algo/cubehash/cube-hash-2way.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/echo/aes_ni/hash_api.h"

+#if defined(QUBIT_4WAY)
+
+typedef struct
+{
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    simd_2way_context       simd2;
+    hashState_echo          echo;
+} qubit_4way_ctx_holder;
+
+qubit_4way_ctx_holder qubit_4way_ctx;
+
+void init_qubit_4way_ctx()
+{
+    cube_4way_init( &qubit_4way_ctx.cube, 512, 16, 32 );
+    sph_shavite512_init(&qubit_4way_ctx.shavite);
+    simd_4way_init( &qubit_4way_ctx.simd, 512 );
+    simd_2way_init( &qubit_4way_ctx.simd2, 512 );
+    init_echo(&qubit_4way_ctx.echo, 512);
+};
+
+void qubit_4way_hash( void *output, const void *input )
+{
+     uint32_t vhash[16*4] __attribute__ ((aligned (128)));
+     uint32_t hash0[16] __attribute__ ((aligned (64)));
+     uint32_t hash1[16] __attribute__ ((aligned (64)));
+     uint32_t hash2[16] __attribute__ ((aligned (64)));
+     uint32_t hash3[16] __attribute__ ((aligned (64)));
+     qubit_4way_ctx_holder ctx;
+
+     memcpy( &ctx, &qubit_4way_ctx, sizeof(qubit_4way_ctx) );
+
+     luffa_4way_update( &ctx.luffa, input + (64<<2), 16 );
+     luffa_4way_close( &ctx.luffa, vhash );
+     
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &qubit_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &qubit_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &qubit_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+     memcpy( output+64, hash2, 32 );
+     memcpy( output+96, hash3, 32 );
+}
+
+int scanhash_qubit_4way( struct work *work,uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *noncep = vdata + 64+3;   // 4*16 + 3
+     int thr_id = mythr->id;
+     const uint32_t Htarg = ptarget[7];
+
+     mm512_bswap32_intrlv80_4x128( vdata, pdata );
+     luffa_4way_init( &qubit_4way_ctx.luffa, 512 );
+     luffa_4way_update( &qubit_4way_ctx.luffa, vdata, 64 );
+
+     do
+     {
+        be32enc( noncep,    n   );
+        be32enc( noncep+ 4, n+1 );
+        be32enc( noncep+ 8, n+2 );
+        be32enc( noncep+12, n+3 );
+
+        qubit_4way_hash( hash, vdata );
+        pdata[19] = n;
+
+        for ( int lane = 0; lane < 4; lane++ )
+        if ( ( hash+(lane<<3) )[7] < Htarg )
+        if ( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark )
+        {
+           pdata[19] = n + lane;
+           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+        }
+        n += 4;
+     } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined(QUBIT_2WAY)
+
 typedef struct
 {
        luffa_2way_context      luffa;
--- a/algo/qubit/qubit-gate.c
+++ b/algo/qubit/qubit-gate.c
@@ -2,7 +2,12 @@

 bool register_qubit_algo( algo_gate_t* gate )
 {
-#if defined (QUBIT_2WAY)
+   
+#if defined (QUBIT_4WAY)
+  init_qubit_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_qubit_4way;
+  gate->hash      = (void*)&qubit_4way_hash;
+#elif defined (QUBIT_2WAY)
  init_qubit_2way_ctx();
  gate->scanhash  = (void*)&scanhash_qubit_2way;
  gate->hash      = (void*)&qubit_2way_hash;
@@ -11,7 +16,7 @@ bool register_qubit_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_qubit;
  gate->hash      = (void*)&qubit_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/qubit/qubit-gate.h
+++ b/algo/qubit/qubit-gate.h
@@ -4,13 +4,23 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define QUBIT_2WAY
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define QUBIT_4WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define QUBIT_2WAY 1
 #endif

 bool register_qubit_algo( algo_gate_t* gate );

-#if defined(QUBIT_2WAY)
+#if defined(QUBIT_4WAY)
+
+void qubit_4way_hash( void *state, const void *input );
+int scanhash_qubit_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+void init_qubit_4way_ctx();
+
+#elif defined(QUBIT_2WAY)

 void qubit_2way_hash( void *state, const void *input );
 int scanhash_qubit_2way( struct work *work, uint32_t max_nonce,
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -56,7 +56,7 @@ typedef struct {
   __m128i val[8];
   uint32_t count_high, count_low;
   bool initialized;
-} sha256_4way_context;
+} sha256_4way_context __attribute__ ((aligned (64)));

 void sha256_4way_init( sha256_4way_context *sc );
 void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
@@ -71,7 +71,7 @@ typedef struct {
   __m256i val[8];
   uint32_t count_high, count_low;
   bool initialized;
-} sha256_8way_context;
+} sha256_8way_context __attribute__ ((aligned (128)));

 void sha256_8way_init( sha256_8way_context *sc );
 void sha256_8way( sha256_8way_context *sc, const void *data, size_t len );
@@ -86,30 +86,32 @@ typedef struct {
   __m256i val[8];
   uint64_t count;
   bool initialized;
-} sha512_4way_context;
+} sha512_4way_context __attribute__ ((aligned (128)));

 void sha512_4way_init( sha512_4way_context *sc);
-void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
+void sha512_4way_update( sha512_4way_context *sc, const void *data,
+                         size_t len );
+#define sha512_4way sha512_4way_update
 void sha512_4way_close( sha512_4way_context *sc, void *dst );

-// SHA-256 11 way hybrid
-// Combines AVX2, MMX and scalar data to do 8 + 2 + 1 parallel.
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-512 8 way
+
 typedef struct {
-   __m256i  bufx[64>>2];
-   __m256i  valx[8];
-   __m64    bufy[64>>2];
-   __m64    valy[8];
-   uint32_t bufz[64>>2];
-   uint32_t valz[8];
-   uint32_t count_high, count_low;
-} sha256_11way_context;
+   __m512i buf[128>>3];
+   __m512i val[8];
+   uint64_t count;
+   bool initialized;
+} sha512_8way_context __attribute__ ((aligned (128)));

-void sha256_11way_init( sha256_11way_context *ctx );
-void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
-	                 const void *datay, const void *dataz, size_t len );
-void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx,
-	                 void *dstz  );
+void sha512_8way_init( sha512_8way_context *sc);
+void sha512_8way_update( sha512_8way_context *sc, const void *data, 
+                         size_t len );
+void sha512_8way_close( sha512_8way_context *sc, void *dst );

+
+#endif  // AVX512
 #endif  // __AVX2__
 #endif  // __SSE2__
 #endif  // SHA256_4WAY_H__
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -36,8 +36,6 @@
 #include <string.h>
 #include "sha-hash-4way.h"

-// SHA-512 4 way 64 bit
-
 /*
 static const sph_u64 H512[8] = {
        SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
@@ -90,6 +88,236 @@ static const sph_u64 K512[80] = {
 	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
 };

+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-512 8 way 64 bit
+
+#define CH8W(X, Y, Z) \
+   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) 
+
+#define MAJ8W(X, Y, Z) \
+   _mm512_or_si512( _mm512_and_si512( X, Y ), \
+                    _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
+
+#define BSG8W_5_0(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x, 28), mm512_ror_64(x, 34) ), mm512_ror_64(x, 39) )
+
+#define BSG8W_5_1(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x, 14), mm512_ror_64(x, 18) ), mm512_ror_64(x, 41) )
+
+#define SSG8W_5_0(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x,  1), mm512_ror_64(x,  8) ), _mm512_srli_epi64(x, 7) ) 
+
+#define SSG8W_5_1(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x, 19), mm512_ror_64(x, 61) ), _mm512_srli_epi64(x, 6) )
+
+static inline __m512i ssg8w_512_add( __m512i w0, __m512i w1 )
+{
+   __m512i w0a, w1a, w0b, w1b;
+   w0a = mm512_ror_64( w0, 1 );
+   w1a = mm512_ror_64( w1,19 );
+   w0b = mm512_ror_64( w0, 8 );
+   w1b = mm512_ror_64( w1,61 );
+   w0a = _mm512_xor_si512( w0a, w0b );
+   w1a = _mm512_xor_si512( w1a, w1b );
+   w0b = _mm512_srli_epi64( w0, 7 );
+   w1b = _mm512_srli_epi64( w1, 6 );
+   w0a = _mm512_xor_si512( w0a, w0b );
+   w1a = _mm512_xor_si512( w1a, w1b );
+   return _mm512_add_epi64( w0a, w1a );
+}
+
+
+#define SSG8W_512x2_0( w0, w1, i ) do \
+{ \
+   __m512i X0a, X1a, X0b, X1b; \
+  X0a = mm512_ror_64( W[i-15], 1 ); \
+  X1a = mm512_ror_64( W[i-14], 1 ); \
+  X0b = mm512_ror_64( W[i-15], 8 ); \
+  X1b = mm512_ror_64( W[i-14], 8 ); \
+  X0a = _mm512_xor_si512( X0a, X0b ); \
+  X1a = _mm512_xor_si512( X1a, X1b ); \
+  X0b = _mm512_srli_epi64( W[i-15], 7 ); \
+  X1b = _mm512_srli_epi64( W[i-14], 7 ); \
+  w0  = _mm512_xor_si512( X0a, X0b ); \
+  w1  = _mm512_xor_si512( X1a, X1b ); \
+} while(0)
+
+#define SSG8W_512x2_1( w0, w1, i ) do \
+{ \
+   __m512i X0a, X1a, X0b, X1b; \
+  X0a = mm512_ror_64( W[i-2],19 ); \
+  X1a = mm512_ror_64( W[i-1],19 ); \
+  X0b = mm512_ror_64( W[i-2],61 ); \
+  X1b = mm512_ror_64( W[i-1],61 ); \
+  X0a = _mm512_xor_si512( X0a, X0b ); \
+  X1a = _mm512_xor_si512( X1a, X1b ); \
+  X0b = _mm512_srli_epi64( W[i-2], 6 ); \
+  X1b = _mm512_srli_epi64( W[i-1], 6 ); \
+  w0  = _mm512_xor_si512( X0a, X0b ); \
+  w1  = _mm512_xor_si512( X1a, X1b ); \
+} while(0)
+
+#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
+do { \
+  __m512i T1, T2; \
+  __m512i K = _mm512_set1_epi64( K512[ i ] ); \
+  T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \
+                                           K, W[i] ) ); \
+  T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \
+  D  = _mm512_add_epi64( D, T1 ); \
+  H  = _mm512_add_epi64( T1, T2 ); \
+} while (0)
+
+static void
+sha512_8way_round( sha512_8way_context *ctx,  __m512i *in, __m512i r[8] )
+{
+   int i;
+   register __m512i A, B, C, D, E, F, G, H;
+   __m512i W[80];
+
+   mm512_block_bswap_64( W  , in );
+   mm512_block_bswap_64( W+8, in+8 );
+
+   for ( i = 16; i < 80; i++ )
+      W[i] = _mm512_add_epi64( ssg8w_512_add( W[i-15], W[i-2] ),
+                               _mm512_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
+
+   if ( ctx->initialized )
+   {
+      A = r[0];
+      B = r[1];
+      C = r[2];
+      D = r[3];
+      E = r[4];
+      F = r[5];
+      G = r[6];
+      H = r[7];
+   }
+   else
+   {
+      A = m512_const1_64( 0x6A09E667F3BCC908 );
+      B = m512_const1_64( 0xBB67AE8584CAA73B );
+      C = m512_const1_64( 0x3C6EF372FE94F82B );
+      D = m512_const1_64( 0xA54FF53A5F1D36F1 );
+      E = m512_const1_64( 0x510E527FADE682D1 );
+      F = m512_const1_64( 0x9B05688C2B3E6C1F );
+      G = m512_const1_64( 0x1F83D9ABFB41BD6B );
+      H = m512_const1_64( 0x5BE0CD19137E2179 );
+   }
+
+   for ( i = 0; i < 80; i += 8 )
+   {
+      SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
+      SHA3_8WAY_STEP( H, A, B, C, D, E, F, G, i + 1 );
+      SHA3_8WAY_STEP( G, H, A, B, C, D, E, F, i + 2 );
+      SHA3_8WAY_STEP( F, G, H, A, B, C, D, E, i + 3 );
+      SHA3_8WAY_STEP( E, F, G, H, A, B, C, D, i + 4 );
+      SHA3_8WAY_STEP( D, E, F, G, H, A, B, C, i + 5 );
+      SHA3_8WAY_STEP( C, D, E, F, G, H, A, B, i + 6 );
+      SHA3_8WAY_STEP( B, C, D, E, F, G, H, A, i + 7 );
+   }
+
+   if ( ctx->initialized )
+   {
+      r[0] = _mm512_add_epi64( r[0], A );
+      r[1] = _mm512_add_epi64( r[1], B );
+      r[2] = _mm512_add_epi64( r[2], C );
+      r[3] = _mm512_add_epi64( r[3], D );
+      r[4] = _mm512_add_epi64( r[4], E );
+      r[5] = _mm512_add_epi64( r[5], F );
+      r[6] = _mm512_add_epi64( r[6], G );
+      r[7] = _mm512_add_epi64( r[7], H );
+   }
+   else
+   {
+      ctx->initialized = true;
+      r[0] = _mm512_add_epi64( A, m512_const1_64( 0x6A09E667F3BCC908 ) );
+      r[1] = _mm512_add_epi64( B, m512_const1_64( 0xBB67AE8584CAA73B ) );
+      r[2] = _mm512_add_epi64( C, m512_const1_64( 0x3C6EF372FE94F82B ) );
+      r[3] = _mm512_add_epi64( D, m512_const1_64( 0xA54FF53A5F1D36F1 ) );
+      r[4] = _mm512_add_epi64( E, m512_const1_64( 0x510E527FADE682D1 ) );
+      r[5] = _mm512_add_epi64( F, m512_const1_64( 0x9B05688C2B3E6C1F ) );
+      r[6] = _mm512_add_epi64( G, m512_const1_64( 0x1F83D9ABFB41BD6B ) );
+      r[7] = _mm512_add_epi64( H, m512_const1_64( 0x5BE0CD19137E2179 ) );
+   }
+}
+
+void sha512_8way_init( sha512_8way_context *sc )
+{
+   sc->initialized = false;
+   sc->count = 0;
+}
+
+void sha512_8way_update( sha512_8way_context *sc, const void *data, size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   size_t ptr;
+   const int buf_size = 128;
+
+   ptr = (unsigned)sc->count & (buf_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_512( sc->buf + (ptr>>3), vdata, clen>>3 );
+      vdata = vdata + (clen>>3);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         sha512_8way_round( sc, sc->buf, sc->val );
+         ptr = 0;
+      }
+      sc->count += clen;
+   }
+}
+
+void sha512_8way_close( sha512_8way_context *sc, void *dst )
+{
+    unsigned ptr;
+    const int buf_size = 128;
+    const int pad = buf_size - 16;
+    const __m512i shuff_bswap64 = m512_const_64(
+                                    0x38393a3b3c3d3e3f, 0x3031323334353637,
+                                    0x28292a2b2c2d2e2f, 0x2021222324252627,
+                                    0x18191a1b1c1d1e1f, 0x1011121314151617,
+                                    0x08090a0b0c0d0e0f, 0x0001020304050607 );
+
+    ptr = (unsigned)sc->count & (buf_size - 1U);
+    sc->buf[ ptr>>3 ] = m512_const1_64( 0x80 );
+    ptr += 8;
+    if ( ptr > pad )
+    {
+         memset_zero_512( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );
+         sha512_8way_round( sc, sc->buf, sc->val );
+         memset_zero_512( sc->buf, pad >> 3 );
+    }
+    else
+         memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
+
+    sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
+                       _mm512_set1_epi64( sc->count >> 61 ), shuff_bswap64 );
+    sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
+                       _mm512_set1_epi64( sc->count <<  3 ), shuff_bswap64 );
+    sha512_8way_round( sc, sc->buf, sc->val );
+
+    mm512_block_bswap_64( dst, sc->val );
+}
+
+
+#endif   // AVX512
+
+// SHA-512 4 way 64 bit
+
+
 #define CH(X, Y, Z) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 

@@ -254,7 +482,7 @@ void sha512_4way_init( sha512_4way_context *sc )
   sc->count = 0;
 }

-void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
+void sha512_4way_update( sha512_4way_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   size_t ptr;
@@ -285,8 +513,10 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
    unsigned ptr;
    const int buf_size = 128;
    const int pad = buf_size - 16;
-    const __m256i shuff_bswap64 = m256_const2_64( 0x08090a0b0c0d0e0f,
-                                                  0x0001020304050607 );
+    const __m256i shuff_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f,
+                                                 0x1011121314151617,
+                                                 0x08090a0b0c0d0e0f,
+                                                 0x0001020304050607 );

    ptr = (unsigned)sc->count & (buf_size - 1U);
    sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 );
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -33,7 +33,7 @@
 #include <stddef.h>
 #include <string.h>

-#ifdef __AVX2__
+#ifdef __SSE4_1__

 #include "shabal-hash-4way.h"
 #ifdef __cplusplus
@@ -58,6 +58,599 @@ extern "C"{
 #define O2    9
 #define O3    6

+
+#if defined(__AVX2__)
+
+#define DECL_STATE8   \
+   __m256i A00, A01, A02, A03, A04, A05, A06, A07, \
+           A08, A09, A0A, A0B; \
+   __m256i B0, B1, B2, B3, B4, B5, B6, B7, \
+           B8, B9, BA, BB, BC, BD, BE, BF; \
+   __m256i C0, C1, C2, C3, C4, C5, C6, C7, \
+           C8, C9, CA, CB, CC, CD, CE, CF; \
+   __m256i M0, M1, M2, M3, M4, M5, M6, M7, \
+           M8, M9, MA, MB, MC, MD, ME, MF; \
+   sph_u32 Wlow, Whigh;
+
+#define READ_STATE8(state) do \
+{ \
+   if ( (state)->state_loaded ) \
+   { \
+      A00 = (state)->A[0]; \
+      A01 = (state)->A[1]; \
+      A02 = (state)->A[2]; \
+      A03 = (state)->A[3]; \
+      A04 = (state)->A[4]; \
+      A05 = (state)->A[5]; \
+      A06 = (state)->A[6]; \
+      A07 = (state)->A[7]; \
+      A08 = (state)->A[8]; \
+      A09 = (state)->A[9]; \
+      A0A = (state)->A[10]; \
+      A0B = (state)->A[11]; \
+      B0 = (state)->B[0]; \
+      B1 = (state)->B[1]; \
+      B2 = (state)->B[2]; \
+      B3 = (state)->B[3]; \
+      B4 = (state)->B[4]; \
+      B5 = (state)->B[5]; \
+      B6 = (state)->B[6]; \
+      B7 = (state)->B[7]; \
+      B8 = (state)->B[8]; \
+      B9 = (state)->B[9]; \
+      BA = (state)->B[10]; \
+      BB = (state)->B[11]; \
+      BC = (state)->B[12]; \
+      BD = (state)->B[13]; \
+      BE = (state)->B[14]; \
+      BF = (state)->B[15]; \
+      C0 = (state)->C[0]; \
+      C1 = (state)->C[1]; \
+      C2 = (state)->C[2]; \
+      C3 = (state)->C[3]; \
+      C4 = (state)->C[4]; \
+      C5 = (state)->C[5]; \
+      C6 = (state)->C[6]; \
+      C7 = (state)->C[7]; \
+      C8 = (state)->C[8]; \
+      C9 = (state)->C[9]; \
+      CA = (state)->C[10]; \
+      CB = (state)->C[11]; \
+      CC = (state)->C[12]; \
+      CD = (state)->C[13]; \
+      CE = (state)->C[14]; \
+      CF = (state)->C[15]; \
+   } \
+   else \
+   { \
+       (state)->state_loaded = true; \
+       A00 = m256_const1_64( 0x20728DFD20728DFD ); \
+       A01 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
+       A02 = m256_const1_64( 0xE782B699E782B699 ); \
+       A03 = m256_const1_64( 0x5530463255304632 ); \
+       A04 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
+       A05 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
+       A06 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
+       A07 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
+       A08 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
+       A09 = m256_const1_64( 0x8BD144108BD14410 ); \
+       A0A = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
+       A0B = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
+       B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \
+       B1 = m256_const1_64( 0x07B385F307B385F3 ); \
+       B2 = m256_const1_64( 0xE7442C26E7442C26 ); \
+       B3 = m256_const1_64( 0xCC8AD640CC8AD640 ); \
+       B4 = m256_const1_64( 0xEB6F56C7EB6F56C7 ); \
+       B5 = m256_const1_64( 0x1EA81AA91EA81AA9 ); \
+       B6 = m256_const1_64( 0x73B9D31473B9D314 ); \
+       B7 = m256_const1_64( 0x1DE85D081DE85D08 ); \
+       B8 = m256_const1_64( 0x48910A5A48910A5A ); \
+       B9 = m256_const1_64( 0x893B22DB893B22DB ); \
+       BA = m256_const1_64( 0xC5A0DF44C5A0DF44 ); \
+       BB = m256_const1_64( 0xBBC4324EBBC4324E ); \
+       BC = m256_const1_64( 0x72D2F24072D2F240 ); \
+       BD = m256_const1_64( 0x75941D9975941D99 ); \
+       BE = m256_const1_64( 0x6D8BDE826D8BDE82 ); \
+       BF = m256_const1_64( 0xA1A7502BA1A7502B ); \
+       C0 = m256_const1_64( 0xD9BF68D1D9BF68D1 ); \
+       C1 = m256_const1_64( 0x58BAD75058BAD750 ); \
+       C2 = m256_const1_64( 0x56028CB256028CB2 ); \
+       C3 = m256_const1_64( 0x8134F3598134F359 ); \
+       C4 = m256_const1_64( 0xB5D469D8B5D469D8 ); \
+       C5 = m256_const1_64( 0x941A8CC2941A8CC2 ); \
+       C6 = m256_const1_64( 0x418B2A6E418B2A6E ); \
+       C7 = m256_const1_64( 0x0405278004052780 ); \
+       C8 = m256_const1_64( 0x7F07D7877F07D787 ); \
+       C9 = m256_const1_64( 0x5194358F5194358F ); \
+       CA = m256_const1_64( 0x3C60D6653C60D665 ); \
+       CB = m256_const1_64( 0xBE97D79ABE97D79A ); \
+       CC = m256_const1_64( 0x950C3434950C3434 ); \
+       CD = m256_const1_64( 0xAED9A06DAED9A06D ); \
+       CE = m256_const1_64( 0x2537DC8D2537DC8D ); \
+       CF = m256_const1_64( 0x7CDB59697CDB5969 ); \
+   } \
+   Wlow = (state)->Wlow; \
+   Whigh = (state)->Whigh; \
+} while (0)
+
+#define WRITE_STATE8(state)   do { \
+      (state)->A[0] = A00; \
+      (state)->A[1] = A01; \
+      (state)->A[2] = A02; \
+      (state)->A[3] = A03; \
+      (state)->A[4] = A04; \
+      (state)->A[5] = A05; \
+      (state)->A[6] = A06; \
+      (state)->A[7] = A07; \
+      (state)->A[8] = A08; \
+      (state)->A[9] = A09; \
+      (state)->A[10] = A0A; \
+      (state)->A[11] = A0B; \
+      (state)->B[0] = B0; \
+      (state)->B[1] = B1; \
+      (state)->B[2] = B2; \
+      (state)->B[3] = B3; \
+      (state)->B[4] = B4; \
+      (state)->B[5] = B5; \
+      (state)->B[6] = B6; \
+      (state)->B[7] = B7; \
+      (state)->B[8] = B8; \
+      (state)->B[9] = B9; \
+      (state)->B[10] = BA; \
+      (state)->B[11] = BB; \
+      (state)->B[12] = BC; \
+      (state)->B[13] = BD; \
+      (state)->B[14] = BE; \
+      (state)->B[15] = BF; \
+      (state)->C[0] = C0; \
+      (state)->C[1] = C1; \
+      (state)->C[2] = C2; \
+      (state)->C[3] = C3; \
+      (state)->C[4] = C4; \
+      (state)->C[5] = C5; \
+      (state)->C[6] = C6; \
+      (state)->C[7] = C7; \
+      (state)->C[8] = C8; \
+      (state)->C[9] = C9; \
+      (state)->C[10] = CA; \
+      (state)->C[11] = CB; \
+      (state)->C[12] = CC; \
+      (state)->C[13] = CD; \
+      (state)->C[14] = CE; \
+      (state)->C[15] = CF; \
+      (state)->Wlow = Wlow; \
+      (state)->Whigh = Whigh; \
+   } while (0)
+
+#define DECODE_BLOCK8 \
+do { \
+   M0 = buf[ 0]; \
+   M1 = buf[ 1]; \
+   M2 = buf[ 2]; \
+   M3 = buf[ 3]; \
+   M4 = buf[ 4]; \
+   M5 = buf[ 5]; \
+   M6 = buf[ 6]; \
+   M7 = buf[ 7]; \
+   M8 = buf[ 8]; \
+   M9 = buf[ 9]; \
+   MA = buf[10]; \
+   MB = buf[11]; \
+   MC = buf[12]; \
+   MD = buf[13]; \
+   ME = buf[14]; \
+   MF = buf[15]; \
+} while (0)
+
+#define INPUT_BLOCK_ADD8 \
+do { \
+    B0 = _mm256_add_epi32( B0, M0 );\
+    B1 = _mm256_add_epi32( B1, M1 );\
+    B2 = _mm256_add_epi32( B2, M2 );\
+    B3 = _mm256_add_epi32( B3, M3 );\
+    B4 = _mm256_add_epi32( B4, M4 );\
+    B5 = _mm256_add_epi32( B5, M5 );\
+    B6 = _mm256_add_epi32( B6, M6 );\
+    B7 = _mm256_add_epi32( B7, M7 );\
+    B8 = _mm256_add_epi32( B8, M8 );\
+    B9 = _mm256_add_epi32( B9, M9 );\
+    BA = _mm256_add_epi32( BA, MA );\
+    BB = _mm256_add_epi32( BB, MB );\
+    BC = _mm256_add_epi32( BC, MC );\
+    BD = _mm256_add_epi32( BD, MD );\
+    BE = _mm256_add_epi32( BE, ME );\
+    BF = _mm256_add_epi32( BF, MF );\
+} while (0)
+
+#define INPUT_BLOCK_SUB8 \
+do { \
+    C0 = _mm256_sub_epi32( C0, M0 ); \
+    C1 = _mm256_sub_epi32( C1, M1 ); \
+    C2 = _mm256_sub_epi32( C2, M2 ); \
+    C3 = _mm256_sub_epi32( C3, M3 ); \
+    C4 = _mm256_sub_epi32( C4, M4 ); \
+    C5 = _mm256_sub_epi32( C5, M5 ); \
+    C6 = _mm256_sub_epi32( C6, M6 ); \
+    C7 = _mm256_sub_epi32( C7, M7 ); \
+    C8 = _mm256_sub_epi32( C8, M8 ); \
+    C9 = _mm256_sub_epi32( C9, M9 ); \
+    CA = _mm256_sub_epi32( CA, MA ); \
+    CB = _mm256_sub_epi32( CB, MB ); \
+    CC = _mm256_sub_epi32( CC, MC ); \
+    CD = _mm256_sub_epi32( CD, MD ); \
+    CE = _mm256_sub_epi32( CE, ME ); \
+    CF = _mm256_sub_epi32( CF, MF ); \
+} while (0)
+
+#define XOR_W8 \
+do { \
+   A00 = _mm256_xor_si256( A00, _mm256_set1_epi32( Wlow ) ); \
+   A01 = _mm256_xor_si256( A01, _mm256_set1_epi32( Whigh ) ); \
+} while (0)
+
+#define SWAP_BC8 \
+do { \
+    mm256_swap512_256( B0, C0 ); \
+    mm256_swap512_256( B1, C1 ); \
+    mm256_swap512_256( B2, C2 ); \
+    mm256_swap512_256( B3, C3 ); \
+    mm256_swap512_256( B4, C4 ); \
+    mm256_swap512_256( B5, C5 ); \
+    mm256_swap512_256( B6, C6 ); \
+    mm256_swap512_256( B7, C7 ); \
+    mm256_swap512_256( B8, C8 ); \
+    mm256_swap512_256( B9, C9 ); \
+    mm256_swap512_256( BA, CA ); \
+    mm256_swap512_256( BB, CB ); \
+    mm256_swap512_256( BC, CC ); \
+    mm256_swap512_256( BD, CD ); \
+    mm256_swap512_256( BE, CE ); \
+    mm256_swap512_256( BF, CF ); \
+} while (0)
+
+#define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
+do { \
+   xa0 = _mm256_xor_si256( xm, _mm256_xor_si256( xb1, _mm256_xor_si256(  \
+            _mm256_andnot_si256( xb3, xb2 ), \
+            _mm256_mullo_epi32( _mm256_xor_si256( xa0, _mm256_xor_si256( xc, \
+               _mm256_mullo_epi32(  mm256_rol_32( xa1, 15 ), _mm256_set1_epi32(5UL) ) \
+                   ) ), _mm256_set1_epi32(3UL) ) ) ) ); \
+   xb0 = mm256_not( _mm256_xor_si256( xa0, mm256_rol_32( xb0, 1 ) ) ); \
+} while (0)
+
+#define PERM_STEP_0_8   do { \
+      PERM_ELT8(A00, A0B, B0, BD, B9, B6, C8, M0); \
+      PERM_ELT8(A01, A00, B1, BE, BA, B7, C7, M1); \
+      PERM_ELT8(A02, A01, B2, BF, BB, B8, C6, M2); \
+      PERM_ELT8(A03, A02, B3, B0, BC, B9, C5, M3); \
+      PERM_ELT8(A04, A03, B4, B1, BD, BA, C4, M4); \
+      PERM_ELT8(A05, A04, B5, B2, BE, BB, C3, M5); \
+      PERM_ELT8(A06, A05, B6, B3, BF, BC, C2, M6); \
+      PERM_ELT8(A07, A06, B7, B4, B0, BD, C1, M7); \
+      PERM_ELT8(A08, A07, B8, B5, B1, BE, C0, M8); \
+      PERM_ELT8(A09, A08, B9, B6, B2, BF, CF, M9); \
+      PERM_ELT8(A0A, A09, BA, B7, B3, B0, CE, MA); \
+      PERM_ELT8(A0B, A0A, BB, B8, B4, B1, CD, MB); \
+      PERM_ELT8(A00, A0B, BC, B9, B5, B2, CC, MC); \
+      PERM_ELT8(A01, A00, BD, BA, B6, B3, CB, MD); \
+      PERM_ELT8(A02, A01, BE, BB, B7, B4, CA, ME); \
+      PERM_ELT8(A03, A02, BF, BC, B8, B5, C9, MF); \
+   } while (0)
+
+#define PERM_STEP_1_8   do { \
+      PERM_ELT8(A04, A03, B0, BD, B9, B6, C8, M0); \
+      PERM_ELT8(A05, A04, B1, BE, BA, B7, C7, M1); \
+      PERM_ELT8(A06, A05, B2, BF, BB, B8, C6, M2); \
+      PERM_ELT8(A07, A06, B3, B0, BC, B9, C5, M3); \
+      PERM_ELT8(A08, A07, B4, B1, BD, BA, C4, M4); \
+      PERM_ELT8(A09, A08, B5, B2, BE, BB, C3, M5); \
+      PERM_ELT8(A0A, A09, B6, B3, BF, BC, C2, M6); \
+      PERM_ELT8(A0B, A0A, B7, B4, B0, BD, C1, M7); \
+      PERM_ELT8(A00, A0B, B8, B5, B1, BE, C0, M8); \
+      PERM_ELT8(A01, A00, B9, B6, B2, BF, CF, M9); \
+      PERM_ELT8(A02, A01, BA, B7, B3, B0, CE, MA); \
+      PERM_ELT8(A03, A02, BB, B8, B4, B1, CD, MB); \
+      PERM_ELT8(A04, A03, BC, B9, B5, B2, CC, MC); \
+      PERM_ELT8(A05, A04, BD, BA, B6, B3, CB, MD); \
+      PERM_ELT8(A06, A05, BE, BB, B7, B4, CA, ME); \
+      PERM_ELT8(A07, A06, BF, BC, B8, B5, C9, MF); \
+   } while (0)
+
+#define PERM_STEP_2_8   do { \
+      PERM_ELT8(A08, A07, B0, BD, B9, B6, C8, M0); \
+      PERM_ELT8(A09, A08, B1, BE, BA, B7, C7, M1); \
+      PERM_ELT8(A0A, A09, B2, BF, BB, B8, C6, M2); \
+      PERM_ELT8(A0B, A0A, B3, B0, BC, B9, C5, M3); \
+      PERM_ELT8(A00, A0B, B4, B1, BD, BA, C4, M4); \
+      PERM_ELT8(A01, A00, B5, B2, BE, BB, C3, M5); \
+      PERM_ELT8(A02, A01, B6, B3, BF, BC, C2, M6); \
+      PERM_ELT8(A03, A02, B7, B4, B0, BD, C1, M7); \
+      PERM_ELT8(A04, A03, B8, B5, B1, BE, C0, M8); \
+      PERM_ELT8(A05, A04, B9, B6, B2, BF, CF, M9); \
+      PERM_ELT8(A06, A05, BA, B7, B3, B0, CE, MA); \
+      PERM_ELT8(A07, A06, BB, B8, B4, B1, CD, MB); \
+      PERM_ELT8(A08, A07, BC, B9, B5, B2, CC, MC); \
+      PERM_ELT8(A09, A08, BD, BA, B6, B3, CB, MD); \
+      PERM_ELT8(A0A, A09, BE, BB, B7, B4, CA, ME); \
+      PERM_ELT8(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+   } while (0)
+
+#define APPLY_P8 \
+do { \
+    B0 = mm256_ror_32( B0, 15 ); \
+    B1 = mm256_ror_32( B1, 15 ); \
+    B2 = mm256_ror_32( B2, 15 ); \
+    B3 = mm256_ror_32( B3, 15 ); \
+    B4 = mm256_ror_32( B4, 15 ); \
+    B5 = mm256_ror_32( B5, 15 ); \
+    B6 = mm256_ror_32( B6, 15 ); \
+    B7 = mm256_ror_32( B7, 15 ); \
+    B8 = mm256_ror_32( B8, 15 ); \
+    B9 = mm256_ror_32( B9, 15 ); \
+    BA = mm256_ror_32( BA, 15 ); \
+    BB = mm256_ror_32( BB, 15 ); \
+    BC = mm256_ror_32( BC, 15 ); \
+    BD = mm256_ror_32( BD, 15 ); \
+    BE = mm256_ror_32( BE, 15 ); \
+    BF = mm256_ror_32( BF, 15 ); \
+    PERM_STEP_0_8; \
+    PERM_STEP_1_8; \
+    PERM_STEP_2_8; \
+    A0B = _mm256_add_epi32( A0B, C6 ); \
+    A0A = _mm256_add_epi32( A0A, C5 ); \
+    A09 = _mm256_add_epi32( A09, C4 ); \
+    A08 = _mm256_add_epi32( A08, C3 ); \
+    A07 = _mm256_add_epi32( A07, C2 ); \
+    A06 = _mm256_add_epi32( A06, C1 ); \
+    A05 = _mm256_add_epi32( A05, C0 ); \
+    A04 = _mm256_add_epi32( A04, CF ); \
+    A03 = _mm256_add_epi32( A03, CE ); \
+    A02 = _mm256_add_epi32( A02, CD ); \
+    A01 = _mm256_add_epi32( A01, CC ); \
+    A00 = _mm256_add_epi32( A00, CB ); \
+    A0B = _mm256_add_epi32( A0B, CA ); \
+    A0A = _mm256_add_epi32( A0A, C9 ); \
+    A09 = _mm256_add_epi32( A09, C8 ); \
+    A08 = _mm256_add_epi32( A08, C7 ); \
+    A07 = _mm256_add_epi32( A07, C6 ); \
+    A06 = _mm256_add_epi32( A06, C5 ); \
+    A05 = _mm256_add_epi32( A05, C4 ); \
+    A04 = _mm256_add_epi32( A04, C3 ); \
+    A03 = _mm256_add_epi32( A03, C2 ); \
+    A02 = _mm256_add_epi32( A02, C1 ); \
+    A01 = _mm256_add_epi32( A01, C0 ); \
+    A00 = _mm256_add_epi32( A00, CF ); \
+    A0B = _mm256_add_epi32( A0B, CE ); \
+    A0A = _mm256_add_epi32( A0A, CD ); \
+    A09 = _mm256_add_epi32( A09, CC ); \
+    A08 = _mm256_add_epi32( A08, CB ); \
+    A07 = _mm256_add_epi32( A07, CA ); \
+    A06 = _mm256_add_epi32( A06, C9 ); \
+    A05 = _mm256_add_epi32( A05, C8 ); \
+    A04 = _mm256_add_epi32( A04, C7 ); \
+    A03 = _mm256_add_epi32( A03, C6 ); \
+    A02 = _mm256_add_epi32( A02, C5 ); \
+    A01 = _mm256_add_epi32( A01, C4 ); \
+    A00 = _mm256_add_epi32( A00, C3 ); \
+} while (0)
+
+#define INCR_W8   do { \
+      if ((Wlow = T32(Wlow + 1)) == 0) \
+         Whigh = T32(Whigh + 1); \
+   } while (0)
+
+static void
+shabal_8way_init( void *cc, unsigned size )
+{
+   shabal_8way_context *sc = (shabal_8way_context*)cc;
+
+   if ( size == 512 )
+   { // copy immediate constants directly to working registers later.
+       sc->state_loaded = false;
+   }
+   else
+   {  // No users
+       sc->state_loaded = true;
+       sc->A[ 0] = m256_const1_64( 0x52F8455252F84552 );
+       sc->A[ 1] = m256_const1_64( 0xE54B7999E54B7999 );
+       sc->A[ 2] = m256_const1_64( 0x2D8EE3EC2D8EE3EC );
+       sc->A[ 3] = m256_const1_64( 0xB9645191B9645191 );
+       sc->A[ 4] = m256_const1_64( 0xE0078B86E0078B86 );
+       sc->A[ 5] = m256_const1_64( 0xBB7C44C9BB7C44C9 );
+       sc->A[ 6] = m256_const1_64( 0xD2B5C1CAD2B5C1CA );
+       sc->A[ 7] = m256_const1_64( 0xB0D2EB8CB0D2EB8C );
+       sc->A[ 8] = m256_const1_64( 0x14CE5A4514CE5A45 );
+       sc->A[ 9] = m256_const1_64( 0x22AF50DC22AF50DC );
+       sc->A[10] = m256_const1_64( 0xEFFDBC6BEFFDBC6B );
+       sc->A[11] = m256_const1_64( 0xEB21B74AEB21B74A );
+
+       sc->B[ 0] = m256_const1_64( 0xB555C6EEB555C6EE );
+       sc->B[ 1] = m256_const1_64( 0x3E7105963E710596 );
+       sc->B[ 2] = m256_const1_64( 0xA72A652FA72A652F );
+       sc->B[ 3] = m256_const1_64( 0x9301515F9301515F );
+       sc->B[ 4] = m256_const1_64( 0xDA28C1FADA28C1FA );
+       sc->B[ 5] = m256_const1_64( 0x696FD868696FD868 );
+       sc->B[ 6] = m256_const1_64( 0x9CB6BF729CB6BF72 );
+       sc->B[ 7] = m256_const1_64( 0x0AFE40020AFE4002 );
+       sc->B[ 8] = m256_const1_64( 0xA6E03615A6E03615 );
+       sc->B[ 9] = m256_const1_64( 0x5138C1D45138C1D4 );
+       sc->B[10] = m256_const1_64( 0xBE216306BE216306 );
+       sc->B[11] = m256_const1_64( 0xB38B8890B38B8890 );
+       sc->B[12] = m256_const1_64( 0x3EA8B96B3EA8B96B );
+       sc->B[13] = m256_const1_64( 0x3299ACE43299ACE4 );
+       sc->B[14] = m256_const1_64( 0x30924DD430924DD4 );
+       sc->B[15] = m256_const1_64( 0x55CB34A555CB34A5 );
+
+       sc->C[ 0] = m256_const1_64( 0xB405F031B405F031 );
+       sc->C[ 1] = m256_const1_64( 0xC4233EBAC4233EBA );
+       sc->C[ 2] = m256_const1_64( 0xB3733979B3733979 );
+       sc->C[ 3] = m256_const1_64( 0xC0DD9D55C0DD9D55 );
+       sc->C[ 4] = m256_const1_64( 0xC51C28AEC51C28AE );
+       sc->C[ 5] = m256_const1_64( 0xA327B8E1A327B8E1 );
+       sc->C[ 6] = m256_const1_64( 0x56C5616756C56167 );
+       sc->C[ 7] = m256_const1_64( 0xED614433ED614433 );
+       sc->C[ 8] = m256_const1_64( 0x88B59D6088B59D60 );
+       sc->C[ 9] = m256_const1_64( 0x60E2CEBA60E2CEBA );
+       sc->C[10] = m256_const1_64( 0x758B4B8B758B4B8B );
+       sc->C[11] = m256_const1_64( 0x83E82A7F83E82A7F );
+       sc->C[12] = m256_const1_64( 0xBC968828BC968828 );
+       sc->C[13] = m256_const1_64( 0xE6E00BF7E6E00BF7 );
+       sc->C[14] = m256_const1_64( 0xBA839E55BA839E55 );
+       sc->C[15] = m256_const1_64( 0x9B491C609B491C60 );
+   }
+    sc->Wlow = 1;
+    sc->Whigh = 0;
+    sc->ptr = 0;
+}
+
+static void
+shabal_8way_core( void *cc, const unsigned char *data, size_t len )
+{
+   shabal_8way_context *sc = (shabal_8way_context*)cc;
+    __m256i *buf;
+    __m256i *vdata = (__m256i*)data;
+   const int buf_size = 64;
+   size_t ptr;
+   DECL_STATE8
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   if ( len < (buf_size - ptr ) )
+   {
+      memcpy_256( buf + (ptr>>2), vdata, len>>2 );
+      ptr += len;
+      sc->ptr = ptr;
+      return;
+   }
+
+   READ_STATE8( sc );
+
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( buf + (ptr>>2), vdata, clen>>2 );
+
+      ptr += clen;
+      vdata += clen>>2;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         DECODE_BLOCK8;
+         INPUT_BLOCK_ADD8;
+         XOR_W8;
+         APPLY_P8;
+         INPUT_BLOCK_SUB8;
+         SWAP_BC8;
+         INCR_W8;
+         ptr = 0;
+      }
+   }
+   WRITE_STATE8(sc);
+   sc->ptr = ptr;
+}
+
+static void
+shabal_8way_close( void *cc, unsigned ub, unsigned n, void *dst,
+                   unsigned size_words )
+{
+   shabal_8way_context *sc = (shabal_8way_context*)cc;
+    __m256i *buf;
+   const int buf_size = 64;
+   size_t ptr;
+   int i;
+   unsigned z, zz;
+   DECL_STATE8
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   z = 0x80 >> n;
+   zz = ((ub & -z) | z) & 0xFF;
+   buf[ptr>>2] = _mm256_set1_epi32( zz );
+   memset_zero_256( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
+   READ_STATE8(sc);
+   DECODE_BLOCK8;
+   INPUT_BLOCK_ADD8;
+   XOR_W8;
+   APPLY_P8;
+
+   for ( i = 0; i < 3; i ++ )
+   {
+      SWAP_BC8;
+      XOR_W8;
+      APPLY_P8;
+   }
+
+   __m256i *d = (__m256i*)dst;
+   if ( size_words == 16 )   // 512
+   {
+      d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3;
+      d[ 4] = B4; d[ 5] = B5; d[ 6] = B6; d[ 7] = B7;
+      d[ 8] = B8; d[ 9] = B9; d[10] = BA; d[11] = BB;
+      d[12] = BC; d[13] = BD; d[14] = BE; d[15] = BF;
+   }
+   else    // 256
+   {
+      d[ 0] = B8; d[ 1] = B9; d[ 2] = BA; d[ 3] = BB;
+      d[ 4] = BC; d[ 5] = BD; d[ 6] = BE; d[ 7] = BF;
+   }
+}
+
+void
+shabal256_8way_init( void *cc )
+{
+   shabal_8way_init(cc, 256);
+}
+
+void
+shabal256_8way_update( void *cc, const void *data, size_t len )
+{
+   shabal_8way_core( cc, data, len );
+}
+
+void
+shabal256_8way_close( void *cc, void *dst )
+{
+   shabal_8way_close(cc, 0, 0, dst, 8);
+}
+
+void
+shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                  void *dst )
+{
+   shabal_8way_close(cc, ub, n, dst, 8);
+}
+
+void
+shabal512_8way_init(void *cc)
+{
+   shabal_8way_init(cc, 512);
+}
+
+void
+shabal512_8way_update(void *cc, const void *data, size_t len)
+{
+   shabal_8way_core(cc, data, len);
+}
+
+void
+shabal512_8way_close(void *cc, void *dst)
+{
+   shabal_8way_close(cc, 0, 0, dst, 16);
+}
+
+void
+shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+   shabal_8way_close(cc, ub, n, dst, 16);
+}
+
+
+#endif  // AVX2
+
 /*
 * We copy the state into local variables, so that the compiler knows
 * that it can optimize them at will.
@@ -290,6 +883,8 @@ do { \
   A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
   A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
 } while (0)
+
+
 /*
 #define SWAP(v1, v2)   do { \
 		sph_u32 tmp = (v1); \
@@ -297,26 +892,39 @@ do { \
 		(v2) = tmp; \
 	} while (0)
 */
+
 #define SWAP_BC \
 do { \
-    mm128_swap128_256( B0, C0 ); \
-    mm128_swap128_256( B1, C1 ); \
-    mm128_swap128_256( B2, C2 ); \
-    mm128_swap128_256( B3, C3 ); \
-    mm128_swap128_256( B4, C4 ); \
-    mm128_swap128_256( B5, C5 ); \
-    mm128_swap128_256( B6, C6 ); \
-    mm128_swap128_256( B7, C7 ); \
-    mm128_swap128_256( B8, C8 ); \
-    mm128_swap128_256( B9, C9 ); \
-    mm128_swap128_256( BA, CA ); \
-    mm128_swap128_256( BB, CB ); \
-    mm128_swap128_256( BC, CC ); \
-    mm128_swap128_256( BD, CD ); \
-    mm128_swap128_256( BE, CE ); \
-    mm128_swap128_256( BF, CF ); \
+    mm128_swap256_128( B0, C0 ); \
+    mm128_swap256_128( B1, C1 ); \
+    mm128_swap256_128( B2, C2 ); \
+    mm128_swap256_128( B3, C3 ); \
+    mm128_swap256_128( B4, C4 ); \
+    mm128_swap256_128( B5, C5 ); \
+    mm128_swap256_128( B6, C6 ); \
+    mm128_swap256_128( B7, C7 ); \
+    mm128_swap256_128( B8, C8 ); \
+    mm128_swap256_128( B9, C9 ); \
+    mm128_swap256_128( BA, CA ); \
+    mm128_swap256_128( BB, CB ); \
+    mm128_swap256_128( BC, CC ); \
+    mm128_swap256_128( BD, CD ); \
+    mm128_swap256_128( BE, CE ); \
+    mm128_swap256_128( BF, CF ); \
 } while (0)

+/*
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
+do { \
+  __m128i t1 = _mm_mullo_epi32(  mm_rol_32( xa1, 15 ),\
+                                   _mm_set1_epi32(5UL) ) \
+  __m128i t2 = _mm_xor_si128( xa0, xc ); \
+  xb0 = mm_not( _mm_xor_si256( xa0, mm_rol_32( xb0, 1 ) ) ); \
+  xa0 = mm_xor4( xm, xb1, _mm_andnot_si128( xb3, xb2 ), \
+              _mm_xor_si128( t2, \
+                      _mm_mullo_epi32( t1, _mm_set1_epi32(5UL) ) ) ) \
+*/
+
 #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
 do { \
   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
@@ -706,7 +1314,7 @@ shabal256_4way_init( void *cc )
 }

 void
-shabal256_4way( void *cc, const void *data, size_t len )
+shabal256_4way_update( void *cc, const void *data, size_t len )
 {
 	shabal_4way_core( cc, data, len );
 }
@@ -731,7 +1339,7 @@ shabal512_4way_init(void *cc)
 }

 void
-shabal512_4way(void *cc, const void *data, size_t len)
+shabal512_4way_update(void *cc, const void *data, size_t len)
 {
 	shabal_4way_core(cc, data, len);
 }
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -36,7 +36,7 @@
 #ifndef SHABAL_HASH_4WAY_H__
 #define SHABAL_HASH_4WAY_H__ 1

-#ifdef __AVX2__
+#ifdef __SSE4_1__

 #include <stddef.h>
 #include "algo/sha/sph_types.h"
@@ -50,6 +50,34 @@ extern "C"{

 #define SPH_SIZE_shabal512   512

+#if defined(__AVX2__)
+
+typedef struct {
+   __m256i buf[16];
+   __m256i A[12], B[16], C[16];
+   sph_u32 Whigh, Wlow;
+   size_t ptr;
+   bool state_loaded;
+} shabal_8way_context __attribute__ ((aligned (64)));
+
+typedef shabal_8way_context shabal256_8way_context;
+typedef shabal_8way_context shabal512_8way_context;
+
+void shabal256_8way_init( void *cc );
+void shabal256_8way_update( void *cc, const void *data, size_t len );
+void shabal256_8way_close( void *cc, void *dst );
+void shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                       void *dst );
+
+void shabal512_8way_init( void *cc );
+void shabal512_8way_update( void *cc, const void *data, size_t len );
+void shabal512_8way_close( void *cc, void *dst );
+void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                       void *dst );
+
+
+#endif
+
 typedef struct {
 	__m128i buf[16] __attribute__ ((aligned (64)));
 	__m128i A[12], B[16], C[16];
@@ -62,13 +90,14 @@ typedef shabal_4way_context shabal256_4way_context;
 typedef shabal_4way_context shabal512_4way_context;

 void shabal256_4way_init( void *cc );
-void shabal256_4way( void *cc, const void *data, size_t len );
+void shabal256_4way_update( void *cc, const void *data, size_t len );
 void shabal256_4way_close( void *cc, void *dst );
 void shabal256_4way_addbits_and_close(	void *cc, unsigned ub, unsigned n,
                                       void *dst );

 void shabal512_4way_init( void *cc );
-void shabal512_4way( void *cc, const void *data, size_t len );
+void shabal512_4way_update( void *cc, const void *data, size_t len );
+#define shabal512_4way shabal512_4way_update
 void shabal512_4way_close( void *cc, void *dst );
 void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                       void *dst );
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -3,6 +3,12 @@

 #include <stdio.h>

+// This implementation is deprecated, superseded by VAES in Icelake
+// which provides HW based 4 way aes.
+// It was created for AVX2 to eliminate interleaving between the 
+// preceding and following function.
+// This code can be removed when current users have reverted to one way.
+
 #if defined(__AVX2__)


@@ -16,8 +22,8 @@ static const uint32_t IV512[] =


 #define mm256_ror2x256hi_1x32( a, b ) \
-   _mm256_blend_epi32( mm256_ror1x32_128( a ), \
-                       mm256_ror1x32_128( b ), 0x88 )
+   _mm256_blend_epi32( mm256_ror128_32( a ), \
+                       mm256_ror128_32( b ), 0x88 )

 static void
 c512_2way( shavite512_2way_context *ctx, const void *msg )
@@ -61,7 +67,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
   {
      // round 1, 5, 9

-     k00 = _mm256_xor_si256( k13, mm256_ror1x32_128(
+     k00 = _mm256_xor_si256( k13, mm256_ror128_32(
                                  mm256_aesenc_2x128( k00, zero ) ) );

     if ( r == 0 )
@@ -71,7 +77,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )

     x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
     k01 = _mm256_xor_si256( k00,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k01, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ) );

     if ( r == 1 )
        k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
@@ -80,25 +86,25 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )

     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
     k02 = _mm256_xor_si256( k01,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k02, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
     k03 = _mm256_xor_si256( k02,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k03, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );

     p3 = _mm256_xor_si256( p3, x );

     k10 = _mm256_xor_si256( k03,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k10, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
     k11 = _mm256_xor_si256( k10,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k11, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
     k12 = _mm256_xor_si256( k11,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
     k13 = _mm256_xor_si256( k12,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k13, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ) );

     if ( r == 2 )
        k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
@@ -134,31 +140,31 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )

     // round 3, 7, 11

-     k00 = _mm256_xor_si256( mm256_ror1x32_128(
+     k00 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k00, zero ) ), k13 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero );
-     k01 = _mm256_xor_si256( mm256_ror1x32_128(
+     k01 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k01, zero ) ), k00 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-     k02 = _mm256_xor_si256( mm256_ror1x32_128(
+     k02 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k02, zero ) ), k01 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-     k03 = _mm256_xor_si256( mm256_ror1x32_128(
+     k03 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k03, zero ) ), k02 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );

     p1 = _mm256_xor_si256( p1, x );

-     k10 = _mm256_xor_si256( mm256_ror1x32_128(
+     k10 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k10, zero ) ), k03 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero );
-     k11 = _mm256_xor_si256( mm256_ror1x32_128(
+     k11 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k11, zero ) ), k10 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
-     k12 = _mm256_xor_si256( mm256_ror1x32_128(
+     k12 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k12, zero ) ), k11 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-     k13 = _mm256_xor_si256( mm256_ror1x32_128(
+     k13 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k13, zero ) ), k12 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );

@@ -192,35 +198,35 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )

   // round 13

-   k00 = _mm256_xor_si256( mm256_ror1x32_128(
+   k00 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k00, zero ) ), k13  );
   x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
-   k01 = _mm256_xor_si256( mm256_ror1x32_128(
+   k01 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k01, zero ) ), k00 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-   k02 = _mm256_xor_si256( mm256_ror1x32_128(
+   k02 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k02, zero ) ), k01 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-   k03 = _mm256_xor_si256( mm256_ror1x32_128(
+   k03 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k03, zero ) ), k02 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );

   p3 = _mm256_xor_si256( p3, x );

-   k10 = _mm256_xor_si256( mm256_ror1x32_128(
+   k10 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k10, zero ) ), k03 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
-   k11 = _mm256_xor_si256( mm256_ror1x32_128(
+   k11 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k11, zero ) ), k10 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );

-   k12 = mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) );
+   k12 = mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) );
   k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
 	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
 	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );

   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-   k13 = _mm256_xor_si256( mm256_ror1x32_128(
+   k13 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k13, zero ) ), k12 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );

--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
--- a/algo/simd/simd-hash-2way.h
+++ b/algo/simd/simd-hash-2way.h
@@ -7,15 +7,37 @@

 #include "simd-utils.h"

+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 typedef struct {
-  uint32_t A[ 32*2 ] __attribute__((aligned(64)));
-  uint8_t buffer[ 128*2 ] __attribute__((aligned(64)));
+  uint32_t A[ 32*4 ];
+  uint8_t buffer[ 128*4 ];
+  uint64_t count;
+  unsigned int hashbitlen;
+  unsigned int blocksize;
+  unsigned int n_feistels;
+
+} simd_4way_context __attribute__((aligned(128)));
+
+int simd_4way_init( simd_4way_context *state, int hashbitlen );
+int simd_4way_update( simd_4way_context *state, const void *data,
+                      int databitlen );
+int simd_4way_close( simd_4way_context *state, void *hashval );
+int simd_4way_update_close( simd_4way_context *state, void *hashval,
+                            const void *data, int databitlen );
+
+#endif
+
+typedef struct {
+  uint32_t A[ 32*2 ];
+  uint8_t buffer[ 128*2 ];
  uint64_t count;
  unsigned int hashbitlen;
  unsigned int blocksize;
  unsigned int n_feistels;
  
-} simd_2way_context;
+} simd_2way_context __attribute__((aligned(128)));

 int simd_2way_init( simd_2way_context *state, int hashbitlen );
 int simd_2way_update( simd_2way_context *state, const void *data,
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -15,7 +15,7 @@

 void skeinhash_8way( void *state, const void *input )
 {
-     uint64_t vhash64[16*8] __attribute__ ((aligned (128)));
+     uint64_t vhash64[8*8] __attribute__ ((aligned (128)));
     skein512_8way_context ctx_skein;

 //#if defined(__SHA__)
@@ -29,7 +29,7 @@ void skeinhash_8way( void *state, const void *input )
 //     uint32_t hash7[16] __attribute__ ((aligned (64)));
 //     SHA256_CTX           ctx_sha256;
 //#else
-     uint32_t vhash32[32*8] __attribute__ ((aligned (128)));
+     uint32_t vhash32[16*8] __attribute__ ((aligned (128)));
     sha256_8way_context ctx_sha256;
 //#endif

@@ -135,7 +135,7 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,

 void skeinhash_4way( void *state, const void *input )
 {
-     uint64_t vhash64[16*4] __attribute__ ((aligned (64)));
+     uint64_t vhash64[8*4] __attribute__ ((aligned (128)));
     skein512_4way_context ctx_skein;
 #if defined(__SHA__)
     uint32_t hash0[16] __attribute__ ((aligned (64)));
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -1,11 +1,7 @@
 #include "cpuminer-config.h"
 #include "c11-gate.h"
-
-#if defined (C11_4WAY)
-
 #include <string.h>
 #include <stdint.h>
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -13,11 +9,236 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"

+#if defined (C11_8WAY)
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+} c11_8way_ctx_holder;
+
+c11_8way_ctx_holder c11_8way_ctx;
+
+void init_c11_8way_ctx()
+{
+     blake512_8way_init( &c11_8way_ctx.blake );
+     bmw512_8way_init( &c11_8way_ctx.bmw );
+     init_groestl( &c11_8way_ctx.groestl, 64 );
+     skein512_8way_init( &c11_8way_ctx.skein );
+     jh512_8way_init( &c11_8way_ctx.jh );
+     keccak512_8way_init( &c11_8way_ctx.keccak );
+     luffa_4way_init( &c11_8way_ctx.luffa, 512 );
+     cube_4way_init( &c11_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &c11_8way_ctx.shavite );
+     simd_4way_init( &c11_8way_ctx.simd, 512 );
+     init_echo( &c11_8way_ctx.echo, 512 );
+}
+
+void c11_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));     
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     c11_8way_ctx_holder ctx;
+     memcpy( &ctx, &c11_8way_ctx, sizeof(c11_8way_ctx) );
+
+     // 1 Blake 4way
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     // 2 Bmw
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     // Serial
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     // 4way
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+
+     // 4 JH
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     // 5 Keccak
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     // 6 Skein
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     // 10 Simd
+     intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
+     intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     int thr_id = mythr->id;   
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     const uint32_t Htarg = ptarget[7];
+
+     max_nonce -= 8;
+
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+        _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                          n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+        c11_8way_hash( hash, vdata );
+        pdata[19] = n;
+
+        for ( int i = 0; i < 8; i++ )
+        if ( ( ( hash+(i<<3) )[7] < Htarg )
+             && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+        {
+           pdata[19] = n+i;
+           submit_lane_solution( work, hash+(i<<3), mythr, i );
+        }
+        n += 8;
+     } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+     
+#elif defined (C11_4WAY)
+
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
--- a/algo/x11/c11-gate.c
+++ b/algo/x11/c11-gate.c
@@ -2,7 +2,11 @@

 bool register_c11_algo( algo_gate_t* gate )
 {
-#if defined (C11_4WAY)
+#if defined (C11_8WAY)
+  init_c11_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_c11_8way;
+  gate->hash      = (void*)&c11_8way_hash;
+#elif defined (C11_4WAY)
  init_c11_4way_ctx();
  gate->scanhash  = (void*)&scanhash_c11_4way;
  gate->hash      = (void*)&c11_4way_hash;
@@ -11,7 +15,7 @@ bool register_c11_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_c11;
  gate->hash      = (void*)&c11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x11/c11-gate.h
+++ b/algo/x11/c11-gate.h
@@ -4,29 +4,36 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define C11_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define C11_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define C11_4WAY 1
 #endif

+
 bool register_c11_algo( algo_gate_t* gate );
+#if defined(C11_8WAY)

-#if defined(C11_4WAY)
+void c11_8way_hash( void *state, const void *input );
+int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+void init_c11_8way_ctx();
+
+#elif defined(C11_4WAY)

 void c11_4way_hash( void *state, const void *input );
-
 int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_c11_4way_ctx();

-#endif
+#else

 void c11_hash( void *state, const void *input );
-
 int scanhash_c11( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_c11_ctx();

 #endif

+#endif
+
--- a/algo/x11/tribus-4way.c
+++ b/algo/x11/tribus-4way.c
@@ -3,22 +3,121 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
-#if defined(TRIBUS_4WAY)
-
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/echo/aes_ni/hash_api.h"

-//hashState_echo tribus_4way_ctx __attribute__ ((aligned (64)));
-static __thread jh512_4way_context ctx_mid;
-/*
-void init_tribus_4way_ctx()
+#if defined(TRIBUS_8WAY)
+
+static __thread jh512_8way_context ctx_mid;
+
+void tribus_hash_8way( void *state, const void *input )
 {
-     init_echo( &tribus_4way_ctx, 512 );
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     jh512_8way_context     ctx_jh;
+     keccak512_8way_context ctx_keccak;
+     hashState_echo         ctx_echo;
+
+     memcpy( &ctx_jh, &ctx_mid, sizeof(ctx_mid) );
+     jh512_8way_update( &ctx_jh, input + (64<<3), 16 );
+     jh512_8way_close( &ctx_jh, vhash );
+
+     keccak512_8way_init( &ctx_keccak );
+     keccak512_8way_update( &ctx_keccak, vhash, 64 );
+     keccak512_8way_close( &ctx_keccak, vhash );
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, 512 );
+
+     // hash echo serially
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash0,
+                        (const BitSequence *) hash0, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash1,
+                        (const BitSequence *) hash1, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash2,
+                        (const BitSequence *) hash2, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash3,
+                        (const BitSequence *) hash3, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash4,
+                        (const BitSequence *) hash4, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash5,
+                        (const BitSequence *) hash5, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash6,
+                        (const BitSequence *) hash6, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash7,
+                        (const BitSequence *) hash7, 512 );
+
+     memcpy( state,       hash0, 32 );
+     memcpy( state+32,    hash1, 32 );
+     memcpy( state+64,    hash2, 32 );
+     memcpy( state+96,    hash3, 32 );
+     memcpy( state+128,   hash4, 32 );
+     memcpy( state+160,   hash5, 32 );
+     memcpy( state+192,   hash6, 32 );
+     memcpy( state+224,   hash7, 32 );
 }
-*/
-void tribus_hash_4way(void *state, const void *input)
+
+int scanhash_tribus_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t Htarg = ptarget[7];
+   uint32_t n = pdata[19];
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   int thr_id = mythr->id;  
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   jh512_8way_init( &ctx_mid );
+   jh512_8way_update( &ctx_mid, vdata, 64 );
+
+   do {
+     *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+     tribus_hash_8way( hash, vdata );
+     pdata[19] = n;
+
+     for ( int i = 0; i < 8; i++ )
+     if ( (hash+(i<<3))[7] < Htarg )
+     if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+     {
+          pdata[19] = n+i;
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
+     }
+     n += 8;
+   } while ( ( n < max_nonce-8 )  && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(TRIBUS_4WAY)
+
+static __thread jh512_4way_context ctx_mid;
+
+void tribus_hash_4way( void *state, const void *input )
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
@@ -30,11 +129,11 @@ void tribus_hash_4way(void *state, const void *input)
     hashState_echo         ctx_echo;

     memcpy( &ctx_jh, &ctx_mid, sizeof(ctx_mid) );
-     jh512_4way( &ctx_jh, input + (64<<2), 16 );
+     jh512_4way_update( &ctx_jh, input + (64<<2), 16 );
     jh512_4way_close( &ctx_jh, vhash );

     keccak512_4way_init( &ctx_keccak );
-     keccak512_4way( &ctx_keccak, vhash, 64 );
+     keccak512_4way_update( &ctx_keccak, vhash, 64 );
     keccak512_4way_close( &ctx_keccak, vhash );

     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -60,7 +159,7 @@ void tribus_hash_4way(void *state, const void *input)
 }

 int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
-            uint64_t *hashes_done, struct thr_info *mythr)
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -70,57 +169,32 @@ int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
   const uint32_t Htarg = ptarget[7];
   uint32_t n = pdata[19];
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-   uint64_t htmax[] = {          0,
-                               0xF,
-                              0xFF,
-                             0xFFF,
-                            0xFFFF,
-                        0x10000000 };
-
-   uint32_t masks[] = {	0xFFFFFFFF,
-                        0xFFFFFFF0,
-                        0xFFFFFF00,
-                        0xFFFFF000,
-                        0xFFFF0000,
-                                 0 };
+   int thr_id = mythr->id;

   mm256_bswap32_intrlv80_4x64( vdata, pdata );

-   // precalc midstate
-   // doing it one way then then interleaving would be faster but too
-   // complicated tto interleave context.
   jh512_4way_init( &ctx_mid );
-   jh512_4way( &ctx_mid, vdata, 64 );
+   jh512_4way_update( &ctx_mid, vdata, 64 );

-   for ( int m = 0; m < 6; m++ )
-   {
-      if ( Htarg <= htmax[m] )
-      {
-         uint32_t mask = masks[m];
-         do {
-           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+   do {
+     *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

-            tribus_hash_4way( hash, vdata );
+     tribus_hash_4way( hash, vdata );

-            pdata[19] = n;
+     pdata[19] = n;

-            for ( int i = 0; i < 4; i++ )
-            if ( ( !( (hash+(i<<3))[7] & mask ) )
-                 && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-            {
-               pdata[19] = n+i;
-               submit_lane_solution( work, hash+(i<<3), mythr, i );
-            }
-            n += 4;
-         } while ( ( n < max_nonce )  && !work_restart[thr_id].restart);
-         break;
-      }
-   }
+     for ( int i = 0; i < 4; i++ )
+     if ( (hash+(i<<3))[7] < Htarg )
+     if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+     {
+          pdata[19] = n+i;
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
+     }
+     n += 4;
+   } while ( ( n < max_nonce-4 )  && !work_restart[thr_id].restart);

-   *hashes_done = n - first_nonce + 1;
+   *hashes_done = n - first_nonce;
   return 0;
 }

--- a/algo/x11/tribus-gate.c
+++ b/algo/x11/tribus-gate.c
@@ -2,9 +2,11 @@

 bool register_tribus_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
-#if defined (TRIBUS_4WAY)
-//  init_tribus_4way_ctx();
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+#if defined (TRIBUS_8WAY)
+  gate->scanhash      = (void*)&scanhash_tribus_8way;
+  gate->hash          = (void*)&tribus_hash_8way;
+#elif defined (TRIBUS_4WAY)
  gate->scanhash      = (void*)&scanhash_tribus_4way;
  gate->hash          = (void*)&tribus_hash_4way;
 #else
--- a/algo/x11/tribus-gate.h
+++ b/algo/x11/tribus-gate.h
@@ -1,16 +1,23 @@
 #ifndef TRIBUS_GATE_H__
-#define TRIBUS_GATE_H__
+#define TRIBUS_GATE_H__ 1

 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define TRIBUS_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define TRIBUS_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define TRIBUS_4WAY 1
 #endif

-#if defined(TRIBUS_4WAY)
+#if defined(TRIBUS_8WAY)

-//void init_tribus_4way_ctx();
+void tribus_hash_8way( void *state, const void *input );
+
+int scanhash_tribus_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(TRIBUS_4WAY)

 void tribus_hash_4way( void *state, const void *input );

--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -1,8 +1,5 @@
 #include "cpuminer-config.h"
 #include "x11-gate.h"
-
-#if defined (X11_4WAY)
-
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/blake-hash-4way.h"
@@ -12,11 +9,235 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"

+#if defined (X11_8WAY)
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+} x11_8way_ctx_holder;
+
+x11_8way_ctx_holder x11_8way_ctx;
+
+void init_x11_8way_ctx()
+{
+     blake512_8way_init( &x11_8way_ctx.blake );
+     bmw512_8way_init( &x11_8way_ctx.bmw );
+     init_groestl( &x11_8way_ctx.groestl, 64 );
+     skein512_8way_init( &x11_8way_ctx.skein );
+     jh512_8way_init( &x11_8way_ctx.jh );
+     keccak512_8way_init( &x11_8way_ctx.keccak );
+     luffa_4way_init( &x11_8way_ctx.luffa, 512 );
+     cube_4way_init( &x11_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x11_8way_ctx.shavite );
+     simd_4way_init( &x11_8way_ctx.simd, 512 );
+     init_echo( &x11_8way_ctx.echo, 512 );
+}
+
+void x11_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     x11_8way_ctx_holder ctx;
+     memcpy( &ctx, &x11_8way_ctx, sizeof(x11_8way_ctx) );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     // Serial
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     // 4way
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+int scanhash_x11_8way( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     int thr_id = mythr->id;
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     const uint32_t Htarg = ptarget[7];
+
+     const uint32_t last_nonce = max_nonce -8;
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+         _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                           n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+         x11_8way_hash( hash, vdata );
+         pdata[19] = n;
+
+         for ( int i = 0; i < 8; i++ )
+         if ( ( hash+(i<<3) )[7] < Htarg
+              && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+         {
+             pdata[19] = n+i;
+             submit_lane_solution( work, hash+(i<<3), mythr, i );
+         }
+         n += 8;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+
+#elif defined (X11_4WAY)
+
+
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
--- a/algo/x11/x11-gate.c
+++ b/algo/x11/x11-gate.c
@@ -1,8 +1,12 @@
 #include "x11-gate.h"

-bool register_x11_algo( algo_gate_t* gate )
+bool register_x11_algo( algo_gate_t *gate )
 {
-#if defined (X11_4WAY)
+#if defined (X11_8WAY)
+  init_x11_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x11_8way;
+  gate->hash      = (void*)&x11_8way_hash;
+#elif defined (X11_4WAY)
  init_x11_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x11_4way;
  gate->hash      = (void*)&x11_4way_hash;
@@ -11,7 +15,7 @@ bool register_x11_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x11;
  gate->hash      = (void*)&x11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x11/x11-gate.h
+++ b/algo/x11/x11-gate.h
@@ -4,29 +4,35 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X11_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X11_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X11_4WAY 1
 #endif

 bool register_x11_algo( algo_gate_t* gate );
+#if defined(X11_8WAY)

-#if defined(X11_4WAY)
+void x11_8way_hash( void *state, const void *input );
+int scanhash_x11_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+void init_x11_8way_ctx();
+
+#elif defined(X11_4WAY)

 void x11_4way_hash( void *state, const void *input );
-
 int scanhash_x11_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x11_4way_ctx();

-#endif
+#else

 void x11_hash( void *state, const void *input );
-
 int scanhash_x11( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x11_ctx();

 #endif

+#endif
+
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -1,11 +1,7 @@
 #include "cpuminer-config.h"
 #include "x11gost-gate.h"
-
-#if defined (X11GOST_4WAY)
-
 #include <string.h>
 #include <stdint.h>
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -14,18 +10,269 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"

+#if defined (X11GOST_8WAY)
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;    
+    keccak512_8way_context  keccak;    
+    sph_gost512_context     gost;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+} x11gost_8way_ctx_holder;
+
+x11gost_8way_ctx_holder x11gost_8way_ctx;
+
+void init_x11gost_8way_ctx()
+{
+     blake512_8way_init( &x11gost_8way_ctx.blake );
+     bmw512_8way_init( &x11gost_8way_ctx.bmw );
+     init_groestl( &x11gost_8way_ctx.groestl, 64 );
+     skein512_8way_init( &x11gost_8way_ctx.skein );
+     jh512_8way_init( &x11gost_8way_ctx.jh );
+     keccak512_8way_init( &x11gost_8way_ctx.keccak );
+     sph_gost512_init( &x11gost_8way_ctx.gost );
+     luffa_4way_init( &x11gost_8way_ctx.luffa, 512 );
+     cube_4way_init( &x11gost_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x11gost_8way_ctx.shavite );
+     simd_4way_init( &x11gost_8way_ctx.simd, 512 );
+     init_echo( &x11gost_8way_ctx.echo, 512 );
+}
+
+void x11gost_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     x11gost_8way_ctx_holder ctx;
+     memcpy( &ctx, &x11gost_8way_ctx, sizeof(x11gost_8way_ctx) );
+
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     // Serial
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     // 4way
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     // Serial
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, hash1 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash4, 64 );
+     sph_gost512_close( &ctx.gost, hash4 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash5, 64 );
+     sph_gost512_close( &ctx.gost, hash5 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash6, 64 );
+     sph_gost512_close( &ctx.gost, hash6 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash7, 64 );
+     sph_gost512_close( &ctx.gost, hash7 );
+
+
+     // Luffa + Cube
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+int scanhash_x11gost_8way( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     int thr_id = mythr->id; 
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     const uint32_t Htarg = ptarget[7];
+
+     max_nonce -= 8;
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+         _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                           n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+         x11gost_8way_hash( hash, vdata );
+         pdata[19] = n;
+
+         for ( int i = 0; i < 8; i++ )
+         if ( ( hash+(i<<3) )[7] < Htarg 
+              && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+         {
+             pdata[19] = n+i;
+             submit_lane_solution( work, hash+(i<<3), mythr, i );
+         }
+         n += 8;
+     } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined (X11GOST_4WAY)
+
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
    skein512_4way_context   skein;
-    jh512_4way_context      jh;    
-    keccak512_4way_context  keccak;    
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
    sph_gost512_context     gost;
    luffa_2way_context      luffa;
    cubehashParam           cube;
@@ -76,10 +323,10 @@ void x11gost_4way_hash( void *state, const void *input )
     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl, 
+     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl, 
+     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

@@ -175,7 +422,7 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
-     int thr_id = mythr->id;  // thr_id arg is deprecated
+     int thr_id = mythr->id;
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
@@ -185,7 +432,7 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,

     mm256_bswap32_intrlv80_4x64( vdata, pdata );

-     for (int m=0; m < 6; m++) 
+     for (int m=0; m < 6; m++)
       if (Htarg <= htmax[m])
       {
         uint32_t mask = masks[m];
--- a/algo/x11/x11gost-gate.c
+++ b/algo/x11/x11gost-gate.c
@@ -2,7 +2,11 @@

 bool register_x11gost_algo( algo_gate_t* gate )
 {
-#if defined (X11GOST_4WAY)
+#if defined (X11GOST_8WAY)
+  init_x11gost_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x11gost_8way;
+  gate->hash      = (void*)&x11gost_8way_hash;
+#elif defined (X11GOST_4WAY)
  init_x11gost_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x11gost_4way;
  gate->hash      = (void*)&x11gost_4way_hash;
@@ -11,7 +15,7 @@ bool register_x11gost_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x11gost;
  gate->hash      = (void*)&x11gost_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x11/x11gost-gate.h
+++ b/algo/x11/x11gost-gate.h
@@ -4,29 +4,36 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X11GOST_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X11GOST_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X11GOST_4WAY 1
 #endif

 bool register_x11gost_algo( algo_gate_t* gate );

-#if defined(X11GOST_4WAY)
+#if defined(X11GOST_8WAY)
+
+void x11gost_8way_hash( void *state, const void *input );
+int scanhash_x11gost_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+void init_x11gost_8way_ctx();
+
+#elif defined(X11GOST_4WAY)

 void x11gost_4way_hash( void *state, const void *input );
-
 int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x11gost_4way_ctx();

-#endif
+#else

 void x11gost_hash( void *state, const void *input );
-
 int scanhash_x11gost( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x11gost_ctx();

 #endif

+#endif
+
--- a/algo/x12/x12-4way.c
+++ b/algo/x12/x12-4way.c
@@ -1,7 +1,4 @@
 #include "x12-gate.h"
-
-#if defined(X12_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,11 +11,223 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
-//#include "algo/fugue/sph_fugue.h"
+
+#if defined(X12_8WAY)
+
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+} x12_8way_ctx_holder;
+
+x12_8way_ctx_holder x12_8way_ctx __attribute__ ((aligned (64)));
+
+void init_x12_8way_ctx()
+{
+     blake512_8way_init( &x12_8way_ctx.blake );
+     bmw512_8way_init( &x12_8way_ctx.bmw );
+     init_groestl( &x12_8way_ctx.groestl, 64 );
+     skein512_8way_init( &x12_8way_ctx.skein );
+     jh512_8way_init( &x12_8way_ctx.jh );
+     keccak512_8way_init( &x12_8way_ctx.keccak );
+     luffa_4way_init( &x12_8way_ctx.luffa, 512 );
+     cube_4way_init( &x12_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x12_8way_ctx.shavite );
+     simd_4way_init( &x12_8way_ctx.simd, 512 );
+     init_echo( &x12_8way_ctx.echo, 512 );
+     hamsi512_8way_init( &x12_8way_ctx.hamsi );
+};
+
+void x12_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     x12_8way_ctx_holder ctx;
+     memcpy( &ctx, &x12_8way_ctx, sizeof(x12_8way_ctx) );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, state );
+}
+
+int scanhash_x12_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[16*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+     uint32_t *hash7 = &(hash[49]);
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     const uint32_t Htarg = ptarget[7];
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     int thr_id = mythr->id;
+
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+               _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                 n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+        x12_8way_hash( hash, vdata );
+
+        for ( int lane = 0; lane < 8; lane++ )
+        if ( hash7[ lane<<1 ] < Htarg )
+        {
+           extr_lane_8x64( lane_hash, hash, lane, 256 );
+           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+           {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+           }
+        }
+        n += 8;
+     } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined(X12_4WAY)

 typedef struct {
    blake512_4way_context   blake;
@@ -63,45 +272,13 @@ void x12_4way_hash( void *state, const void *input )
     x12_4way_ctx_holder ctx;
     memcpy( &ctx, &x12_4way_ctx, sizeof(x12_4way_ctx) );

-     // 1 Blake
     blake512_4way( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );

-     // 2 Bmw
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
-
-     // Serial
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-     // 3 Groestl
-     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-
-     // Parallel 4way 64 bit
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-
-     // 4 Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
-     skein512_4way_close( &ctx.skein, vhash );
-
-     // 5 JH
-     jh512_4way( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
-
-     // 6 Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
-
-     // Serial
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-
-     // 7 Luffa
     intrlv_2x128( vhash, hash0, hash1, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
     dintrlv_2x128( hash0, hash1, vhash, 512 );
@@ -110,7 +287,6 @@ void x12_4way_hash( void *state, const void *input )
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
     dintrlv_2x128( hash2, hash3, vhash, 512 );

-     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
@@ -119,7 +295,6 @@ void x12_4way_hash( void *state, const void *input )
     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );

-     // 9 Shavite
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     memcpy( &ctx.shavite, &x12_4way_ctx.shavite,
@@ -135,7 +310,6 @@ void x12_4way_hash( void *state, const void *input )
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );

-     // 10 Simd
     intrlv_2x128( vhash, hash0, hash1, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_2x128( hash0, hash1, vhash, 512 );
@@ -144,21 +318,25 @@ void x12_4way_hash( void *state, const void *input )
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_2x128( hash2, hash3, vhash, 512 );

-     // 11 Echo
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                       (const BitSequence *) hash0, 512 );
-     memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                       (const BitSequence *) hash1, 512 );
-     memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                       (const BitSequence *) hash2, 512 );
-     memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                       (const BitSequence *) hash3, 512 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

-     // 12 Hamsi parallel 4way 32 bit
+     // Parallel 4way 64 bit
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );

--- a/algo/x12/x12-gate.c
+++ b/algo/x12/x12-gate.c
@@ -2,7 +2,11 @@

 bool register_x12_algo( algo_gate_t* gate )
 {
-#if defined (X12_4WAY)
+#if defined (X12_8WAY)
+  init_x12_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x12_8way;
+  gate->hash      = (void*)&x12_8way_hash;
+#elif defined (X12_4WAY)
  init_x12_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x12_4way;
  gate->hash      = (void*)&x12_4way_hash;
@@ -11,7 +15,7 @@ bool register_x12_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x12;
  gate->hash      = (void*)&x12hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x12/x12-gate.h
+++ b/algo/x12/x12-gate.h
@@ -4,29 +4,36 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X12_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X12_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X12_4WAY 1
 #endif

 bool register_x12_algo( algo_gate_t* gate );

-#if defined(X12_4WAY)
+#if defined(X12_8WAY)
+
+void x12_8way_hash( void *state, const void *input );
+int scanhash_x12_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+void init_x12_8way_ctx();
+
+#elif defined(X12_4WAY)

 void x12_4way_hash( void *state, const void *input );
-
 int scanhash_x12_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x12_4way_ctx();

-#endif
+#else

 void x12hash( void *state, const void *input );
-
 int scanhash_x12( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x12_ctx();

 #endif

+#endif
+
--- a/algo/x12/x12.c
+++ b/algo/x12/x12.c
@@ -20,35 +20,40 @@
 #include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
-#include "algo/blake/sse2/blake.c"   
-#include "algo/bmw/sse2/bmw.c"
-#include "algo/keccak/sse2/keccak.c"
-#include "algo/skein/sse2/skein.c"
-#include "algo/jh/sse2/jh_sse2_opt64.h"
 #if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #endif

 typedef struct {
+   sph_blake512_context    blake;
+   sph_bmw512_context      bmw;
+   sph_skein512_context    skein;
+   sph_jh512_context       jh;
+   sph_keccak512_context   keccak;
 #if defined(__AES__)
-        hashState_groestl       groestl;
-        hashState_echo          echo;
+   hashState_groestl       groestl;
+   hashState_echo          echo;
 #else
-        sph_groestl512_context   groestl;
-        sph_echo512_context      echo;
+   sph_groestl512_context   groestl;
+   sph_echo512_context      echo;
 #endif
-        hashState_luffa         luffa;
-        cubehashParam           cubehash;
-        sph_shavite512_context  shavite;
-        hashState_sd            simd;
-        sph_hamsi512_context    hamsi;
+   hashState_luffa          luffa;
+   cubehashParam            cubehash;
+   sph_shavite512_context   shavite;
+   hashState_sd             simd;
+   sph_hamsi512_context     hamsi;
 } x12_ctx_holder;

 x12_ctx_holder x12_ctx;

 void init_x12_ctx()
 {
+        sph_blake512_init( &x12_ctx.blake );
+        sph_bmw512_init( &x12_ctx.bmw );
+        sph_skein512_init( &x12_ctx.skein);
+        sph_jh512_init( &x12_ctx.jh);
+        sph_keccak512_init( &x12_ctx.keccak);
 #if defined(__AES__)
        init_echo( &x12_ctx.echo, 512 );
        init_groestl (&x12_ctx.groestl, 64 );
@@ -65,102 +70,59 @@ void init_x12_ctx()

 void x12hash(void *output, const void *input)
 {
+
 	unsigned char hash[128] __attribute__ ((aligned (32)));
 	#define hashB hash+64
      
-        x12_ctx_holder ctx;
-        memcpy( &ctx, &x12_ctx, sizeof(x12_ctx) );
+   x12_ctx_holder ctx;
+   memcpy( &ctx, &x12_ctx, sizeof(x12_ctx) );

-        // X11 algos
+   sph_blake512(&ctx.blake, input, 80);
+   sph_blake512_close(&ctx.blake, hash);

-        unsigned char hashbuf[128];
-        size_t hashptr;
-        sph_u64 hashctA;
-        sph_u64 hashctB;
+   sph_bmw512(&ctx.bmw, hash, 64);
+   sph_bmw512_close(&ctx.bmw, hash);

-        //---blake1---
+   update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
+                           (const BitSequence*)hash, 64 );

-        DECL_BLK;
-        BLK_I;
-        BLK_W;
-        BLK_C;
+   cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
+                         (const byte*)hashB, 64 );

-        //---bmw2---
+   sph_shavite512( &ctx.shavite, hash, 64);
+   sph_shavite512_close( &ctx.shavite, hashB);

-        DECL_BMW;
-        BMW_I;
-        BMW_U;
-
-        #define M(x)    sph_dec64le_aligned(data + 8 * (x))
-        #define H(x)    (h[x])
-        #define dH(x)   (dh[x])
-
-        BMW_C;
-
-        #undef M
-        #undef H
-        #undef dH
-        
-        //---groetl----
+   update_final_sd( &ctx.simd, (BitSequence *)hash,
+                    (const BitSequence *)hashB, 512 );

 #if defined(__AES__)
-        update_and_final_groestl( &ctx.groestl, (char*)hash,
-                                  (const char*)hash, 512 );
-#else
-        sph_groestl512 (&ctx.groestl, hash, 64);
-        sph_groestl512_close(&ctx.groestl, hash);
-#endif
-
-        //---skein4---
-
-        DECL_SKN;
-        SKN_I;
-        SKN_U;
-        SKN_C;
-
-        //---jh5------
-
-        DECL_JH;
-        JH_H;
-
-        //---keccak6---
-
-        DECL_KEC;
-        KEC_I;
-        KEC_U;
-        KEC_C;
-
-        //--- luffa7
-        update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
-                                (const BitSequence*)hash, 64 );
-
-        // 8 Cube
-        cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
-                              (const byte*)hashB, 64 );
-
-        // 9 Shavite
-        sph_shavite512( &ctx.shavite, hash, 64);
-        sph_shavite512_close( &ctx.shavite, hashB);
-
-        // 10 Simd
-        update_final_sd( &ctx.simd, (BitSequence *)hash,
-                         (const BitSequence *)hashB, 512 );
-
-        //11---echo---
-
-#if defined(__AES__)
-        update_final_echo ( &ctx.echo, (BitSequence *)hashB,
+   update_final_echo ( &ctx.echo, (BitSequence *)hashB,
                            (const BitSequence *)hash, 512 );
 #else
-        sph_echo512(&ctx.echo, hash, 64);
-        sph_echo512_close(&ctx.echo, hashB);
+   sph_echo512(&ctx.echo, hash, 64);
+   sph_echo512_close(&ctx.echo, hashB);
 #endif

-        // 12 Hamsi
+#if defined(__AES__)
+   update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                  (const char*)hash, 512 );
+#else
+   sph_groestl512 (&ctx.groestl, hash, 64);
+   sph_groestl512_close(&ctx.groestl, hash);
+#endif
+
+   sph_skein512(&ctx.skein, hash, 64);
+   sph_skein512_close(&ctx.skein, hash);
+
+   sph_jh512(&ctx.jh, hash, 64);
+   sph_jh512_close(&ctx.jh, hash);
+
+   sph_keccak512(&ctx.keccak, hash, 64);
+   sph_keccak512_close(&ctx.keccak, hash);
+
 	sph_hamsi512(&ctx.hamsi, hashB, 64);
 	sph_hamsi512_close(&ctx.hamsi, hash);

-        asm volatile ("emms");
 	memcpy(output, hashB, 32);
 }

--- a/algo/x13/phi1612-4way.c
+++ b/algo/x13/phi1612-4way.c
@@ -1,7 +1,4 @@
 #include "phi1612-gate.h"
-
-#if defined(PHI1612_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -9,10 +6,193 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/echo/aes_ni/hash_api.h"

+#if defined(PHI1612_8WAY)
+
+typedef struct {
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    cube_4way_context       cube;
+    sph_fugue512_context    fugue;
+    sph_gost512_context     gost;
+    hashState_echo          echo;
+} phi1612_8way_ctx_holder;
+
+phi1612_8way_ctx_holder phi1612_8way_ctx __attribute__ ((aligned (64)));
+
+void init_phi1612_8way_ctx()
+{
+     skein512_8way_init( &phi1612_8way_ctx.skein );
+     jh512_8way_init( &phi1612_8way_ctx.jh );
+     cube_4way_init( &phi1612_8way_ctx.cube, 512, 16, 32 );
+     sph_fugue512_init( &phi1612_8way_ctx.fugue );
+     sph_gost512_init( &phi1612_8way_ctx.gost );
+     init_echo( &phi1612_8way_ctx.echo, 512 );
+};
+
+void phi1612_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     phi1612_8way_ctx_holder ctx;
+     memcpy( &ctx, &phi1612_8way_ctx, sizeof(phi1612_8way_ctx) );
+
+     // Skein parallel 4way
+     skein512_8way_update( &ctx.skein, input, 80 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     // JH
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     // Cubehash
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     // Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     // Gost
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, hash1 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash4, 64 );
+     sph_gost512_close( &ctx.gost, hash4 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash5, 64 );
+     sph_gost512_close( &ctx.gost, hash5 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash6, 64 );
+     sph_gost512_close( &ctx.gost, hash6 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash7, 64 );
+     sph_gost512_close( &ctx.gost, hash7 );
+
+     // Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+int scanhash_phi1612_8way( struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     const uint32_t first_nonce = pdata[19];
+     uint32_t n = first_nonce;
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     int thr_id = mythr->id;  
+     const uint32_t Htarg = ptarget[7];
+
+     if ( opt_benchmark )
+          ( (uint32_t*)ptarget )[7] = 0x0cff;
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do {
+           *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+               _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                 n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+        phi1612_8way_hash( hash, vdata );
+        pdata[19] = n;
+
+        for ( int i = 0; i < 8; i++ )
+        if ( (hash+(i<<3))[7] <= Htarg )
+        if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+        {
+           pdata[19] = n+i;
+           submit_lane_solution( work, hash+(i<<3), mythr, i );
+        }
+        n += 8;
+     } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined(PHI1612_4WAY)
+
+
 typedef struct {
    skein512_4way_context   skein;
    jh512_4way_context      jh;
--- a/algo/x13/phi1612-gate.c
+++ b/algo/x13/phi1612-gate.c
@@ -2,7 +2,11 @@

 bool register_phi1612_algo( algo_gate_t* gate )
 {
-#if defined(PHI1612_4WAY)
+#if defined(PHI1612_8WAY)
+  init_phi1612_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_phi1612_8way;
+  gate->hash      = (void*)&phi1612_8way_hash;
+#elif defined(PHI1612_4WAY)
  init_phi1612_4way_ctx();
  gate->scanhash  = (void*)&scanhash_phi1612_4way;
  gate->hash      = (void*)&phi1612_4way_hash;
@@ -11,7 +15,7 @@ bool register_phi1612_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_phi1612;
  gate->hash      = (void*)&phi1612_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x13/phi1612-gate.h
+++ b/algo/x13/phi1612-gate.h
@@ -4,29 +4,35 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define PHI1612_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define PHI1612_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define PHI1612_4WAY 1
 #endif

 bool register_phi1612_algo( algo_gate_t* gate );

-#if defined(PHI1612_4WAY)
+#if defined(PHI1612_8WAY)
+
+void phi1612_8way_hash( void *state, const void *input );
+int scanhash_phi1612_8way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+void init_phi1612_8way_ctx();
+
+#elif defined(PHI1612_4WAY)

 void phi1612_4way_hash( void *state, const void *input );
-
 int scanhash_phi1612_4way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_phi1612_4way_ctx();

-#endif
+#else

 void phi1612_hash( void *state, const void *input );
-
 int scanhash_phi1612( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_phi1612_ctx();

 #endif
+#endif

--- a/algo/x13/skunk-4way.c
+++ b/algo/x13/skunk-4way.c
@@ -1,7 +1,4 @@
 #include "skunk-gate.h"
-
-#if defined(SKUNK_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -10,6 +7,146 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
+
+#if defined(SKUNK_8WAY)
+
+typedef struct {
+    skein512_8way_context skein;
+    cube_4way_context     cube;
+    sph_fugue512_context  fugue;
+    sph_gost512_context   gost;
+} skunk_8way_ctx_holder;
+
+static __thread skunk_8way_ctx_holder skunk_8way_ctx;
+
+void skunk_8way_hash( void *output, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     skunk_8way_ctx_holder ctx __attribute__ ((aligned (64)));
+     memcpy( &ctx, &skunk_8way_ctx, sizeof(skunk_8way_ctx) );
+
+     skein512_8way_update( &ctx.skein, input, 80 );
+     skein512_8way_close( &ctx.skein, vhash );
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                        hash7, vhash, 512 );
+  
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); 
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 ); 
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 ); 
+     cube_4way_init( &ctx.cube, 512, 16, 32 );           
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );  
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, output );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, output+ 32 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, output+ 64 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, output+ 96 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash4, 64 );
+     sph_gost512_close( &ctx.gost, output+128 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash5, 64 );
+     sph_gost512_close( &ctx.gost, output+160 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash6, 64 );
+     sph_gost512_close( &ctx.gost, output+192 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash7, 64 );
+     sph_gost512_close( &ctx.gost, output+224 );
+}
+
+int scanhash_skunk_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   const uint32_t Htarg = ptarget[7];
+   int thr_id = mythr->id;  
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+      ((uint32_t*)ptarget)[7] = 0x0cff;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+      skunk_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
+      if ( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      {
+         pdata[19] = n+i;
+         submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n +=8;
+   } while ( likely( ( n < max_nonce-8 ) && !(*restart) ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+bool skunk_8way_thread_init()
+{
+   skein512_8way_init( &skunk_8way_ctx.skein );
+   cube_4way_init( &skunk_8way_ctx.cube, 512, 16, 32 );
+   sph_fugue512_init( &skunk_8way_ctx.fugue );
+   sph_gost512_init( &skunk_8way_ctx.gost );
+   return true;
+}
+
+#elif defined(SKUNK_4WAY)

 typedef struct {
    skein512_4way_context skein;
--- a/algo/x13/skunk-gate.c
+++ b/algo/x13/skunk-gate.c
@@ -2,12 +2,15 @@

 bool register_skunk_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AVX2_OPT;
-#if defined (SKUNK_4WAY)
+   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+#if defined (SKUNK_8WAY)
+   gate->miner_thread_init = (void*)&skunk_8way_thread_init;
+   gate->scanhash = (void*)&scanhash_skunk_8way;
+   gate->hash     = (void*)&skunk_8way_hash;
+#elif defined (SKUNK_4WAY)
   gate->miner_thread_init = (void*)&skunk_4way_thread_init;
   gate->scanhash = (void*)&scanhash_skunk_4way;
   gate->hash     = (void*)&skunk_4way_hash;
-//   init_skunk_4way_ctx();
 #else
   gate->miner_thread_init = (void*)&skunk_thread_init;
   gate->scanhash = (void*)&scanhash_skunk;
--- a/algo/x13/skunk-gate.h
+++ b/algo/x13/skunk-gate.h
@@ -4,29 +4,33 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__)
-  #define SKUNK_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define SKUNK_8WAY 1
+#elif defined(__AVX2__)
+  #define SKUNK_4WAY 1
 #endif

 bool register_skunk_algo( algo_gate_t* gate );

-#if defined(SKUNK_4WAY)
+#if defined(SKUNK_8WAY)
+
+void skunk_8way_hash( void *state, const void *input );
+int scanhash_skunk_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+bool skunk_8way_thread_init();
+
+#elif defined(SKUNK_4WAY)

 void skunk_4way_hash( void *state, const void *input );
-
 int scanhash_skunk_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
-
 bool skunk_4way_thread_init();
-//void init_skunk_4way_ctx();

 #endif

 void skunkhash( void *state, const void *input );
-
 int scanhash_skunk( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
-
 bool skunk_thread_init();

 #endif
--- a/algo/x13/x13-4way.c
+++ b/algo/x13/x13-4way.c
@@ -1,7 +1,4 @@
 #include "x13-gate.h"
-
-#if defined(X13_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,12 +11,270 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"

+#if defined(X13_8WAY)
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+} x13_8way_ctx_holder;
+
+x13_8way_ctx_holder x13_8way_ctx;
+
+void init_x13_8way_ctx()
+{
+     blake512_8way_init( &x13_8way_ctx.blake );
+     bmw512_8way_init( &x13_8way_ctx.bmw );
+     init_groestl( &x13_8way_ctx.groestl, 64 );
+     skein512_8way_init( &x13_8way_ctx.skein );
+     jh512_8way_init( &x13_8way_ctx.jh );
+     keccak512_8way_init( &x13_8way_ctx.keccak );
+     luffa_4way_init( &x13_8way_ctx.luffa, 512 );
+     cube_4way_init( &x13_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x13_8way_ctx.shavite );
+     simd_4way_init( &x13_8way_ctx.simd, 512 );
+     init_echo( &x13_8way_ctx.echo, 512 );
+     hamsi512_8way_init( &x13_8way_ctx.hamsi );
+     sph_fugue512_init( &x13_8way_ctx.fugue );
+}
+
+void x13_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     x13_8way_ctx_holder ctx;
+     memcpy( &ctx, &x13_8way_ctx, sizeof(x13_8way_ctx) );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 13 Fugue serial
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+     
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+
+int scanhash_x13_8way( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     int thr_id = mythr->id;
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     const uint32_t Htarg = ptarget[7];
+     const uint32_t last_nonce = max_nonce -8;
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+         _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                           n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+         x13_8way_hash( hash, vdata );
+         pdata[19] = n;
+
+         for ( int i = 0; i < 8; i++ )
+         if ( ( hash+(i<<3) )[7] < Htarg
+              && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+         {
+             pdata[19] = n+i;
+             submit_lane_solution( work, hash+(i<<3), mythr, i );
+         }
+         n += 8;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+
+#elif defined(X13_4WAY)
+
+
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
--- a/algo/x13/x13-gate.c
+++ b/algo/x13/x13-gate.c
@@ -2,7 +2,11 @@

 bool register_x13_algo( algo_gate_t* gate )
 {
-#if defined (X13_4WAY)
+#if defined (X13_8WAY)
+  init_x13_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x13_8way;
+  gate->hash      = (void*)&x13_8way_hash;
+#elif defined (X13_4WAY)
  init_x13_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x13_4way;
  gate->hash      = (void*)&x13_4way_hash;
@@ -11,7 +15,7 @@ bool register_x13_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x13;
  gate->hash      = (void*)&x13hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x13/x13-gate.h
+++ b/algo/x13/x13-gate.h
@@ -4,29 +4,35 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X13_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X13_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X13_4WAY 1
 #endif

 bool register_x13_algo( algo_gate_t* gate );

-#if defined(X13_4WAY)
+#if defined(X13_8WAY)
+
+void x13_8way_hash( void *state, const void *input );
+int scanhash_x13_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+void init_x13_8way_ctx();
+
+#elif defined(X13_4WAY)

 void x13_4way_hash( void *state, const void *input );
-
 int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x13_4way_ctx();

-#endif
+#else

 void x13hash( void *state, const void *input );
-
 int scanhash_x13( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x13_ctx();

 #endif

+#endif
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -1,7 +1,4 @@
 #include "x14-gate.h"
-
-#if defined(X14_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -13,6 +10,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
@@ -22,6 +20,263 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/shabal-hash-4way.h"

+#if defined(X14_8WAY)
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+} x14_8way_ctx_holder;
+
+x14_8way_ctx_holder x14_8way_ctx __attribute__ ((aligned (64)));
+
+void init_x14_8way_ctx()
+{
+     blake512_8way_init( &x14_8way_ctx.blake );
+     bmw512_8way_init( &x14_8way_ctx.bmw );
+     init_groestl( &x14_8way_ctx.groestl, 64 );
+     skein512_8way_init( &x14_8way_ctx.skein );
+     jh512_8way_init( &x14_8way_ctx.jh );
+     keccak512_8way_init( &x14_8way_ctx.keccak );
+     luffa_4way_init( &x14_8way_ctx.luffa, 512 );
+     cube_4way_init( &x14_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x14_8way_ctx.shavite );
+     simd_4way_init( &x14_8way_ctx.simd, 512 );
+     init_echo( &x14_8way_ctx.echo, 512 );
+     hamsi512_8way_init( &x14_8way_ctx.hamsi );
+     sph_fugue512_init( &x14_8way_ctx.fugue );
+     shabal512_8way_init( &x14_8way_ctx.shabal );
+};
+
+void x14_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     x14_8way_ctx_holder ctx;
+     memcpy( &ctx, &x14_8way_ctx, sizeof(x14_8way_ctx) );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+     
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 13 Fugue serial
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     // 14 Shabal, parallel 32 bit
+     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+     shabal512_8way_update( &ctx.shabal, vhash, 64 );
+     shabal512_8way_close( &ctx.shabal, state );
+}
+
+
+int scanhash_x14_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*16] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     const uint32_t last_nonce = max_nonce - 8;
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     const uint32_t Htarg = ptarget[7];
+     int thr_id = mythr->id;
+
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+        x14_8way_hash( hash, vdata );
+        pdata[19] = n;
+
+        uint32_t *hash7 = &(hash[7<<3]);
+        for ( int lane = 0; lane < 8; lane++ )
+        if ( hash7[ lane ] < Htarg )
+        {
+            uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+            extr_lane_8x32( lane_hash, hash, lane, 256 );
+            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+            {
+                pdata[19] = n + lane;
+                submit_lane_solution( work, lane_hash, mythr, lane );
+            }
+         }
+         n += 8;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined(X14_4WAY)
+
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
@@ -61,11 +316,11 @@ void init_x14_4way_ctx()

 void x14_4way_hash( void *state, const void *input )
 {
+     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     x14_4way_ctx_holder ctx;
     memcpy( &ctx, &x14_4way_ctx, sizeof(x14_4way_ctx) );

@@ -184,61 +439,49 @@ void x14_4way_hash( void *state, const void *input )

     // 14 Shabal, parallel 32 bit
     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, state );
 }

 int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[4*16] __attribute__ ((aligned (64)));
+     uint32_t hash[4*16] __attribute__ ((aligned (128)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
-     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
+     uint32_t n = first_nonce;
+     const uint32_t last_nonce = max_nonce - 4;
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
-     int thr_id = mythr->id;  // thr_id arg is deprecated
-     uint64_t htmax[] = {          0,        0xF,       0xFF,
-                               0xFFF,     0xFFFF, 0x10000000  };
-     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
-                          0xFFFFF000, 0xFFFF0000,          0  };
-
+     int thr_id = mythr->id;  
     mm256_bswap32_intrlv80_4x64( vdata, pdata );

-     for ( int m=0; m < 6; m++ )
-       if ( Htarg <= htmax[m] )
+     do
+     {
+       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+             _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+
+       x14_4way_hash( hash, vdata );
+       pdata[19] = n;
+
+       uint32_t *hash7 = &(hash[7<<2]);
+       for ( int lane = 0; lane < 4; lane++ )
+       if ( hash7[ lane ] < Htarg )
       {
-         uint32_t mask = masks[m];
-         do
-         {
-           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+           uint32_t lane_hash[8];
+           extr_lane_4x32( lane_hash, hash, lane, 256 );

-            x14_4way_hash( hash, vdata );
-            pdata[19] = n;
-
-            uint32_t *hash7 = &(hash[7<<2]);
-
-            for ( int lane = 0; lane < 4; lane++ )
-            if ( ( hash7[ lane ] & mask ) == 0 )
-            {
-               // deinterleave hash for lane
-               uint32_t lane_hash[8];
-               extr_lane_4x32( lane_hash, hash, lane, 256 );
-
-               if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
-               {
-                  pdata[19] = n + lane;
-                  submit_lane_solution( work, lane_hash, mythr, lane );
-               }
-            }
-            n += 4;
-         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
-         break;
-       }
-     *hashes_done = n - first_nonce + 1;
+           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+           {
+               pdata[19] = n + lane;
+               submit_lane_solution( work, lane_hash, mythr, lane );
+           }
+        }
+        n += 4;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
     return 0;
 }

--- a/algo/x14/x14-gate.c
+++ b/algo/x14/x14-gate.c
@@ -2,7 +2,11 @@

 bool register_x14_algo( algo_gate_t* gate )
 {
-#if defined (X14_4WAY)
+#if defined (X14_8WAY)
+  init_x14_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x14_8way;
+  gate->hash      = (void*)&x14_8way_hash;
+#elif defined (X14_4WAY)
  init_x14_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x14_4way;
  gate->hash      = (void*)&x14_4way_hash;
@@ -11,7 +15,7 @@ bool register_x14_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x14;
  gate->hash      = (void*)&x14hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x14/x14-gate.h
+++ b/algo/x14/x14-gate.h
@@ -4,20 +4,29 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X14_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X14_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X14_4WAY 1
 #endif

 bool register_x14_algo( algo_gate_t* gate );

-#if defined(X14_4WAY)
+#if defined(X14_8WAY)
+
+void x14_8way_hash( void *state, const void *input );
+int scanhash_x14_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+void init_x14_8way_ctx();
+
+#elif defined(X14_4WAY)

 void x14_4way_hash( void *state, const void *input );
 int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x14_4way_ctx();

-#endif
+#else

 void x14hash( void *state, const void *input );
 int scanhash_x14( struct work *work, uint32_t max_nonce,
@@ -26,3 +35,4 @@ void init_x14_ctx();

 #endif

+#endif
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -1,7 +1,4 @@
 #include "x15-gate.h"
-
-#if defined(X15_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,6 +11,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -23,6 +21,309 @@
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"

+#if defined(X15_8WAY)
+
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+} x15_8way_ctx_holder;
+
+x15_8way_ctx_holder x15_8way_ctx __attribute__ ((aligned (64)));
+
+void init_x15_8way_ctx()
+{
+     blake512_8way_init( &x15_8way_ctx.blake );
+     bmw512_8way_init( &x15_8way_ctx.bmw );
+     init_groestl( &x15_8way_ctx.groestl, 64 );
+     skein512_8way_init( &x15_8way_ctx.skein );
+     jh512_8way_init( &x15_8way_ctx.jh );
+     keccak512_8way_init( &x15_8way_ctx.keccak );
+     luffa_4way_init( &x15_8way_ctx.luffa, 512 );
+     cube_4way_init( &x15_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x15_8way_ctx.shavite );
+     simd_4way_init( &x15_8way_ctx.simd, 512 );
+     init_echo( &x15_8way_ctx.echo, 512 );
+     hamsi512_8way_init( &x15_8way_ctx.hamsi );
+     sph_fugue512_init( &x15_8way_ctx.fugue );
+     shabal512_8way_init( &x15_8way_ctx.shabal );
+     sph_whirlpool_init( &x15_8way_ctx.whirlpool );
+};
+
+void x15_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     x15_8way_ctx_holder ctx;
+     memcpy( &ctx, &x15_8way_ctx, sizeof(x15_8way_ctx) );
+
+     // 1 Blake
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     // 2 Bmw
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     // 10 Simd
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+
+     // 12 Hamsi parallel 4way 64 bit
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 13 Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+
+     // 14 Shabal, parallel 32 bit
+     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+     shabal512_8way_update( &ctx.shabal, vhash, 64 );
+     shabal512_8way_close( &ctx.shabal, vhash );
+     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 15 Whirlpool
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash4 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash5 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash6 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+int scanhash_x15_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     const uint32_t first_nonce = pdata[19];
+     uint32_t n = first_nonce;
+     const uint32_t last_nonce = max_nonce - 8;
+     __m512i  *noncev = (__m512i*)vdata + 9;  
+     const uint32_t Htarg = ptarget[7];
+     int thr_id = mythr->id;  
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+        x15_8way_hash( hash, vdata );
+        pdata[19] = n;
+
+        for ( int i = 0; i < 8; i++ )
+        if ( ( hash+(i<<3) )[7] < Htarg )
+        if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+        {
+           pdata[19] = n+i;
+           submit_lane_solution( work, hash, mythr, i );
+        }
+        n += 8;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined(X15_4WAY)
+
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
@@ -64,11 +365,11 @@ void init_x15_4way_ctx()

 void x15_4way_hash( void *state, const void *input )
 {
+     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     x15_4way_ctx_holder ctx;
     memcpy( &ctx, &x15_4way_ctx, sizeof(x15_4way_ctx) );

@@ -187,7 +488,7 @@ void x15_4way_hash( void *state, const void *input )

     // 14 Shabal, parallel 32 bit
     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
       
@@ -216,48 +517,37 @@ void x15_4way_hash( void *state, const void *input )
 int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t hash[4*8] __attribute__ ((aligned (128)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
-     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
-     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+     uint32_t n = first_nonce;
+     const uint32_t last_nonce = max_nonce - 4;
+     __m256i  *noncev = (__m256i*)vdata + 9;
     const uint32_t Htarg = ptarget[7];
-     int thr_id = mythr->id;  // thr_id arg is deprecated
-     uint64_t htmax[] = {          0,        0xF,       0xFF,
-                               0xFFF,     0xFFFF, 0x10000000  };
-     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
-                          0xFFFFF000, 0xFFFF0000,          0  };
-
-
+     int thr_id = mythr->id;  
     mm256_bswap32_intrlv80_4x64( vdata, pdata );

-     for ( int m=0; m < 6; m++ )
-       if ( Htarg <= htmax[m] )
-       {
-         uint32_t mask = masks[m];
-         do
+     do
+     {
+        *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+              _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+
+         x15_4way_hash( hash, vdata );
+         pdata[19] = n;
+
+         for ( int i = 0; i < 4; i++ )
+         if ( ( hash+(i<<3) )[7] < Htarg )
+         if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
         {
-           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+            pdata[19] = n+i;
+            submit_lane_solution( work, hash, mythr, i );
+         }
+         n += 4;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );

-            x15_4way_hash( hash, vdata );
-            pdata[19] = n;
-
-            for ( int i = 0; i < 4; i++ )
-            if ( ( (hash+(i<<3))[7] & mask ) == 0 )
-            if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-            {
-               pdata[19] = n+i;
-               submit_lane_solution( work, hash, mythr, i );
-            }
-            n += 4;
-         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
-         break;
-       }
-
-     *hashes_done = n - first_nonce + 1;
+     *hashes_done = n - first_nonce;
     return 0;
 }

--- a/algo/x15/x15-gate.c
+++ b/algo/x15/x15-gate.c
@@ -2,7 +2,11 @@

 bool register_x15_algo( algo_gate_t* gate )
 {
-#if defined (X15_4WAY)
+#if defined (X15_8WAY)
+  init_x15_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x15_8way;
+  gate->hash      = (void*)&x15_8way_hash;
+#elif defined (X15_4WAY)
  init_x15_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x15_4way;
  gate->hash      = (void*)&x15_4way_hash;
@@ -11,7 +15,7 @@ bool register_x15_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x15;
  gate->hash      = (void*)&x15hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x15/x15-gate.h
+++ b/algo/x15/x15-gate.h
@@ -4,20 +4,30 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X15_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X15_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X15_4WAY 1
 #endif

+
 bool register_x15_algo( algo_gate_t* gate );

-#if defined(X15_4WAY)
+#if defined(X15_8WAY)
+
+void x15_8way_hash( void *state, const void *input );
+int scanhash_x15_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+void init_x15_8way_ctx();
+
+#elif defined(X15_4WAY)

 void x15_4way_hash( void *state, const void *input );
 int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x15_4way_ctx();

-#endif
+#else

 void x15hash( void *state, const void *input );
 int scanhash_x15( struct work *work, uint32_t max_nonce,
@@ -26,3 +36,5 @@ void init_x15_ctx();

 #endif

+#endif
+
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -5,9 +5,6 @@
 * Optimized by JayDDee@github Jan 2018
 */
 #include "x16r-gate.h"
-
-#if defined (X16R_4WAY)
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -20,6 +17,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -32,6 +30,392 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };

+#if defined (X16R_8WAY)
+
+union _x16r_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+} __attribute__ ((aligned (64)));
+
+typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
+
+void x16r_8way_hash( void* output, const void* input )
+{
+   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[24] __attribute__ ((aligned (64)));
+   uint32_t hash1[24] __attribute__ ((aligned (64)));
+   uint32_t hash2[24] __attribute__ ((aligned (64)));
+   uint32_t hash3[24] __attribute__ ((aligned (64)));
+   uint32_t hash4[24] __attribute__ ((aligned (64)));
+   uint32_t hash5[24] __attribute__ ((aligned (64)));
+   uint32_t hash6[24] __attribute__ ((aligned (64)));
+   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   x16r_8way_context_overlay ctx;
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   void *in2 = (void*) hash2;
+   void *in3 = (void*) hash3;
+   void *in4 = (void*) hash4;
+   void *in5 = (void*) hash5;
+   void *in6 = (void*) hash6;
+   void *in7 = (void*) hash7;
+   int size = 80;
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 input, 640 );
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            blake512_8way_init( &ctx.blake );
+            if ( i == 0 )
+               blake512_8way_update( &ctx.blake, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
+                            size<<3 );
+               blake512_8way_update( &ctx.blake, vhash, size );
+            }
+            blake512_8way_close( &ctx.blake, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case BMW:
+            bmw512_8way_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_8way_update( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+            bmw512_8way_update( &ctx.bmw, vhash, size );
+            }
+            bmw512_8way_close( &ctx.bmw, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case GROESTL:
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                 (const char*)in0, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                 (const char*)in1, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                 (const char*)in2, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                 (const char*)in3, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                                 (const char*)in4, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                                 (const char*)in5, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                                 (const char*)in6, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                                 (const char*)in7, size<<3 );
+         break;
+         case SKEIN:
+            skein512_8way_init( &ctx.skein );
+            if ( i == 0 )
+               skein512_8way_update( &ctx.skein, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               skein512_8way_update( &ctx.skein, vhash, size );
+            }
+            skein512_8way_close( &ctx.skein, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case JH:
+            jh512_8way_init( &ctx.jh );
+            if ( i == 0 )
+               jh512_8way_update( &ctx.jh, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
+                            size<<3 );
+               jh512_8way_update( &ctx.jh, vhash, size );
+            }
+            jh512_8way_close( &ctx.jh, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case KECCAK:
+            keccak512_8way_init( &ctx.keccak );
+            if ( i == 0 )
+               keccak512_8way_update( &ctx.keccak, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
+                            size<<3 );
+               keccak512_8way_update( &ctx.keccak, vhash, size );
+            }
+            keccak512_8way_close( &ctx.keccak, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case LUFFA:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case CUBEHASH:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in0, size );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in1, size );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in2, size );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in3, size );
+            sph_shavite512_close( &ctx.shavite, hash3 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in4, size );
+            sph_shavite512_close( &ctx.shavite, hash4 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in5, size );
+            sph_shavite512_close( &ctx.shavite, hash5 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in6, size );
+            sph_shavite512_close( &ctx.shavite, hash6 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in7, size );
+            sph_shavite512_close( &ctx.shavite, hash7 );
+         break;
+         case SIMD:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case ECHO:
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                                (const BitSequence*)in0, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                                (const BitSequence*)in1, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                                (const BitSequence*)in2, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                                (const BitSequence*)in3, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
+                                (const BitSequence*)in4, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
+                                (const BitSequence*)in5, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
+                                (const BitSequence*)in6, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
+                                (const BitSequence*)in7, size<<3 );
+         break;
+         case HAMSI:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+
+             hamsi512_8way_init( &ctx.hamsi );
+             hamsi512_8way_update( &ctx.hamsi, vhash, size );
+             hamsi512_8way_close( &ctx.hamsi, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+             break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in0, size );
+             sph_fugue512_close( &ctx.fugue, hash0 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in1, size );
+             sph_fugue512_close( &ctx.fugue, hash1 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in2, size );
+             sph_fugue512_close( &ctx.fugue, hash2 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in3, size );
+             sph_fugue512_close( &ctx.fugue, hash3 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in4, size );
+             sph_fugue512_close( &ctx.fugue, hash4 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in5, size );
+             sph_fugue512_close( &ctx.fugue, hash5 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in6, size );
+             sph_fugue512_close( &ctx.fugue, hash6 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in7, size );
+             sph_fugue512_close( &ctx.fugue, hash7 );
+         break;
+         case SHABAL:
+             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                          size<<3 );
+             shabal512_8way_init( &ctx.shabal );
+             shabal512_8way_update( &ctx.shabal, vhash, size );
+             shabal512_8way_close( &ctx.shabal, vhash );
+             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in0, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash0 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in1, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash1 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in2, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash2 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in3, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in4, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash4 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in5, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash5 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in6, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash6 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in7, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash7 );
+         break;
+         case SHA_512:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+             sha512_8way_init( &ctx.sha512 );
+             sha512_8way_update( &ctx.sha512, vhash, size );
+             sha512_8way_close( &ctx.sha512, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+      }
+      size = 64;
+   }
+
+   memcpy( output,     hash0, 32 );
+   memcpy( output+32,  hash1, 32 );
+   memcpy( output+64,  hash2, 32 );
+   memcpy( output+96,  hash3, 32 );
+   memcpy( output+128, hash4, 32 );
+   memcpy( output+160, hash5, 32 );
+   memcpy( output+192, hash6, 32 );
+   memcpy( output+224, hash7, 32 );
+}
+
+int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   int thr_id = mythr->id;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+   const uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+   }
+
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+      x16r_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
+      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      {
+         pdata[19] = n+i;
+         submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+#elif defined (X16R_4WAY)
+
 union _x16r_4way_context_overlay
 {
    blake512_4way_context   blake;
@@ -50,16 +434,16 @@ union _x16r_4way_context_overlay
    shabal512_4way_context  shabal;
    sph_whirlpool_context   whirlpool;
    sha512_4way_context     sha512;
-};
+} __attribute__ ((aligned (64)));
 typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;

 void x16r_4way_hash( void* output, const void* input )
 {
+   uint32_t vhash[24*4] __attribute__ ((aligned (128)));
   uint32_t hash0[24] __attribute__ ((aligned (64)));
   uint32_t hash1[24] __attribute__ ((aligned (64)));
   uint32_t hash2[24] __attribute__ ((aligned (64)));
   uint32_t hash3[24] __attribute__ ((aligned (64)));
-   uint32_t vhash[24*4] __attribute__ ((aligned (64)));
   x16r_4way_context_overlay ctx;
   void *in0 = (void*) hash0;
   void *in1 = (void*) hash1;
@@ -86,7 +470,7 @@ void x16r_4way_hash( void* output, const void* input )
               blake512_4way( &ctx.blake, vhash, size );
            }
            blake512_4way_close( &ctx.blake, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case BMW:
            bmw512_4way_init( &ctx.bmw );
@@ -98,7 +482,7 @@ void x16r_4way_hash( void* output, const void* input )
               bmw512_4way( &ctx.bmw, vhash, size );
            }
            bmw512_4way_close( &ctx.bmw, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case GROESTL:
               init_groestl( &ctx.groestl, 64 );
@@ -124,7 +508,7 @@ void x16r_4way_hash( void* output, const void* input )
               skein512_4way( &ctx.skein, vhash, size );
            }
            skein512_4way_close( &ctx.skein, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case JH:
            jh512_4way_init( &ctx.jh );
@@ -136,7 +520,7 @@ void x16r_4way_hash( void* output, const void* input )
               jh512_4way( &ctx.jh, vhash, size );
            }
            jh512_4way_close( &ctx.jh, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case KECCAK:
            keccak512_4way_init( &ctx.keccak );
@@ -148,17 +532,17 @@ void x16r_4way_hash( void* output, const void* input )
               keccak512_4way( &ctx.keccak, vhash, size );
            }
            keccak512_4way_close( &ctx.keccak, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case LUFFA:
            intrlv_2x128( vhash, in0, in1, size<<3 );
            luffa_2way_init( &ctx.luffa, 512 );
            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
-            dintrlv_2x128( hash0, hash1, vhash, 512 );
+            dintrlv_2x128_512( hash0, hash1, vhash );
            intrlv_2x128( vhash, in2, in3, size<<3 );
            luffa_2way_init( &ctx.luffa, 512 );
            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
-            dintrlv_2x128( hash2, hash3, vhash, 512 );
+            dintrlv_2x128_512( hash2, hash3, vhash );
         break;
         case CUBEHASH:
            cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -192,11 +576,11 @@ void x16r_4way_hash( void* output, const void* input )
            intrlv_2x128( vhash, in0, in1, size<<3 );
            simd_2way_init( &ctx.simd, 512 );
            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_2x128( hash0, hash1, vhash, 512 );
+            dintrlv_2x128_512( hash0, hash1, vhash );
            intrlv_2x128( vhash, in2, in3, size<<3 );
            simd_2way_init( &ctx.simd, 512 );
            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_2x128( hash2, hash3, vhash, 512 );
+            dintrlv_2x128_512( hash2, hash3, vhash );
         break;
         case ECHO:
             init_echo( &ctx.echo, 512 );
@@ -217,7 +601,7 @@ void x16r_4way_hash( void* output, const void* input )
             hamsi512_4way_init( &ctx.hamsi );
             hamsi512_4way( &ctx.hamsi, vhash, size );
             hamsi512_4way_close( &ctx.hamsi, vhash );
-             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case FUGUE:
             sph_fugue512_init( &ctx.fugue );
@@ -238,7 +622,7 @@ void x16r_4way_hash( void* output, const void* input )
             shabal512_4way_init( &ctx.shabal );
             shabal512_4way( &ctx.shabal, vhash, size );
             shabal512_4way_close( &ctx.shabal, vhash );
-             dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+             dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case WHIRLPOOL:
             sph_whirlpool_init( &ctx.whirlpool );
@@ -259,7 +643,7 @@ void x16r_4way_hash( void* output, const void* input )
             sha512_4way_init( &ctx.sha512 );
             sha512_4way( &ctx.sha512, vhash, size );
             sha512_4way_close( &ctx.sha512, vhash );
-             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
      }
      size = 64;
@@ -280,6 +664,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   int thr_id = mythr->id;
@@ -317,9 +702,9 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
         submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;
-   } while ( likely( ( n < max_nonce ) && !(*restart) ) );
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );

-   *hashes_done = n - first_nonce + 1;
+   *hashes_done = n - first_nonce;
   return 0;
 }

--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -34,14 +34,17 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output )

 bool register_x16r_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16r_8way;
+  gate->hash      = (void*)&x16r_8way_hash;
+#elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -49,14 +52,17 @@ bool register_x16r_algo( algo_gate_t* gate )

 bool register_x16rv2_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16rv2_8way;
+  gate->hash      = (void*)&x16rv2_8way_hash;
+#elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rv2_4way;
  gate->hash      = (void*)&x16rv2_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rv2;
  gate->hash      = (void*)&x16rv2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -64,14 +70,17 @@ bool register_x16rv2_algo( algo_gate_t* gate )

 bool register_x16s_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16r_8way;
+  gate->hash      = (void*)&x16r_8way_hash;
+#elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -196,28 +205,34 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )

 bool register_x16rt_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16rt_8way;
+  gate->hash      = (void*)&x16rt_8way_hash;
+#elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16rt_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16rt_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  opt_target_factor = 256.0;
  return true;
 };

 bool register_x16rt_veil_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16rt_8way;
+  gate->hash      = (void*)&x16rt_8way_hash;
+#elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16rt_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16rt_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  gate->build_extraheader = (void*)&veil_build_extraheader;
  opt_target_factor = 256.0;
  return true;
@@ -231,7 +246,7 @@ bool register_hex_algo( algo_gate_t* gate )
 {
  gate->scanhash        = (void*)&scanhash_hex;
  gate->hash            = (void*)&hex_hash;
-  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  opt_target_factor = 128.0;
  return true;
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -6,8 +6,10 @@
 #include <stdint.h>
 #include <unistd.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X16R_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X16R_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X16R_4WAY 1
 #endif

 enum x16r_Algo {
@@ -44,7 +46,20 @@ bool register_x16rt_algo( algo_gate_t* gate );
 bool register_hex__algo( algo_gate_t* gate );
 bool register_x21s__algo( algo_gate_t* gate );

-#if defined(X16R_4WAY)
+#if defined(X16R_8WAY)
+
+void x16r_8way_hash( void *state, const void *input );
+int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
+void x16rv2_8way_hash( void *state, const void *input );
+int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+void x16rt_8way_hash( void *state, const void *input );
+int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(X16R_4WAY)

 void x16r_4way_hash( void *state, const void *input );
 int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
@@ -58,12 +73,7 @@ void x16rt_4way_hash( void *state, const void *input );
 int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

-void x21s_4way_hash( void *state, const void *input );
-int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr );
-bool x21s_4way_thread_init();
-
-#endif
+#else

 void x16r_hash( void *state, const void *input );
 int scanhash_x16r( struct work *work, uint32_t max_nonce,
@@ -77,9 +87,16 @@ void x16rt_hash( void *state, const void *input );
 int scanhash_x16rt( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );

-void hex_hash( void *state, const void *input );
-int scanhash_hex( struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done, struct thr_info *mythr );
+#endif
+
+#if defined(X16R_4WAY)
+
+void x21s_4way_hash( void *state, const void *input );
+int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+bool x21s_4way_thread_init();
+
+#else

 void x21s_hash( void *state, const void *input );
 int scanhash_x21s( struct work *work, uint32_t max_nonce,
@@ -88,3 +105,9 @@ bool x21s_thread_init();

 #endif

+void hex_hash( void *state, const void *input );
+int scanhash_hex( struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -1,7 +1,4 @@
 #include "x16r-gate.h"
-
-#if defined (X16R_4WAY)
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -15,6 +12,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -26,6 +24,391 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };

+#if defined (X16R_8WAY)
+
+union _x16rt_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+} __attribute__ ((aligned (64)));
+
+typedef union _x16rt_8way_context_overlay x16rt_8way_context_overlay;
+
+void x16rt_8way_hash( void* output, const void* input )
+{
+   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[24] __attribute__ ((aligned (64)));
+   uint32_t hash1[24] __attribute__ ((aligned (64)));
+   uint32_t hash2[24] __attribute__ ((aligned (64)));
+   uint32_t hash3[24] __attribute__ ((aligned (64)));
+   uint32_t hash4[24] __attribute__ ((aligned (64)));
+   uint32_t hash5[24] __attribute__ ((aligned (64)));
+   uint32_t hash6[24] __attribute__ ((aligned (64)));
+   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   x16rt_8way_context_overlay ctx;
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   void *in2 = (void*) hash2;
+   void *in3 = (void*) hash3;
+   void *in4 = (void*) hash4;
+   void *in5 = (void*) hash5;
+   void *in6 = (void*) hash6;
+   void *in7 = (void*) hash7;
+   int size = 80;
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 input, 640 );
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            blake512_8way_init( &ctx.blake );
+            if ( i == 0 )
+               blake512_8way_update( &ctx.blake, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               blake512_8way_update( &ctx.blake, vhash, size );
+            }
+            blake512_8way_close( &ctx.blake, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case BMW:
+            bmw512_8way_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_8way_update( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+            bmw512_8way_update( &ctx.bmw, vhash, size );
+            }
+            bmw512_8way_close( &ctx.bmw, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case GROESTL:
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                 (const char*)in0, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                 (const char*)in1, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                 (const char*)in2, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                 (const char*)in3, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                                 (const char*)in4, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                                 (const char*)in5, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                                 (const char*)in6, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                                 (const char*)in7, size<<3 );
+         break;
+         case SKEIN:
+            skein512_8way_init( &ctx.skein );
+            if ( i == 0 )
+               skein512_8way_update( &ctx.skein, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               skein512_8way_update( &ctx.skein, vhash, size );
+            }
+            skein512_8way_close( &ctx.skein, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case JH:
+            jh512_8way_init( &ctx.jh );
+            if ( i == 0 )
+               jh512_8way_update( &ctx.jh, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               jh512_8way_update( &ctx.jh, vhash, size );
+            }
+            jh512_8way_close( &ctx.jh, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case KECCAK:
+            keccak512_8way_init( &ctx.keccak );
+            if ( i == 0 )
+               keccak512_8way_update( &ctx.keccak, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               keccak512_8way_update( &ctx.keccak, vhash, size );
+            }
+            keccak512_8way_close( &ctx.keccak, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case LUFFA:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case CUBEHASH:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in0, size );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in1, size );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in2, size );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in3, size );
+            sph_shavite512_close( &ctx.shavite, hash3 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in4, size );
+            sph_shavite512_close( &ctx.shavite, hash4 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in5, size );
+            sph_shavite512_close( &ctx.shavite, hash5 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in6, size );
+            sph_shavite512_close( &ctx.shavite, hash6 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in7, size );
+            sph_shavite512_close( &ctx.shavite, hash7 );
+         break;
+         case SIMD:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case ECHO:
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                                (const BitSequence*)in0, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                                (const BitSequence*)in1, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                                (const BitSequence*)in2, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                                (const BitSequence*)in3, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
+                                (const BitSequence*)in4, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
+                                (const BitSequence*)in5, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
+                                (const BitSequence*)in6, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
+                                (const BitSequence*)in7, size<<3 );
+         break;
+         case HAMSI:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+
+             hamsi512_8way_init( &ctx.hamsi );
+             hamsi512_8way_update( &ctx.hamsi, vhash, size );
+             hamsi512_8way_close( &ctx.hamsi, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+             break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in0, size );
+             sph_fugue512_close( &ctx.fugue, hash0 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in1, size );
+             sph_fugue512_close( &ctx.fugue, hash1 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in2, size );
+             sph_fugue512_close( &ctx.fugue, hash2 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in3, size );
+             sph_fugue512_close( &ctx.fugue, hash3 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in4, size );
+             sph_fugue512_close( &ctx.fugue, hash4 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in5, size );
+             sph_fugue512_close( &ctx.fugue, hash5 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in6, size );
+             sph_fugue512_close( &ctx.fugue, hash6 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in7, size );
+             sph_fugue512_close( &ctx.fugue, hash7 );
+         break;
+         case SHABAL:
+             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                          size<<3 );
+             shabal512_8way_init( &ctx.shabal );
+             shabal512_8way_update( &ctx.shabal, vhash, size );
+             shabal512_8way_close( &ctx.shabal, vhash );
+             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in0, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash0 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in1, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash1 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in2, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash2 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in3, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in4, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash4 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in5, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash5 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in6, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash6 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in7, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash7 );
+         break;
+         case SHA_512:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+             sha512_8way_init( &ctx.sha512 );
+             sha512_8way_update( &ctx.sha512, vhash, size );
+             sha512_8way_close( &ctx.sha512, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+      }
+      size = 64;
+   }
+
+   memcpy( output,     hash0, 32 );
+   memcpy( output+32,  hash1, 32 );
+   memcpy( output+64,  hash2, 32 );
+   memcpy( output+96,  hash3, 32 );
+   memcpy( output+128, hash4, 32 );
+   memcpy( output+160, hash5, 32 );
+   memcpy( output+192, hash6, 32 );
+   memcpy( output+224, hash7, 32 );
+}
+
+int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) timeHash[8*8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   int thr_id = mythr->id;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16rt_getTimeHash( ntime, &timeHash );
+      x16rt_getAlgoString( &timeHash[0], hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
+                               hashOrder, ntime, timeHash );
+   }
+
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+      x16rt_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
+      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      {
+         pdata[19] = n+i;
+         submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (X16R_4WAY)
+
 union _x16rt_4way_context_overlay
 {
    blake512_4way_context   blake;
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -5,9 +5,6 @@
 * Optimized by JayDDee@github Jan 2018
 */
 #include "x16r-gate.h"
-
-#if defined (X16R_4WAY)
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -21,6 +18,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -33,6 +31,477 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };

+#if defined (X16R_8WAY)
+
+union _x16rv2_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+    sph_tiger_context       tiger;
+} __attribute__ ((aligned (64)));
+
+typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
+
+void x16rv2_8way_hash( void* output, const void* input )
+{
+   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[24] __attribute__ ((aligned (64)));
+   uint32_t hash1[24] __attribute__ ((aligned (64)));
+   uint32_t hash2[24] __attribute__ ((aligned (64)));
+   uint32_t hash3[24] __attribute__ ((aligned (64)));
+   uint32_t hash4[24] __attribute__ ((aligned (64)));
+   uint32_t hash5[24] __attribute__ ((aligned (64)));
+   uint32_t hash6[24] __attribute__ ((aligned (64)));
+   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   x16rv2_8way_context_overlay ctx;
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   void *in2 = (void*) hash2;
+   void *in3 = (void*) hash3;
+   void *in4 = (void*) hash4;
+   void *in5 = (void*) hash5;
+   void *in6 = (void*) hash6;
+   void *in7 = (void*) hash7;
+   int size = 80;
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 input, 640 );
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            blake512_8way_init( &ctx.blake );
+            if ( i == 0 )
+               blake512_8way_update( &ctx.blake, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               blake512_8way_update( &ctx.blake, vhash, size );
+            }
+            blake512_8way_close( &ctx.blake, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case BMW:
+            bmw512_8way_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_8way_update( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+            bmw512_8way_update( &ctx.bmw, vhash, size );
+            }
+            bmw512_8way_close( &ctx.bmw, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case GROESTL:
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                 (const char*)in0, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                 (const char*)in1, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                 (const char*)in2, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                 (const char*)in3, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                                 (const char*)in4, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                                 (const char*)in5, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                                 (const char*)in6, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                                 (const char*)in7, size<<3 );
+         break;
+         case SKEIN:
+            skein512_8way_init( &ctx.skein );
+            if ( i == 0 )
+               skein512_8way_update( &ctx.skein, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               skein512_8way_update( &ctx.skein, vhash, size );
+            }
+            skein512_8way_close( &ctx.skein, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case JH:
+            jh512_8way_init( &ctx.jh );
+            if ( i == 0 )
+               jh512_8way_update( &ctx.jh, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               jh512_8way_update( &ctx.jh, vhash, size );
+            }
+            jh512_8way_close( &ctx.jh, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case KECCAK:
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in0, size );
+             sph_tiger_close( &ctx.tiger, hash0 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in1, size );
+             sph_tiger_close( &ctx.tiger, hash1 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in2, size );
+             sph_tiger_close( &ctx.tiger, hash2 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in3, size );
+             sph_tiger_close( &ctx.tiger, hash3 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in4, size );
+             sph_tiger_close( &ctx.tiger, hash4 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in5, size );
+             sph_tiger_close( &ctx.tiger, hash5 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in6, size );
+             sph_tiger_close( &ctx.tiger, hash6 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in7, size );
+             sph_tiger_close( &ctx.tiger, hash7 );
+
+             for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = hash2[i] = hash3[i] =
+                hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
+
+             intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
+                          hash6, hash7 );
+             keccak512_8way_init( &ctx.keccak );
+             keccak512_8way_update( &ctx.keccak, vhash, 64 );
+             keccak512_8way_close( &ctx.keccak, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case LUFFA:
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in0, size );
+             sph_tiger_close( &ctx.tiger, hash0 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in1, size );
+             sph_tiger_close( &ctx.tiger, hash1 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in2, size );
+             sph_tiger_close( &ctx.tiger, hash2 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in3, size );
+             sph_tiger_close( &ctx.tiger, hash3 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in4, size );
+             sph_tiger_close( &ctx.tiger, hash4 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in5, size );
+             sph_tiger_close( &ctx.tiger, hash5 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in6, size );
+             sph_tiger_close( &ctx.tiger, hash6 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in7, size );
+             sph_tiger_close( &ctx.tiger, hash7 );
+
+             for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = hash2[i] = hash3[i] = 
+                hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
+
+            intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3);
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7);
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case CUBEHASH:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in0, size );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in1, size );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in2, size );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in3, size );
+            sph_shavite512_close( &ctx.shavite, hash3 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in4, size );
+            sph_shavite512_close( &ctx.shavite, hash4 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in5, size );
+            sph_shavite512_close( &ctx.shavite, hash5 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in6, size );
+            sph_shavite512_close( &ctx.shavite, hash6 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in7, size );
+            sph_shavite512_close( &ctx.shavite, hash7 );
+         break;
+         case SIMD:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case ECHO:
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                                (const BitSequence*)in0, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                                (const BitSequence*)in1, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                                (const BitSequence*)in2, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                                (const BitSequence*)in3, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
+                                (const BitSequence*)in4, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
+                                (const BitSequence*)in5, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
+                                (const BitSequence*)in6, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
+                                (const BitSequence*)in7, size<<3 );
+         break;
+         case HAMSI:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+
+             hamsi512_8way_init( &ctx.hamsi );
+             hamsi512_8way_update( &ctx.hamsi, vhash, size );
+             hamsi512_8way_close( &ctx.hamsi, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+             break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in0, size );
+             sph_fugue512_close( &ctx.fugue, hash0 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in1, size );
+             sph_fugue512_close( &ctx.fugue, hash1 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in2, size );
+             sph_fugue512_close( &ctx.fugue, hash2 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in3, size );
+             sph_fugue512_close( &ctx.fugue, hash3 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in4, size );
+             sph_fugue512_close( &ctx.fugue, hash4 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in5, size );
+             sph_fugue512_close( &ctx.fugue, hash5 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in6, size );
+             sph_fugue512_close( &ctx.fugue, hash6 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in7, size );
+             sph_fugue512_close( &ctx.fugue, hash7 );
+         break;
+         case SHABAL:
+             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                          size<<3 );
+             shabal512_8way_init( &ctx.shabal );
+             shabal512_8way_update( &ctx.shabal, vhash, size );
+             shabal512_8way_close( &ctx.shabal, vhash );
+             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in0, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash0 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in1, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash1 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in2, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash2 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in3, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in4, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash4 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in5, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash5 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in6, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash6 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in7, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash7 );
+         break;
+         case SHA_512:
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in0, size );
+             sph_tiger_close( &ctx.tiger, hash0 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in1, size );
+             sph_tiger_close( &ctx.tiger, hash1 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in2, size );
+             sph_tiger_close( &ctx.tiger, hash2 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in3, size );
+             sph_tiger_close( &ctx.tiger, hash3 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in4, size );
+             sph_tiger_close( &ctx.tiger, hash4 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in5, size );
+             sph_tiger_close( &ctx.tiger, hash5 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in6, size );
+             sph_tiger_close( &ctx.tiger, hash6 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in7, size );
+             sph_tiger_close( &ctx.tiger, hash7 );
+
+             for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = hash2[i] = hash3[i] =
+                hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
+
+             intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
+                          hash6, hash7 );
+             sha512_8way_init( &ctx.sha512 );
+             sha512_8way_update( &ctx.sha512, vhash, 64 );
+             sha512_8way_close( &ctx.sha512, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+      }
+      size = 64;
+   }
+
+   memcpy( output,     hash0, 32 );
+   memcpy( output+32,  hash1, 32 );
+   memcpy( output+64,  hash2, 32 );
+   memcpy( output+96,  hash3, 32 );
+   memcpy( output+128, hash4, 32 );
+   memcpy( output+160, hash5, 32 );
+   memcpy( output+192, hash6, 32 );
+   memcpy( output+224, hash7, 32 );
+}
+
+int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   int thr_id = mythr->id;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+   const uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+   }
+
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+      x16rv2_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
+      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      {
+         pdata[19] = n+i;
+         submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+#elif defined (X16R_4WAY)
+
+
+
 union _x16rv2_4way_context_overlay
 {
    blake512_4way_context   blake;
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
--- a/algo/x17/sonoa-gate.c
+++ b/algo/x17/sonoa-gate.c
@@ -2,8 +2,10 @@

 bool register_sonoa_algo( algo_gate_t* gate )
 {
-#if defined (SONOA_4WAY)
-//  init_sonoa_4way_ctx();
+#if defined (SONOA_8WAY)
+  gate->scanhash  = (void*)&scanhash_sonoa_8way;
+  gate->hash      = (void*)&sonoa_8way_hash;
+#elif defined (SONOA_4WAY)
  gate->scanhash  = (void*)&scanhash_sonoa_4way;
  gate->hash      = (void*)&sonoa_4way_hash;
 #else
@@ -11,7 +13,7 @@ bool register_sonoa_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_sonoa;
  gate->hash      = (void*)&sonoa_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x17/sonoa-gate.h
+++ b/algo/x17/sonoa-gate.h
@@ -4,29 +4,33 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define SONOA_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define SONOA_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define SONOA_4WAY 1
 #endif

 bool register_sonoa_algo( algo_gate_t* gate );

-#if defined(SONOA_4WAY)
+#if defined(SONOA_8WAY)
+
+void sonoa_8way_hash( void *state, const void *input );
+int scanhash_sonoa_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(SONOA_4WAY)

 void sonoa_4way_hash( void *state, const void *input );
-
 int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );

-//void init_sonoa_4way_ctx();
-
-#endif
+#else

 void sonoa_hash( void *state, const void *input );
-
 int scanhash_sonoa( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_sonoa_ctx();

 #endif

+#endif
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -1,7 +1,4 @@
 #include "x17-gate.h"
-
-#if defined(X17_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,6 +11,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
+#include "algo/shavite/sph_shavite.h"
 #include "algo/shavite/shavite-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -24,6 +22,309 @@
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/sha/sha-hash-4way.h"

+#if defined(X17_8WAY)
+
+union _x17_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+    haval256_5_8way_context haval;
+} __attribute__ ((aligned (64)));
+typedef union _x17_8way_context_overlay x17_8way_context_overlay;
+
+void x17_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[8*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[8*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     x17_8way_context_overlay ctx;
+
+     // 1 Blake parallel 4 way 64 bit
+     blake512_8way_init( &ctx.blake );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     // 2 Bmw
+     bmw512_8way_init( &ctx.bmw );
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     // Serialize
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     // 3 Groestl
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     // Parallellize
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+
+     // 4 Skein parallel 4 way 64 bit 
+     skein512_8way_init( &ctx.skein );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_8way_init( &ctx.jh );
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_8way_init( &ctx.keccak );
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     // 7 Luffa  
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     // 8 Cubehash
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+
+     // 9 Shavite
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     // 10 Simd
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+
+     // 11 Echo serial
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                            (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                            (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                            (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                            (const BitSequence *) hash3, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                            (const BitSequence *) hash4, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                            (const BitSequence *) hash5, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                            (const BitSequence *) hash6, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                            (const BitSequence *) hash7, 512 );
+
+     // 12 Hamsi parallel 4 way 64 bit
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+
+     hamsi512_8way_init( &ctx.hamsi );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 13 Fugue serial
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     // 14 Shabal, parallel 4 way 32 bit
+     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+
+     shabal512_8way_init( &ctx.shabal );
+     shabal512_8way_update( &ctx.shabal, vhash, 64 );
+     shabal512_8way_close( &ctx.shabal, vhash );
+
+     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 15 Whirlpool serial
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash4 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash5 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash6 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+
+     // 16 SHA512 parallel 64 bit 
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+
+     sha512_8way_init( &ctx.sha512 );
+     sha512_8way_update( &ctx.sha512, vhash, 64 );
+     sha512_8way_close( &ctx.sha512, vhash );
+
+     // 17 Haval parallel 32 bit
+     rintrlv_8x64_8x32( vhash0, vhash,  512 );
+
+     haval256_5_8way_init( &ctx.haval );
+     haval256_5_8way_update( &ctx.haval, vhash0, 64 );
+     haval256_5_8way_close( &ctx.haval, state );
+}
+
+int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+     const uint32_t last_nonce = max_nonce - 8;
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const uint32_t Htarg = ptarget[7];
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+      x17_8way_hash( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if unlikely( ( hash7[ lane ] <= Htarg ) )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(X17_4WAY)
+
 union _x17_4way_context_overlay
 {
    blake512_4way_context   blake;
@@ -127,6 +428,7 @@ void x17_4way_hash( void *state, const void *input )
     dintrlv_2x128_512( hash0, hash1, vhashA );
     dintrlv_2x128_512( hash2, hash3, vhashB );

+
     // 11 Echo serial
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -2,14 +2,17 @@

 bool register_x17_algo( algo_gate_t* gate )
 {
-#if defined (X17_4WAY)
+#if defined (X17_8WAY)
+  gate->scanhash  = (void*)&scanhash_x17_8way;
+  gate->hash      = (void*)&x17_8way_hash;
+#elif defined (X17_4WAY)
  gate->scanhash  = (void*)&scanhash_x17_4way;
  gate->hash      = (void*)&x17_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x17;
  gate->hash      = (void*)&x17_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	c65b0ff7a6	v3.10.5	2019-12-21 13:19:29 -05:00
Jay D Dee	a17ff6f189	v3.10.2	2019-12-09 15:59:02 -05:00
Jay D Dee	73430b13b1	v3.10.1	2019-12-05 19:09:23 -05:00