From c65b0ff7a6d982b5faca465e01f27ac4af4901e2 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Sat, 21 Dec 2019 13:19:29 -0500
Subject: [PATCH] v3.10.5

---
 INSTALL_LINUX                                 |   71 +-
 Makefile.am                                   |    2 +
 RELEASE_NOTES                                 |   18 +-
 algo/argon2/argon2d/blake2/blamka-round-opt.h |    8 +-
 algo/blake/blake-hash-4way.h                  |   17 +-
 algo/blake/blake256-hash-4way.c               |   14 +-
 algo/bmw/bmw256-hash-4way.c                   |   52 +
 algo/cubehash/cube-hash-2way.c                |   32 +-
 algo/cubehash/cubehash_sse2.c                 |    8 +-
 algo/hamsi/hamsi-hash-4way.c                  |  389 ++++-
 algo/hamsi/hamsi-hash-4way.h                  |   25 +-
 algo/haval/haval-8way-helper.c                |  115 ++
 algo/haval/haval-hash-4way.c                  |  297 +++-
 algo/haval/haval-hash-4way.h                  |   24 +-
 algo/lyra2/lyra2-gate.c                       |   28 +-
 algo/lyra2/lyra2-gate.h                       |   36 +-
 algo/lyra2/lyra2-hash-2way.c                  |  482 +++---
 algo/lyra2/lyra2.c                            |    1 -
 algo/lyra2/lyra2.h                            |   11 +
 algo/lyra2/lyra2rev2-4way.c                   |  143 +-
 algo/lyra2/lyra2rev3-4way.c                   |  174 ++-
 algo/lyra2/sponge-2way.c                      |  178 ++-
 algo/lyra2/sponge.c                           |   56 +-
 algo/lyra2/sponge.h                           |   54 +-
 algo/quark/hmq1725-4way.c                     | 1081 +++++++++++--
 algo/quark/hmq1725-gate.c                     |    7 +-
 algo/quark/hmq1725-gate.h                     |   14 +-
 algo/quark/hmq1725.c                          |    6 +
 algo/quark/quark-4way.c                       |   14 -
 algo/qubit/qubit-2way.c                       |    1 -
 algo/sha/sha-hash-4way.h                      |   40 +-
 algo/sha/sha512-hash-4way.c                   |  234 ++-
 algo/shabal/shabal-hash-4way.c                |  646 +++++++-
 algo/shabal/shabal-hash-4way.h                |   35 +-
 algo/shavite/shavite-hash-2way.c              |   58 +-
 algo/x11/c11-4way.c                           |   23 +-
 algo/x11/x11-4way.c                           |   22 +-
 algo/x12/x12-4way.c                           |  282 +++-
 algo/x12/x12-gate.c                           |    8 +-
 algo/x12/x12-gate.h                           |   23 +-
 algo/x12/x12.c                                |  146 +-
 algo/x13/x13-4way.c                           |  261 +++-
 algo/x13/x13-gate.c                           |    8 +-
 algo/x13/x13-gate.h                           |   22 +-
 algo/x14/x14-4way.c                           |  329 +++-
 algo/x14/x14-gate.c                           |    8 +-
 algo/x14/x14-gate.h                           |   18 +-
 algo/x15/x15-4way.c                           |  366 ++++-
 algo/x15/x15-gate.c                           |    8 +-
 algo/x15/x15-gate.h                           |   20 +-
 algo/x16/x16r-4way.c                          |  423 +++++-
 algo/x16/x16r-gate.c                          |   37 +-
 algo/x16/x16r-gate.h                          |   47 +-
 algo/x16/x16rt-4way.c                         |  389 ++++-
 algo/x16/x16rv2-4way.c                        |  475 +++++-
 algo/x17/sonoa-4way.c                         | 1335 ++++++++++++++++-
 algo/x17/sonoa-gate.c                         |    8 +-
 algo/x17/sonoa-gate.h                         |   22 +-
 algo/x17/x17-4way.c                           |  307 +++-
 algo/x17/x17-gate.c                           |    7 +-
 algo/x17/x17-gate.h                           |   13 +-
 algo/x17/xevan-4way.c                         |  513 ++++++-
 algo/x17/xevan-gate.c                         |    6 +-
 algo/x17/xevan-gate.h                         |   17 +-
 build-allarch.sh                              |    2 +
 configure                                     |   20 +-
 configure.ac                                  |    2 +-
 cpu-miner.c                                   |   50 +-
 simd-utils/intrlv.h                           |  452 +++++-
 simd-utils/simd-128.h                         |   90 +-
 simd-utils/simd-256.h                         |  116 +-
 simd-utils/simd-512.h                         |  180 +--
 72 files changed, 9090 insertions(+), 1336 deletions(-)
 create mode 100644 algo/haval/haval-8way-helper.c

diff --git a/INSTALL_LINUX b/INSTALL_LINUX
index e2a0953..a88f888 100644
--- a/INSTALL_LINUX
+++ b/INSTALL_LINUX
@@ -1,12 +1,14 @@
 
 
-Requirements:
+1. Requirements:
+---------------
 
 Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
 supported.
 64 bit Linux operating system. Apple is not supported.
 
-Building on linux prerequisites:
+2. Building on linux prerequisites:
+-----------------------------------
 
 It is assumed users know how to install packages on their system and
 be able to compile standard source packages. This is basic Linux and
@@ -20,41 +22,74 @@ http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu
 
 Install any additional dependencies needed by cpuminer-opt. The list below
 are some of the ones that may not be in the default install and need to
-be installed manually. There may be others, read the error messages they
-will give a clue as to the missing package.
+be installed manually. There may be others, read the compiler error messages,
+they will give a clue as to the missing package.
 
 The following command should install everything you need on Debian based
 distributions such as Ubuntu. Fedora and other distributions may have similar
-but different package names.
+but different package names. 
 
-sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev
+$ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git
 
 SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
-openssl 1.1.0e or higher. Add one of the following, depending on the
-compiler version, to CFLAGS:
-"-march=native" or "-march=znver1" or "-msha".
+openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA
+support depending on your CPU and compiler version:
+
+"-march=native" is always the best choice
+
+"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000.
+
+"-msha"  Add SHA to other tuning options
 
 Additional instructions for static compilalation can be found here:
 https://lxadm.com/Static_compilation_of_cpuminer
 Static builds should only considered in a homogeneous HW and SW environment.
 Local builds will always have the best performance and compatibility.
 
-Extract cpuminer source.
+3. Download cpuminer-opt
+------------------------
 
-tar xvzf cpuminer-opt-x.y.z.tar.gz
-cd cpuminer-opt-x.y.z
+Download the source code for the latest realease from the official repository.
 
-Run ./build.sh to build on Linux or execute the following commands.
+https://github.com/JayDDee/cpuminer-opt/releases
 
-./autogen.sh
-CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
-make
+Extract the source code.
 
-Start mining.
+$ tar xvzf cpuminer-opt-x.y.z.tar.gz
+
+
+Alternatively it can be cloned from git.
+
+$ git clone https://github.com/JayDDee/cpuminer-opt.git
+ 
+4. Build cpuminer-opt
+---------------------
+
+It is recomended to Build with default options, this will usuallly
+produce the best results.
+
+$ ./build.sh to build on Linux or execute the following commands.
+
+or 
+
+$ ./autogen.sh
+$ CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
+$ make -j n
+
+n is the number of threads.
+
+5. Start mining.
+----------------
+
+$ ./cpuminer -a algo -o url -u username -p password
 
-./cpuminer -a algo -o url -u username -p password
 
 Windows
+-------
+
+See also INSTAL_WINDOWS
+
+The following procedure is obsolete and uses an old compiler.
 
 Precompiled Windows binaries are built on a Linux host using Mingw
 with a more recent compiler than the following Windows hosted procedure.
diff --git a/Makefile.am b/Makefile.am
index a2ba0fc..ee8990d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -124,6 +124,8 @@ cpuminer_SOURCES = \
   algo/luffa/luffa-hash-2way.c \
   algo/lyra2/lyra2.c \
   algo/lyra2/sponge.c \
+  algo/lyra2/sponge-2way.c \
+  algo/lyra2/lyra2-hash-2way.c \
   algo/lyra2/lyra2-gate.c \
   algo/lyra2/lyra2rev2.c \
   algo/lyra2/lyra2rev2-4way.c \
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index e3c857b..8caedc5 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -1,6 +1,8 @@
 cpuminer-opt is a console program run from the command line using the
 keyboard, not the mouse.
 
+See also README.md for list of supported algorithms,
+
 Security warning
 ----------------
 
@@ -31,7 +33,21 @@ not supported. FreeBSD YMMV.
 Change Log
 ----------
 
-v3.10.2
+v3.10.5
+
+AVX512 for x17, sonoa, xevan, hmq1725, lyra2rev3, lyra2rev2. 
+Faster hmq1725 AVX2.
+
+v3.10.4
+
+AVX512 for x16r, x16rv2, x16rt, x16s, x16rt-veil (veil).
+
+v3.10.3
+
+AVX512 for x12, x13, x14, x15.
+Fixed x12 AVX2 invalid shares.
+
+v.10.2
 
 AVX512 added for bmw512, c11, phi1612 (phi), qubit, skunk, x11, x11gost (sib).
 Fixed c11 AVX2 invalid shares.
diff --git a/algo/argon2/argon2d/blake2/blamka-round-opt.h b/algo/argon2/argon2d/blake2/blamka-round-opt.h
index 37d99d8..8156331 100644
--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -184,10 +184,10 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
 
 #include <immintrin.h>
 
-#define  rotr32  mm256_swap32_64
-#define  rotr24  mm256_ror3x8_64
-#define  rotr16  mm256_ror1x16_64
-#define  rotr63( x ) mm256_rol_64( x, 1 )
+#define  rotr32( x )  mm256_ror_64( x, 32 )
+#define  rotr24( x )  mm256_ror_64( x, 24 )
+#define  rotr16( x )  mm256_ror_64( x, 16 )
+#define  rotr63( x )  mm256_rol_64( x,  1 )
 
 //#define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
 //#define rotr24(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
diff --git a/algo/blake/blake-hash-4way.h b/algo/blake/blake-hash-4way.h
index 2cf9a47..9f389f6 100644
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -70,19 +70,22 @@ typedef struct {
 // Default 14 rounds
 typedef blake_4way_small_context blake256_4way_context;
 void blake256_4way_init(void *ctx);
-void blake256_4way(void *ctx, const void *data, size_t len);
+void blake256_4way_update(void *ctx, const void *data, size_t len);
+#define blake256_4way blake256_4way_update
 void blake256_4way_close(void *ctx, void *dst);
 
 // 14 rounds, blake, decred
 typedef blake_4way_small_context blake256r14_4way_context;
 void blake256r14_4way_init(void *cc);
-void blake256r14_4way(void *cc, const void *data, size_t len);
+void blake256r14_4way_update(void *cc, const void *data, size_t len);
+#define blake256r14_4way blake256r14_4way_update
 void blake256r14_4way_close(void *cc, void *dst);
 
 // 8 rounds, blakecoin, vanilla
 typedef blake_4way_small_context blake256r8_4way_context;
 void blake256r8_4way_init(void *cc);
-void blake256r8_4way(void *cc, const void *data, size_t len);
+void blake256r8_4way_update(void *cc, const void *data, size_t len);
+#define blake256r8_4way blake256r8_4way_update
 void blake256r8_4way_close(void *cc, void *dst);
 
 #ifdef __AVX2__
@@ -100,19 +103,21 @@ typedef struct {
 // Default 14 rounds
 typedef blake_8way_small_context blake256_8way_context;
 void blake256_8way_init(void *cc);
-void blake256_8way(void *cc, const void *data, size_t len);
+void blake256_8way_update(void *cc, const void *data, size_t len);
+#define blake256_8way blake256_8way_update
 void blake256_8way_close(void *cc, void *dst);
 
 // 14 rounds, blake, decred
 typedef blake_8way_small_context blake256r14_8way_context;
 void blake256r14_8way_init(void *cc);
-void blake256r14_8way(void *cc, const void *data, size_t len);
+void blake256r14_8way_update(void *cc, const void *data, size_t len);
 void blake256r14_8way_close(void *cc, void *dst);
 
 // 8 rounds, blakecoin, vanilla
 typedef blake_8way_small_context blake256r8_8way_context;
 void blake256r8_8way_init(void *cc);
-void blake256r8_8way(void *cc, const void *data, size_t len);
+void blake256r8_8way_update(void *cc, const void *data, size_t len);
+#define blake256r8_8way blake256r8_8way_update
 void blake256r8_8way_close(void *cc, void *dst);
 
 // Blake-512 4 way
diff --git a/algo/blake/blake256-hash-4way.c b/algo/blake/blake256-hash-4way.c
index 87592bc..f958659 100644
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -634,7 +634,7 @@ do { \
                               m256_const1_64( 0x082EFA98082EFA98 ) ); \
    VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
                               m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
-   shuf_bswap32 = m256_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
+   shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
                                  0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
    M0 = _mm256_shuffle_epi8( * buf    , shuf_bswap32 ); \
    M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
@@ -1184,7 +1184,7 @@ blake256_16way_update(void *cc, const void *data, size_t len)
 }
 
 void
-blake256_16way_close_update(void *cc, void *dst)
+blake256_16way_close(void *cc, void *dst)
 {
         blake32_16way_close(cc, 0, 0, dst, 8);
 }
@@ -1259,7 +1259,7 @@ blake256_8way_init(void *cc)
 }
 
 void
-blake256_8way(void *cc, const void *data, size_t len)
+blake256_8way_update(void *cc, const void *data, size_t len)
 {
         blake32_8way(cc, data, len);
 }
@@ -1279,7 +1279,7 @@ void blake256r14_4way_init(void *cc)
 }
 
 void
-blake256r14_4way(void *cc, const void *data, size_t len)
+blake256r14_4way_update(void *cc, const void *data, size_t len)
 {
    blake32_4way(cc, data, len);
 }
@@ -1298,7 +1298,7 @@ void blake256r14_8way_init(void *cc)
 }
 
 void
-blake256r14_8way(void *cc, const void *data, size_t len)
+blake256r14_8way_update(void *cc, const void *data, size_t len)
 {
    blake32_8way(cc, data, len);
 }
@@ -1318,7 +1318,7 @@ void blake256r8_4way_init(void *cc)
 }
 
 void
-blake256r8_4way(void *cc, const void *data, size_t len)
+blake256r8_4way_update(void *cc, const void *data, size_t len)
 {
    blake32_4way(cc, data, len);
 }
@@ -1337,7 +1337,7 @@ void blake256r8_8way_init(void *cc)
 }
 
 void
-blake256r8_8way(void *cc, const void *data, size_t len)
+blake256r8_8way_update(void *cc, const void *data, size_t len)
 {
    blake32_8way(cc, data, len);
 }
diff --git a/algo/bmw/bmw256-hash-4way.c b/algo/bmw/bmw256-hash-4way.c
index a5a2e77..92e7183 100644
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -874,6 +874,57 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
                  mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
                  mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
 
+#define DH1L( m, sl, sr, a, b, c ) \
+   _mm256_add_epi32( \
+               _mm256_xor_si256( M[m], \
+                  _mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \
+                                    _mm256_srli_epi32( qt[a], sr ) ) ), \
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+
+#define DH1R( m, sl, sr, a, b, c ) \
+   _mm256_add_epi32( \
+               _mm256_xor_si256( M[m], \
+                  _mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \
+                                    _mm256_slli_epi32( qt[a], sr ) ) ), \
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+
+#define DH2L( m, rl, sl, h, a, b, c ) \
+   _mm256_add_epi32( _mm256_add_epi32( \
+       mm256_rol_32( dH[h], rl ), \
+          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
+                 _mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \
+                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+
+#define DH2R( m, rl, sr, h, a, b, c ) \
+   _mm256_add_epi32( _mm256_add_epi32( \
+       mm256_rol_32( dH[h], rl ), \
+          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
+                 _mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \
+                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+
+   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
+   dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
+   dH[ 2] = DH1R(  2,  5,  5, 18, 26, 2 );
+   dH[ 3] = DH1R(  3,  1,  5, 19, 27, 3 );
+   dH[ 4] = DH1R(  4,  3,  0, 20, 28, 4 );
+   dH[ 5] = DH1L(  5,  6,  6, 21, 29, 5 );
+   dH[ 6] = DH1R(  6,  4,  6, 22, 30, 6 );
+   dH[ 7] = DH1R(  7, 11,  2, 23, 31, 7 );
+   dH[ 8] = DH2L(  8,  9,  8,  4, 24, 23,  8 );
+   dH[ 9] = DH2R(  9, 10,  6,  5, 25, 16,  9 );
+   dH[10] = DH2L( 10, 11,  6,  6, 26, 17, 10 );
+   dH[11] = DH2L( 11, 12,  4,  7, 27, 18, 11 );
+   dH[12] = DH2R( 12, 13,  3,  0, 28, 19, 12 );
+   dH[13] = DH2R( 13, 14,  4,  1, 29, 20, 13 );
+   dH[14] = DH2R( 14, 15,  7,  2, 30, 21, 14 );
+   dH[15] = DH2R( 15, 16,  2,  3, 31, 22, 15 );
+
+#undef DH1L
+#undef DH1R
+#undef DH2L
+#undef DH2R
+
+/*   
    dH[ 0] = _mm256_add_epi32(
                  _mm256_xor_si256( M[0],
                       _mm256_xor_si256( _mm256_slli_epi32( xh, 5 ),
@@ -954,6 +1005,7 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
                  _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
                  _mm256_xor_si256( _mm256_srli_epi32( xl, 2 ),
                                    _mm256_xor_si256( qt[22], qt[15] ) ) );
+*/
 }
 
 static const __m256i final_s8[16] =
diff --git a/algo/cubehash/cube-hash-2way.c b/algo/cubehash/cube-hash-2way.c
index c9a4012..5a4af53 100644
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -64,10 +64,10 @@ static void transform_4way( cube_4way_context *sp )
         x1 = _mm512_xor_si512( x1, x5 );
         x2 = _mm512_xor_si512( x2, x6 );
         x3 = _mm512_xor_si512( x3, x7 );
-        x4 = mm512_swap64_128( x4 );
-        x5 = mm512_swap64_128( x5 );
-        x6 = mm512_swap64_128( x6 );
-        x7 = mm512_swap64_128( x7 );
+        x4 = mm512_swap128_64( x4 );
+        x5 = mm512_swap128_64( x5 );
+        x6 = mm512_swap128_64( x6 );
+        x7 = mm512_swap128_64( x7 );
         x4 = _mm512_add_epi32( x0, x4 );
         x5 = _mm512_add_epi32( x1, x5 );
         x6 = _mm512_add_epi32( x2, x6 );
@@ -82,10 +82,10 @@ static void transform_4way( cube_4way_context *sp )
         x1 = _mm512_xor_si512( x1, x5 );
         x2 = _mm512_xor_si512( x2, x6 );
         x3 = _mm512_xor_si512( x3, x7 );
-        x4 = mm512_swap32_64( x4 );
-        x5 = mm512_swap32_64( x5 );
-        x6 = mm512_swap32_64( x6 );
-        x7 = mm512_swap32_64( x7 );
+        x4 = mm512_swap64_32( x4 );
+        x5 = mm512_swap64_32( x5 );
+        x6 = mm512_swap64_32( x6 );
+        x7 = mm512_swap64_32( x7 );
     }
 
     _mm512_store_si512( (__m512i*)sp->h,     x0 );
@@ -239,10 +239,10 @@ static void transform_2way( cube_2way_context *sp )
         x1 = _mm256_xor_si256( x1, x5 );
         x2 = _mm256_xor_si256( x2, x6 );
         x3 = _mm256_xor_si256( x3, x7 );
-        x4 = mm256_swap64_128( x4 );
-        x5 = mm256_swap64_128( x5 );
-        x6 = mm256_swap64_128( x6 );
-        x7 = mm256_swap64_128( x7 );
+        x4 = mm256_swap128_64( x4 );
+        x5 = mm256_swap128_64( x5 );
+        x6 = mm256_swap128_64( x6 );
+        x7 = mm256_swap128_64( x7 );
         x4 = _mm256_add_epi32( x0, x4 );
         x5 = _mm256_add_epi32( x1, x5 );
         x6 = _mm256_add_epi32( x2, x6 );
@@ -257,10 +257,10 @@ static void transform_2way( cube_2way_context *sp )
         x1 = _mm256_xor_si256( x1, x5 );
         x2 = _mm256_xor_si256( x2, x6 );
         x3 = _mm256_xor_si256( x3, x7 );
-        x4 = mm256_swap32_64( x4 );
-        x5 = mm256_swap32_64( x5 );
-        x6 = mm256_swap32_64( x6 );
-        x7 = mm256_swap32_64( x7 );
+        x4 = mm256_swap64_32( x4 );
+        x5 = mm256_swap64_32( x5 );
+        x6 = mm256_swap64_32( x6 );
+        x7 = mm256_swap64_32( x7 );
     }
 
     _mm256_store_si256( (__m256i*)sp->h,     x0 );
diff --git a/algo/cubehash/cubehash_sse2.c b/algo/cubehash/cubehash_sse2.c
index 7f6591f..8b9d010 100644
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -39,8 +39,8 @@ static void transform( cubehashParam *sp )
         x1 = mm256_rol_32( y0, 7 );
         x0 = _mm256_xor_si256( x0, x2 );
         x1 = _mm256_xor_si256( x1, x3 );
-        x2 = mm256_swap64_128( x2 );
-        x3 = mm256_swap64_128( x3 );
+        x2 = mm256_swap128_64( x2 );
+        x3 = mm256_swap128_64( x3 );
         x2 = _mm256_add_epi32( x0, x2 );
         x3 = _mm256_add_epi32( x1, x3 );
         y0 = mm256_swap_128( x0 );
@@ -49,8 +49,8 @@ static void transform( cubehashParam *sp )
         x1 = mm256_rol_32( y1, 11 );
         x0 = _mm256_xor_si256( x0, x2 );
         x1 = _mm256_xor_si256( x1, x3 );
-        x2 = mm256_swap32_64( x2 );
-        x3 = mm256_swap32_64( x3 );
+        x2 = mm256_swap64_32( x2 );
+        x3 = mm256_swap64_32( x3 );
     }
 
     _mm256_store_si256( (__m256i*)sp->x,     x0 );
diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c
index 24d8ce8..0a1e6e2 100644
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -528,6 +528,346 @@ static const sph_u32 T512[64][16] = {
 	  SPH_C32(0xe7e00a94) }
 };
 
+#define s0   m0
+#define s1   c0
+#define s2   m1
+#define s3   c1
+#define s4   c2
+#define s5   m2
+#define s6   c3
+#define s7   m3
+#define s8   m4
+#define s9   c4
+#define sA   m5
+#define sB   c5
+#define sC   c6
+#define sD   m6
+#define sE   c7
+#define sF   m7
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// Hamsi 8 way 
+
+#define INPUT_BIG8 \
+do { \
+  __m512i db = *buf; \
+  const uint64_t *tp = (uint64_t*)&T512[0][0];  \
+  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
+  for ( int u = 0; u < 64; u++ ) \
+  { \
+     __m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
+     dm = mm512_negate_32( _mm512_or_si512( dm, \
+                                          _mm512_slli_epi64( dm, 32 ) ) ); \
+     m0 = _mm512_xor_si512( m0, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[0] ) ) ); \
+     m1 = _mm512_xor_si512( m1, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[1] ) ) ); \
+     m2 = _mm512_xor_si512( m2, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[2] ) ) ); \
+     m3 = _mm512_xor_si512( m3, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[3] ) ) ); \
+     m4 = _mm512_xor_si512( m4, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[4] ) ) ); \
+     m5 = _mm512_xor_si512( m5, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[5] ) ) ); \
+     m6 = _mm512_xor_si512( m6, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[6] ) ) ); \
+     m7 = _mm512_xor_si512( m7, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[7] ) ) ); \
+     tp += 8; \
+     db = _mm512_srli_epi64( db, 1 ); \
+  } \
+} while (0)
+
+#define SBOX8( a, b, c, d ) \
+do { \
+  __m512i t; \
+  t = a; \
+  a = _mm512_and_si512( a, c ); \
+  a = _mm512_xor_si512( a, d ); \
+  c = _mm512_xor_si512( c, b ); \
+  c = _mm512_xor_si512( c, a ); \
+  d = _mm512_or_si512( d, t ); \
+  d = _mm512_xor_si512( d, b ); \
+  t = _mm512_xor_si512( t, c ); \
+  b = d; \
+  d = _mm512_or_si512( d, t ); \
+  d = _mm512_xor_si512( d, a ); \
+  a = _mm512_and_si512( a, b ); \
+  t = _mm512_xor_si512( t, a ); \
+  b = _mm512_xor_si512( b, d ); \
+  b = _mm512_xor_si512( b, t ); \
+  a = c; \
+  c = b; \
+  b = d; \
+  d = mm512_not( t ); \
+} while (0)
+
+#define L8( a, b, c, d ) \
+do { \
+   a = mm512_rol_32( a, 13 ); \
+   c = mm512_rol_32( c,  3 ); \
+   b = _mm512_xor_si512( b, _mm512_xor_si512( a, c ) ); \
+   d = _mm512_xor_si512( d, _mm512_xor_si512( c, \
+                                              _mm512_slli_epi32( a, 3 ) ) ); \
+   b = mm512_rol_32( b, 1 ); \
+   d = mm512_rol_32( d, 7 ); \
+   a = _mm512_xor_si512( a, _mm512_xor_si512( b, d ) ); \
+   c = _mm512_xor_si512( c, _mm512_xor_si512( d, \
+                                              _mm512_slli_epi32( b, 7 ) ) ); \
+   a = mm512_rol_32( a,  5 ); \
+   c = mm512_rol_32( c, 22 ); \
+} while (0)
+
+#define DECL_STATE_BIG8 \
+   __m512i c0, c1, c2, c3, c4, c5, c6, c7; \
+
+#define READ_STATE_BIG8(sc) \
+do { \
+   c0 = sc->h[0x0]; \
+   c1 = sc->h[0x1]; \
+   c2 = sc->h[0x2]; \
+   c3 = sc->h[0x3]; \
+   c4 = sc->h[0x4]; \
+   c5 = sc->h[0x5]; \
+   c6 = sc->h[0x6]; \
+   c7 = sc->h[0x7]; \
+} while (0)
+
+#define WRITE_STATE_BIG8(sc) \
+do { \
+   sc->h[0x0] = c0; \
+   sc->h[0x1] = c1; \
+   sc->h[0x2] = c2; \
+   sc->h[0x3] = c3; \
+   sc->h[0x4] = c4; \
+   sc->h[0x5] = c5; \
+   sc->h[0x6] = c6; \
+   sc->h[0x7] = c7; \
+} while (0)
+
+
+#define ROUND_BIG8(rc, alpha) \
+do { \
+   __m512i t0, t1, t2, t3; \
+   s0 = _mm512_xor_si512( s0, m512_const1_64( \
+                   ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
+   s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
+   s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
+   s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
+   s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
+   s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
+   s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
+   s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
+   s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
+   s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
+   sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
+   sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
+   sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
+   sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
+   sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
+   sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
+\
+  SBOX8( s0, s4, s8, sC ); \
+  SBOX8( s1, s5, s9, sD ); \
+  SBOX8( s2, s6, sA, sE ); \
+  SBOX8( s3, s7, sB, sF ); \
+\
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \
+                                        _mm512_bslli_epi128( s5, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \
+                                        _mm512_bslli_epi128( sE, 4 ) ); \
+  L8( s0, t1, s9, t3 ); \
+  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \
+  s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \
+  sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \
+  sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
+                                        _mm512_bslli_epi128( s6, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \
+                                        _mm512_bslli_epi128( sF, 4 ) ); \
+  L8( s1, t1, sA, t3 ); \
+  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
+  s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \
+  sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \
+  sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \
+                                        _mm512_bslli_epi128( s7, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \
+                                        _mm512_bslli_epi128( sC, 4 ) ); \
+  L8( s2, t1, sB, t3 ); \
+  s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \
+  s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \
+  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \
+  sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \
+                                        _mm512_bslli_epi128( s4, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \
+                                        _mm512_bslli_epi128( sD, 4 ) ); \
+  L8( s3, t1, s8, t3 ); \
+  s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \
+  s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \
+  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \
+  sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \
+                                        _mm512_bslli_epi128( sB, 4 ) ); \
+  L8( t0, t1, t2, t3 ); \
+  s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
+  s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \
+  s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
+  s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
+  s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \
+  sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
+  s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \
+  sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
+                                        _mm512_bslli_epi128( sD, 4 ) ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \
+  L8( t0, t1, t2, t3 ); \
+  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \
+  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \
+  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
+  sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \
+  s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \
+  sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \
+  s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \
+  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
+} while (0)
+
+#define P_BIG8 \
+do { \
+   ROUND_BIG8(0, alpha_n); \
+   ROUND_BIG8(1, alpha_n); \
+   ROUND_BIG8(2, alpha_n); \
+   ROUND_BIG8(3, alpha_n); \
+   ROUND_BIG8(4, alpha_n); \
+   ROUND_BIG8(5, alpha_n); \
+} while (0)
+
+#define PF_BIG8 \
+do { \
+   ROUND_BIG8( 0, alpha_f); \
+   ROUND_BIG8( 1, alpha_f); \
+   ROUND_BIG8( 2, alpha_f); \
+   ROUND_BIG8( 3, alpha_f); \
+   ROUND_BIG8( 4, alpha_f); \
+   ROUND_BIG8( 5, alpha_f); \
+   ROUND_BIG8( 6, alpha_f); \
+   ROUND_BIG8( 7, alpha_f); \
+   ROUND_BIG8( 8, alpha_f); \
+   ROUND_BIG8( 9, alpha_f); \
+   ROUND_BIG8(10, alpha_f); \
+   ROUND_BIG8(11, alpha_f); \
+} while (0)
+
+#define T_BIG8 \
+do { /* order is important */ \
+   c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \
+   c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \
+   c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \
+   c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \
+   c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \
+   c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \
+   c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \
+   c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \
+} while (0)
+
+void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
+{
+   DECL_STATE_BIG8
+   uint32_t tmp = num << 6;
+
+   sc->count_low = SPH_T32( sc->count_low + tmp );
+   sc->count_high += (sph_u32)( (num >> 13) >> 13 );
+   if ( sc->count_low < tmp )
+      sc->count_high++;
+
+   READ_STATE_BIG8( sc );
+   while ( num-- > 0 )
+   {
+      __m512i m0, m1, m2, m3, m4, m5, m6, m7;
+
+      INPUT_BIG8;
+      P_BIG8;
+      T_BIG8;
+      buf++;
+   }
+   WRITE_STATE_BIG8( sc );
+}
+
+void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
+{
+   __m512i m0, m1, m2, m3, m4, m5, m6, m7;
+   DECL_STATE_BIG8
+   READ_STATE_BIG8( sc );
+   INPUT_BIG8;
+   PF_BIG8;
+   T_BIG8;
+   WRITE_STATE_BIG8( sc );
+}
+
+
+void hamsi512_8way_init( hamsi_8way_big_context *sc )
+{
+   sc->partial_len = 0;
+   sc->count_high = sc->count_low = 0;
+
+   sc->h[0] = m512_const1_64( 0x6c70617273746565 );
+   sc->h[1] = m512_const1_64( 0x656e62656b204172 );
+   sc->h[2] = m512_const1_64( 0x302c206272672031 );
+   sc->h[3] = m512_const1_64( 0x3434362c75732032 );
+   sc->h[4] = m512_const1_64( 0x3030312020422d33 );
+   sc->h[5] = m512_const1_64( 0x656e2d484c657576 );
+   sc->h[6] = m512_const1_64( 0x6c65652c65766572 );
+   sc->h[7] = m512_const1_64( 0x6769756d2042656c );
+}
+
+void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
+                           size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+
+   hamsi_8way_big( sc, vdata, len>>3 );
+   vdata += ( (len& ~(size_t)7) >> 3 );
+   len &= (size_t)7;
+   memcpy_512( sc->buf, vdata, len>>3 );
+   sc->partial_len = len;
+}
+
+void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
+{
+   __m512i pad[1];
+   int ch, cl;
+
+   sph_enc32be( &ch, sc->count_high );
+   sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
+   pad[0] =  _mm512_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch,
+                               cl, ch, cl, ch, cl, ch, cl, ch );
+//   pad[0] =  m512_const2_32( cl, ch );
+   sc->buf[0] = m512_const1_64( 0x80 );
+   hamsi_8way_big( sc, sc->buf, 1 );
+   hamsi_8way_big_final( sc, pad );
+
+   mm512_block_bswap_32( (__m512i*)dst, sc->h );
+}
+
+
+#endif // AVX512
+
+
+// Hamsi 4 way
 
 #define INPUT_BIG \
 do { \
@@ -627,6 +967,7 @@ do { \
    sc->h[0x7] = c7; \
 } while (0)
 
+/*
 #define s0   m0
 #define s1   c0
 #define s2   m1
@@ -643,42 +984,28 @@ do { \
 #define sD   m6
 #define sE   c7
 #define sF   m7
+*/
 
 #define ROUND_BIG(rc, alpha) \
 do { \
    __m256i t0, t1, t2, t3; \
    s0 = _mm256_xor_si256( s0, m256_const1_64( \
-        ( ( (uint64_t)( (rc) ^ alpha[1] ) << 32 ) ) | (uint64_t)alpha[0] ) ); \
-   s1 = _mm256_xor_si256( s1, m256_const1_64( \
-        ( (uint64_t)alpha[ 3] << 32 ) | (uint64_t)alpha[ 2] ) ); \
-   s2 = _mm256_xor_si256( s2, m256_const1_64( \
-        ( (uint64_t)alpha[ 5] << 32 ) | (uint64_t)alpha[ 4] ) ); \
-   s3 = _mm256_xor_si256( s3, m256_const1_64( \
-        ( (uint64_t)alpha[ 7] << 32 ) | (uint64_t)alpha[ 6] ) ); \
-   s4 = _mm256_xor_si256( s4, m256_const1_64( \
-        ( (uint64_t)alpha[ 9] << 32 ) | (uint64_t)alpha[ 8] ) ); \
-   s5 = _mm256_xor_si256( s5, m256_const1_64( \
-        ( (uint64_t)alpha[11] << 32 ) | (uint64_t)alpha[10] ) ); \
-   s6 = _mm256_xor_si256( s6, m256_const1_64( \
-        ( (uint64_t)alpha[13] << 32 ) | (uint64_t)alpha[12] ) ); \
-   s7 = _mm256_xor_si256( s7, m256_const1_64( \
-        ( (uint64_t)alpha[15] << 32 ) | (uint64_t)alpha[14] ) ); \
-   s8 = _mm256_xor_si256( s8, m256_const1_64( \
-        ( (uint64_t)alpha[17] << 32 ) | (uint64_t)alpha[16] ) ); \
-   s9 = _mm256_xor_si256( s9, m256_const1_64( \
-        ( (uint64_t)alpha[19] << 32 ) | (uint64_t)alpha[18] ) ); \
-   sA = _mm256_xor_si256( sA, m256_const1_64( \
-        ( (uint64_t)alpha[21] << 32 ) | (uint64_t)alpha[20] ) ); \
-   sB = _mm256_xor_si256( sB, m256_const1_64( \
-        ( (uint64_t)alpha[23] << 32 ) | (uint64_t)alpha[22] ) ); \
-   sC = _mm256_xor_si256( sC, m256_const1_64( \
-        ( (uint64_t)alpha[25] << 32 ) | (uint64_t)alpha[24] ) ); \
-   sD = _mm256_xor_si256( sD, m256_const1_64( \
-        ( (uint64_t)alpha[27] << 32 ) | (uint64_t)alpha[26] ) ); \
-   sE = _mm256_xor_si256( sE, m256_const1_64( \
-        ( (uint64_t)alpha[29] << 32 ) | (uint64_t)alpha[28] ) ); \
-   sF = _mm256_xor_si256( sF, m256_const1_64( \
-        ( (uint64_t)alpha[31] << 32 ) | (uint64_t)alpha[30] ) ); \
+                   ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
+   s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
+   s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
+   s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
+   s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
+   s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
+   s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
+   s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
+   s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
+   s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
+   sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
+   sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
+   sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
+   sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
+   sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
+   sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
 \
   SBOX( s0, s4, s8, sC ); \
   SBOX( s1, s5, s9, sD ); \
diff --git a/algo/hamsi/hamsi-hash-4way.h b/algo/hamsi/hamsi-hash-4way.h
index f70f3fe..4e57f10 100644
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -60,9 +60,32 @@ typedef struct {
 typedef hamsi_4way_big_context hamsi512_4way_context;
 
 void hamsi512_4way_init( hamsi512_4way_context *sc );
-void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
+void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
+      size_t len );
+#define hamsi512_4way hamsi512_4way_update
 void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
 
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+typedef struct {
+   __m512i h[8];
+   __m512i buf[1];
+   size_t partial_len;
+   sph_u32 count_high, count_low;
+} hamsi_8way_big_context;
+
+typedef hamsi_8way_big_context hamsi512_8way_context;
+
+void hamsi512_8way_init( hamsi512_8way_context *sc );
+void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
+                           size_t len );
+void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
+
+
+
+#endif
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/algo/haval/haval-8way-helper.c b/algo/haval/haval-8way-helper.c
new file mode 100644
index 0000000..82187f5
--- /dev/null
+++ b/algo/haval/haval-8way-helper.c
@@ -0,0 +1,115 @@
+/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
+/*
+ * Helper code, included (three times !) by HAVAL implementation.
+ *
+ * TODO: try to merge this with md_helper.c.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#undef SPH_XCAT
+#define SPH_XCAT(a, b)    SPH_XCAT_(a, b)
+#undef SPH_XCAT_
+#define SPH_XCAT_(a, b)   a ## b
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_update)
+( haval_8way_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   unsigned current;
+
+   current = (unsigned)sc->count_low & 127U;
+   while ( len > 0 )
+   {
+      unsigned clen;
+      uint32_t clow, clow2;
+
+      clen = 128U - current;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( sc->buf + (current>>2), vdata, clen>>2 );
+      vdata += clen>>2;
+      current += clen;
+      len -= clen;
+      if ( current == 128U )
+      {
+         DSTATE_8W;
+         IN_PREPARE_8W(sc->buf);
+         RSTATE_8W;
+         SPH_XCAT(CORE_8W, PASSES)(INW_8W);
+         WSTATE_8W;
+         current = 0;
+      }
+      clow = sc->count_low;
+      clow2 = clow + clen;
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high ++;
+   }
+}
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_close)( haval_8way_context *sc,
+                                                void *dst)
+{
+   unsigned current;
+   DSTATE_8W;
+
+   current = (unsigned)sc->count_low & 127UL;
+
+   sc->buf[ current>>2 ] = m256_one_32;
+   current += 4;   
+   RSTATE_8W;
+   if ( current > 116UL )
+   {
+      memset_zero_256( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
+      do
+      {
+         IN_PREPARE_8W(sc->buf);
+         SPH_XCAT(CORE_8W, PASSES)(INW_8W);
+      } while (0);
+      current = 0;
+   }
+
+   uint32_t t1, t2;
+   memset_zero_256( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
+   t1 = 0x01 | (PASSES << 3);
+   t2 = sc->olen << 3;
+   sc->buf[ 116>>2 ] = _mm256_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
+   sc->buf[ 120>>2 ] = _mm256_set1_epi32( sc->count_low << 3 );
+   sc->buf[ 124>>2 ] = _mm256_set1_epi32( (sc->count_high << 3)
+                                     | (sc->count_low >> 29) );
+   do
+   {
+      IN_PREPARE_8W(sc->buf);
+      SPH_XCAT(CORE_8W, PASSES)(INW_8W);
+   } while (0);
+   WSTATE_8W;
+   haval_8way_out( sc, dst );
+}
diff --git a/algo/haval/haval-hash-4way.c b/algo/haval/haval-hash-4way.c
index 35cfd17..02df40f 100644
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -40,7 +40,7 @@
 #include <string.h>
 #include "haval-hash-4way.h"
 
-// won't compile with sse4.2
+// won't compile with sse4.2, not a problem, it's only used with AVX2 4 way.
 //#if defined (__SSE4_2__)
 #if defined(__AVX__)
 
@@ -518,6 +518,301 @@ do { \
 
 #define INMSG(i)   msg[i]
 
+#if defined(__AVX2__)
+
+// Haval-256 8 way 32 bit avx2
+
+#define F1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   _mm256_xor_si256( x0, \
+       _mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \
+                      _mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
+                                     _mm256_and_si256( x3, x6 ) ) ) ) \
+
+#define F2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   _mm256_xor_si256( \
+      _mm256_and_si256( x2, \
+         _mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \
+                        _mm256_xor_si256( _mm256_and_si256( x4, x5 ), \
+                                       _mm256_xor_si256( x6, x0 ) ) ) ), \
+         _mm256_xor_si256( \
+             _mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \
+             _mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \
+
+#define F3_8W(x6, x5, x4, x3, x2, x1, x0) \
+  _mm256_xor_si256( \
+    _mm256_and_si256( x3, \
+      _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
+                     _mm256_xor_si256( x6, x0 ) ) ), \
+      _mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \
+                                   _mm256_and_si256( x2, x5 ) ), x0 ) )
+
+#define F4_8W(x6, x5, x4, x3, x2, x1, x0) \
+  _mm256_xor_si256( \
+     _mm256_xor_si256( \
+        _mm256_and_si256( x3, \
+           _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
+                                         _mm256_or_si256( x4, x6 ) ), x5 ) ), \
+        _mm256_and_si256( x4, \
+           _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \
+                          _mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \
+     _mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) )
+
+
+#define F5_8W(x6, x5, x4, x3, x2, x1, x0) \
+   _mm256_xor_si256( \
+       _mm256_and_si256( x0, \
+            mm256_not( _mm256_xor_si256( \
+                    _mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \
+      _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
+                                    _mm256_and_si256( x2, x5 ) ), \
+                                    _mm256_and_si256( x3, x6 ) ) )
+
+#define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F1_8W(x1, x0, x3, x5, x6, x2, x4)
+#define FP3_2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F2_8W(x4, x2, x1, x0, x5, x3, x6)
+#define FP3_3_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F3_8W(x6, x1, x2, x3, x4, x5, x0)
+
+#define FP4_1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F1_8W(x2, x6, x1, x4, x5, x3, x0)
+#define FP4_2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F2_8W(x3, x5, x2, x0, x1, x6, x4)
+#define FP4_3_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F3_8W(x1, x4, x3, x6, x0, x2, x5)
+#define FP4_4_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F4_8W(x6, x4, x0, x5, x2, x1, x3)
+
+#define FP5_1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F1_8W(x3, x4, x1, x0, x5, x2, x6)
+#define FP5_2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F2_8W(x6, x2, x1, x0, x3, x4, x5)
+#define FP5_3_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F3_8W(x2, x6, x0, x4, x3, x1, x5)
+#define FP5_4_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F4_8W(x1, x5, x3, x2, x0, x4, x6)
+#define FP5_5_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F5_8W(x2, x5, x0, x6, x4, x3, x1)
+
+#define STEP_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
+do { \
+   __m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
+                                      mm256_ror_32( x7, 11 ) ), \
+                       _mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
+} while (0)
+
+#define PASS1_8W(n, in)   do { \
+      unsigned pass_count; \
+      for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+         STEP_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
+            in(pass_count + 0), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
+            in(pass_count + 1), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
+            in(pass_count + 2), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
+            in(pass_count + 3), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
+            in(pass_count + 4), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
+            in(pass_count + 5), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
+            in(pass_count + 6), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
+            in(pass_count + 7), SPH_C32(0x00000000)); \
+         } \
+   } while (0)
+
+#define PASSG_8W(p, n, in)   do { \
+      unsigned pass_count; \
+      for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+         STEP_8W(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
+            in(MP ## p[pass_count + 0]), \
+            RK ## p[pass_count + 0]); \
+         STEP_8W(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
+            in(MP ## p[pass_count + 1]), \
+            RK ## p[pass_count + 1]); \
+         STEP_8W(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
+            in(MP ## p[pass_count + 2]), \
+            RK ## p[pass_count + 2]); \
+         STEP_8W(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
+            in(MP ## p[pass_count + 3]), \
+            RK ## p[pass_count + 3]); \
+         STEP_8W(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
+            in(MP ## p[pass_count + 4]), \
+            RK ## p[pass_count + 4]); \
+         STEP_8W(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
+            in(MP ## p[pass_count + 5]), \
+            RK ## p[pass_count + 5]); \
+         STEP_8W(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
+            in(MP ## p[pass_count + 6]), \
+            RK ## p[pass_count + 6]); \
+         STEP_8W(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
+            in(MP ## p[pass_count + 7]), \
+            RK ## p[pass_count + 7]); \
+         } \
+   } while (0)
+
+#define PASS2_8W(n, in)    PASSG_8W(2, n, in)
+#define PASS3_8W(n, in)    PASSG_8W(3, n, in)
+#define PASS4_8W(n, in)    PASSG_8W(4, n, in)
+#define PASS5_8W(n, in)    PASSG_8W(5, n, in)
+
+#define SAVE_STATE_8W \
+   __m256i u0, u1, u2, u3, u4, u5, u6, u7; \
+   do { \
+      u0 = s0; \
+      u1 = s1; \
+      u2 = s2; \
+      u3 = s3; \
+      u4 = s4; \
+      u5 = s5; \
+      u6 = s6; \
+      u7 = s7; \
+   } while (0)
+
+#define UPDATE_STATE_8W \
+do { \
+   s0 = _mm256_add_epi32( s0, u0 ); \
+   s1 = _mm256_add_epi32( s1, u1 ); \
+   s2 = _mm256_add_epi32( s2, u2 ); \
+   s3 = _mm256_add_epi32( s3, u3 ); \
+   s4 = _mm256_add_epi32( s4, u4 ); \
+   s5 = _mm256_add_epi32( s5, u5 ); \
+   s6 = _mm256_add_epi32( s6, u6 ); \
+   s7 = _mm256_add_epi32( s7, u7 ); \
+} while (0)
+
+#define CORE_8W5(in)  do { \
+      SAVE_STATE_8W; \
+      PASS1_8W(5, in); \
+      PASS2_8W(5, in); \
+      PASS3_8W(5, in); \
+      PASS4_8W(5, in); \
+      PASS5_8W(5, in); \
+      UPDATE_STATE_8W; \
+   } while (0)
+
+#define DSTATE_8W   __m256i s0, s1, s2, s3, s4, s5, s6, s7
+
+#define RSTATE_8W \
+do { \
+   s0 = sc->s0; \
+   s1 = sc->s1; \
+   s2 = sc->s2; \
+   s3 = sc->s3; \
+   s4 = sc->s4; \
+   s5 = sc->s5; \
+   s6 = sc->s6; \
+   s7 = sc->s7; \
+} while (0)
+
+#define WSTATE_8W \
+do { \
+   sc->s0 = s0; \
+   sc->s1 = s1; \
+   sc->s2 = s2; \
+   sc->s3 = s3; \
+   sc->s4 = s4; \
+   sc->s5 = s5; \
+   sc->s6 = s6; \
+   sc->s7 = s7; \
+} while (0)
+
+static void
+haval_8way_init( haval_8way_context *sc, unsigned olen, unsigned passes )
+{
+   sc->s0 = m256_const1_32( 0x243F6A88UL );
+   sc->s1 = m256_const1_32( 0x85A308D3UL );
+   sc->s2 = m256_const1_32( 0x13198A2EUL );
+   sc->s3 = m256_const1_32( 0x03707344UL );
+   sc->s4 = m256_const1_32( 0xA4093822UL );
+   sc->s5 = m256_const1_32( 0x299F31D0UL );
+   sc->s6 = m256_const1_32( 0x082EFA98UL );
+   sc->s7 = m256_const1_32( 0xEC4E6C89UL );
+   sc->olen = olen;
+   sc->passes = passes;
+   sc->count_high = 0;
+   sc->count_low = 0;
+
+}
+#define IN_PREPARE_8W(indata) const __m256i *const load_ptr_8w = (indata)
+
+#define INW_8W(i)   load_ptr_8w[ i ] 
+
+static void
+haval_8way_out( haval_8way_context *sc, void *dst )
+{
+   __m256i *buf = (__m256i*)dst;
+   DSTATE_8W;
+   RSTATE_8W;
+
+   buf[0] = s0;
+   buf[1] = s1;
+   buf[2] = s2;
+   buf[3] = s3;
+   buf[4] = s4;
+   buf[5] = s5;
+   buf[6] = s6;
+   buf[7] = s7;
+}
+
+#undef PASSES
+#define PASSES   5
+#include "haval-8way-helper.c"
+
+#define API_8W(xxx, y) \
+void \
+haval ## xxx ## _ ## y ## _8way_init(void *cc) \
+{ \
+   haval_8way_init(cc, xxx >> 5, y); \
+} \
+ \
+void \
+haval ## xxx ## _ ## y ## _8way_update (void *cc, const void *data, size_t len) \
+{ \
+   haval ## y ## _8way_update(cc, data, len); \
+} \
+ \
+void \
+haval ## xxx ## _ ## y ## _8way_close(void *cc, void *dst) \
+{ \
+   haval ## y ## _8way_close(cc, dst); \
+} \
+
+API_8W(256, 5)
+
+#define RVAL_8W \
+do { \
+   s0 = val[0]; \
+   s1 = val[1]; \
+   s2 = val[2]; \
+   s3 = val[3]; \
+   s4 = val[4]; \
+   s5 = val[5]; \
+   s6 = val[6]; \
+   s7 = val[7]; \
+} while (0)
+
+#define WVAL_8W \
+do { \
+   val[0] = s0; \
+   val[1] = s1; \
+   val[2] = s2; \
+   val[3] = s3; \
+   val[4] = s4; \
+   val[5] = s5; \
+   val[6] = s6; \
+   val[7] = s7; \
+} while (0)
+
+#define INMSG_8W(i)   msg[i]
+
+
+
+#endif // AVX2
+
 #ifdef __cplusplus
 }
 #endif	
diff --git a/algo/haval/haval-hash-4way.h b/algo/haval/haval-hash-4way.h
index 47338ce..9bd37ba 100644
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -59,7 +59,7 @@
  */
 
 #ifndef HAVAL_HASH_4WAY_H__
-#define HAVAL_HASH_4WAY_H__
+#define HAVAL_HASH_4WAY_H__ 1
 
 #if defined(__AVX__)
 
@@ -84,10 +84,30 @@ typedef haval_4way_context haval256_5_4way_context;
 
 void haval256_5_4way_init( void *cc );
 
-void haval256_5_4way( void *cc, const void *data, size_t len );
+void haval256_5_4way_update( void *cc, const void *data, size_t len );
+#define haval256_5_4way haval256_5_4way_update
 
 void haval256_5_4way_close( void *cc, void *dst );
 
+#if defined(__AVX2__)
+
+typedef struct {
+   __m256i buf[32];
+   __m256i s0, s1, s2, s3, s4, s5, s6, s7;
+   unsigned olen, passes;
+   uint32_t count_high, count_low;
+} haval_8way_context __attribute__ ((aligned (64)));
+
+typedef haval_8way_context haval256_5_8way_context;
+
+void haval256_5_8way_init( void *cc );
+
+void haval256_5_8way_update( void *cc, const void *data, size_t len );
+
+void haval256_5_8way_close( void *cc, void *dst );
+
+#endif // AVX2
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/algo/lyra2/lyra2-gate.c b/algo/lyra2/lyra2-gate.c
index b608ba2..4b1f7e6 100644
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -44,8 +44,13 @@ bool lyra2rev3_thread_init()
 {
    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   int size = ROW_LEN_BYTES * 4; // nRows;
 
-   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+#if defined(LYRA2REV3_16WAY)
+//   l2v3_wholeMatrix = _mm_malloc( 2*size, 128 );
+   l2v3_wholeMatrix = _mm_malloc( 2*size, 64 );
+   init_lyra2rev3_16way_ctx();;
+#else
    l2v3_wholeMatrix = _mm_malloc( size, 64 );
 #if defined (LYRA2REV3_8WAY)
    init_lyra2rev3_8way_ctx();;
@@ -53,13 +58,17 @@ bool lyra2rev3_thread_init()
    init_lyra2rev3_4way_ctx();;
 #else
    init_lyra2rev3_ctx();
+#endif
 #endif
    return l2v3_wholeMatrix;
 }
 
 bool register_lyra2rev3_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV3_8WAY)
+#if defined(LYRA2REV3_16WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev3_16way;
+  gate->hash      = (void*)&lyra2rev3_16way_hash;
+#elif defined (LYRA2REV3_8WAY)
   gate->scanhash  = (void*)&scanhash_lyra2rev3_8way;
   gate->hash      = (void*)&lyra2rev3_8way_hash;
 #elif defined (LYRA2REV3_4WAY)
@@ -69,7 +78,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_lyra2rev3;
   gate->hash      = (void*)&lyra2rev3_hash;
 #endif
-  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
   gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
   opt_target_factor = 256.0;
   return true;
@@ -85,10 +94,14 @@ bool lyra2rev2_thread_init()
    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
 
    int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+#if defined (LYRA2REV2_8WAY)
+   l2v2_wholeMatrix = _mm_malloc( 2 * size, 64 );   // 2 way
+   init_lyra2rev2_8way_ctx();;
+#elif defined (LYRA2REV2_4WAY)
    l2v2_wholeMatrix = _mm_malloc( size, 64 );
-#if defined (LYRA2REV2_4WAY)
    init_lyra2rev2_4way_ctx();;
 #else
+   l2v2_wholeMatrix = _mm_malloc( size, 64 );
    init_lyra2rev2_ctx();
 #endif
    return l2v2_wholeMatrix;
@@ -96,14 +109,17 @@ bool lyra2rev2_thread_init()
 
 bool register_lyra2rev2_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV2_4WAY)
+#if defined (LYRA2REV2_8WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev2_8way;
+  gate->hash      = (void*)&lyra2rev2_8way_hash;
+#elif defined (LYRA2REV2_4WAY)
   gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
   gate->hash      = (void*)&lyra2rev2_4way_hash;
 #else
   gate->scanhash  = (void*)&scanhash_lyra2rev2;
   gate->hash      = (void*)&lyra2rev2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
   gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
   opt_target_factor = 256.0;
   return true;
diff --git a/algo/lyra2/lyra2-gate.h b/algo/lyra2/lyra2-gate.h
index 8a392ca..5c48bdc 100644
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -5,18 +5,27 @@
 #include <stdint.h>
 #include "lyra2.h"
 
-#if defined(__AVX2__)
-  #define LYRA2REV3_8WAY
-#endif
 
-#if defined(__SSE2__)
-  #define LYRA2REV3_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define LYRA2REV3_16WAY 1
+#elif defined(__AVX2__)
+  #define LYRA2REV3_8WAY 1
+#elif defined(__SSE2__)
+  #define LYRA2REV3_4WAY 1
 #endif
 
 extern __thread uint64_t* l2v3_wholeMatrix;
 
 bool register_lyra2rev3_algo( algo_gate_t* gate );
-#if defined(LYRA2REV3_8WAY)
+
+#if defined(LYRA2REV3_16WAY)
+
+void lyra2rev3_16way_hash( void *state, const void *input );
+int scanhash_lyra2rev3_16way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr );
+bool init_lyra2rev3_16way_ctx();
+
+#elif defined(LYRA2REV3_8WAY)
 
 void lyra2rev3_8way_hash( void *state, const void *input );
 int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
@@ -41,15 +50,24 @@ bool init_lyra2rev3_ctx();
 
 //////////////////////////////////
 
-#if defined(__AVX2__)
-  #define LYRA2REV2_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define LYRA2REV2_8WAY 1
+#elif defined(__AVX2__)
+  #define LYRA2REV2_4WAY 1
 #endif
 
 extern __thread uint64_t* l2v2_wholeMatrix;
 
 bool register_lyra2rev2_algo( algo_gate_t* gate );
 
-#if defined(LYRA2REV2_4WAY)
+#if defined(LYRA2REV2_8WAY)
+
+void lyra2rev2_8way_hash( void *state, const void *input );
+int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr );
+bool init_lyra2rev2_8way_ctx();
+
+#elif defined(LYRA2REV2_4WAY)
 
 void lyra2rev2_4way_hash( void *state, const void *input );
 int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
diff --git a/algo/lyra2/lyra2-hash-2way.c b/algo/lyra2/lyra2-hash-2way.c
index b657af0..b69eb09 100644
--- a/algo/lyra2/lyra2-hash-2way.c
+++ b/algo/lyra2/lyra2-hash-2way.c
@@ -26,6 +26,19 @@
 #include "lyra2.h"
 #include "sponge.h"
 
+//  LYRA2RE 8 cols 8 rows used by lyea2re, allium, phi2, x22i, x25x.
+//
+//  LYRA2REV2 4 cols 4 rows used by lyra2rev2.
+//
+//  LYRA2REV3 4 cols 4 rows with an extra twist in calculating
+//  rowa in the wandering phase. Used by lyra2rev3.
+// 
+//  LYRA2Z various cols & rows and supports 80 input. Used by lyra2z,
+//  lyra2z330, lyra2h, 
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 /**
  * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
  * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
@@ -46,176 +59,137 @@
  * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
  */
 
-int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
-               const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
-               const uint64_t timeCost, const uint64_t nRows,
-               const uint64_t nCols )
+// For lyra2rev3.
+// convert a simple offset to an index into interleaved data.
+// good for state and 4 row matrix. 
+// index = ( int( off / 4 ) * 2 ) + ( off mod 4 )
+
+#define offset_to_index( o ) \
+   ( ( ( (uint64_t)( (o) & 0xf) / 4 ) * 8 ) + ( (o) % 4 ) )
+
+
+int LYRA2REV2_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
+             const void *pwd, const uint64_t pwdlen, const uint64_t timeCost,
+             const uint64_t nRows, const uint64_t nCols )
 {
    //====================== Basic variables ============================//
-   uint64_t _ALIGN(256) state[16];
-   int64_t row = 2; //index of row to be processed
-   int64_t prev = 1; //index of prev (last row ever computed/modified)
-   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
-   int64_t tau; //Time Loop iterator
-   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
-   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
-   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
-//   int64_t i; //auxiliary iteration counter
-   int64_t v64; // 64bit var for memcpy
+   uint64_t _ALIGN(256) state[32];
+   int64_t row = 2;
+   int64_t prev = 1;
+   int64_t rowa0 = 0;
+   int64_t rowa1 = 0;
+   int64_t tau; 
+   int64_t step = 1;
+   int64_t window = 2;
+   int64_t gap = 1;
    //====================================================================/
 
-   //=== Initializing the Memory Matrix and pointers to it =============//
-   //Tries to allocate enough space for the whole memory matrix
-
    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-//   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
    // for Lyra2REv2, nCols = 4, v1 was using 8
    const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
                                           : BLOCK_LEN_BLAKE2_SAFE_BYTES;
    uint64_t *ptrWord = wholeMatrix;
 
-//   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
-
-   //=== Getting the password + salt + basil padded with 10*1 ==========//
-   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
-   //but this ensures that the password copied locally will be overwritten as soon as possible
-
-   //First, we clean enough blocks for the password, salt, basil and padding
-   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
                               / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
 
-   byte *ptrByte = (byte*) wholeMatrix;
+   uint64_t *ptr = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;
 
-   //Prepends the password
-   memcpy(ptrByte, pwd, pwdlen);
-   ptrByte += pwdlen;
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;
 
-   //Concatenates the salt
-   memcpy(ptrByte, salt, saltlen);
-   ptrByte += saltlen;
+   // now build the rest interleaving on the fly.
 
-   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
-                       - (saltlen + pwdlen) );
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;
 
-   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
-   memcpy(ptrByte, &kLen, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = pwdlen;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = saltlen;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = timeCost;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = nRows;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = nCols;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-
-   //Now comes the padding
-   *ptrByte = 0x80; //first byte of padding: right after the password
-   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
-   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
-   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
-
-// from here on it's all simd acces to state and matrix
-// define vector pointers and adjust sizes and pointer offsets
-
-   //================= Initializing the Sponge State ====================//
-   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
-
-//   initState( state );
-
-   //========================= Setup Phase =============================//
-   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
-   
    ptrWord = wholeMatrix;
 
-   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
-/*
-   for (i = 0; i < nBlocksInput; i++)
-   {
-       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
-       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
-   }
-*/
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
 
    //Initializes M[0] and M[1]
-   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
 
-   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
-                      nCols);
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                           &wholeMatrix[ 2 * ROW_LEN_INT64 ],  nCols );
 
    do
    {
-      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+     //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
 
-      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                             &wholeMatrix[rowa*ROW_LEN_INT64],
-                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+     reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64],
+                                        &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64],
+                                        &wholeMatrix[ 2* row*ROW_LEN_INT64],
+                                        nCols );
 
-      //updates the value of row* (deterministically picked during Setup))
-      rowa = (rowa + step) & (window - 1);
-      //update prev: it now points to the last row ever computed
+     rowa0 = (rowa0 + step) & (window - 1);
 
-      prev = row;
-      //updates row: goes to the next row to be computed
-      row++;
+     prev = row;
+     row++;
 
-      //Checks if all rows in the window where visited.
-      if (rowa == 0)
-      {
-         step = window + gap; //changes the step: approximately doubles its value
-         window *= 2; //doubles the size of the re-visitation window
-         gap = -gap; //inverts the modifier to the step
-      }
-
-   } while (row < nRows);
+     if ( rowa0 == 0 )
+     {
+        step = window + gap;
+        window *= 2; 
+        gap = -gap;
+     }
+   } while ( row < nRows );
 
    //===================== Wandering Phase =============================//
-   row = 0; //Resets the visitation to the first row of the memory matrix
-   for (tau = 1; tau <= timeCost; tau++)
+   row = 0;
+   for ( tau = 1; tau <= timeCost; tau++ )
    {
-       //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
-       step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
-       do
-       {
-           //Selects a pseudorandom index row*
-           //-----------------------------------------------
-           rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+      step = ( (tau & 1) == 0 ) ? -1 : ( nRows >> 1 ) - 1;
+      do
+      {
+        rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
+        rowa1 = state[ 4 ] & (unsigned int)(nRows-1);
 
-           //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-           //-------------------------------------------
+        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row *ROW_LEN_INT64 ],
+                                      nCols );
+         prev = row;
 
-           //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
-           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                             &wholeMatrix[rowa*ROW_LEN_INT64],
-                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
-           //update prev: it now points to the last row ever computed
-           prev = row;
+         row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
 
-           //updates row: goes to the next row to be computed
-           //----------------------------------------------------
-           row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
-           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-           //----------------------------------------------------
-
-       } while (row != 0);
+      } while (row != 0);
    }
 
    //===================== Wrap-up Phase ===============================//
    //Absorbs the last block of the memory matrix
-   absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
+   absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64 ],
+                            &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64 ] );
    //Squeezes the key
-   squeeze(state, K, (unsigned int) kLen);
+   squeeze_2way( state, K, (unsigned int) kLen );
 
    return 0;
 }
 
+// This version is currently only used by REv3 and has some hard coding
+// specific to v3 such as input data size of 32 bytes.
+//
+// Similarly with REv2. Thedifference with REv3 isn't clear and maybe
+// they can be merged.
+//
+// RE is used by RE, allium. The main difference between RE and REv2
+// in the matrix size.
+//
+// Z also needs to support 80 byte input as well as 32 byte, and odd
+// matrix sizes like 330 rows. It is used by lyra2z330, lyra2z, lyra2h.
+
+
 /////////////////////////////////////////////////
 
 // 2 way 256
@@ -223,22 +197,29 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
 // Data is interleaved 2x256.
 
 int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
-      const void *pwd, const uint64_t pwdlen, const void *salt,
-      const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
-      const uint64_t nCols )
+                    const void *pwd, uint64_t pwdlen, uint64_t timeCost,
+                    uint64_t nRows, uint64_t nCols )
+
+// hard coded for 32 byte input as well as matrix size.
+// Other required versions include 80 byte input and different block
+// sizez
+
+//int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
+//      const void *pwd, const uint64_t pwdlen, const void *salt,
+//      const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
+//      const uint64_t nCols )
 {
    //====================== Basic variables ============================//
-   uint64_t _ALIGN(256) state[16];
-   int64_t row = 2; //index of row to be processed
-   int64_t prev = 1; //index of prev (last row ever computed/modified)
-   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
-   int64_t tau; //Time Loop iterator
-   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
-   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
-   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
-//   int64_t i; //auxiliary iteration counter
-   int64_t v64; // 64bit var for memcpy
-   uint64_t instance0 = 0; // Seperate instance for each lane
+   uint64_t _ALIGN(256) state[32];
+   int64_t row = 2; 
+   int64_t prev = 1;
+   int64_t rowa0 = 0;
+   int64_t rowa1 = 0;
+   int64_t tau; 
+   int64_t step = 1;
+   int64_t window = 2;
+   int64_t gap = 1; 
+   uint64_t instance0 = 0;
    uint64_t instance1 = 0;
    //====================================================================/
 
@@ -248,7 +229,9 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
    uint64_t *ptrWord = wholeMatrix;
 
 //  2 way 256 rewrite. Salt always == password, and data is interleaved,
-//  need to build in parallel:
+//  need to build in parallel as pw isalready interleaved.
+
+   
 //  {   password,    (64 or 80 bytes)
 //      salt,        (64 or 80 bytes) =  same as password
 //      Klen,        (u64)  = 32 bytes
@@ -262,73 +245,54 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
 //      1            (byte)
 //   }
    
-//   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+// input is usually 32 maybe 64, both are aligned to 256 bit vector.
+// 80 byte inpput is not aligned complicating matters for lyra2z.   
 
-   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
                               / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+   
+   uint64_t *ptr = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;
 
-   byte *ptrByte = (byte*) wholeMatrix;
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;
+ 
+   // now build the rest interleaving on the fly.
 
-   //Prepends the password
-   memcpy(ptrByte, pwd, pwdlen);
-   ptrByte += pwdlen;
-
-   //Concatenates the salt
-   memcpy(ptrByte, salt, saltlen);
-   ptrByte += saltlen;
-
-   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
-                       - (saltlen + pwdlen) );
-
-   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
-   memcpy(ptrByte, &kLen, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = pwdlen;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = saltlen;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = timeCost;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = nRows;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = nCols;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-
-   //Now comes the padding
-   *ptrByte = 0x80; //first byte of padding: right after the password
-   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
-   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
-   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
-
-// from here on it's all simd acces to state and matrix
-// define vector pointers and adjust sizes and pointer offsets
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;
 
    ptrWord = wholeMatrix;
 
-   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
-   reducedSqueezeRow0( state, &wholeMatrix[0], nCols );
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
 
-   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
-                      nCols);
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
+
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                           &wholeMatrix[2*ROW_LEN_INT64],  nCols );
 
    do
    {
 
-      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                             &wholeMatrix[rowa*ROW_LEN_INT64],
-                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+      reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
+                                         nCols );
 
-      rowa = (rowa + step) & (window - 1);
+      rowa0 = (rowa0 + step) & (window - 1);
 
       prev = row;
       row++;
 
-      if (rowa == 0)
+      if (rowa0 == 0)
       {
          step = window + gap; //changes the step: approximately doubles its value
          window *= 2; //doubles the size of the re-visitation window
@@ -340,37 +304,22 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
    row = 0;
    for (tau = 1; tau <= timeCost; tau++)
    {
-      step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
+      step = ( (tau & 1) == 0 ) ? -1 : ( nRows >> 1 ) - 1;
       do
       {
-        // This part is not parallel, rowa will be different for each lane.
-        // state (u64[16]) is interleaved 2x256, need to extract seperately.
+        instance0 = state[ offset_to_index( instance0 ) ];
+        instance1 = (&state[4])[ offset_to_index( instance1 ) ];
 
-        // index = 2 * instance / 4 * 4 + instance % 4
-        uint64_t index0 = ( ( (instance0 & 0xf) >> 3 ) << 2 )
-                           + ( instance0 & 0x3 )
-        uint64_t index1 = ( ( (instance1 & 0xf) >> 3 ) << 2 )
-                           + ( instance1 & 0x3 )
+        rowa0 = state[ offset_to_index( instance0 )  ]
+                & (unsigned int)(nRows-1);
+        rowa1 = (state+4)[ offset_to_index( instance1 ) ]
+                & (unsigned int)(nRows-1);
 
-        instance0 = state[ index0 ] & 0xf;
-        instance1 = (state+4)[ index1 ] & 0xf;
-
-        rowa0 = state[ instance0 ];
-        rowa1 = (state+4)[ instance1 ];
-
-        reducedDuplexRow_2way( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                                      &wholeMatrix[rowa0*ROW_LEN_INT64],
-                                      &wholeMatrix[rowa1*ROW_LEN_INT64],
-                                      &wholeMatrix[row*ROW_LEN_INT64], nCols );
-/*
-           instance = state[instance & 0xF];
-           rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
-
-           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                             &wholeMatrix[rowa*ROW_LEN_INT64],
-                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
-*/
-        // End of divergence.
+        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
+                                      nCols );
 
         prev = row;
         row = (row + step) & (unsigned int)(nRows-1); 
@@ -378,13 +327,17 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
        } while ( row != 0 );
    }
 
-   absorbBlock( state, &wholeMatrix[rowa*ROW_LEN_INT64] );
-   squeeze( state, K, (unsigned int) kLen );
+   absorbBlock_2way( state, &wholeMatrix[2*rowa0*ROW_LEN_INT64],
+                            &wholeMatrix[2*rowa1*ROW_LEN_INT64] );
+
+   squeeze_2way( state, K, (unsigned int) kLen );
 
    return 0;
 }
 
+#endif // AVX512
 
+#if 0
 
 //////////////////////////////////////////////////
 int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
@@ -532,22 +485,26 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
     return 0;
 }
 
+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 // Lyra2RE doesn't like the new wholeMatrix implementation
-int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
-             const void *salt, const uint64_t saltlen, const uint64_t timeCost,
-             const uint64_t nRows, const uint64_t nCols )
+int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
+                  const uint64_t pwdlen, const uint64_t timeCost,
+                  const uint64_t nRows, const uint64_t nCols )
 {
    //====================== Basic variables ============================//
    uint64_t _ALIGN(256) state[16];
    int64_t row = 2; //index of row to be processed
    int64_t prev = 1; //index of prev (last row ever computed/modified)
-   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+   int64_t rowa0 = 0;
+   int64_t rowa1 = 0;
    int64_t tau; //Time Loop iterator
    int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
    int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
    int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
    int64_t i; //auxiliary iteration counter
-   int64_t v64; // 64bit var for memcpy
    //====================================================================/
 
    //=== Initializing the Memory Matrix and pointers to it =============//
@@ -573,15 +530,36 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
 #endif
 
    uint64_t *ptrWord = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;
 
    //=== Getting the password + salt + basil padded with 10*1 ==========//
    //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
    //but this ensures that the password copied locally will be overwritten as soon as possible
 
    //First, we clean enough blocks for the password, salt, basil and padding
-   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
                               / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
 
+   uint64_t *ptr = wholeMatrix;
+
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;
+
+   // now build the rest interleaving on the fly.
+
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;
+
+   
+/*   
    byte *ptrByte = (byte*) wholeMatrix;
 
    //Prepends the password
@@ -630,7 +608,9 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
 
    ptrWord = wholeMatrix;
 
-   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
+*/
+
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
 /*
    for (i = 0; i < nBlocksInput; i++)
    {
@@ -639,21 +619,22 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
    }
 */
    //Initializes M[0] and M[1]
-   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
 
-   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
-                      nCols);
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                                  &wholeMatrix[ 2 * ROW_LEN_INT64], nCols );
 
    do
    {
       //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
 
-      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                             &wholeMatrix[rowa*ROW_LEN_INT64],
-                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+      reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
+                                         nCols );
 
       //updates the value of row* (deterministically picked during Setup))
-      rowa = (rowa + step) & (window - 1);
+      rowa0 = (rowa0 + step) & (window - 1);
       //update prev: it now points to the last row ever computed
 
       prev = row;
@@ -661,7 +642,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
       row++;
 
       //Checks if all rows in the window where visited.
-      if (rowa == 0)
+      if (rowa0 == 0)
       {
          step = window + gap; //changes the step: approximately doubles its value
          window *= 2; //doubles the size of the re-visitation window
@@ -674,21 +655,18 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
    row = 0; //Resets the visitation to the first row of the memory matrix
    for (tau = 1; tau <= timeCost; tau++)
    {
-       //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
-       step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
-       do
-       {
-           //Selects a pseudorandom index row*
-           //-----------------------------------------------
-           rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+      step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
+      do
+      {
+        rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
+        rowa1 = state[ 4 ] & (unsigned int)(nRows-1);
 
-           //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-           //-------------------------------------------
+        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row *ROW_LEN_INT64 ],
+                                      nCols );
 
-           //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
-           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                             &wholeMatrix[rowa*ROW_LEN_INT64],
-                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
            //update prev: it now points to the last row ever computed
            prev = row;
 
@@ -703,9 +681,10 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
 
    //===================== Wrap-up Phase ===============================//
    //Absorbs the last block of the memory matrix
-   absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
+   absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64],
+                            &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64] );
    //Squeezes the key
-   squeeze(state, K, (unsigned int) kLen);
+   squeeze_2way( state, K, (unsigned int) kLen );
 
    //================== Freeing the memory =============================//
    _mm_free(wholeMatrix);
@@ -713,3 +692,4 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
    return 0;
 }
 
+#endif
diff --git a/algo/lyra2/lyra2.c b/algo/lyra2/lyra2.c
index 8db05dc..970c612 100644
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -327,7 +327,6 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
 
    reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
                       nCols);
-
    do
    {
       //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
diff --git a/algo/lyra2/lyra2.h b/algo/lyra2/lyra2.h
index 3c2399e..483ca2f 100644
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -60,4 +60,15 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
 
 int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
 
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+
+int LYRA2REV2_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
+        uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
+int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
+        uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
+#endif
+
 #endif /* LYRA2_H_ */
diff --git a/algo/lyra2/lyra2rev2-4way.c b/algo/lyra2/lyra2rev2-4way.c
index 9832fb1..f2954c3 100644
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -1,13 +1,150 @@
 #include "lyra2-gate.h"
 #include <memory.h>
-
-#if defined (LYRA2REV2_4WAY)	
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 
+#include "algo/cubehash/cube-hash-2way.h"
+
+#if defined (LYRA2REV2_8WAY)
+
+typedef struct {
+   blake256_8way_context     blake;
+   keccak256_8way_context    keccak;
+   cube_4way_context          cube;
+   skein256_8way_context     skein;
+   bmw256_8way_context          bmw;
+} lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));
+
+static lyra2v2_8way_ctx_holder l2v2_8way_ctx;
+
+bool init_lyra2rev2_8way_ctx()
+{
+   keccak256_8way_init( &l2v2_8way_ctx.keccak );
+   cube_4way_init( &l2v2_8way_ctx.cube, 256, 16, 32 );
+   skein256_8way_init( &l2v2_8way_ctx.skein );
+   bmw256_8way_init( &l2v2_8way_ctx.bmw );
+   return true;
+}
+
+void lyra2rev2_8way_hash( void *state, const void *input )
+{
+   uint32_t vhash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
+   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (64)));
+   uint32_t hash2[8] __attribute__ ((aligned (64)));
+   uint32_t hash3[8] __attribute__ ((aligned (64)));
+   uint32_t hash4[8] __attribute__ ((aligned (64)));
+   uint32_t hash5[8] __attribute__ ((aligned (64)));
+   uint32_t hash6[8] __attribute__ ((aligned (64)));
+   uint32_t hash7[8] __attribute__ ((aligned (64)));
+   lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );
+
+   blake256_8way( &ctx.blake, input + (64<<3), 16 );
+   blake256_8way_close( &ctx.blake, vhash );
+
+   rintrlv_8x32_8x64( vhashA, vhash, 256 );
+
+   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
+   keccak256_8way_close( &ctx.keccak, vhash );
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
+
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+
+   intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                hash7, 256 );
+
+   skein256_8way_update( &ctx.skein, vhash, 32 );
+   skein256_8way_close( &ctx.skein, vhash );
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
+
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+   
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
+
+   intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, 
+                hash7, 256 );
+
+   bmw256_8way_update( &ctx.bmw, vhash, 32 );
+   bmw256_8way_close( &ctx.bmw, state );
+}
+
+int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+   const uint32_t Htarg = ptarget[7];
+   __m256i *noncev = (__m256i*)vdata + 19;   // aligned
+   int thr_id = mythr->id; 
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );
+
+   blake256_8way_init( &l2v2_8way_ctx.blake );
+   blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 );
+
+   do
+   {
+      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                  n+3, n+2, n+1, n ) );
+
+      lyra2rev2_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (LYRA2REV2_4WAY)
 
 typedef struct {
    blake256_4way_context     blake;
diff --git a/algo/lyra2/lyra2rev3-4way.c b/algo/lyra2/lyra2rev3-4way.c
index 467a1a6..6e560be 100644
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -4,8 +4,180 @@
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 
+#include "algo/cubehash/cube-hash-2way.h"
 
-#if defined (LYRA2REV3_8WAY)
+#if defined (LYRA2REV3_16WAY)
+
+typedef struct {
+   blake256_16way_context     blake;
+   cube_4way_context          cube;
+   bmw256_16way_context       bmw;
+} lyra2v3_16way_ctx_holder;
+
+static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx;
+
+bool init_lyra2rev3_16way_ctx()
+{
+   blake256_16way_init( &l2v3_16way_ctx.blake );
+   cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 );
+   bmw256_16way_init( &l2v3_16way_ctx.bmw );
+   return true;
+}
+
+void lyra2rev3_16way_hash( void *state, const void *input )
+{
+   uint32_t vhash[16*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (64)));
+   uint32_t hash2[8] __attribute__ ((aligned (64)));
+   uint32_t hash3[8] __attribute__ ((aligned (64)));
+   uint32_t hash4[8] __attribute__ ((aligned (64)));
+   uint32_t hash5[8] __attribute__ ((aligned (64)));
+   uint32_t hash6[8] __attribute__ ((aligned (64)));
+   uint32_t hash7[8] __attribute__ ((aligned (64)));
+   uint32_t hash8[8] __attribute__ ((aligned (64)));
+   uint32_t hash9[8] __attribute__ ((aligned (64)));
+   uint32_t hash10[8] __attribute__ ((aligned (64)));
+   uint32_t hash11[8] __attribute__ ((aligned (64)));
+   uint32_t hash12[8] __attribute__ ((aligned (64)));
+   uint32_t hash13[8] __attribute__ ((aligned (64)));
+   uint32_t hash14[8] __attribute__ ((aligned (64)));
+   uint32_t hash15[8] __attribute__ ((aligned (64)));
+   lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );
+
+   blake256_16way_update( &ctx.blake, input + (64*16), 16 );
+   blake256_16way_close( &ctx.blake, vhash );
+
+   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+           hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
+           vhash, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+   intrlv_2x256( vhash, hash8, hash9, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash8, hash9, vhash, 256 );
+   intrlv_2x256( vhash, hash10, hash11, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash10, hash11, vhash, 256 );
+   intrlv_2x256( vhash, hash12, hash13, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash12, hash13, vhash, 256 );
+   intrlv_2x256( vhash, hash14, hash15, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash14, hash15, vhash, 256 );
+
+   intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 256 );
+   intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 256 );
+   intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash8, hash9, hash10, hash11, vhash, 256 );
+   intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash12, hash13, hash14, hash15, vhash, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+   intrlv_2x256( vhash, hash8, hash9, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash8, hash9, vhash, 256 );
+   intrlv_2x256( vhash, hash10, hash11, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash10, hash11, vhash, 256 );
+   intrlv_2x256( vhash, hash12, hash13, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash12, hash13, vhash, 256 );
+   intrlv_2x256( vhash, hash14, hash15, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash14, hash15, vhash, 256 );
+
+   intrlv_16x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+             hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
+             hash15, 256 );
+
+   bmw256_16way_update( &ctx.bmw, vhash, 32 );
+   bmw256_16way_close( &ctx.bmw, state );
+}
+
+
+int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &hash[7<<4];
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 16;
+   const uint32_t Htarg = ptarget[7];
+   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
+   const int thr_id = mythr->id;
+
+   if ( opt_benchmark )  ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm512_bswap32_intrlv80_16x32( vdata, pdata );
+
+   blake256_16way_init( &l2v3_16way_ctx.blake );
+   blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
+
+   do
+   {
+      *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
+                                                  n+11, n+10, n+ 9, n+ 8,
+                                                  n+ 7, n+ 6, n+ 5, n+ 4,
+                                                  n+ 3, n+ 2, n+ 1, n ) );
+
+      lyra2rev3_16way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int lane = 0; lane < 16; lane++ )
+      if ( unlikely( hash7[lane] <= Htarg ) )
+      {
+         extr_lane_16x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+             pdata[19] = n + lane;
+             submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 16;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (LYRA2REV3_8WAY)
 
 typedef struct {
    blake256_8way_context     blake;
diff --git a/algo/lyra2/sponge-2way.c b/algo/lyra2/sponge-2way.c
index 35c20cc..bb92082 100644
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -19,7 +19,7 @@
  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "algo-gate.h"
+//#include "algo-gate.h"
 #include <string.h>
 #include <stdio.h>
 #include <time.h>
@@ -40,19 +40,26 @@ inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
     //Squeezes full blocks
     for ( i = 0; i < fullBlocks; i++ )
     {
-       memcpy_512( out, state, BLOCK_LEN_M256I*2 );
-       LYRA_ROUND_2WAY_AVX2( state[0], state[1], state[2], state[3] );
-       out += BLOCK_LEN_M256I*2;
+       memcpy_512( out, state, BLOCK_LEN_M256I );
+       LYRA_ROUND_2WAY_AVX512( state[0], state[1], state[2], state[3] );
+       out += BLOCK_LEN_M256I;
     }
     //Squeezes remaining bytes
-    memcpy_512( out, state, ( (len_m256i % BLOCK_LEN_M256I) * 2 ) );
+    memcpy_512( out, state, len_m256i % BLOCK_LEN_M256I );
 }
 
-inline void absorbBlock_2way( uint64_t *State, const uint64_t *In ) 
+inline void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
+                                               const uint64_t *In1 ) 
 {
     register __m512i state0, state1, state2, state3;
-    __m512i *in = (__m512i*)In;
-
+    __m512i in[3];
+    casti_m256i( in, 0 ) = casti_m256i( In0, 0 );
+    casti_m256i( in, 1 ) = casti_m256i( In1, 1 );
+    casti_m256i( in, 2 ) = casti_m256i( In0, 2 );
+    casti_m256i( in, 3 ) = casti_m256i( In1, 3 );
+    casti_m256i( in, 4 ) = casti_m256i( In0, 4 );
+    casti_m256i( in, 5 ) = casti_m256i( In1, 5 );
+    
     state0 = _mm512_load_si512( (__m512i*)State     );
     state1 = _mm512_load_si512( (__m512i*)State + 1 );
     state2 = _mm512_load_si512( (__m512i*)State + 2 );
@@ -90,7 +97,7 @@ inline void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
     state1 = _mm512_xor_si512( state1, in[1] );
 
     LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
-    In += block_len * 2;
+    In += block_len*2;
   }
 
   _mm512_store_si512( (__m512i*)State,     state0 );
@@ -109,7 +116,7 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
 
 
     register __m512i state0, state1, state2, state3;
-    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
 
     state0 = _mm512_load_si512( (__m512i*)State     );
     state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -126,13 +133,13 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
     {
        _mm_prefetch( out -  9, _MM_HINT_T0 );
        _mm_prefetch( out - 11, _MM_HINT_T0 );
-                   
+
        out[0] = state0;
        out[1] = state1;
        out[2] = state2;
 
        //Goes to next block (column) that will receive the squeezed data
-       out -= BLOCK_LEN_M256I * 2;
+       out -= BLOCK_LEN_M256I;
 
        LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
     }
@@ -143,15 +150,14 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
     _mm512_store_si512( (__m512i*)State + 3, state3 );
 }
 
-// This function has to deal with gathering 2 256 bit rowin vectors from
-// non-contiguous memory. Extra work and performance penalty.
 
 inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
                  uint64_t *rowOut, uint64_t nCols )
 {
     int i;
     register __m512i state0, state1, state2, state3;
-    __m512i *in = (__m256i*)rowIn;
+    __m512i *in = (__m512i*)rowIn;
+    __m512i *out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
 
     state0 = _mm512_load_si512( (__m512i*)State     );
     state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -171,28 +177,25 @@ inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
          out[2] = _mm512_xor_si512( state2, in[2] );
 
          //Input: next column (i.e., next block in sequence)
-         in0 += BLOCK_LEN_M256I;
-         in1 += BLOCK_LEN_M256I;
+         in += BLOCK_LEN_M256I;
          //Output: goes to previous column
-         out -= BLOCK_LEN_M256I * 2;
+         out -= BLOCK_LEN_M256I;
     }
 
-    _mm512_store_si256( (__m512i*)State,     state0 );
-    _mm512_store_si256( (__m512i*)State + 1, state1 );
-    _mm512_store_si256( (__m512i*)State + 2, state2 );
-    _mm512_store_si256( (__m512i*)State + 3, state3 );
-   }
+    _mm512_store_si512( (__m512i*)State,     state0 );
+    _mm512_store_si512( (__m512i*)State + 1, state1 );
+    _mm512_store_si512( (__m512i*)State + 2, state2 );
+    _mm512_store_si512( (__m512i*)State + 3, state3 );
 }
 
 inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
                        uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols )
 {
     int i;
-
     register __m512i state0, state1, state2, state3;
     __m512i* in    = (__m512i*)rowIn;
     __m512i* inout = (__m512i*)rowInOut;
-    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
     __m512i  t0, t1, t2;
 
     state0 = _mm512_load_si512( (__m512i*)State     );
@@ -209,7 +212,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
        state2 = _mm512_xor_si512( state2,
                                   _mm512_add_epi64( in[2], inout[2] ) );
 
-       LYRA_ROUND_2WAY AVX512( state0, state1, state2, state3 );
+       LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
 
        out[0] = _mm512_xor_si512( state0, in[0] );
        out[1] = _mm512_xor_si512( state1, in[1] );
@@ -221,17 +224,18 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
        t2 = _mm512_permutex_epi64( state2, 0x93 );
 
        inout[0] = _mm512_xor_si512( inout[0],
-                                 _mm512_mask_blend_epi32( t0, t2, 0x03 ) );
+                                 _mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
        inout[1] = _mm512_xor_si512( inout[1],
-                                 _mm512_mask_blend_epi32( t1, t0, 0x03 ) );
+                                 _mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
        inout[2] = _mm512_xor_si512( inout[2],
-                                 _mm512_mask_blend_epi32( t2, t1, 0x03 ) );
+                                 _mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
+
 
        //Inputs: next column (i.e., next block in sequence)
-       in    += BLOCK_LEN_M256I * 2;
-       inout += BLOCK_LEN_M256I * 2;
+       in    += BLOCK_LEN_M256I;
+       inout += BLOCK_LEN_M256I;
        //Output: goes to previous column
-       out   -= BLOCK_LEN_M256I * 2;
+       out   -= BLOCK_LEN_M256I;
     }
 
     _mm512_store_si512( (__m512i*)State,     state0 );
@@ -240,49 +244,61 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
     _mm512_store_si512( (__m512i*)State + 3, state3 );
 }
 
-inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
-                uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut,
-                uint64_t nCols )
+// big ugly workaound for pointer aliasing, use a union of pointers.
+// Access matrix using m512i for in and out, m256i for inout
+
+inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
+                            uint64_t *rowInOut0, uint64_t *rowInOut1,
+                            uint64_t *rowOut, uint64_t nCols)
 {
    int i;
-
    register __m512i state0, state1, state2, state3;
-    __m256i *in0 = (__m256i*)rowIn0;
-    __m256i *in0 = (__m256i*)rowIn0;
-    __m2512* in    = (__m512i*)rowIn;
-    __m2512* inout = (__m512i*)rowInOut;
-    __m512i* out   = (__m512i*)rowOut;
-    __m512i  t0, t1, t2;
+   __m512i *in = (__m512i*)rowIn;
+   __m256i *inout0 = (__m256i*)rowInOut0;
+   __m256i *inout1 = (__m256i*)rowInOut1;
+   __m512i *out = (__m512i*)rowOut;
+   __m512i io[3];
+   povly inout;
+   inout.v512 = &io[0];
+    __m512i t0, t1, t2;
 
-    _mm_prefetch( in0,     _MM_HINT_T0 );
-    _mm_prefetch( in1,     _MM_HINT_T0 );
-    _mm_prefetch( in0 + 2, _MM_HINT_T0 );
-    _mm_prefetch( in1 + 2, _MM_HINT_T0 );
-    _mm_prefetch( in0 + 4, _MM_HINT_T0 );
-    _mm_prefetch( in1 + 4, _MM_HINT_T0 );
-    _mm_prefetch( in0 + 6, _MM_HINT_T0 );
-    _mm_prefetch( in1 + 6, _MM_HINT_T0 );
-   
    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
    state2 = _mm512_load_si512( (__m512i*)State + 2 );
    state3 = _mm512_load_si512( (__m512i*)State + 3 );
+    
+    _mm_prefetch( in,     _MM_HINT_T0 );
+    _mm_prefetch( inout0,     _MM_HINT_T0 );
+    _mm_prefetch( inout1,     _MM_HINT_T0 );
+    _mm_prefetch( in     + 2, _MM_HINT_T0 );
+    _mm_prefetch( inout0 + 2, _MM_HINT_T0 );
+    _mm_prefetch( inout1 + 2, _MM_HINT_T0 );
+    _mm_prefetch( in     + 4, _MM_HINT_T0 );
+    _mm_prefetch( inout0 + 4, _MM_HINT_T0 );
+    _mm_prefetch( inout1 + 4, _MM_HINT_T0 );
+    _mm_prefetch( in     + 6, _MM_HINT_T0 );
+    _mm_prefetch( inout0 + 6, _MM_HINT_T0 );
+    _mm_prefetch( inout1 + 6, _MM_HINT_T0 );
+
+    
+    for ( i = 0; i < nCols; i++ )
+    {
 
       //Absorbing "M[prev] [+] M[row*]"
+      inout.v256[0] = inout0[0];
+      inout.v256[1] = inout1[1];
+      inout.v256[2] = inout0[2];
+      inout.v256[3] = inout1[3];
+      inout.v256[4] = inout0[4];
+      inout.v256[5] = inout1[5];
 
-//         state0 = _mm512_xor_si512( state0, mm512_concat_256( in1[0], in0[0] );
-//         state1 = _mm512_xor_si512( state1, mm512_concat_256( in1[1], in0[1] );
-//         state2 = _mm512_xor_si512( state2, mm512_concat_256( in1[2], in0[2] );
-      t0 = mm512_concat_256( in1[0], in0[0] );
-      t1 = mm512_concat_256( in1[1], in0[1] );
-      t2 = mm512_concat_256( in1[2], in0[2] );
-      
       state0 = _mm512_xor_si512( state0,
-                                     _mm512_add_epi64( t0, inout[0] ) );
+                                 _mm512_add_epi64( in[0], inout.v512[0] ) );
       state1 = _mm512_xor_si512( state1,
-                                     _mm512_add_epi64( t1, inout[1] ) );
+                                 _mm512_add_epi64( in[1], inout.v512[1] ) );
       state2 = _mm512_xor_si512( state2,
-                                     _mm512_add_epi64( t2, inout[2] ) );
+                                 _mm512_add_epi64( in[2], inout.v512[2] ) );
+
 
       //Applies the reduced-round transformation f to the sponge's state
       LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
@@ -292,22 +308,44 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
       out[1] = _mm512_xor_si512( out[1], state1 );
       out[2] = _mm512_xor_si512( out[2], state2 );
 
+      // if inout is the same row as out it was just overwritten, reload.
+      if ( rowOut == rowInOut0 )
+      {
+         inout.v256[0] = inout0[0];
+         inout.v256[2] = inout0[2];
+         inout.v256[4] = inout0[4];
+      }
+      if ( rowOut == rowInOut1 )
+      {
+         inout.v256[1] = inout1[1];
+         inout.v256[3] = inout1[3];
+         inout.v256[5] = inout1[5];
+      }
+
       //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
       t0 = _mm512_permutex_epi64( state0, 0x93 );
       t1 = _mm512_permutex_epi64( state1, 0x93 );
       t2 = _mm512_permutex_epi64( state2, 0x93 );
 
-      inout[0] = _mm512_xor_si512( inout[0],
-                                   _mm512_mask_blend_epi32( t0, t2, 0x03 ) );
-      inout[1] = _mm512_xor_si512( inout[1],
-                                   _mm512_mask_blend_epi32( t1, t0, 0x03 ) );
-      inout[2] = _mm512_xor_si512( inout[2],
-                                   _mm512_mask_blend_epi32( t2, t1, 0x03 ) );
+      inout.v512[0] = _mm512_xor_si512( inout.v512[0],
+                                   _mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
+      inout.v512[1] = _mm512_xor_si512( inout.v512[1],
+                                   _mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
+      inout.v512[2] = _mm512_xor_si512( inout.v512[2],
+                                   _mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
+      
+      inout0[0] = inout.v256[0];
+      inout1[1] = inout.v256[1];
+      inout0[2] = inout.v256[2];
+      inout1[3] = inout.v256[3];
+      inout0[4] = inout.v256[4];
+      inout1[5] = inout.v256[5];
 
        //Goes to next block
-       in    += BLOCK_LEN_M256I * 2;
-       out   += BLOCK_LEN_M256I * 2;
-       inout += BLOCK_LEN_M256I * 2;
+       in     += BLOCK_LEN_M256I;
+       inout0 += BLOCK_LEN_M256I * 2;
+       inout1 += BLOCK_LEN_M256I * 2;
+       out    += BLOCK_LEN_M256I;
    }
 
    _mm512_store_si512( (__m512i*)State,     state0 );
diff --git a/algo/lyra2/sponge.c b/algo/lyra2/sponge.c
index 5a8e71b..9f400b5 100644
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -375,7 +375,10 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
     {
        _mm_prefetch( out -  9, _MM_HINT_T0 );
        _mm_prefetch( out - 11, _MM_HINT_T0 );
-                   
+
+//printf("S RSR0 col= %d, out= %x\n",i,out);
+
+
        out[0] = state0;
        out[1] = state1;
        out[2] = state2;
@@ -706,11 +709,34 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
        out[1] = _mm256_xor_si256( state1, in[1] );
        out[2] = _mm256_xor_si256( state2, in[2] );
 
+/*
+printf("s duplexsetup col= %d\n",i); 
+uint64_t * o = (uint64_t*)out;
+printf("S out %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
+*/
+
        //M[row*][col] = M[row*][col] XOR rotW(rand)
        t0 = _mm256_permute4x64_epi64( state0, 0x93 );
        t1 = _mm256_permute4x64_epi64( state1, 0x93 );
        t2 = _mm256_permute4x64_epi64( state2, 0x93 );
 
+/*
+uint64_t *t = (uint64_t*)&t0;
+printf("S t0 %016lx %016lx %016lx %016lx\n",t[0],t[1],t[2],t[3]);
+
+o = (uint64_t*)inout;
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
+*/       
        inout[0] = _mm256_xor_si256( inout[0],
                                     _mm256_blend_epi32( t0, t2, 0x03 ) );
        inout[1] = _mm256_xor_si256( inout[1],
@@ -718,7 +744,17 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
        inout[2] = _mm256_xor_si256( inout[2],
                                     _mm256_blend_epi32( t2, t1, 0x03 ) );
 
-       //Inputs: next column (i.e., next block in sequence)
+/*
+o = (uint64_t*)inout;
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
+*/
+
+//Inputs: next column (i.e., next block in sequence)
        in    += BLOCK_LEN_M256I;
        inout += BLOCK_LEN_M256I;
        //Output: goes to previous column
@@ -949,6 +985,22 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
       _mm_prefetch( inout +  9, _MM_HINT_T0 );
       _mm_prefetch( inout + 11, _MM_HINT_T0 );
 
+/*
+uint64_t *io = (uint64_t*)inout;
+uint64_t *ii = (uint64_t*)in;
+
+printf("RDRS1 col= %d\n", i);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[15]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[0],ii[1],ii[2],ii[3]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[4],ii[5],ii[6],ii[7]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[8],ii[9],ii[10],ii[11]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[12],ii[13],ii[14],ii[15]);
+*/
+
+
       //Absorbing "M[prev] [+] M[row*]"
       state0 = _mm256_xor_si256( state0,
                                      _mm256_add_epi64( in[0], inout[0] ) );
diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h
index 80f1d4f..185181b 100644
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -65,14 +65,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 );
 
 #define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
-   G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm512_ror_1x64( s1); \
-   s2 = mm512_swap128_256( s2 ); \
-   s3 = mm512_rol1x64_256( s3 ); \
-   G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm512_rol1x64_256( s1 ); \
-   s2 = mm512_swap128_256( s2 ); \
-   s3 = mm512_ror1x64_256( s3 );
+   G2W_4X64( s0, s1, s2, s3 ); \
+   s1 = mm512_ror256_64( s1); \
+   s2 = mm512_swap256_128( s2 ); \
+   s3 = mm512_rol256_64( s3 ); \
+   G2W_4X64( s0, s1, s2, s3 ); \
+   s1 = mm512_rol256_64( s1 ); \
+   s2 = mm512_swap256_128( s2 ); \
+   s3 = mm512_ror256_64( s3 );
 
 #define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
    LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -148,14 +148,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
    G_2X64( s0, s2, s4, s6 ); \
    G_2X64( s1, s3, s5, s7 ); \
-   mm128_ror1x64_256( s2, s3 ); \
-   mm128_swap128_256( s4, s5 ); \
-   mm128_rol1x64_256( s6, s7 ); \
+   mm128_ror256_64( s2, s3 ); \
+   mm128_swap256_128( s4, s5 ); \
+   mm128_rol256_64( s6, s7 ); \
    G_2X64( s0, s2, s4, s6 ); \
    G_2X64( s1, s3, s5, s7 ); \
-   mm128_rol1x64_256( s2, s3 ); \
-   mm128_swap128_256( s4, s5 ); \
-   mm128_ror1x64_256( s6, s7 );
+   mm128_rol256_64( s2, s3 ); \
+   mm128_swap256_128( s4, s5 ); \
+   mm128_ror256_64( s6, s7 );
 
 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
    LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
@@ -203,24 +203,36 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
+union _povly
+{
+   __m512i *v512;
+   __m256i *v256;
+   uint64_t *u64;
+};
+typedef union _povly povly;
+
 //---- Housekeeping
-void initState_2way( uint64_t state[/*16*/] );
+void initState_2way( uint64_t State[/*16*/] );
 
 //---- Squeezes
-void squeeze_2way( uint64_t *state, unsigned char *out, unsigned int len );
+void squeeze_2way( uint64_t *State, unsigned char *out, unsigned int len );
 void reducedSqueezeRow0_2way( uint64_t* state, uint64_t* row, uint64_t nCols );
 
 //---- Absorbs
-void absorbBlock_2way( uint64_t *state, const uint64_t *in );
-void absorbBlockBlake2Safe_2way( uint64_t *state, const uint64_t *in,
+void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
+                       const uint64_t *In1 );
+void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
                             const uint64_t nBlocks, const uint64_t block_len );
 
 //---- Duplexes
-void reducedDuplexRow1_2way( uint64_t *state, uint64_t *rowIn,
+void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
                              uint64_t *rowOut, uint64_t nCols);
-void reducedDuplexRowSetup_2way( uint64_t *state, uint64_t *rowIn,
+void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
                     uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
-void reducedDuplexRow_2way(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
+
+void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
+                            uint64_t *rowInOut0, uint64_t *rowInOut1,
+                            uint64_t *rowOut, uint64_t nCols);
 
 #endif
 
diff --git a/algo/quark/hmq1725-4way.c b/algo/quark/hmq1725-4way.c
index 101a5c2..9f22d29 100644
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -1,7 +1,4 @@
 #include "hmq1725-gate.h"
-
-#if defined(HMQ1725_4WAY)
-
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/blake-hash-4way.h"
@@ -11,6 +8,8 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/shavite/sph_shavite.h"
@@ -23,6 +22,772 @@
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/sha/sha-hash-4way.h"
 
+#if defined(HMQ1725_8WAY)
+
+union _hmq1725_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+    haval256_5_8way_context haval;
+} __attribute__ ((aligned (64)));
+
+typedef union _hmq1725_8way_context_overlay hmq1725_8way_context_overlay;
+
+extern void hmq1725_8way_hash(void *state, const void *input)
+{
+   uint32_t vhash [16<<3] __attribute__ ((aligned (128)));
+   uint32_t vhashA[16<<3] __attribute__ ((aligned (64)));
+   uint32_t vhashB[16<<3] __attribute__ ((aligned (64)));
+   uint32_t hash0 [16]    __attribute__ ((aligned (64)));
+   uint32_t hash1 [16]    __attribute__ ((aligned (64)));
+   uint32_t hash2 [16]    __attribute__ ((aligned (64)));
+   uint32_t hash3 [16]    __attribute__ ((aligned (64)));
+   uint32_t hash4 [16]    __attribute__ ((aligned (64)));
+   uint32_t hash5 [16]    __attribute__ ((aligned (64)));
+   uint32_t hash6 [16]    __attribute__ ((aligned (64)));
+   uint32_t hash7 [16]    __attribute__ ((aligned (64)));
+   hmq1725_8way_context_overlay ctx __attribute__ ((aligned (64)));
+   __mmask8 vh_mask;
+   const __m512i vmask = m512_const1_64( 24 );
+   const uint32_t mask = 24;
+   __m512i* vh  = (__m512i*)vhash;
+   __m512i* vhA = (__m512i*)vhashA;
+   __m512i* vhB = (__m512i*)vhashB;
+
+   bmw512_8way_init( &ctx.bmw );
+   bmw512_8way_update( &ctx.bmw, input, 80 );
+   bmw512_8way_close( &ctx.bmw, vhash );
+
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                  vhash );
+
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash0 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash1 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash2 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash3 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash4, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash4 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash5, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash5 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash6, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash6 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash7, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash7 );
+
+   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                         hash7 );
+   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
+                                       m512_zero );
+
+   // A
+   if ( hash0[0] & mask )
+   {
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                             (char*)hash0, 512 );
+   }
+   if ( hash1[0] & mask )
+   {
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                             (char*)hash1, 512 );
+   }
+   if ( hash2[0] & mask )
+   {
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                             (char*)hash2, 512 );
+   }
+   if ( hash3[0] & mask )
+   {
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                             (char*)hash3, 512 );
+   }
+   if ( hash4[0] & mask )
+   {
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                             (char*)hash4, 512 );
+   }
+   if ( hash5[0] & mask )
+   {
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                               (char*)hash5, 512 );
+   }
+   if ( hash6[0] & mask )
+   {
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                               (char*)hash6, 512 );
+   }
+   if ( hash7[0] & mask )
+   {
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                             (char*)hash7, 512 );
+   }
+
+   intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                         hash7 );
+
+   // B
+   if ( likely( vh_mask & 0xff ) )
+   {
+      skein512_8way_init( &ctx.skein );
+      skein512_8way_update( &ctx.skein, vhash, 64 );
+      skein512_8way_close( &ctx.skein, vhashB );
+   }
+
+   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+
+   jh512_8way_init( &ctx.jh );
+   jh512_8way_update( &ctx.jh, vhash, 64 );
+   jh512_8way_close( &ctx.jh, vhash );
+
+   keccak512_8way_init( &ctx.keccak );
+   keccak512_8way_update( &ctx.keccak, vhash, 64 );
+   keccak512_8way_close( &ctx.keccak, vhash );
+
+   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
+                                       m512_zero );
+
+   if ( likely( ( vh_mask & 0xff ) != 0xff ) )
+   {
+      blake512_8way_init( &ctx.blake );
+      blake512_8way_update( &ctx.blake, vhash, 64 );
+      blake512_8way_close( &ctx.blake, vhashA );
+   }
+
+   if ( likely( vh_mask & 0xff ) )
+   {
+      bmw512_8way_init( &ctx.bmw );
+      bmw512_8way_update( &ctx.bmw, vhash, 64 );
+      bmw512_8way_close( &ctx.bmw, vhashB );
+   }
+
+   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+   luffa_4way_init( &ctx.luffa, 512 );
+   luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+   luffa_4way_init( &ctx.luffa, 512 );
+   luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+   cube_4way_init( &ctx.cube, 512, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+   cube_4way_init( &ctx.cube, 512, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+   rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
+                                       m512_zero );
+
+   if ( likely( ( vh_mask & 0xff ) != 0xff ) )
+   {
+      keccak512_8way_init( &ctx.keccak );
+      keccak512_8way_update( &ctx.keccak, vhash, 64 );
+      keccak512_8way_close( &ctx.keccak, vhashA );
+   }
+
+   if ( likely( vh_mask & 0xff ) )
+   {
+      jh512_8way_init( &ctx.jh );
+      jh512_8way_update( &ctx.jh, vhash, 64 );
+      jh512_8way_close( &ctx.jh, vhashB );
+   }
+
+   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                  vhash );
+
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512 ( &ctx.shavite, hash0, 64 );
+   sph_shavite512_close( &ctx.shavite, hash0 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512 ( &ctx.shavite, hash1, 64 );
+   sph_shavite512_close( &ctx.shavite, hash1 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512 ( &ctx.shavite, hash2, 64 );
+   sph_shavite512_close( &ctx.shavite, hash2 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512 ( &ctx.shavite, hash3, 64 );
+   sph_shavite512_close( &ctx.shavite, hash3 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512 ( &ctx.shavite, hash4, 64 );
+   sph_shavite512_close( &ctx.shavite, hash4 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512 ( &ctx.shavite, hash5, 64 );
+   sph_shavite512_close( &ctx.shavite, hash5 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512 ( &ctx.shavite, hash6, 64 );
+   sph_shavite512_close( &ctx.shavite, hash6 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512 ( &ctx.shavite, hash7, 64 );
+   sph_shavite512_close( &ctx.shavite, hash7 );
+
+   intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+   intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
+
+   simd_4way_init( &ctx.simd, 512 );
+   simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+   simd_4way_init( &ctx.simd, 512 );
+   simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+   rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
+                                       m512_zero );
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                  vhash );
+   // 4x32 for haval
+   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                         hash7 );
+     
+   // A
+   if ( hash0[0] & mask )
+   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash0 );
+   }
+   if ( hash1[0] & mask )
+   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash1 );
+   }
+   if ( hash2[0] & mask )
+   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash2 );
+   }
+   if ( hash3[0] & mask )
+   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash3 );
+   }
+   if ( hash4[0] & mask )
+   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash4, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash4 );
+   }
+   if ( hash5[0] & mask )
+   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash5, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash5 );
+   }
+   if ( hash6[0] & mask )
+   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash6, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash6 );
+   }
+   if ( hash7[0] & mask )
+   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash7, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash7 );
+   }
+
+   intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                         hash7 );
+
+   // B
+   if ( likely( vh_mask & 0xff ) )
+   {
+      haval256_5_8way_init( &ctx.haval );
+      haval256_5_8way_update( &ctx.haval, vhash, 64 );
+      haval256_5_8way_close( &ctx.haval, vhash );
+      memset( &vhash[8<<3], 0, 32<<3 );
+      rintrlv_8x32_8x64( vhashB, vhash, 512 );
+   }
+
+   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                     vhash );
+
+   init_echo( &ctx.echo, 512 );
+   update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                           (const BitSequence *)hash0, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                           (const BitSequence *)hash1, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                           (const BitSequence *)hash2, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                           (const BitSequence *)hash3, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                           (const BitSequence *)hash4, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                           (const BitSequence *)hash5, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                           (const BitSequence *)hash6, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                           (const BitSequence *)hash7, 512 );
+
+   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                    hash7 );
+
+   blake512_8way_init( &ctx.blake );
+   blake512_8way_update( &ctx.blake, vhash, 64 );
+   blake512_8way_close( &ctx.blake, vhash );
+
+   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
+                                       m512_zero );
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                  vhash );
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+   // A
+   if ( hash0[0] & mask )
+   {
+      sph_shavite512_init( &ctx.shavite );
+      sph_shavite512( &ctx.shavite, hash0, 64 ); //
+      sph_shavite512_close( &ctx.shavite, hash0 ); //8
+   }
+   if ( hash1[0] & mask )
+   {
+      sph_shavite512_init( &ctx.shavite );
+      sph_shavite512( &ctx.shavite, hash1, 64 ); //
+      sph_shavite512_close( &ctx.shavite, hash1 ); //8
+   }
+   if ( hash2[0] & mask )
+   {
+      sph_shavite512_init( &ctx.shavite );
+      sph_shavite512( &ctx.shavite, hash2, 64 ); //
+      sph_shavite512_close( &ctx.shavite, hash2 ); //8
+   }
+   if ( hash3[0] & mask )
+   {
+      sph_shavite512_init( &ctx.shavite );
+      sph_shavite512( &ctx.shavite, hash3, 64 ); //
+      sph_shavite512_close( &ctx.shavite, hash3 ); //8
+   }
+   if ( hash4[0] & mask )
+   {
+      sph_shavite512_init( &ctx.shavite );
+      sph_shavite512( &ctx.shavite, hash4, 64 ); //
+      sph_shavite512_close( &ctx.shavite, hash4 ); //8
+   }
+   if ( hash5[0] & mask )
+   {
+      sph_shavite512_init( &ctx.shavite );
+      sph_shavite512( &ctx.shavite, hash5, 64 ); //
+      sph_shavite512_close( &ctx.shavite, hash5 ); //8
+   }
+   if ( hash6[0] & mask )
+   {
+      sph_shavite512_init( &ctx.shavite );
+      sph_shavite512( &ctx.shavite, hash6, 64 ); //
+      sph_shavite512_close( &ctx.shavite, hash6 ); //8
+   }
+   if ( hash7[0] & mask )
+   {
+      sph_shavite512_init( &ctx.shavite );
+      sph_shavite512( &ctx.shavite, hash7, 64 ); //
+      sph_shavite512_close( &ctx.shavite, hash7 ); //8
+   }
+
+   // B
+   if ( likely( vh_mask & 0xff ) )
+   {
+      luffa_4way_init( &ctx.luffa, 512 );
+      luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+      luffa_4way_init( &ctx.luffa, 512 );
+      luffa_4way_update_close( &ctx.luffa, vhash, vhashB, 64 );
+      rintrlv_4x128_8x64( vhashB, vhashA, vhash, 512 );
+   }
+
+   intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                         hash7 );
+   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+
+   hamsi512_8way_init( &ctx.hamsi );
+   hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+   hamsi512_8way_close( &ctx.hamsi, vhash );
+
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                  vhash );
+
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash0, 64 );
+   sph_fugue512_close( &ctx.fugue, hash0 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash1, 64 );
+   sph_fugue512_close( &ctx.fugue, hash1 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash2, 64 );
+   sph_fugue512_close( &ctx.fugue, hash2 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash3, 64 );
+   sph_fugue512_close( &ctx.fugue, hash3 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash4, 64 );
+   sph_fugue512_close( &ctx.fugue, hash4 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash5, 64 );
+   sph_fugue512_close( &ctx.fugue, hash5 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash6, 64 );
+   sph_fugue512_close( &ctx.fugue, hash6 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash7, 64 );
+   sph_fugue512_close( &ctx.fugue, hash7 );
+
+   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                         hash7 );
+   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
+                                       m512_zero );
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+   if ( hash0[0] & mask ) //4
+   {
+       init_echo( &ctx.echo, 512 );
+       update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                               (const BitSequence *)hash0, 512 );
+   }
+   if ( hash1[0] & mask ) //4
+   {
+       init_echo( &ctx.echo, 512 );
+       update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                               (const BitSequence *)hash1, 512 );
+   }
+   if ( hash2[0] & mask ) //4
+   {
+       init_echo( &ctx.echo, 512 );
+       update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                               (const BitSequence *)hash2, 512 );
+   }
+   if ( hash3[0] & mask ) //4
+   {
+       init_echo( &ctx.echo, 512 );
+       update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                               (const BitSequence *)hash3, 512 );
+   }
+   if ( hash4[0] & mask ) //4
+   {
+       init_echo( &ctx.echo, 512 );
+       update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                               (const BitSequence *)hash4, 512 );
+   }
+   if ( hash5[0] & mask ) //4
+   {
+       init_echo( &ctx.echo, 512 );
+       update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                               (const BitSequence *)hash5, 512 );
+   }
+   if ( hash6[0] & mask ) //4
+   {
+       init_echo( &ctx.echo, 512 );
+       update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                               (const BitSequence *)hash6, 512 );
+   }
+   if ( hash7[0] & mask ) //4
+   {
+       init_echo( &ctx.echo, 512 );
+       update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                               (const BitSequence *)hash7, 512 );
+   }
+
+   // B
+   if ( likely( vh_mask & 0xff ) )
+   {
+      simd_4way_init( &ctx.simd, 512 );
+      simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+      simd_4way_init( &ctx.simd, 512 );
+      simd_4way_update_close( &ctx.simd, vhash, vhashB, 512 );
+      rintrlv_4x128_8x64( vhashB, vhashA, vhash, 512 );
+   }
+
+   intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                         hash7 );
+   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+   rintrlv_8x64_8x32( vhashA, vhash, 512 );
+
+   shabal512_8way_init( &ctx.shabal );
+   shabal512_8way_update( &ctx.shabal, vhashA, 64 );
+   shabal512_8way_close( &ctx.shabal, vhash );
+
+   dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                  vhash );
+
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash0 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash1 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash2 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash3 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash4, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash4 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash5, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash5 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash6, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash6 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash7, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash7 );
+
+   // A
+
+   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                         hash7 );
+   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
+                                       m512_zero );
+
+   if ( hash0[0] & mask )
+   {
+      sph_fugue512_init( &ctx.fugue );
+      sph_fugue512( &ctx.fugue, hash0, 64 );
+      sph_fugue512_close( &ctx.fugue, hash0 );
+   }
+   if ( hash1[0] & mask )
+   {
+      sph_fugue512_init( &ctx.fugue );
+      sph_fugue512( &ctx.fugue, hash1, 64 );
+      sph_fugue512_close( &ctx.fugue, hash1 );
+   }
+   if ( hash2[0] & mask )
+   {
+      sph_fugue512_init( &ctx.fugue );
+      sph_fugue512( &ctx.fugue, hash2, 64 );
+      sph_fugue512_close( &ctx.fugue, hash2 );
+   }
+   if ( hash3[0] & mask )
+   {
+      sph_fugue512_init( &ctx.fugue );
+      sph_fugue512( &ctx.fugue, hash3, 64 );
+      sph_fugue512_close( &ctx.fugue, hash3 );
+   }
+   if ( hash4[0] & mask )
+   {
+      sph_fugue512_init( &ctx.fugue );
+      sph_fugue512( &ctx.fugue, hash4, 64 );
+      sph_fugue512_close( &ctx.fugue, hash4 );
+   }
+   if ( hash5[0] & mask )
+   {
+      sph_fugue512_init( &ctx.fugue );
+      sph_fugue512( &ctx.fugue, hash5, 64 );
+      sph_fugue512_close( &ctx.fugue, hash5 );
+   }
+   if ( hash6[0] & mask )
+   {
+      sph_fugue512_init( &ctx.fugue );
+      sph_fugue512( &ctx.fugue, hash6, 64 );
+      sph_fugue512_close( &ctx.fugue, hash6 );
+   }
+   if ( hash7[0] & mask )
+   {
+      sph_fugue512_init( &ctx.fugue );
+      sph_fugue512( &ctx.fugue, hash7, 64 );
+      sph_fugue512_close( &ctx.fugue, hash7 );
+   }
+
+   intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                    hash7 );
+
+   // B
+   if ( likely( vh_mask & 0xff ) )
+   {
+      sha512_8way_init( &ctx.sha512 );
+      sha512_8way_update( &ctx.sha512, vhash, 64 );
+      sha512_8way_close( &ctx.sha512, vhashB );
+   }
+
+   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                  vhash );
+
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                         hash7 );
+   
+   sha512_8way_init( &ctx.sha512 );
+   sha512_8way_update( &ctx.sha512, vhash, 64 );
+   sha512_8way_close( &ctx.sha512, vhash );
+
+   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
+                                       m512_zero );
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                     vhash );
+
+   // A
+   if ( likely( ( vh_mask & 0xff ) != 0xff ) )
+   {
+      intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                         hash7 );
+      haval256_5_8way_init( &ctx.haval );
+      haval256_5_8way_update( &ctx.haval, vhash, 64 );
+      haval256_5_8way_close( &ctx.haval, vhash );
+      memset( &vhash[8<<3], 0, 32<<3 );
+      rintrlv_8x32_8x64( vhashA, vhash, 512 );
+   }
+
+   // B
+   if ( !( hash0[0] & mask ) )
+   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash0 );
+   }
+   if ( !( hash1[0] & mask ) )
+   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash1 );
+   }
+   if ( !( hash2[0] & mask ) )
+   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash2 );
+   }
+   if ( !( hash3[0] & mask ) )
+   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash3 );
+   }
+   if ( !( hash4[0] & mask ) )
+   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash4, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash4 );
+   }
+   if ( !( hash5[0] & mask ) )
+   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash5, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash5 );
+   }
+   if ( !( hash6[0] & mask ) )
+   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash6, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash6 );
+   }
+   if ( !( hash7[0] & mask ) )
+   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash7, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash7 );
+   }
+
+   intrlv_8x64_512( vhashB, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                    hash7 );
+   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+
+   bmw512_8way_init( &ctx.bmw );
+   bmw512_8way_update( &ctx.bmw, vhash, 64 );
+   bmw512_8way_close( &ctx.bmw, state );
+}
+
+int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint32_t hash[16*8] __attribute__ ((aligned (128)));
+    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+    uint32_t *hash7 = &(hash[49]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint32_t Htarg = ptarget[7];
+    const uint32_t first_nonce = pdata[19];
+    uint32_t n = first_nonce;
+    const uint32_t last_nonce = max_nonce - 4;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+    int thr_id = mythr->id;
+
+    mm512_bswap32_intrlv80_8x64( vdata, pdata );
+    do
+    {
+       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+       hmq1725_8way_hash( hash, vdata );
+
+       for ( int lane = 0; lane < 8; lane++ )
+       if ( hash7[ lane<<1 ] <= Htarg )
+       {
+          extr_lane_8x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          {
+             pdata[19] = n + lane;
+             submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+       }
+       n += 8;
+    } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+
+    *hashes_done = n - first_nonce;
+    return 0;
+}
+
+#elif defined(HMQ1725_4WAY)
+
 union _hmq1725_4way_context_overlay
 {
     blake512_4way_context   blake;
@@ -34,7 +799,8 @@ union _hmq1725_4way_context_overlay
     hashState_luffa         luffa;
     cubehashParam           cube;
     sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    hashState_sd            sd;
+    simd_2way_context       simd;
     hashState_echo          echo;
     hamsi512_4way_context   hamsi;
     sph_fugue512_context    fugue;
@@ -42,19 +808,19 @@ union _hmq1725_4way_context_overlay
     sph_whirlpool_context   whirlpool;
     sha512_4way_context     sha512;
     haval256_5_4way_context haval;
-};
+} __attribute__ ((aligned (64)));
+
 typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;
 
 extern void hmq1725_4way_hash(void *state, const void *input)
 {
-// why so big? only really need 16.
-     uint32_t hash0 [32]    __attribute__ ((aligned (64)));
-     uint32_t hash1 [32]    __attribute__ ((aligned (64)));
-     uint32_t hash2 [32]    __attribute__ ((aligned (64)));
-     uint32_t hash3 [32]    __attribute__ ((aligned (64)));
-     uint32_t vhash [32<<2] __attribute__ ((aligned (64)));
-     uint32_t vhashA[32<<2] __attribute__ ((aligned (64)));
-     uint32_t vhashB[32<<2] __attribute__ ((aligned (64)));
+     uint32_t hash0 [16]    __attribute__ ((aligned (64)));
+     uint32_t hash1 [16]    __attribute__ ((aligned (64)));
+     uint32_t hash2 [16]    __attribute__ ((aligned (64)));
+     uint32_t hash3 [16]    __attribute__ ((aligned (64)));
+     uint32_t vhash [16<<2] __attribute__ ((aligned (64)));
+     uint32_t vhashA[16<<2] __attribute__ ((aligned (64)));
+     uint32_t vhashB[16<<2] __attribute__ ((aligned (64)));
      hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
      __m256i vh_mask;     
      const __m256i vmask = m256_const1_64( 24 );
@@ -91,41 +857,41 @@ extern void hmq1725_4way_hash(void *state, const void *input)
 
 // A
 
-//     if ( hash0[0] & mask )
-//     {
+     if ( hash0[0] & mask )
+     {
        init_groestl( &ctx.groestl, 64 );
        update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                                (char*)hash0, 512 );
-//     }
-//     if ( hash1[0] & mask )
-//     {
+     }
+     if ( hash1[0] & mask )
+     {
        init_groestl( &ctx.groestl, 64 );
        update_and_final_groestl( &ctx.groestl, (char*)hash1,
                                                (char*)hash1, 512 );
-//     }
-//     if ( hash2[0] & mask )
-//     {
+     }
+     if ( hash2[0] & mask )
+     {
        init_groestl( &ctx.groestl, 64 );
        update_and_final_groestl( &ctx.groestl, (char*)hash2,
                                                (char*)hash2, 512 );
-//     }
-//     if ( hash3[0] & mask )
-//     {
+     }
+     if ( hash3[0] & mask )
+     {
        init_groestl( &ctx.groestl, 64 );
        update_and_final_groestl( &ctx.groestl, (char*)hash3,
                                                (char*)hash3, 512 );
-//     }
+     }
 
      intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
 
 // B
 
-//     if ( mm256_any_clr_256( vh_mask ) )
-//     {
+     if ( mm256_anybits1( vh_mask ) )
+     {
        skein512_4way_init( &ctx.skein );
        skein512_4way( &ctx.skein, vhash, 64 );
        skein512_4way_close( &ctx.skein, vhashB );
-//     }
+     }
 
      mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
 
@@ -142,19 +908,19 @@ extern void hmq1725_4way_hash(void *state, const void *input)
      vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
                                    m256_zero );
 
-//     if ( mm256_any_set_256( vh_mask ) )
-//     {
+     if ( mm256_anybits0( vh_mask ) )
+     {
        blake512_4way_init( &ctx.blake );
        blake512_4way( &ctx.blake, vhash, 64 );
        blake512_4way_close( &ctx.blake, vhashA );
-//     }
+     }
 
-//     if ( mm256_any_clr_256( vh_mask ) )
-//     {
+     if ( mm256_anybits1( vh_mask ) )
+     {
        bmw512_4way_init( &ctx.bmw );
        bmw512_4way( &ctx.bmw, vhash, 64 );
        bmw512_4way_close( &ctx.bmw, vhashB );
-//     }
+     }
 
      mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
     
@@ -193,19 +959,19 @@ extern void hmq1725_4way_hash(void *state, const void *input)
      vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
                                   m256_zero );
 
-//     if ( mm256_any_set_256( vh_mask ) )
-//     {
+     if ( mm256_anybits0( vh_mask ) )
+     {
         keccak512_4way_init( &ctx.keccak );
         keccak512_4way( &ctx.keccak, vhash, 64 );
         keccak512_4way_close( &ctx.keccak, vhashA );
-//     }
+     }
 
-//     if ( mm256_any_clr_256( vh_mask ) )
-//     {
+     if ( mm256_anybits1( vh_mask ) )
+     {
         jh512_4way_init( &ctx.jh );
         jh512_4way( &ctx.jh, vhash, 64 );
         jh512_4way_close( &ctx.jh, vhashB );
-//     }
+     }
 
      mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
 
@@ -224,64 +990,63 @@ extern void hmq1725_4way_hash(void *state, const void *input)
      sph_shavite512 ( &ctx.shavite, hash3, 64 );
      sph_shavite512_close( &ctx.shavite, hash3 );
 
-     init_sd( &ctx.simd, 512 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
-                           (const BitSequence *)hash0, 512 );
-     init_sd( &ctx.simd, 512 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
-                           (const BitSequence *)hash1, 512 );
-     init_sd( &ctx.simd, 512 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
-                           (const BitSequence *)hash2, 512 );
-     init_sd( &ctx.simd, 512 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash3,
-                           (const BitSequence *)hash3, 512 );
 
-// A is whirlpool serial, B is haval parallel.
-    
+     intrlv_2x128_512( vhashA, hash0, hash1 );
+     intrlv_2x128_512( vhashB, hash2, hash3 );
 
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );     
 
      vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
                                    m256_zero );
+
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
+
+     // 4x32 for haval
+     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
+
      // A
     
-//     if ( hash0[0] & mask )
-//     {
+     if ( hash0[0] & mask )
+     {
         sph_whirlpool_init( &ctx.whirlpool );
         sph_whirlpool( &ctx.whirlpool, hash0, 64 );
         sph_whirlpool_close( &ctx.whirlpool, hash0 );
-//     }
-//     if ( hash1[0] & mask )
-//     {
+     }
+     if ( hash1[0] & mask )
+     {
         sph_whirlpool_init( &ctx.whirlpool );
         sph_whirlpool( &ctx.whirlpool, hash1, 64 );
         sph_whirlpool_close( &ctx.whirlpool, hash1 );
-//     }
-//     if ( hash2[0] & mask )
-//     {
+     }
+     if ( hash2[0] & mask )
+     {
         sph_whirlpool_init( &ctx.whirlpool );
         sph_whirlpool( &ctx.whirlpool, hash2, 64 );
         sph_whirlpool_close( &ctx.whirlpool, hash2 );
-//     }
-//     if ( hash3[0] & mask )
-//     {
+     }
+     if ( hash3[0] & mask )
+     {
         sph_whirlpool_init( &ctx.whirlpool );
         sph_whirlpool( &ctx.whirlpool, hash3, 64 );
         sph_whirlpool_close( &ctx.whirlpool, hash3 );
-//     }
+     }
 
      intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
 
 // B
-
-//     if ( mm256_any_clr_256( vh_mask ) )
-//     {
+     if ( mm256_anybits1( vh_mask ) )
+     {
         haval256_5_4way_init( &ctx.haval );
         haval256_5_4way( &ctx.haval, vhash, 64 );
-        haval256_5_4way_close( &ctx.haval, vhashB );
-        memset( &vhashB[8<<2], 0, 32<<2);
-//     }
+        haval256_5_4way_close( &ctx.haval, vhash );
+        memset( &vhash[8<<2], 0, 32<<2 );
+        rintrlv_4x32_4x64( vhashB, vhash, 512 );
+     }
 
      mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
 
@@ -383,19 +1148,24 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    sph_fugue512( &ctx.fugue, hash3, 64 );
    sph_fugue512_close( &ctx.fugue, hash3 );
 
+    // In this situation serial simd seems to be faster.
 
-//  A echo, B sd both serial
+    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
    
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
+                                   m256_zero );
+
    if ( hash0[0] & mask ) //4
    {
        init_echo( &ctx.echo, 512 );
        update_final_echo( &ctx.echo, (BitSequence *)hash0,
                                (const BitSequence *)hash0, 512 );
    }
+
    else
    {
-       init_sd( &ctx.simd, 512 );
-       update_final_sd( &ctx.simd, (BitSequence *)hash0,
+       init_sd( &ctx.sd, 512 );
+       update_final_sd( &ctx.sd, (BitSequence *)hash0,
                              (const BitSequence *)hash0, 512 );
    }
 
@@ -405,10 +1175,11 @@ extern void hmq1725_4way_hash(void *state, const void *input)
        update_final_echo( &ctx.echo, (BitSequence *)hash1,
                                (const BitSequence *)hash1, 512 );
    }
+
    else
    {
-       init_sd( &ctx.simd, 512 );
-       update_final_sd( &ctx.simd, (BitSequence *)hash1,
+       init_sd( &ctx.sd, 512 );
+       update_final_sd( &ctx.sd, (BitSequence *)hash1,
                              (const BitSequence *)hash1, 512 );
    }
 
@@ -418,10 +1189,11 @@ extern void hmq1725_4way_hash(void *state, const void *input)
        update_final_echo( &ctx.echo, (BitSequence *)hash2,
                                (const BitSequence *)hash2, 512 );
    }
+
    else
    {
-       init_sd( &ctx.simd, 512 );
-       update_final_sd( &ctx.simd, (BitSequence *)hash2,
+       init_sd( &ctx.sd, 512 );
+       update_final_sd( &ctx.sd, (BitSequence *)hash2,
                              (const BitSequence *)hash2, 512 );
    }
 
@@ -431,10 +1203,11 @@ extern void hmq1725_4way_hash(void *state, const void *input)
        update_final_echo( &ctx.echo, (BitSequence *)hash3,
                                (const BitSequence *)hash3, 512 );
    }
+
    else
    {
-       init_sd( &ctx.simd, 512 );
-       update_final_sd( &ctx.simd, (BitSequence *)hash3,
+       init_sd( &ctx.sd, 512 );
+       update_final_sd( &ctx.sd, (BitSequence *)hash3,
                              (const BitSequence *)hash3, 512 );
    }
 
@@ -466,39 +1239,39 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
                                  m256_zero );
 
-//   if ( hash0[0] & mask ) 
-//   {
+   if ( hash0[0] & mask ) 
+   {
       sph_fugue512_init( &ctx.fugue );
       sph_fugue512( &ctx.fugue, hash0, 64 );
       sph_fugue512_close( &ctx.fugue, hash0 );
-//   }
-//   if ( hash1[0] & mask ) 
-//   {
+   }
+   if ( hash1[0] & mask ) 
+   {
       sph_fugue512_init( &ctx.fugue );
       sph_fugue512( &ctx.fugue, hash1, 64 );
       sph_fugue512_close( &ctx.fugue, hash1 );
-//   }
-//   if ( hash2[0] & mask ) 
-//   {
+   }
+   if ( hash2[0] & mask ) 
+   {
       sph_fugue512_init( &ctx.fugue );
       sph_fugue512( &ctx.fugue, hash2, 64 );
       sph_fugue512_close( &ctx.fugue, hash2 );
-//   }
-//   if ( hash3[0] & mask ) 
-//   {
+   }
+   if ( hash3[0] & mask ) 
+   {
       sph_fugue512_init( &ctx.fugue );
       sph_fugue512( &ctx.fugue, hash3, 64 );
       sph_fugue512_close( &ctx.fugue, hash3 );
-//   }
+   }
 
    intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
 
-//   if ( mm256_any_clr_256( vh_mask ) )
-//   {
+   if ( mm256_anybits1( vh_mask ) )
+   {
       sha512_4way_init( &ctx.sha512 );
       sha512_4way( &ctx.sha512, vhash, 64 );
       sha512_4way_close( &ctx.sha512, vhashB );
-//   }
+   }
 
    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
 
@@ -525,39 +1298,43 @@ extern void hmq1725_4way_hash(void *state, const void *input)
                                  m256_zero );
 
    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-     
-//   if ( mm256_any_set_256( vh_mask ) ) //4
-//   {
+
+   // 4x32 for haval
+   intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
+
+   if ( mm256_anybits0( vh_mask ) )
+   {
       haval256_5_4way_init( &ctx.haval );
       haval256_5_4way( &ctx.haval, vhash, 64 );
-      haval256_5_4way_close( &ctx.haval, vhashA );
-      memset( &vhashA[8<<2], 0, 32<<2 );
-//   }
+      haval256_5_4way_close( &ctx.haval, vhash );
+      memset( &vhash[8<<2], 0, 32<<2 );
+      rintrlv_4x32_4x64( vhashA, vhash, 512 );
+   }
 
-//   if ( !( hash0[0] & mask ) )
-//   {
+   if ( !( hash0[0] & mask ) )
+   {
       sph_whirlpool_init( &ctx.whirlpool );
       sph_whirlpool( &ctx.whirlpool, hash0, 64 );
       sph_whirlpool_close( &ctx.whirlpool, hash0 );
-//   }
-//   if ( !( hash2[0] & mask ) )
-//   {
+   }
+   if ( !( hash1[0] & mask ) )
+   {
       sph_whirlpool_init( &ctx.whirlpool );
       sph_whirlpool( &ctx.whirlpool, hash1, 64 );
       sph_whirlpool_close( &ctx.whirlpool, hash1 );
-//   }
-//   if ( !( hash2[0] & mask ) )
-//   {
+   }
+   if ( !( hash2[0] & mask ) )
+   {
       sph_whirlpool_init( &ctx.whirlpool );
       sph_whirlpool( &ctx.whirlpool, hash2, 64 );
       sph_whirlpool_close( &ctx.whirlpool, hash2 );
-//   }
-//   if ( !( hash3[0] & mask ) )
-//   {
+   }
+   if ( !( hash3[0] & mask ) )
+   {
       sph_whirlpool_init( &ctx.whirlpool );
       sph_whirlpool( &ctx.whirlpool, hash3, 64 );
       sph_whirlpool_close( &ctx.whirlpool, hash3 );
-//   }
+   }
 
    intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, 512 );
 
@@ -573,46 +1350,42 @@ extern void hmq1725_4way_hash(void *state, const void *input)
 int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[4*8] __attribute__ ((aligned (64)));
-//   uint32_t *hash7 = &(hash[25]);
-//   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   uint32_t n = pdata[19] - 1;
-   const uint32_t first_nonce = pdata[19];
-   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-   const uint32_t Htarg = ptarget[7];
-   uint64_t htmax[] = {          0,        0xF,       0xFF,
-                             0xFFF,     0xFFFF, 0x10000000  };
-   uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
-                        0xFFFFF000, 0xFFFF0000,          0  };
+    uint32_t hash[16*4] __attribute__ ((aligned (64)));
+    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+    uint32_t *hash7 = &(hash[25]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint32_t Htarg = ptarget[7];
+    const uint32_t first_nonce = pdata[19];
+    uint32_t n = first_nonce;
+    const uint32_t last_nonce = max_nonce - 4;
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+    int thr_id = mythr->id; 
 
-   mm256_bswap32_intrlv80_4x64( vdata, pdata );
-   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
-   {
-      uint32_t mask = masks[ m ];
-      do
-      {
-         *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-         hmq1725_4way_hash( hash, vdata );
-         for ( int i = 0; i < 4; i++ )
-         if ( ( (hash+(i<<3))[7] & mask ) == 0 )
-         {
-            if ( fulltest( (hash+(i<<3)), ptarget ) && !opt_benchmark )
-            {
-               pdata[19] = n + i;
-               submit_lane_solution( work, (hash+(i<<3)), mythr, i );
-            }
-         }
-	      n += 4;
-      } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );	
-	   break;
-	}
-	*hashes_done = n - first_nonce + 1;
-	return 0;
+    mm256_bswap32_intrlv80_4x64( vdata, pdata );
+    do
+    {
+       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+
+       hmq1725_4way_hash( hash, vdata );
+
+       for ( int lane = 0; lane < 4; lane++ )
+       if ( hash7[ lane<<1 ] <= Htarg )
+       {
+          extr_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          {
+             pdata[19] = n + lane;
+             submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+       }
+       n += 4;
+    } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+
+    *hashes_done = n - first_nonce;
+    return 0;
 }
 
 #endif // HMQ1725_4WAY
diff --git a/algo/quark/hmq1725-gate.c b/algo/quark/hmq1725-gate.c
index a0ccf1b..9cc2784 100644
--- a/algo/quark/hmq1725-gate.c
+++ b/algo/quark/hmq1725-gate.c
@@ -2,7 +2,10 @@
 
 bool register_hmq1725_algo( algo_gate_t* gate )
 {
-#if defined(HMQ1725_4WAY)
+#if defined(HMQ1725_8WAY)
+  gate->scanhash  = (void*)&scanhash_hmq1725_8way;
+  gate->hash      = (void*)&hmq1725_8way_hash;
+#elif defined(HMQ1725_4WAY)
   gate->scanhash  = (void*)&scanhash_hmq1725_4way;
   gate->hash      = (void*)&hmq1725_4way_hash;
 #else
@@ -10,7 +13,7 @@ bool register_hmq1725_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_hmq1725;
   gate->hash      = (void*)&hmq1725hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
   opt_target_factor = 65536.0;
   return true;
 };
diff --git a/algo/quark/hmq1725-gate.h b/algo/quark/hmq1725-gate.h
index 23f51f6..faef6fc 100644
--- a/algo/quark/hmq1725-gate.h
+++ b/algo/quark/hmq1725-gate.h
@@ -4,13 +4,21 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX2__) && defined(__AES__)
-//  #define HMQ1725_4WAY 1
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define HMQ1725_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define HMQ1725_4WAY 1
 #endif
 
 bool register_hmq1725_algo( algo_gate_t* gate );
 
-#if defined(HMQ1725_4WAY)
+#if defined(HMQ1725_8WAY)
+
+void hmq1725_8way_hash( void *state, const void *input );
+int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(HMQ1725_4WAY)
 
 void hmq1725_4way_hash( void *state, const void *input );
 int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
diff --git a/algo/quark/hmq1725.c b/algo/quark/hmq1725.c
index 4b065ef..aaea14a 100644
--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -333,6 +333,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFFFFF)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -346,6 +347,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFFFF0)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -359,6 +361,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFFF00)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -372,6 +375,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFF000)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -386,6 +390,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFF0000)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -399,6 +404,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			hmq1725hash(hash64, endiandata);
 			if (fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
diff --git a/algo/quark/quark-4way.c b/algo/quark/quark-4way.c
index 9c617ba..180d636 100644
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -63,20 +63,6 @@ void quark_8way_hash( void *state, const void *input )
     bmw512_8way_update( &ctx.bmw, vhash, 64 );
     bmw512_8way_close( &ctx.bmw, vhash );
 
-// AVX 512 cmpeq returns a bit mask instead of a vector mask.
-// This should simplify things but the logic doesn't seem to be working.
-// The problem appears to be related to the test to skip a hash if it isn't
-// to be used. Skipping the test for all 8 way hashes seems to have
-// fixed it. The hash selection blending works if the hash is produced
-// but the hash wasn't being produced when it should.
-// Both decisions are based on the same data, the __mmask8. It works
-// as a blend mask but not in a logical comparison, maybe the type is the
-// problem. Maybe a cast to int or movm is needed to make it work.
-// It's now moot because the hash can only be skipped 1 in 256 iterations
-// when hashing parallel 8 ways.
-// The performance impact of the workaround should be negligible.
-// It's a problem for another day.
-
     vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
                                        zero );
 
diff --git a/algo/qubit/qubit-2way.c b/algo/qubit/qubit-2way.c
index 8aee162..2b5d603 100644
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -92,7 +92,6 @@ int scanhash_qubit_4way( struct work *work,uint32_t max_nonce,
 {
      uint32_t hash[4*8] __attribute__ ((aligned (128)));
      uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-     uint32_t endiandata[20] __attribute__((aligned(64)));
      uint32_t *pdata = work->data;
      uint32_t *ptarget = work->target;
      uint32_t n = pdata[19];
diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h
index 5be93d4..2791877 100644
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -56,7 +56,7 @@ typedef struct {
    __m128i val[8];
    uint32_t count_high, count_low;
    bool initialized;
-} sha256_4way_context;
+} sha256_4way_context __attribute__ ((aligned (64)));
 
 void sha256_4way_init( sha256_4way_context *sc );
 void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
@@ -71,7 +71,7 @@ typedef struct {
    __m256i val[8];
    uint32_t count_high, count_low;
    bool initialized;
-} sha256_8way_context;
+} sha256_8way_context __attribute__ ((aligned (128)));
 
 void sha256_8way_init( sha256_8way_context *sc );
 void sha256_8way( sha256_8way_context *sc, const void *data, size_t len );
@@ -86,30 +86,32 @@ typedef struct {
    __m256i val[8];
    uint64_t count;
    bool initialized;
-} sha512_4way_context;
+} sha512_4way_context __attribute__ ((aligned (128)));
 
 void sha512_4way_init( sha512_4way_context *sc);
-void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
+void sha512_4way_update( sha512_4way_context *sc, const void *data,
+                         size_t len );
+#define sha512_4way sha512_4way_update
 void sha512_4way_close( sha512_4way_context *sc, void *dst );
 
-// SHA-256 11 way hybrid
-// Combines AVX2, MMX and scalar data to do 8 + 2 + 1 parallel.
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-512 8 way
+
 typedef struct {
-   __m256i  bufx[64>>2];
-   __m256i  valx[8];
-   __m64    bufy[64>>2];
-   __m64    valy[8];
-   uint32_t bufz[64>>2];
-   uint32_t valz[8];
-   uint32_t count_high, count_low;
-} sha256_11way_context;
+   __m512i buf[128>>3];
+   __m512i val[8];
+   uint64_t count;
+   bool initialized;
+} sha512_8way_context __attribute__ ((aligned (128)));
 
-void sha256_11way_init( sha256_11way_context *ctx );
-void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
-	                 const void *datay, const void *dataz, size_t len );
-void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx,
-	                 void *dstz  );
+void sha512_8way_init( sha512_8way_context *sc);
+void sha512_8way_update( sha512_8way_context *sc, const void *data, 
+                         size_t len );
+void sha512_8way_close( sha512_8way_context *sc, void *dst );
 
+
+#endif  // AVX512
 #endif  // __AVX2__
 #endif  // __SSE2__
 #endif  // SHA256_4WAY_H__
diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c
index 1c30074..3ee8194 100644
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -36,8 +36,6 @@
 #include <string.h>
 #include "sha-hash-4way.h"
 
-// SHA-512 4 way 64 bit
-
 /*
 static const sph_u64 H512[8] = {
         SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
@@ -90,6 +88,236 @@ static const sph_u64 K512[80] = {
 	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
 };
 
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-512 8 way 64 bit
+
+#define CH8W(X, Y, Z) \
+   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) 
+
+#define MAJ8W(X, Y, Z) \
+   _mm512_or_si512( _mm512_and_si512( X, Y ), \
+                    _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
+
+#define BSG8W_5_0(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x, 28), mm512_ror_64(x, 34) ), mm512_ror_64(x, 39) )
+
+#define BSG8W_5_1(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x, 14), mm512_ror_64(x, 18) ), mm512_ror_64(x, 41) )
+
+#define SSG8W_5_0(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x,  1), mm512_ror_64(x,  8) ), _mm512_srli_epi64(x, 7) ) 
+
+#define SSG8W_5_1(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x, 19), mm512_ror_64(x, 61) ), _mm512_srli_epi64(x, 6) )
+
+static inline __m512i ssg8w_512_add( __m512i w0, __m512i w1 )
+{
+   __m512i w0a, w1a, w0b, w1b;
+   w0a = mm512_ror_64( w0, 1 );
+   w1a = mm512_ror_64( w1,19 );
+   w0b = mm512_ror_64( w0, 8 );
+   w1b = mm512_ror_64( w1,61 );
+   w0a = _mm512_xor_si512( w0a, w0b );
+   w1a = _mm512_xor_si512( w1a, w1b );
+   w0b = _mm512_srli_epi64( w0, 7 );
+   w1b = _mm512_srli_epi64( w1, 6 );
+   w0a = _mm512_xor_si512( w0a, w0b );
+   w1a = _mm512_xor_si512( w1a, w1b );
+   return _mm512_add_epi64( w0a, w1a );
+}
+
+
+#define SSG8W_512x2_0( w0, w1, i ) do \
+{ \
+   __m512i X0a, X1a, X0b, X1b; \
+  X0a = mm512_ror_64( W[i-15], 1 ); \
+  X1a = mm512_ror_64( W[i-14], 1 ); \
+  X0b = mm512_ror_64( W[i-15], 8 ); \
+  X1b = mm512_ror_64( W[i-14], 8 ); \
+  X0a = _mm512_xor_si512( X0a, X0b ); \
+  X1a = _mm512_xor_si512( X1a, X1b ); \
+  X0b = _mm512_srli_epi64( W[i-15], 7 ); \
+  X1b = _mm512_srli_epi64( W[i-14], 7 ); \
+  w0  = _mm512_xor_si512( X0a, X0b ); \
+  w1  = _mm512_xor_si512( X1a, X1b ); \
+} while(0)
+
+#define SSG8W_512x2_1( w0, w1, i ) do \
+{ \
+   __m512i X0a, X1a, X0b, X1b; \
+  X0a = mm512_ror_64( W[i-2],19 ); \
+  X1a = mm512_ror_64( W[i-1],19 ); \
+  X0b = mm512_ror_64( W[i-2],61 ); \
+  X1b = mm512_ror_64( W[i-1],61 ); \
+  X0a = _mm512_xor_si512( X0a, X0b ); \
+  X1a = _mm512_xor_si512( X1a, X1b ); \
+  X0b = _mm512_srli_epi64( W[i-2], 6 ); \
+  X1b = _mm512_srli_epi64( W[i-1], 6 ); \
+  w0  = _mm512_xor_si512( X0a, X0b ); \
+  w1  = _mm512_xor_si512( X1a, X1b ); \
+} while(0)
+
+#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
+do { \
+  __m512i T1, T2; \
+  __m512i K = _mm512_set1_epi64( K512[ i ] ); \
+  T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \
+                                           K, W[i] ) ); \
+  T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \
+  D  = _mm512_add_epi64( D, T1 ); \
+  H  = _mm512_add_epi64( T1, T2 ); \
+} while (0)
+
+static void
+sha512_8way_round( sha512_8way_context *ctx,  __m512i *in, __m512i r[8] )
+{
+   int i;
+   register __m512i A, B, C, D, E, F, G, H;
+   __m512i W[80];
+
+   mm512_block_bswap_64( W  , in );
+   mm512_block_bswap_64( W+8, in+8 );
+
+   for ( i = 16; i < 80; i++ )
+      W[i] = _mm512_add_epi64( ssg8w_512_add( W[i-15], W[i-2] ),
+                               _mm512_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
+
+   if ( ctx->initialized )
+   {
+      A = r[0];
+      B = r[1];
+      C = r[2];
+      D = r[3];
+      E = r[4];
+      F = r[5];
+      G = r[6];
+      H = r[7];
+   }
+   else
+   {
+      A = m512_const1_64( 0x6A09E667F3BCC908 );
+      B = m512_const1_64( 0xBB67AE8584CAA73B );
+      C = m512_const1_64( 0x3C6EF372FE94F82B );
+      D = m512_const1_64( 0xA54FF53A5F1D36F1 );
+      E = m512_const1_64( 0x510E527FADE682D1 );
+      F = m512_const1_64( 0x9B05688C2B3E6C1F );
+      G = m512_const1_64( 0x1F83D9ABFB41BD6B );
+      H = m512_const1_64( 0x5BE0CD19137E2179 );
+   }
+
+   for ( i = 0; i < 80; i += 8 )
+   {
+      SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
+      SHA3_8WAY_STEP( H, A, B, C, D, E, F, G, i + 1 );
+      SHA3_8WAY_STEP( G, H, A, B, C, D, E, F, i + 2 );
+      SHA3_8WAY_STEP( F, G, H, A, B, C, D, E, i + 3 );
+      SHA3_8WAY_STEP( E, F, G, H, A, B, C, D, i + 4 );
+      SHA3_8WAY_STEP( D, E, F, G, H, A, B, C, i + 5 );
+      SHA3_8WAY_STEP( C, D, E, F, G, H, A, B, i + 6 );
+      SHA3_8WAY_STEP( B, C, D, E, F, G, H, A, i + 7 );
+   }
+
+   if ( ctx->initialized )
+   {
+      r[0] = _mm512_add_epi64( r[0], A );
+      r[1] = _mm512_add_epi64( r[1], B );
+      r[2] = _mm512_add_epi64( r[2], C );
+      r[3] = _mm512_add_epi64( r[3], D );
+      r[4] = _mm512_add_epi64( r[4], E );
+      r[5] = _mm512_add_epi64( r[5], F );
+      r[6] = _mm512_add_epi64( r[6], G );
+      r[7] = _mm512_add_epi64( r[7], H );
+   }
+   else
+   {
+      ctx->initialized = true;
+      r[0] = _mm512_add_epi64( A, m512_const1_64( 0x6A09E667F3BCC908 ) );
+      r[1] = _mm512_add_epi64( B, m512_const1_64( 0xBB67AE8584CAA73B ) );
+      r[2] = _mm512_add_epi64( C, m512_const1_64( 0x3C6EF372FE94F82B ) );
+      r[3] = _mm512_add_epi64( D, m512_const1_64( 0xA54FF53A5F1D36F1 ) );
+      r[4] = _mm512_add_epi64( E, m512_const1_64( 0x510E527FADE682D1 ) );
+      r[5] = _mm512_add_epi64( F, m512_const1_64( 0x9B05688C2B3E6C1F ) );
+      r[6] = _mm512_add_epi64( G, m512_const1_64( 0x1F83D9ABFB41BD6B ) );
+      r[7] = _mm512_add_epi64( H, m512_const1_64( 0x5BE0CD19137E2179 ) );
+   }
+}
+
+void sha512_8way_init( sha512_8way_context *sc )
+{
+   sc->initialized = false;
+   sc->count = 0;
+}
+
+void sha512_8way_update( sha512_8way_context *sc, const void *data, size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   size_t ptr;
+   const int buf_size = 128;
+
+   ptr = (unsigned)sc->count & (buf_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_512( sc->buf + (ptr>>3), vdata, clen>>3 );
+      vdata = vdata + (clen>>3);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         sha512_8way_round( sc, sc->buf, sc->val );
+         ptr = 0;
+      }
+      sc->count += clen;
+   }
+}
+
+void sha512_8way_close( sha512_8way_context *sc, void *dst )
+{
+    unsigned ptr;
+    const int buf_size = 128;
+    const int pad = buf_size - 16;
+    const __m512i shuff_bswap64 = m512_const_64(
+                                    0x38393a3b3c3d3e3f, 0x3031323334353637,
+                                    0x28292a2b2c2d2e2f, 0x2021222324252627,
+                                    0x18191a1b1c1d1e1f, 0x1011121314151617,
+                                    0x08090a0b0c0d0e0f, 0x0001020304050607 );
+
+    ptr = (unsigned)sc->count & (buf_size - 1U);
+    sc->buf[ ptr>>3 ] = m512_const1_64( 0x80 );
+    ptr += 8;
+    if ( ptr > pad )
+    {
+         memset_zero_512( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );
+         sha512_8way_round( sc, sc->buf, sc->val );
+         memset_zero_512( sc->buf, pad >> 3 );
+    }
+    else
+         memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
+
+    sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
+                       _mm512_set1_epi64( sc->count >> 61 ), shuff_bswap64 );
+    sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
+                       _mm512_set1_epi64( sc->count <<  3 ), shuff_bswap64 );
+    sha512_8way_round( sc, sc->buf, sc->val );
+
+    mm512_block_bswap_64( dst, sc->val );
+}
+
+
+#endif   // AVX512
+
+// SHA-512 4 way 64 bit
+
+
 #define CH(X, Y, Z) \
    _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
 
@@ -254,7 +482,7 @@ void sha512_4way_init( sha512_4way_context *sc )
    sc->count = 0;
 }
 
-void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
+void sha512_4way_update( sha512_4way_context *sc, const void *data, size_t len )
 {
    __m256i *vdata = (__m256i*)data;
    size_t ptr;
diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c
index b84246b..dffa18d 100644
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -33,7 +33,7 @@
 #include <stddef.h>
 #include <string.h>
 
-#ifdef __AVX2__
+#ifdef __SSE4_1__
 
 #include "shabal-hash-4way.h"
 #ifdef __cplusplus
@@ -58,6 +58,599 @@ extern "C"{
 #define O2    9
 #define O3    6
 
+
+#if defined(__AVX2__)
+
+#define DECL_STATE8   \
+   __m256i A00, A01, A02, A03, A04, A05, A06, A07, \
+           A08, A09, A0A, A0B; \
+   __m256i B0, B1, B2, B3, B4, B5, B6, B7, \
+           B8, B9, BA, BB, BC, BD, BE, BF; \
+   __m256i C0, C1, C2, C3, C4, C5, C6, C7, \
+           C8, C9, CA, CB, CC, CD, CE, CF; \
+   __m256i M0, M1, M2, M3, M4, M5, M6, M7, \
+           M8, M9, MA, MB, MC, MD, ME, MF; \
+   sph_u32 Wlow, Whigh;
+
+#define READ_STATE8(state) do \
+{ \
+   if ( (state)->state_loaded ) \
+   { \
+      A00 = (state)->A[0]; \
+      A01 = (state)->A[1]; \
+      A02 = (state)->A[2]; \
+      A03 = (state)->A[3]; \
+      A04 = (state)->A[4]; \
+      A05 = (state)->A[5]; \
+      A06 = (state)->A[6]; \
+      A07 = (state)->A[7]; \
+      A08 = (state)->A[8]; \
+      A09 = (state)->A[9]; \
+      A0A = (state)->A[10]; \
+      A0B = (state)->A[11]; \
+      B0 = (state)->B[0]; \
+      B1 = (state)->B[1]; \
+      B2 = (state)->B[2]; \
+      B3 = (state)->B[3]; \
+      B4 = (state)->B[4]; \
+      B5 = (state)->B[5]; \
+      B6 = (state)->B[6]; \
+      B7 = (state)->B[7]; \
+      B8 = (state)->B[8]; \
+      B9 = (state)->B[9]; \
+      BA = (state)->B[10]; \
+      BB = (state)->B[11]; \
+      BC = (state)->B[12]; \
+      BD = (state)->B[13]; \
+      BE = (state)->B[14]; \
+      BF = (state)->B[15]; \
+      C0 = (state)->C[0]; \
+      C1 = (state)->C[1]; \
+      C2 = (state)->C[2]; \
+      C3 = (state)->C[3]; \
+      C4 = (state)->C[4]; \
+      C5 = (state)->C[5]; \
+      C6 = (state)->C[6]; \
+      C7 = (state)->C[7]; \
+      C8 = (state)->C[8]; \
+      C9 = (state)->C[9]; \
+      CA = (state)->C[10]; \
+      CB = (state)->C[11]; \
+      CC = (state)->C[12]; \
+      CD = (state)->C[13]; \
+      CE = (state)->C[14]; \
+      CF = (state)->C[15]; \
+   } \
+   else \
+   { \
+       (state)->state_loaded = true; \
+       A00 = m256_const1_64( 0x20728DFD20728DFD ); \
+       A01 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
+       A02 = m256_const1_64( 0xE782B699E782B699 ); \
+       A03 = m256_const1_64( 0x5530463255304632 ); \
+       A04 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
+       A05 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
+       A06 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
+       A07 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
+       A08 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
+       A09 = m256_const1_64( 0x8BD144108BD14410 ); \
+       A0A = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
+       A0B = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
+       B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \
+       B1 = m256_const1_64( 0x07B385F307B385F3 ); \
+       B2 = m256_const1_64( 0xE7442C26E7442C26 ); \
+       B3 = m256_const1_64( 0xCC8AD640CC8AD640 ); \
+       B4 = m256_const1_64( 0xEB6F56C7EB6F56C7 ); \
+       B5 = m256_const1_64( 0x1EA81AA91EA81AA9 ); \
+       B6 = m256_const1_64( 0x73B9D31473B9D314 ); \
+       B7 = m256_const1_64( 0x1DE85D081DE85D08 ); \
+       B8 = m256_const1_64( 0x48910A5A48910A5A ); \
+       B9 = m256_const1_64( 0x893B22DB893B22DB ); \
+       BA = m256_const1_64( 0xC5A0DF44C5A0DF44 ); \
+       BB = m256_const1_64( 0xBBC4324EBBC4324E ); \
+       BC = m256_const1_64( 0x72D2F24072D2F240 ); \
+       BD = m256_const1_64( 0x75941D9975941D99 ); \
+       BE = m256_const1_64( 0x6D8BDE826D8BDE82 ); \
+       BF = m256_const1_64( 0xA1A7502BA1A7502B ); \
+       C0 = m256_const1_64( 0xD9BF68D1D9BF68D1 ); \
+       C1 = m256_const1_64( 0x58BAD75058BAD750 ); \
+       C2 = m256_const1_64( 0x56028CB256028CB2 ); \
+       C3 = m256_const1_64( 0x8134F3598134F359 ); \
+       C4 = m256_const1_64( 0xB5D469D8B5D469D8 ); \
+       C5 = m256_const1_64( 0x941A8CC2941A8CC2 ); \
+       C6 = m256_const1_64( 0x418B2A6E418B2A6E ); \
+       C7 = m256_const1_64( 0x0405278004052780 ); \
+       C8 = m256_const1_64( 0x7F07D7877F07D787 ); \
+       C9 = m256_const1_64( 0x5194358F5194358F ); \
+       CA = m256_const1_64( 0x3C60D6653C60D665 ); \
+       CB = m256_const1_64( 0xBE97D79ABE97D79A ); \
+       CC = m256_const1_64( 0x950C3434950C3434 ); \
+       CD = m256_const1_64( 0xAED9A06DAED9A06D ); \
+       CE = m256_const1_64( 0x2537DC8D2537DC8D ); \
+       CF = m256_const1_64( 0x7CDB59697CDB5969 ); \
+   } \
+   Wlow = (state)->Wlow; \
+   Whigh = (state)->Whigh; \
+} while (0)
+
+#define WRITE_STATE8(state)   do { \
+      (state)->A[0] = A00; \
+      (state)->A[1] = A01; \
+      (state)->A[2] = A02; \
+      (state)->A[3] = A03; \
+      (state)->A[4] = A04; \
+      (state)->A[5] = A05; \
+      (state)->A[6] = A06; \
+      (state)->A[7] = A07; \
+      (state)->A[8] = A08; \
+      (state)->A[9] = A09; \
+      (state)->A[10] = A0A; \
+      (state)->A[11] = A0B; \
+      (state)->B[0] = B0; \
+      (state)->B[1] = B1; \
+      (state)->B[2] = B2; \
+      (state)->B[3] = B3; \
+      (state)->B[4] = B4; \
+      (state)->B[5] = B5; \
+      (state)->B[6] = B6; \
+      (state)->B[7] = B7; \
+      (state)->B[8] = B8; \
+      (state)->B[9] = B9; \
+      (state)->B[10] = BA; \
+      (state)->B[11] = BB; \
+      (state)->B[12] = BC; \
+      (state)->B[13] = BD; \
+      (state)->B[14] = BE; \
+      (state)->B[15] = BF; \
+      (state)->C[0] = C0; \
+      (state)->C[1] = C1; \
+      (state)->C[2] = C2; \
+      (state)->C[3] = C3; \
+      (state)->C[4] = C4; \
+      (state)->C[5] = C5; \
+      (state)->C[6] = C6; \
+      (state)->C[7] = C7; \
+      (state)->C[8] = C8; \
+      (state)->C[9] = C9; \
+      (state)->C[10] = CA; \
+      (state)->C[11] = CB; \
+      (state)->C[12] = CC; \
+      (state)->C[13] = CD; \
+      (state)->C[14] = CE; \
+      (state)->C[15] = CF; \
+      (state)->Wlow = Wlow; \
+      (state)->Whigh = Whigh; \
+   } while (0)
+
+#define DECODE_BLOCK8 \
+do { \
+   M0 = buf[ 0]; \
+   M1 = buf[ 1]; \
+   M2 = buf[ 2]; \
+   M3 = buf[ 3]; \
+   M4 = buf[ 4]; \
+   M5 = buf[ 5]; \
+   M6 = buf[ 6]; \
+   M7 = buf[ 7]; \
+   M8 = buf[ 8]; \
+   M9 = buf[ 9]; \
+   MA = buf[10]; \
+   MB = buf[11]; \
+   MC = buf[12]; \
+   MD = buf[13]; \
+   ME = buf[14]; \
+   MF = buf[15]; \
+} while (0)
+
+#define INPUT_BLOCK_ADD8 \
+do { \
+    B0 = _mm256_add_epi32( B0, M0 );\
+    B1 = _mm256_add_epi32( B1, M1 );\
+    B2 = _mm256_add_epi32( B2, M2 );\
+    B3 = _mm256_add_epi32( B3, M3 );\
+    B4 = _mm256_add_epi32( B4, M4 );\
+    B5 = _mm256_add_epi32( B5, M5 );\
+    B6 = _mm256_add_epi32( B6, M6 );\
+    B7 = _mm256_add_epi32( B7, M7 );\
+    B8 = _mm256_add_epi32( B8, M8 );\
+    B9 = _mm256_add_epi32( B9, M9 );\
+    BA = _mm256_add_epi32( BA, MA );\
+    BB = _mm256_add_epi32( BB, MB );\
+    BC = _mm256_add_epi32( BC, MC );\
+    BD = _mm256_add_epi32( BD, MD );\
+    BE = _mm256_add_epi32( BE, ME );\
+    BF = _mm256_add_epi32( BF, MF );\
+} while (0)
+
+#define INPUT_BLOCK_SUB8 \
+do { \
+    C0 = _mm256_sub_epi32( C0, M0 ); \
+    C1 = _mm256_sub_epi32( C1, M1 ); \
+    C2 = _mm256_sub_epi32( C2, M2 ); \
+    C3 = _mm256_sub_epi32( C3, M3 ); \
+    C4 = _mm256_sub_epi32( C4, M4 ); \
+    C5 = _mm256_sub_epi32( C5, M5 ); \
+    C6 = _mm256_sub_epi32( C6, M6 ); \
+    C7 = _mm256_sub_epi32( C7, M7 ); \
+    C8 = _mm256_sub_epi32( C8, M8 ); \
+    C9 = _mm256_sub_epi32( C9, M9 ); \
+    CA = _mm256_sub_epi32( CA, MA ); \
+    CB = _mm256_sub_epi32( CB, MB ); \
+    CC = _mm256_sub_epi32( CC, MC ); \
+    CD = _mm256_sub_epi32( CD, MD ); \
+    CE = _mm256_sub_epi32( CE, ME ); \
+    CF = _mm256_sub_epi32( CF, MF ); \
+} while (0)
+
+#define XOR_W8 \
+do { \
+   A00 = _mm256_xor_si256( A00, _mm256_set1_epi32( Wlow ) ); \
+   A01 = _mm256_xor_si256( A01, _mm256_set1_epi32( Whigh ) ); \
+} while (0)
+
+#define SWAP_BC8 \
+do { \
+    mm256_swap512_256( B0, C0 ); \
+    mm256_swap512_256( B1, C1 ); \
+    mm256_swap512_256( B2, C2 ); \
+    mm256_swap512_256( B3, C3 ); \
+    mm256_swap512_256( B4, C4 ); \
+    mm256_swap512_256( B5, C5 ); \
+    mm256_swap512_256( B6, C6 ); \
+    mm256_swap512_256( B7, C7 ); \
+    mm256_swap512_256( B8, C8 ); \
+    mm256_swap512_256( B9, C9 ); \
+    mm256_swap512_256( BA, CA ); \
+    mm256_swap512_256( BB, CB ); \
+    mm256_swap512_256( BC, CC ); \
+    mm256_swap512_256( BD, CD ); \
+    mm256_swap512_256( BE, CE ); \
+    mm256_swap512_256( BF, CF ); \
+} while (0)
+
+#define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
+do { \
+   xa0 = _mm256_xor_si256( xm, _mm256_xor_si256( xb1, _mm256_xor_si256(  \
+            _mm256_andnot_si256( xb3, xb2 ), \
+            _mm256_mullo_epi32( _mm256_xor_si256( xa0, _mm256_xor_si256( xc, \
+               _mm256_mullo_epi32(  mm256_rol_32( xa1, 15 ), _mm256_set1_epi32(5UL) ) \
+                   ) ), _mm256_set1_epi32(3UL) ) ) ) ); \
+   xb0 = mm256_not( _mm256_xor_si256( xa0, mm256_rol_32( xb0, 1 ) ) ); \
+} while (0)
+
+#define PERM_STEP_0_8   do { \
+      PERM_ELT8(A00, A0B, B0, BD, B9, B6, C8, M0); \
+      PERM_ELT8(A01, A00, B1, BE, BA, B7, C7, M1); \
+      PERM_ELT8(A02, A01, B2, BF, BB, B8, C6, M2); \
+      PERM_ELT8(A03, A02, B3, B0, BC, B9, C5, M3); \
+      PERM_ELT8(A04, A03, B4, B1, BD, BA, C4, M4); \
+      PERM_ELT8(A05, A04, B5, B2, BE, BB, C3, M5); \
+      PERM_ELT8(A06, A05, B6, B3, BF, BC, C2, M6); \
+      PERM_ELT8(A07, A06, B7, B4, B0, BD, C1, M7); \
+      PERM_ELT8(A08, A07, B8, B5, B1, BE, C0, M8); \
+      PERM_ELT8(A09, A08, B9, B6, B2, BF, CF, M9); \
+      PERM_ELT8(A0A, A09, BA, B7, B3, B0, CE, MA); \
+      PERM_ELT8(A0B, A0A, BB, B8, B4, B1, CD, MB); \
+      PERM_ELT8(A00, A0B, BC, B9, B5, B2, CC, MC); \
+      PERM_ELT8(A01, A00, BD, BA, B6, B3, CB, MD); \
+      PERM_ELT8(A02, A01, BE, BB, B7, B4, CA, ME); \
+      PERM_ELT8(A03, A02, BF, BC, B8, B5, C9, MF); \
+   } while (0)
+
+#define PERM_STEP_1_8   do { \
+      PERM_ELT8(A04, A03, B0, BD, B9, B6, C8, M0); \
+      PERM_ELT8(A05, A04, B1, BE, BA, B7, C7, M1); \
+      PERM_ELT8(A06, A05, B2, BF, BB, B8, C6, M2); \
+      PERM_ELT8(A07, A06, B3, B0, BC, B9, C5, M3); \
+      PERM_ELT8(A08, A07, B4, B1, BD, BA, C4, M4); \
+      PERM_ELT8(A09, A08, B5, B2, BE, BB, C3, M5); \
+      PERM_ELT8(A0A, A09, B6, B3, BF, BC, C2, M6); \
+      PERM_ELT8(A0B, A0A, B7, B4, B0, BD, C1, M7); \
+      PERM_ELT8(A00, A0B, B8, B5, B1, BE, C0, M8); \
+      PERM_ELT8(A01, A00, B9, B6, B2, BF, CF, M9); \
+      PERM_ELT8(A02, A01, BA, B7, B3, B0, CE, MA); \
+      PERM_ELT8(A03, A02, BB, B8, B4, B1, CD, MB); \
+      PERM_ELT8(A04, A03, BC, B9, B5, B2, CC, MC); \
+      PERM_ELT8(A05, A04, BD, BA, B6, B3, CB, MD); \
+      PERM_ELT8(A06, A05, BE, BB, B7, B4, CA, ME); \
+      PERM_ELT8(A07, A06, BF, BC, B8, B5, C9, MF); \
+   } while (0)
+
+#define PERM_STEP_2_8   do { \
+      PERM_ELT8(A08, A07, B0, BD, B9, B6, C8, M0); \
+      PERM_ELT8(A09, A08, B1, BE, BA, B7, C7, M1); \
+      PERM_ELT8(A0A, A09, B2, BF, BB, B8, C6, M2); \
+      PERM_ELT8(A0B, A0A, B3, B0, BC, B9, C5, M3); \
+      PERM_ELT8(A00, A0B, B4, B1, BD, BA, C4, M4); \
+      PERM_ELT8(A01, A00, B5, B2, BE, BB, C3, M5); \
+      PERM_ELT8(A02, A01, B6, B3, BF, BC, C2, M6); \
+      PERM_ELT8(A03, A02, B7, B4, B0, BD, C1, M7); \
+      PERM_ELT8(A04, A03, B8, B5, B1, BE, C0, M8); \
+      PERM_ELT8(A05, A04, B9, B6, B2, BF, CF, M9); \
+      PERM_ELT8(A06, A05, BA, B7, B3, B0, CE, MA); \
+      PERM_ELT8(A07, A06, BB, B8, B4, B1, CD, MB); \
+      PERM_ELT8(A08, A07, BC, B9, B5, B2, CC, MC); \
+      PERM_ELT8(A09, A08, BD, BA, B6, B3, CB, MD); \
+      PERM_ELT8(A0A, A09, BE, BB, B7, B4, CA, ME); \
+      PERM_ELT8(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+   } while (0)
+
+#define APPLY_P8 \
+do { \
+    B0 = mm256_ror_32( B0, 15 ); \
+    B1 = mm256_ror_32( B1, 15 ); \
+    B2 = mm256_ror_32( B2, 15 ); \
+    B3 = mm256_ror_32( B3, 15 ); \
+    B4 = mm256_ror_32( B4, 15 ); \
+    B5 = mm256_ror_32( B5, 15 ); \
+    B6 = mm256_ror_32( B6, 15 ); \
+    B7 = mm256_ror_32( B7, 15 ); \
+    B8 = mm256_ror_32( B8, 15 ); \
+    B9 = mm256_ror_32( B9, 15 ); \
+    BA = mm256_ror_32( BA, 15 ); \
+    BB = mm256_ror_32( BB, 15 ); \
+    BC = mm256_ror_32( BC, 15 ); \
+    BD = mm256_ror_32( BD, 15 ); \
+    BE = mm256_ror_32( BE, 15 ); \
+    BF = mm256_ror_32( BF, 15 ); \
+    PERM_STEP_0_8; \
+    PERM_STEP_1_8; \
+    PERM_STEP_2_8; \
+    A0B = _mm256_add_epi32( A0B, C6 ); \
+    A0A = _mm256_add_epi32( A0A, C5 ); \
+    A09 = _mm256_add_epi32( A09, C4 ); \
+    A08 = _mm256_add_epi32( A08, C3 ); \
+    A07 = _mm256_add_epi32( A07, C2 ); \
+    A06 = _mm256_add_epi32( A06, C1 ); \
+    A05 = _mm256_add_epi32( A05, C0 ); \
+    A04 = _mm256_add_epi32( A04, CF ); \
+    A03 = _mm256_add_epi32( A03, CE ); \
+    A02 = _mm256_add_epi32( A02, CD ); \
+    A01 = _mm256_add_epi32( A01, CC ); \
+    A00 = _mm256_add_epi32( A00, CB ); \
+    A0B = _mm256_add_epi32( A0B, CA ); \
+    A0A = _mm256_add_epi32( A0A, C9 ); \
+    A09 = _mm256_add_epi32( A09, C8 ); \
+    A08 = _mm256_add_epi32( A08, C7 ); \
+    A07 = _mm256_add_epi32( A07, C6 ); \
+    A06 = _mm256_add_epi32( A06, C5 ); \
+    A05 = _mm256_add_epi32( A05, C4 ); \
+    A04 = _mm256_add_epi32( A04, C3 ); \
+    A03 = _mm256_add_epi32( A03, C2 ); \
+    A02 = _mm256_add_epi32( A02, C1 ); \
+    A01 = _mm256_add_epi32( A01, C0 ); \
+    A00 = _mm256_add_epi32( A00, CF ); \
+    A0B = _mm256_add_epi32( A0B, CE ); \
+    A0A = _mm256_add_epi32( A0A, CD ); \
+    A09 = _mm256_add_epi32( A09, CC ); \
+    A08 = _mm256_add_epi32( A08, CB ); \
+    A07 = _mm256_add_epi32( A07, CA ); \
+    A06 = _mm256_add_epi32( A06, C9 ); \
+    A05 = _mm256_add_epi32( A05, C8 ); \
+    A04 = _mm256_add_epi32( A04, C7 ); \
+    A03 = _mm256_add_epi32( A03, C6 ); \
+    A02 = _mm256_add_epi32( A02, C5 ); \
+    A01 = _mm256_add_epi32( A01, C4 ); \
+    A00 = _mm256_add_epi32( A00, C3 ); \
+} while (0)
+
+#define INCR_W8   do { \
+      if ((Wlow = T32(Wlow + 1)) == 0) \
+         Whigh = T32(Whigh + 1); \
+   } while (0)
+
+static void
+shabal_8way_init( void *cc, unsigned size )
+{
+   shabal_8way_context *sc = (shabal_8way_context*)cc;
+
+   if ( size == 512 )
+   { // copy immediate constants directly to working registers later.
+       sc->state_loaded = false;
+   }
+   else
+   {  // No users
+       sc->state_loaded = true;
+       sc->A[ 0] = m256_const1_64( 0x52F8455252F84552 );
+       sc->A[ 1] = m256_const1_64( 0xE54B7999E54B7999 );
+       sc->A[ 2] = m256_const1_64( 0x2D8EE3EC2D8EE3EC );
+       sc->A[ 3] = m256_const1_64( 0xB9645191B9645191 );
+       sc->A[ 4] = m256_const1_64( 0xE0078B86E0078B86 );
+       sc->A[ 5] = m256_const1_64( 0xBB7C44C9BB7C44C9 );
+       sc->A[ 6] = m256_const1_64( 0xD2B5C1CAD2B5C1CA );
+       sc->A[ 7] = m256_const1_64( 0xB0D2EB8CB0D2EB8C );
+       sc->A[ 8] = m256_const1_64( 0x14CE5A4514CE5A45 );
+       sc->A[ 9] = m256_const1_64( 0x22AF50DC22AF50DC );
+       sc->A[10] = m256_const1_64( 0xEFFDBC6BEFFDBC6B );
+       sc->A[11] = m256_const1_64( 0xEB21B74AEB21B74A );
+
+       sc->B[ 0] = m256_const1_64( 0xB555C6EEB555C6EE );
+       sc->B[ 1] = m256_const1_64( 0x3E7105963E710596 );
+       sc->B[ 2] = m256_const1_64( 0xA72A652FA72A652F );
+       sc->B[ 3] = m256_const1_64( 0x9301515F9301515F );
+       sc->B[ 4] = m256_const1_64( 0xDA28C1FADA28C1FA );
+       sc->B[ 5] = m256_const1_64( 0x696FD868696FD868 );
+       sc->B[ 6] = m256_const1_64( 0x9CB6BF729CB6BF72 );
+       sc->B[ 7] = m256_const1_64( 0x0AFE40020AFE4002 );
+       sc->B[ 8] = m256_const1_64( 0xA6E03615A6E03615 );
+       sc->B[ 9] = m256_const1_64( 0x5138C1D45138C1D4 );
+       sc->B[10] = m256_const1_64( 0xBE216306BE216306 );
+       sc->B[11] = m256_const1_64( 0xB38B8890B38B8890 );
+       sc->B[12] = m256_const1_64( 0x3EA8B96B3EA8B96B );
+       sc->B[13] = m256_const1_64( 0x3299ACE43299ACE4 );
+       sc->B[14] = m256_const1_64( 0x30924DD430924DD4 );
+       sc->B[15] = m256_const1_64( 0x55CB34A555CB34A5 );
+
+       sc->C[ 0] = m256_const1_64( 0xB405F031B405F031 );
+       sc->C[ 1] = m256_const1_64( 0xC4233EBAC4233EBA );
+       sc->C[ 2] = m256_const1_64( 0xB3733979B3733979 );
+       sc->C[ 3] = m256_const1_64( 0xC0DD9D55C0DD9D55 );
+       sc->C[ 4] = m256_const1_64( 0xC51C28AEC51C28AE );
+       sc->C[ 5] = m256_const1_64( 0xA327B8E1A327B8E1 );
+       sc->C[ 6] = m256_const1_64( 0x56C5616756C56167 );
+       sc->C[ 7] = m256_const1_64( 0xED614433ED614433 );
+       sc->C[ 8] = m256_const1_64( 0x88B59D6088B59D60 );
+       sc->C[ 9] = m256_const1_64( 0x60E2CEBA60E2CEBA );
+       sc->C[10] = m256_const1_64( 0x758B4B8B758B4B8B );
+       sc->C[11] = m256_const1_64( 0x83E82A7F83E82A7F );
+       sc->C[12] = m256_const1_64( 0xBC968828BC968828 );
+       sc->C[13] = m256_const1_64( 0xE6E00BF7E6E00BF7 );
+       sc->C[14] = m256_const1_64( 0xBA839E55BA839E55 );
+       sc->C[15] = m256_const1_64( 0x9B491C609B491C60 );
+   }
+    sc->Wlow = 1;
+    sc->Whigh = 0;
+    sc->ptr = 0;
+}
+
+static void
+shabal_8way_core( void *cc, const unsigned char *data, size_t len )
+{
+   shabal_8way_context *sc = (shabal_8way_context*)cc;
+    __m256i *buf;
+    __m256i *vdata = (__m256i*)data;
+   const int buf_size = 64;
+   size_t ptr;
+   DECL_STATE8
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   if ( len < (buf_size - ptr ) )
+   {
+      memcpy_256( buf + (ptr>>2), vdata, len>>2 );
+      ptr += len;
+      sc->ptr = ptr;
+      return;
+   }
+
+   READ_STATE8( sc );
+
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( buf + (ptr>>2), vdata, clen>>2 );
+
+      ptr += clen;
+      vdata += clen>>2;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         DECODE_BLOCK8;
+         INPUT_BLOCK_ADD8;
+         XOR_W8;
+         APPLY_P8;
+         INPUT_BLOCK_SUB8;
+         SWAP_BC8;
+         INCR_W8;
+         ptr = 0;
+      }
+   }
+   WRITE_STATE8(sc);
+   sc->ptr = ptr;
+}
+
+static void
+shabal_8way_close( void *cc, unsigned ub, unsigned n, void *dst,
+                   unsigned size_words )
+{
+   shabal_8way_context *sc = (shabal_8way_context*)cc;
+    __m256i *buf;
+   const int buf_size = 64;
+   size_t ptr;
+   int i;
+   unsigned z, zz;
+   DECL_STATE8
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   z = 0x80 >> n;
+   zz = ((ub & -z) | z) & 0xFF;
+   buf[ptr>>2] = _mm256_set1_epi32( zz );
+   memset_zero_256( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
+   READ_STATE8(sc);
+   DECODE_BLOCK8;
+   INPUT_BLOCK_ADD8;
+   XOR_W8;
+   APPLY_P8;
+
+   for ( i = 0; i < 3; i ++ )
+   {
+      SWAP_BC8;
+      XOR_W8;
+      APPLY_P8;
+   }
+
+   __m256i *d = (__m256i*)dst;
+   if ( size_words == 16 )   // 512
+   {
+      d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3;
+      d[ 4] = B4; d[ 5] = B5; d[ 6] = B6; d[ 7] = B7;
+      d[ 8] = B8; d[ 9] = B9; d[10] = BA; d[11] = BB;
+      d[12] = BC; d[13] = BD; d[14] = BE; d[15] = BF;
+   }
+   else    // 256
+   {
+      d[ 0] = B8; d[ 1] = B9; d[ 2] = BA; d[ 3] = BB;
+      d[ 4] = BC; d[ 5] = BD; d[ 6] = BE; d[ 7] = BF;
+   }
+}
+
+void
+shabal256_8way_init( void *cc )
+{
+   shabal_8way_init(cc, 256);
+}
+
+void
+shabal256_8way_update( void *cc, const void *data, size_t len )
+{
+   shabal_8way_core( cc, data, len );
+}
+
+void
+shabal256_8way_close( void *cc, void *dst )
+{
+   shabal_8way_close(cc, 0, 0, dst, 8);
+}
+
+void
+shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                  void *dst )
+{
+   shabal_8way_close(cc, ub, n, dst, 8);
+}
+
+void
+shabal512_8way_init(void *cc)
+{
+   shabal_8way_init(cc, 512);
+}
+
+void
+shabal512_8way_update(void *cc, const void *data, size_t len)
+{
+   shabal_8way_core(cc, data, len);
+}
+
+void
+shabal512_8way_close(void *cc, void *dst)
+{
+   shabal_8way_close(cc, 0, 0, dst, 16);
+}
+
+void
+shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+   shabal_8way_close(cc, ub, n, dst, 16);
+}
+
+
+#endif  // AVX2
+
 /*
  * We copy the state into local variables, so that the compiler knows
  * that it can optimize them at will.
@@ -290,6 +883,8 @@ do { \
    A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
    A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
 } while (0)
+
+
 /*
 #define SWAP(v1, v2)   do { \
 		sph_u32 tmp = (v1); \
@@ -297,26 +892,39 @@ do { \
 		(v2) = tmp; \
 	} while (0)
 */
+
 #define SWAP_BC \
 do { \
-    mm128_swap128_256( B0, C0 ); \
-    mm128_swap128_256( B1, C1 ); \
-    mm128_swap128_256( B2, C2 ); \
-    mm128_swap128_256( B3, C3 ); \
-    mm128_swap128_256( B4, C4 ); \
-    mm128_swap128_256( B5, C5 ); \
-    mm128_swap128_256( B6, C6 ); \
-    mm128_swap128_256( B7, C7 ); \
-    mm128_swap128_256( B8, C8 ); \
-    mm128_swap128_256( B9, C9 ); \
-    mm128_swap128_256( BA, CA ); \
-    mm128_swap128_256( BB, CB ); \
-    mm128_swap128_256( BC, CC ); \
-    mm128_swap128_256( BD, CD ); \
-    mm128_swap128_256( BE, CE ); \
-    mm128_swap128_256( BF, CF ); \
+    mm128_swap256_128( B0, C0 ); \
+    mm128_swap256_128( B1, C1 ); \
+    mm128_swap256_128( B2, C2 ); \
+    mm128_swap256_128( B3, C3 ); \
+    mm128_swap256_128( B4, C4 ); \
+    mm128_swap256_128( B5, C5 ); \
+    mm128_swap256_128( B6, C6 ); \
+    mm128_swap256_128( B7, C7 ); \
+    mm128_swap256_128( B8, C8 ); \
+    mm128_swap256_128( B9, C9 ); \
+    mm128_swap256_128( BA, CA ); \
+    mm128_swap256_128( BB, CB ); \
+    mm128_swap256_128( BC, CC ); \
+    mm128_swap256_128( BD, CD ); \
+    mm128_swap256_128( BE, CE ); \
+    mm128_swap256_128( BF, CF ); \
 } while (0)
 
+/*
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
+do { \
+  __m128i t1 = _mm_mullo_epi32(  mm_rol_32( xa1, 15 ),\
+                                   _mm_set1_epi32(5UL) ) \
+  __m128i t2 = _mm_xor_si128( xa0, xc ); \
+  xb0 = mm_not( _mm_xor_si256( xa0, mm_rol_32( xb0, 1 ) ) ); \
+  xa0 = mm_xor4( xm, xb1, _mm_andnot_si128( xb3, xb2 ), \
+              _mm_xor_si128( t2, \
+                      _mm_mullo_epi32( t1, _mm_set1_epi32(5UL) ) ) ) \
+*/
+
 #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
 do { \
    xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
@@ -706,7 +1314,7 @@ shabal256_4way_init( void *cc )
 }
 
 void
-shabal256_4way( void *cc, const void *data, size_t len )
+shabal256_4way_update( void *cc, const void *data, size_t len )
 {
 	shabal_4way_core( cc, data, len );
 }
@@ -731,7 +1339,7 @@ shabal512_4way_init(void *cc)
 }
 
 void
-shabal512_4way(void *cc, const void *data, size_t len)
+shabal512_4way_update(void *cc, const void *data, size_t len)
 {
 	shabal_4way_core(cc, data, len);
 }
diff --git a/algo/shabal/shabal-hash-4way.h b/algo/shabal/shabal-hash-4way.h
index bf54b59..c296f8c 100644
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -36,7 +36,7 @@
 #ifndef SHABAL_HASH_4WAY_H__
 #define SHABAL_HASH_4WAY_H__ 1
 
-#ifdef __AVX2__
+#ifdef __SSE4_1__
 
 #include <stddef.h>
 #include "algo/sha/sph_types.h"
@@ -50,6 +50,34 @@ extern "C"{
 
 #define SPH_SIZE_shabal512   512
 
+#if defined(__AVX2__)
+
+typedef struct {
+   __m256i buf[16];
+   __m256i A[12], B[16], C[16];
+   sph_u32 Whigh, Wlow;
+   size_t ptr;
+   bool state_loaded;
+} shabal_8way_context __attribute__ ((aligned (64)));
+
+typedef shabal_8way_context shabal256_8way_context;
+typedef shabal_8way_context shabal512_8way_context;
+
+void shabal256_8way_init( void *cc );
+void shabal256_8way_update( void *cc, const void *data, size_t len );
+void shabal256_8way_close( void *cc, void *dst );
+void shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                       void *dst );
+
+void shabal512_8way_init( void *cc );
+void shabal512_8way_update( void *cc, const void *data, size_t len );
+void shabal512_8way_close( void *cc, void *dst );
+void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                       void *dst );
+
+
+#endif
+
 typedef struct {
 	__m128i buf[16] __attribute__ ((aligned (64)));
 	__m128i A[12], B[16], C[16];
@@ -62,13 +90,14 @@ typedef shabal_4way_context shabal256_4way_context;
 typedef shabal_4way_context shabal512_4way_context;
 
 void shabal256_4way_init( void *cc );
-void shabal256_4way( void *cc, const void *data, size_t len );
+void shabal256_4way_update( void *cc, const void *data, size_t len );
 void shabal256_4way_close( void *cc, void *dst );
 void shabal256_4way_addbits_and_close(	void *cc, unsigned ub, unsigned n,
                                        void *dst );
 
 void shabal512_4way_init( void *cc );
-void shabal512_4way( void *cc, const void *data, size_t len );
+void shabal512_4way_update( void *cc, const void *data, size_t len );
+#define shabal512_4way shabal512_4way_update
 void shabal512_4way_close( void *cc, void *dst );
 void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                        void *dst );
diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c
index d061ef0..25fe8a6 100644
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -3,6 +3,12 @@
 
 #include <stdio.h>
 
+// This implementation is deprecated, superseded by VAES in Icelake
+// which provides HW based 4 way aes.
+// It was created for AVX2 to eliminate interleaving between the 
+// preceding and following function.
+// This code can be removed when current users have reverted to one way.
+
 #if defined(__AVX2__)
 
 
@@ -16,8 +22,8 @@ static const uint32_t IV512[] =
 
 
 #define mm256_ror2x256hi_1x32( a, b ) \
-   _mm256_blend_epi32( mm256_ror1x32_128( a ), \
-                       mm256_ror1x32_128( b ), 0x88 )
+   _mm256_blend_epi32( mm256_ror128_32( a ), \
+                       mm256_ror128_32( b ), 0x88 )
 
 static void
 c512_2way( shavite512_2way_context *ctx, const void *msg )
@@ -61,7 +67,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
    {
       // round 1, 5, 9
 
-     k00 = _mm256_xor_si256( k13, mm256_ror1x32_128(
+     k00 = _mm256_xor_si256( k13, mm256_ror128_32(
                                   mm256_aesenc_2x128( k00, zero ) ) );
 
      if ( r == 0 )
@@ -71,7 +77,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
 
      x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
      k01 = _mm256_xor_si256( k00,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k01, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ) );
 
      if ( r == 1 )
         k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
@@ -80,25 +86,25 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
 
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
      k02 = _mm256_xor_si256( k01,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k02, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
      k03 = _mm256_xor_si256( k02,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k03, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
 
      p3 = _mm256_xor_si256( p3, x );
 
      k10 = _mm256_xor_si256( k03,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k10, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
      k11 = _mm256_xor_si256( k10,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k11, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
      k12 = _mm256_xor_si256( k11,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
      k13 = _mm256_xor_si256( k12,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k13, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ) );
 
      if ( r == 2 )
         k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
@@ -134,31 +140,31 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
 
      // round 3, 7, 11
 
-     k00 = _mm256_xor_si256( mm256_ror1x32_128(
+     k00 = _mm256_xor_si256( mm256_ror128_32(
                                      mm256_aesenc_2x128( k00, zero ) ), k13 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero );
-     k01 = _mm256_xor_si256( mm256_ror1x32_128(
+     k01 = _mm256_xor_si256( mm256_ror128_32(
                                      mm256_aesenc_2x128( k01, zero ) ), k00 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-     k02 = _mm256_xor_si256( mm256_ror1x32_128(
+     k02 = _mm256_xor_si256( mm256_ror128_32(
                                      mm256_aesenc_2x128( k02, zero ) ), k01 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-     k03 = _mm256_xor_si256( mm256_ror1x32_128(
+     k03 = _mm256_xor_si256( mm256_ror128_32(
                                      mm256_aesenc_2x128( k03, zero ) ), k02 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
 
      p1 = _mm256_xor_si256( p1, x );
 
-     k10 = _mm256_xor_si256( mm256_ror1x32_128(
+     k10 = _mm256_xor_si256( mm256_ror128_32(
                                      mm256_aesenc_2x128( k10, zero ) ), k03 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero );
-     k11 = _mm256_xor_si256( mm256_ror1x32_128(
+     k11 = _mm256_xor_si256( mm256_ror128_32(
                                      mm256_aesenc_2x128( k11, zero ) ), k10 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
-     k12 = _mm256_xor_si256( mm256_ror1x32_128(
+     k12 = _mm256_xor_si256( mm256_ror128_32(
                                      mm256_aesenc_2x128( k12, zero ) ), k11 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-     k13 = _mm256_xor_si256( mm256_ror1x32_128(
+     k13 = _mm256_xor_si256( mm256_ror128_32(
                                      mm256_aesenc_2x128( k13, zero ) ), k12 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
 
@@ -192,35 +198,35 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
 
    // round 13
 
-   k00 = _mm256_xor_si256( mm256_ror1x32_128(
+   k00 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k00, zero ) ), k13  );
    x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
-   k01 = _mm256_xor_si256( mm256_ror1x32_128(
+   k01 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k01, zero ) ), k00 );
    x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-   k02 = _mm256_xor_si256( mm256_ror1x32_128(
+   k02 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k02, zero ) ), k01 );
    x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-   k03 = _mm256_xor_si256( mm256_ror1x32_128(
+   k03 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k03, zero ) ), k02 );
    x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
 
    p3 = _mm256_xor_si256( p3, x );
 
-   k10 = _mm256_xor_si256( mm256_ror1x32_128(
+   k10 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k10, zero ) ), k03 );
    x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
-   k11 = _mm256_xor_si256( mm256_ror1x32_128(
+   k11 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k11, zero ) ), k10 );
    x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
 
-   k12 = mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) );
+   k12 = mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) );
    k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
 	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
 	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
 
    x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-   k13 = _mm256_xor_si256( mm256_ror1x32_128(
+   k13 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k13, zero ) ), k12 );
    x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
 
diff --git a/algo/x11/c11-4way.c b/algo/x11/c11-4way.c
index 1bd1664..fcae00c 100644
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -51,6 +51,8 @@ void init_c11_8way_ctx()
 void c11_8way_hash( void *state, const void *input )
 {
      uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));     
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
      uint64_t hash0[8] __attribute__ ((aligned (64)));
      uint64_t hash1[8] __attribute__ ((aligned (64)));
      uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -107,21 +109,18 @@ void c11_8way_hash( void *state, const void *input )
      skein512_8way_update( &ctx.skein, vhash, 64 );
      skein512_8way_close( &ctx.skein, vhash );
 
-     // Serial
-     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash );
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
 
-     // 7 Luffa + 8 cube
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
      luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
      cube_4way_init( &ctx.cube, 512, 16, 32 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
 
      // 9 Shavite
      sph_shavite512( &ctx.shavite, hash0, 64 );
diff --git a/algo/x11/x11-4way.c b/algo/x11/x11-4way.c
index ad3168d..a30cbc0 100644
--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -51,6 +51,8 @@ void init_x11_8way_ctx()
 void x11_8way_hash( void *state, const void *input )
 {
      uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
      uint64_t hash0[8] __attribute__ ((aligned (64)));
      uint64_t hash1[8] __attribute__ ((aligned (64)));
      uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -108,20 +110,18 @@ void x11_8way_hash( void *state, const void *input )
      keccak512_8way_update( &ctx.keccak, vhash, 64 );
      keccak512_8way_close( &ctx.keccak, vhash );
 
-     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash );
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
 
-     // Luffa + Cube
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
      luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
      cube_4way_init( &ctx.cube, 512, 16, 32 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
 
      sph_shavite512( &ctx.shavite, hash0, 64 );
      sph_shavite512_close( &ctx.shavite, hash0 );
diff --git a/algo/x12/x12-4way.c b/algo/x12/x12-4way.c
index 90ed730..ed4d131 100644
--- a/algo/x12/x12-4way.c
+++ b/algo/x12/x12-4way.c
@@ -1,7 +1,4 @@
 #include "x12-gate.h"
-
-#if defined(X12_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,11 +11,223 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
-//#include "algo/fugue/sph_fugue.h"
+
+#if defined(X12_8WAY)
+
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+} x12_8way_ctx_holder;
+
+x12_8way_ctx_holder x12_8way_ctx __attribute__ ((aligned (64)));
+
+void init_x12_8way_ctx()
+{
+     blake512_8way_init( &x12_8way_ctx.blake );
+     bmw512_8way_init( &x12_8way_ctx.bmw );
+     init_groestl( &x12_8way_ctx.groestl, 64 );
+     skein512_8way_init( &x12_8way_ctx.skein );
+     jh512_8way_init( &x12_8way_ctx.jh );
+     keccak512_8way_init( &x12_8way_ctx.keccak );
+     luffa_4way_init( &x12_8way_ctx.luffa, 512 );
+     cube_4way_init( &x12_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x12_8way_ctx.shavite );
+     simd_4way_init( &x12_8way_ctx.simd, 512 );
+     init_echo( &x12_8way_ctx.echo, 512 );
+     hamsi512_8way_init( &x12_8way_ctx.hamsi );
+};
+
+void x12_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     x12_8way_ctx_holder ctx;
+     memcpy( &ctx, &x12_8way_ctx, sizeof(x12_8way_ctx) );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, state );
+}
+
+int scanhash_x12_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[16*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+     uint32_t *hash7 = &(hash[49]);
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     const uint32_t Htarg = ptarget[7];
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     int thr_id = mythr->id;
+
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+               _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                 n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+        x12_8way_hash( hash, vdata );
+
+        for ( int lane = 0; lane < 8; lane++ )
+        if ( hash7[ lane<<1 ] < Htarg )
+        {
+           extr_lane_8x64( lane_hash, hash, lane, 256 );
+           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+           {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+           }
+        }
+        n += 8;
+     } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined(X12_4WAY)
 
 typedef struct {
     blake512_4way_context   blake;
@@ -63,45 +272,13 @@ void x12_4way_hash( void *state, const void *input )
      x12_4way_ctx_holder ctx;
      memcpy( &ctx, &x12_4way_ctx, sizeof(x12_4way_ctx) );
 
-     // 1 Blake
      blake512_4way( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
-     // 2 Bmw
      bmw512_4way( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
-
-     // Serial
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
-     // 3 Groestl
-     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-
-     // Parallel 4way 64 bit
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-
-     // 4 Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
-     skein512_4way_close( &ctx.skein, vhash );
-
-     // 5 JH
-     jh512_4way( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
-
-     // 6 Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
-
-     // Serial
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-
-     // 7 Luffa
      intrlv_2x128( vhash, hash0, hash1, 512 );
      luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
      dintrlv_2x128( hash0, hash1, vhash, 512 );
@@ -110,7 +287,6 @@ void x12_4way_hash( void *state, const void *input )
      luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
      dintrlv_2x128( hash2, hash3, vhash, 512 );
 
-     // 8 Cubehash
      cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
      cubehashInit( &ctx.cube, 512, 16, 32 );
      cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
@@ -119,7 +295,6 @@ void x12_4way_hash( void *state, const void *input )
      cubehashInit( &ctx.cube, 512, 16, 32 );
      cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
 
-     // 9 Shavite
      sph_shavite512( &ctx.shavite, hash0, 64 );
      sph_shavite512_close( &ctx.shavite, hash0 );
      memcpy( &ctx.shavite, &x12_4way_ctx.shavite,
@@ -135,7 +310,6 @@ void x12_4way_hash( void *state, const void *input )
      sph_shavite512( &ctx.shavite, hash3, 64 );
      sph_shavite512_close( &ctx.shavite, hash3 );
 
-     // 10 Simd
      intrlv_2x128( vhash, hash0, hash1, 512 );
      simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
      dintrlv_2x128( hash0, hash1, vhash, 512 );
@@ -144,21 +318,25 @@ void x12_4way_hash( void *state, const void *input )
      simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
      dintrlv_2x128( hash2, hash3, vhash, 512 );
 
-     // 11 Echo
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                       (const BitSequence *) hash0, 512 );
-     memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                       (const BitSequence *) hash1, 512 );
-     memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                       (const BitSequence *) hash2, 512 );
-     memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                       (const BitSequence *) hash3, 512 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
 
-     // 12 Hamsi parallel 4way 32 bit
+     // Parallel 4way 64 bit
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
      hamsi512_4way( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
diff --git a/algo/x12/x12-gate.c b/algo/x12/x12-gate.c
index 05f7173..f495747 100644
--- a/algo/x12/x12-gate.c
+++ b/algo/x12/x12-gate.c
@@ -2,7 +2,11 @@
 
 bool register_x12_algo( algo_gate_t* gate )
 {
-#if defined (X12_4WAY)
+#if defined (X12_8WAY)
+  init_x12_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x12_8way;
+  gate->hash      = (void*)&x12_8way_hash;
+#elif defined (X12_4WAY)
   init_x12_4way_ctx();
   gate->scanhash  = (void*)&scanhash_x12_4way;
   gate->hash      = (void*)&x12_4way_hash;
@@ -11,7 +15,7 @@ bool register_x12_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x12;
   gate->hash      = (void*)&x12hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
   return true;
 };
 
diff --git a/algo/x12/x12-gate.h b/algo/x12/x12-gate.h
index e26956e..998f09b 100644
--- a/algo/x12/x12-gate.h
+++ b/algo/x12/x12-gate.h
@@ -4,29 +4,36 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX2__) && defined(__AES__)
-  #define X12_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X12_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X12_4WAY 1
 #endif
 
 bool register_x12_algo( algo_gate_t* gate );
 
-#if defined(X12_4WAY)
+#if defined(X12_8WAY)
+
+void x12_8way_hash( void *state, const void *input );
+int scanhash_x12_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+void init_x12_8way_ctx();
+
+#elif defined(X12_4WAY)
 
 void x12_4way_hash( void *state, const void *input );
-
 int scanhash_x12_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x12_4way_ctx();
 
-#endif
+#else
 
 void x12hash( void *state, const void *input );
-
 int scanhash_x12( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x12_ctx();
 
 #endif
 
+#endif
+
diff --git a/algo/x12/x12.c b/algo/x12/x12.c
index 87a4fa6..1545ca4 100644
--- a/algo/x12/x12.c
+++ b/algo/x12/x12.c
@@ -20,35 +20,40 @@
 #include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
-#include "algo/blake/sse2/blake.c"   
-#include "algo/bmw/sse2/bmw.c"
-#include "algo/keccak/sse2/keccak.c"
-#include "algo/skein/sse2/skein.c"
-#include "algo/jh/sse2/jh_sse2_opt64.h"
 #if defined(__AES__)
   #include "algo/groestl/aes_ni/hash-groestl.h"
   #include "algo/echo/aes_ni/hash_api.h"
 #endif
 
 typedef struct {
+   sph_blake512_context    blake;
+   sph_bmw512_context      bmw;
+   sph_skein512_context    skein;
+   sph_jh512_context       jh;
+   sph_keccak512_context   keccak;
 #if defined(__AES__)
-        hashState_groestl       groestl;
-        hashState_echo          echo;
+   hashState_groestl       groestl;
+   hashState_echo          echo;
 #else
-        sph_groestl512_context   groestl;
-        sph_echo512_context      echo;
+   sph_groestl512_context   groestl;
+   sph_echo512_context      echo;
 #endif
-        hashState_luffa         luffa;
-        cubehashParam           cubehash;
-        sph_shavite512_context  shavite;
-        hashState_sd            simd;
-        sph_hamsi512_context    hamsi;
+   hashState_luffa          luffa;
+   cubehashParam            cubehash;
+   sph_shavite512_context   shavite;
+   hashState_sd             simd;
+   sph_hamsi512_context     hamsi;
 } x12_ctx_holder;
 
 x12_ctx_holder x12_ctx;
 
 void init_x12_ctx()
 {
+        sph_blake512_init( &x12_ctx.blake );
+        sph_bmw512_init( &x12_ctx.bmw );
+        sph_skein512_init( &x12_ctx.skein);
+        sph_jh512_init( &x12_ctx.jh);
+        sph_keccak512_init( &x12_ctx.keccak);
 #if defined(__AES__)
         init_echo( &x12_ctx.echo, 512 );
         init_groestl (&x12_ctx.groestl, 64 );
@@ -65,102 +70,59 @@ void init_x12_ctx()
 
 void x12hash(void *output, const void *input)
 {
+
 	unsigned char hash[128] __attribute__ ((aligned (32)));
 	#define hashB hash+64
       
-        x12_ctx_holder ctx;
-        memcpy( &ctx, &x12_ctx, sizeof(x12_ctx) );
+   x12_ctx_holder ctx;
+   memcpy( &ctx, &x12_ctx, sizeof(x12_ctx) );
 
-        // X11 algos
+   sph_blake512(&ctx.blake, input, 80);
+   sph_blake512_close(&ctx.blake, hash);
 
-        unsigned char hashbuf[128];
-        size_t hashptr;
-        sph_u64 hashctA;
-        sph_u64 hashctB;
+   sph_bmw512(&ctx.bmw, hash, 64);
+   sph_bmw512_close(&ctx.bmw, hash);
 
-        //---blake1---
+   update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
+                           (const BitSequence*)hash, 64 );
 
-        DECL_BLK;
-        BLK_I;
-        BLK_W;
-        BLK_C;
+   cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
+                         (const byte*)hashB, 64 );
 
-        //---bmw2---
+   sph_shavite512( &ctx.shavite, hash, 64);
+   sph_shavite512_close( &ctx.shavite, hashB);
 
-        DECL_BMW;
-        BMW_I;
-        BMW_U;
-
-        #define M(x)    sph_dec64le_aligned(data + 8 * (x))
-        #define H(x)    (h[x])
-        #define dH(x)   (dh[x])
-
-        BMW_C;
-
-        #undef M
-        #undef H
-        #undef dH
-        
-        //---groetl----
+   update_final_sd( &ctx.simd, (BitSequence *)hash,
+                    (const BitSequence *)hashB, 512 );
 
 #if defined(__AES__)
-        update_and_final_groestl( &ctx.groestl, (char*)hash,
-                                  (const char*)hash, 512 );
-#else
-        sph_groestl512 (&ctx.groestl, hash, 64);
-        sph_groestl512_close(&ctx.groestl, hash);
-#endif
-
-        //---skein4---
-
-        DECL_SKN;
-        SKN_I;
-        SKN_U;
-        SKN_C;
-
-        //---jh5------
-
-        DECL_JH;
-        JH_H;
-
-        //---keccak6---
-
-        DECL_KEC;
-        KEC_I;
-        KEC_U;
-        KEC_C;
-
-        //--- luffa7
-        update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
-                                (const BitSequence*)hash, 64 );
-
-        // 8 Cube
-        cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
-                              (const byte*)hashB, 64 );
-
-        // 9 Shavite
-        sph_shavite512( &ctx.shavite, hash, 64);
-        sph_shavite512_close( &ctx.shavite, hashB);
-
-        // 10 Simd
-        update_final_sd( &ctx.simd, (BitSequence *)hash,
-                         (const BitSequence *)hashB, 512 );
-
-        //11---echo---
-
-#if defined(__AES__)
-        update_final_echo ( &ctx.echo, (BitSequence *)hashB,
+   update_final_echo ( &ctx.echo, (BitSequence *)hashB,
                             (const BitSequence *)hash, 512 );
 #else
-        sph_echo512(&ctx.echo, hash, 64);
-        sph_echo512_close(&ctx.echo, hashB);
+   sph_echo512(&ctx.echo, hash, 64);
+   sph_echo512_close(&ctx.echo, hashB);
 #endif
 
-        // 12 Hamsi
+#if defined(__AES__)
+   update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                  (const char*)hash, 512 );
+#else
+   sph_groestl512 (&ctx.groestl, hash, 64);
+   sph_groestl512_close(&ctx.groestl, hash);
+#endif
+
+   sph_skein512(&ctx.skein, hash, 64);
+   sph_skein512_close(&ctx.skein, hash);
+
+   sph_jh512(&ctx.jh, hash, 64);
+   sph_jh512_close(&ctx.jh, hash);
+
+   sph_keccak512(&ctx.keccak, hash, 64);
+   sph_keccak512_close(&ctx.keccak, hash);
+
 	sph_hamsi512(&ctx.hamsi, hashB, 64);
 	sph_hamsi512_close(&ctx.hamsi, hash);
 
-        asm volatile ("emms");
 	memcpy(output, hashB, 32);
 }
 
diff --git a/algo/x13/x13-4way.c b/algo/x13/x13-4way.c
index 970f8e3..40b4b5b 100644
--- a/algo/x13/x13-4way.c
+++ b/algo/x13/x13-4way.c
@@ -1,7 +1,4 @@
 #include "x13-gate.h"
-
-#if defined(X13_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,12 +11,270 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
 
+#if defined(X13_8WAY)
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+} x13_8way_ctx_holder;
+
+x13_8way_ctx_holder x13_8way_ctx;
+
+void init_x13_8way_ctx()
+{
+     blake512_8way_init( &x13_8way_ctx.blake );
+     bmw512_8way_init( &x13_8way_ctx.bmw );
+     init_groestl( &x13_8way_ctx.groestl, 64 );
+     skein512_8way_init( &x13_8way_ctx.skein );
+     jh512_8way_init( &x13_8way_ctx.jh );
+     keccak512_8way_init( &x13_8way_ctx.keccak );
+     luffa_4way_init( &x13_8way_ctx.luffa, 512 );
+     cube_4way_init( &x13_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x13_8way_ctx.shavite );
+     simd_4way_init( &x13_8way_ctx.simd, 512 );
+     init_echo( &x13_8way_ctx.echo, 512 );
+     hamsi512_8way_init( &x13_8way_ctx.hamsi );
+     sph_fugue512_init( &x13_8way_ctx.fugue );
+}
+
+void x13_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     x13_8way_ctx_holder ctx;
+     memcpy( &ctx, &x13_8way_ctx, sizeof(x13_8way_ctx) );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 13 Fugue serial
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+     
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+
+int scanhash_x13_8way( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     int thr_id = mythr->id;
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     const uint32_t Htarg = ptarget[7];
+     const uint32_t last_nonce = max_nonce -8;
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+         _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                           n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+         x13_8way_hash( hash, vdata );
+         pdata[19] = n;
+
+         for ( int i = 0; i < 8; i++ )
+         if ( ( hash+(i<<3) )[7] < Htarg
+              && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+         {
+             pdata[19] = n+i;
+             submit_lane_solution( work, hash+(i<<3), mythr, i );
+         }
+         n += 8;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+
+#elif defined(X13_4WAY)
+
+
 typedef struct {
     blake512_4way_context   blake;
     bmw512_4way_context     bmw;
diff --git a/algo/x13/x13-gate.c b/algo/x13/x13-gate.c
index 60973d3..366185c 100644
--- a/algo/x13/x13-gate.c
+++ b/algo/x13/x13-gate.c
@@ -2,7 +2,11 @@
 
 bool register_x13_algo( algo_gate_t* gate )
 {
-#if defined (X13_4WAY)
+#if defined (X13_8WAY)
+  init_x13_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x13_8way;
+  gate->hash      = (void*)&x13_8way_hash;
+#elif defined (X13_4WAY)
   init_x13_4way_ctx();
   gate->scanhash  = (void*)&scanhash_x13_4way;
   gate->hash      = (void*)&x13_4way_hash;
@@ -11,7 +15,7 @@ bool register_x13_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x13;
   gate->hash      = (void*)&x13hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
   return true;
 };
 
diff --git a/algo/x13/x13-gate.h b/algo/x13/x13-gate.h
index c61d7d6..6718eb3 100644
--- a/algo/x13/x13-gate.h
+++ b/algo/x13/x13-gate.h
@@ -4,29 +4,35 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX2__) && defined(__AES__)
-  #define X13_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X13_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X13_4WAY 1
 #endif
 
 bool register_x13_algo( algo_gate_t* gate );
 
-#if defined(X13_4WAY)
+#if defined(X13_8WAY)
+
+void x13_8way_hash( void *state, const void *input );
+int scanhash_x13_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+void init_x13_8way_ctx();
+
+#elif defined(X13_4WAY)
 
 void x13_4way_hash( void *state, const void *input );
-
 int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x13_4way_ctx();
 
-#endif
+#else
 
 void x13hash( void *state, const void *input );
-
 int scanhash_x13( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x13_ctx();
 
 #endif
 
+#endif
diff --git a/algo/x14/x14-4way.c b/algo/x14/x14-4way.c
index 5267d78..9de05d3 100644
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -1,7 +1,4 @@
 #include "x14-gate.h"
-
-#if defined(X14_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -13,6 +10,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
@@ -22,6 +20,263 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/shabal-hash-4way.h"
 
+#if defined(X14_8WAY)
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+} x14_8way_ctx_holder;
+
+x14_8way_ctx_holder x14_8way_ctx __attribute__ ((aligned (64)));
+
+void init_x14_8way_ctx()
+{
+     blake512_8way_init( &x14_8way_ctx.blake );
+     bmw512_8way_init( &x14_8way_ctx.bmw );
+     init_groestl( &x14_8way_ctx.groestl, 64 );
+     skein512_8way_init( &x14_8way_ctx.skein );
+     jh512_8way_init( &x14_8way_ctx.jh );
+     keccak512_8way_init( &x14_8way_ctx.keccak );
+     luffa_4way_init( &x14_8way_ctx.luffa, 512 );
+     cube_4way_init( &x14_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x14_8way_ctx.shavite );
+     simd_4way_init( &x14_8way_ctx.simd, 512 );
+     init_echo( &x14_8way_ctx.echo, 512 );
+     hamsi512_8way_init( &x14_8way_ctx.hamsi );
+     sph_fugue512_init( &x14_8way_ctx.fugue );
+     shabal512_8way_init( &x14_8way_ctx.shabal );
+};
+
+void x14_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     x14_8way_ctx_holder ctx;
+     memcpy( &ctx, &x14_8way_ctx, sizeof(x14_8way_ctx) );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+     
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 13 Fugue serial
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     // 14 Shabal, parallel 32 bit
+     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+     shabal512_8way_update( &ctx.shabal, vhash, 64 );
+     shabal512_8way_close( &ctx.shabal, state );
+}
+
+
+int scanhash_x14_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*16] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     const uint32_t last_nonce = max_nonce - 8;
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     const uint32_t Htarg = ptarget[7];
+     int thr_id = mythr->id;
+
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+        x14_8way_hash( hash, vdata );
+        pdata[19] = n;
+
+        uint32_t *hash7 = &(hash[7<<3]);
+        for ( int lane = 0; lane < 8; lane++ )
+        if ( hash7[ lane ] < Htarg )
+        {
+            uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+            extr_lane_8x32( lane_hash, hash, lane, 256 );
+            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+            {
+                pdata[19] = n + lane;
+                submit_lane_solution( work, lane_hash, mythr, lane );
+            }
+         }
+         n += 8;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined(X14_4WAY)
+
 typedef struct {
     blake512_4way_context   blake;
     bmw512_4way_context     bmw;
@@ -61,11 +316,11 @@ void init_x14_4way_ctx()
 
 void x14_4way_hash( void *state, const void *input )
 {
+     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
      uint64_t hash0[8] __attribute__ ((aligned (64)));
      uint64_t hash1[8] __attribute__ ((aligned (64)));
      uint64_t hash2[8] __attribute__ ((aligned (64)));
      uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
      x14_4way_ctx_holder ctx;
      memcpy( &ctx, &x14_4way_ctx, sizeof(x14_4way_ctx) );
 
@@ -184,61 +439,49 @@ void x14_4way_hash( void *state, const void *input )
 
      // 14 Shabal, parallel 32 bit
      intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
      shabal512_4way_close( &ctx.shabal, state );
 }
 
 int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[4*16] __attribute__ ((aligned (64)));
+     uint32_t hash[4*16] __attribute__ ((aligned (128)));
      uint32_t vdata[24*4] __attribute__ ((aligned (64)));
      uint32_t *pdata = work->data;
      uint32_t *ptarget = work->target;
-     uint32_t n = pdata[19];
      const uint32_t first_nonce = pdata[19];
+     uint32_t n = first_nonce;
+     const uint32_t last_nonce = max_nonce - 4;
      __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
      const uint32_t Htarg = ptarget[7];
-     int thr_id = mythr->id;  // thr_id arg is deprecated
-     uint64_t htmax[] = {          0,        0xF,       0xFF,
-                               0xFFF,     0xFFFF, 0x10000000  };
-     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
-                          0xFFFFF000, 0xFFFF0000,          0  };
-
+     int thr_id = mythr->id;  
      mm256_bswap32_intrlv80_4x64( vdata, pdata );
 
-     for ( int m=0; m < 6; m++ )
-       if ( Htarg <= htmax[m] )
+     do
+     {
+       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+             _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+
+       x14_4way_hash( hash, vdata );
+       pdata[19] = n;
+
+       uint32_t *hash7 = &(hash[7<<2]);
+       for ( int lane = 0; lane < 4; lane++ )
+       if ( hash7[ lane ] < Htarg )
        {
-         uint32_t mask = masks[m];
-         do
-         {
-           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+           uint32_t lane_hash[8];
+           extr_lane_4x32( lane_hash, hash, lane, 256 );
 
-            x14_4way_hash( hash, vdata );
-            pdata[19] = n;
-
-            uint32_t *hash7 = &(hash[7<<2]);
-
-            for ( int lane = 0; lane < 4; lane++ )
-            if ( ( hash7[ lane ] & mask ) == 0 )
-            {
-               // deinterleave hash for lane
-               uint32_t lane_hash[8];
-               extr_lane_4x32( lane_hash, hash, lane, 256 );
-
-               if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
-               {
-                  pdata[19] = n + lane;
-                  submit_lane_solution( work, lane_hash, mythr, lane );
-               }
-            }
-            n += 4;
-         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
-         break;
-       }
-     *hashes_done = n - first_nonce + 1;
+           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+           {
+               pdata[19] = n + lane;
+               submit_lane_solution( work, lane_hash, mythr, lane );
+           }
+        }
+        n += 4;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
      return 0;
 }
 
diff --git a/algo/x14/x14-gate.c b/algo/x14/x14-gate.c
index 013aa10..851b7c3 100644
--- a/algo/x14/x14-gate.c
+++ b/algo/x14/x14-gate.c
@@ -2,7 +2,11 @@
 
 bool register_x14_algo( algo_gate_t* gate )
 {
-#if defined (X14_4WAY)
+#if defined (X14_8WAY)
+  init_x14_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x14_8way;
+  gate->hash      = (void*)&x14_8way_hash;
+#elif defined (X14_4WAY)
   init_x14_4way_ctx();
   gate->scanhash  = (void*)&scanhash_x14_4way;
   gate->hash      = (void*)&x14_4way_hash;
@@ -11,7 +15,7 @@ bool register_x14_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x14;
   gate->hash      = (void*)&x14hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
   return true;
 };
 
diff --git a/algo/x14/x14-gate.h b/algo/x14/x14-gate.h
index 9df974f..97f4800 100644
--- a/algo/x14/x14-gate.h
+++ b/algo/x14/x14-gate.h
@@ -4,20 +4,29 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX2__) && defined(__AES__)
-  #define X14_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X14_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X14_4WAY 1
 #endif
 
 bool register_x14_algo( algo_gate_t* gate );
 
-#if defined(X14_4WAY)
+#if defined(X14_8WAY)
+
+void x14_8way_hash( void *state, const void *input );
+int scanhash_x14_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+void init_x14_8way_ctx();
+
+#elif defined(X14_4WAY)
 
 void x14_4way_hash( void *state, const void *input );
 int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 void init_x14_4way_ctx();
 
-#endif
+#else
 
 void x14hash( void *state, const void *input );
 int scanhash_x14( struct work *work, uint32_t max_nonce,
@@ -26,3 +35,4 @@ void init_x14_ctx();
 
 #endif
 
+#endif
diff --git a/algo/x15/x15-4way.c b/algo/x15/x15-4way.c
index 87fe361..a761af0 100644
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -1,7 +1,4 @@
 #include "x15-gate.h"
-
-#if defined(X15_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,6 +11,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -23,6 +21,309 @@
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 
+#if defined(X15_8WAY)
+
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+} x15_8way_ctx_holder;
+
+x15_8way_ctx_holder x15_8way_ctx __attribute__ ((aligned (64)));
+
+void init_x15_8way_ctx()
+{
+     blake512_8way_init( &x15_8way_ctx.blake );
+     bmw512_8way_init( &x15_8way_ctx.bmw );
+     init_groestl( &x15_8way_ctx.groestl, 64 );
+     skein512_8way_init( &x15_8way_ctx.skein );
+     jh512_8way_init( &x15_8way_ctx.jh );
+     keccak512_8way_init( &x15_8way_ctx.keccak );
+     luffa_4way_init( &x15_8way_ctx.luffa, 512 );
+     cube_4way_init( &x15_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x15_8way_ctx.shavite );
+     simd_4way_init( &x15_8way_ctx.simd, 512 );
+     init_echo( &x15_8way_ctx.echo, 512 );
+     hamsi512_8way_init( &x15_8way_ctx.hamsi );
+     sph_fugue512_init( &x15_8way_ctx.fugue );
+     shabal512_8way_init( &x15_8way_ctx.shabal );
+     sph_whirlpool_init( &x15_8way_ctx.whirlpool );
+};
+
+void x15_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     x15_8way_ctx_holder ctx;
+     memcpy( &ctx, &x15_8way_ctx, sizeof(x15_8way_ctx) );
+
+     // 1 Blake
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     // 2 Bmw
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     // 10 Simd
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+
+     // 12 Hamsi parallel 4way 64 bit
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 13 Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+
+     // 14 Shabal, parallel 32 bit
+     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+     shabal512_8way_update( &ctx.shabal, vhash, 64 );
+     shabal512_8way_close( &ctx.shabal, vhash );
+     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 15 Whirlpool
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash4 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash5 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash6 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+int scanhash_x15_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     const uint32_t first_nonce = pdata[19];
+     uint32_t n = first_nonce;
+     const uint32_t last_nonce = max_nonce - 8;
+     __m512i  *noncev = (__m512i*)vdata + 9;  
+     const uint32_t Htarg = ptarget[7];
+     int thr_id = mythr->id;  
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+        x15_8way_hash( hash, vdata );
+        pdata[19] = n;
+
+        for ( int i = 0; i < 8; i++ )
+        if ( ( hash+(i<<3) )[7] < Htarg )
+        if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+        {
+           pdata[19] = n+i;
+           submit_lane_solution( work, hash, mythr, i );
+        }
+        n += 8;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined(X15_4WAY)
+
 typedef struct {
     blake512_4way_context   blake;
     bmw512_4way_context     bmw;
@@ -64,11 +365,11 @@ void init_x15_4way_ctx()
 
 void x15_4way_hash( void *state, const void *input )
 {
+     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
      uint64_t hash0[8] __attribute__ ((aligned (64)));
      uint64_t hash1[8] __attribute__ ((aligned (64)));
      uint64_t hash2[8] __attribute__ ((aligned (64)));
      uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
      x15_4way_ctx_holder ctx;
      memcpy( &ctx, &x15_4way_ctx, sizeof(x15_4way_ctx) );
 
@@ -187,7 +488,7 @@ void x15_4way_hash( void *state, const void *input )
 
      // 14 Shabal, parallel 32 bit
      intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
      shabal512_4way_close( &ctx.shabal, vhash );
      dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
        
@@ -216,48 +517,37 @@ void x15_4way_hash( void *state, const void *input )
 int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t hash[4*8] __attribute__ ((aligned (128)));
      uint32_t vdata[24*4] __attribute__ ((aligned (64)));
      uint32_t *pdata = work->data;
      uint32_t *ptarget = work->target;
-     uint32_t n = pdata[19];
      const uint32_t first_nonce = pdata[19];
-     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+     uint32_t n = first_nonce;
+     const uint32_t last_nonce = max_nonce - 4;
+     __m256i  *noncev = (__m256i*)vdata + 9;
      const uint32_t Htarg = ptarget[7];
-     int thr_id = mythr->id;  // thr_id arg is deprecated
-     uint64_t htmax[] = {          0,        0xF,       0xFF,
-                               0xFFF,     0xFFFF, 0x10000000  };
-     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
-                          0xFFFFF000, 0xFFFF0000,          0  };
-
-
+     int thr_id = mythr->id;  
      mm256_bswap32_intrlv80_4x64( vdata, pdata );
 
-     for ( int m=0; m < 6; m++ )
-       if ( Htarg <= htmax[m] )
-       {
-         uint32_t mask = masks[m];
-         do
+     do
+     {
+        *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+              _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+
+         x15_4way_hash( hash, vdata );
+         pdata[19] = n;
+
+         for ( int i = 0; i < 4; i++ )
+         if ( ( hash+(i<<3) )[7] < Htarg )
+         if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
          {
-           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+            pdata[19] = n+i;
+            submit_lane_solution( work, hash, mythr, i );
+         }
+         n += 4;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
 
-            x15_4way_hash( hash, vdata );
-            pdata[19] = n;
-
-            for ( int i = 0; i < 4; i++ )
-            if ( ( (hash+(i<<3))[7] & mask ) == 0 )
-            if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-            {
-               pdata[19] = n+i;
-               submit_lane_solution( work, hash, mythr, i );
-            }
-            n += 4;
-         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
-         break;
-       }
-
-     *hashes_done = n - first_nonce + 1;
+     *hashes_done = n - first_nonce;
      return 0;
 }
 
diff --git a/algo/x15/x15-gate.c b/algo/x15/x15-gate.c
index da33192..c148618 100644
--- a/algo/x15/x15-gate.c
+++ b/algo/x15/x15-gate.c
@@ -2,7 +2,11 @@
 
 bool register_x15_algo( algo_gate_t* gate )
 {
-#if defined (X15_4WAY)
+#if defined (X15_8WAY)
+  init_x15_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x15_8way;
+  gate->hash      = (void*)&x15_8way_hash;
+#elif defined (X15_4WAY)
   init_x15_4way_ctx();
   gate->scanhash  = (void*)&scanhash_x15_4way;
   gate->hash      = (void*)&x15_4way_hash;
@@ -11,7 +15,7 @@ bool register_x15_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x15;
   gate->hash      = (void*)&x15hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
   return true;
 };
 
diff --git a/algo/x15/x15-gate.h b/algo/x15/x15-gate.h
index 8224fe2..44568c2 100644
--- a/algo/x15/x15-gate.h
+++ b/algo/x15/x15-gate.h
@@ -4,20 +4,30 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX2__) && defined(__AES__)
-  #define X15_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X15_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X15_4WAY 1
 #endif
 
+
 bool register_x15_algo( algo_gate_t* gate );
 
-#if defined(X15_4WAY)
+#if defined(X15_8WAY)
+
+void x15_8way_hash( void *state, const void *input );
+int scanhash_x15_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+void init_x15_8way_ctx();
+
+#elif defined(X15_4WAY)
 
 void x15_4way_hash( void *state, const void *input );
 int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 void init_x15_4way_ctx();
 
-#endif
+#else
 
 void x15hash( void *state, const void *input );
 int scanhash_x15( struct work *work, uint32_t max_nonce,
@@ -26,3 +36,5 @@ void init_x15_ctx();
 
 #endif
 
+#endif
+
diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c
index 6fbd93f..d724c78 100644
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -5,9 +5,6 @@
  * Optimized by JayDDee@github Jan 2018
  */
 #include "x16r-gate.h"
-
-#if defined (X16R_4WAY)
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -20,6 +17,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -32,6 +30,392 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
 
+#if defined (X16R_8WAY)
+
+union _x16r_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+} __attribute__ ((aligned (64)));
+
+typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
+
+void x16r_8way_hash( void* output, const void* input )
+{
+   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[24] __attribute__ ((aligned (64)));
+   uint32_t hash1[24] __attribute__ ((aligned (64)));
+   uint32_t hash2[24] __attribute__ ((aligned (64)));
+   uint32_t hash3[24] __attribute__ ((aligned (64)));
+   uint32_t hash4[24] __attribute__ ((aligned (64)));
+   uint32_t hash5[24] __attribute__ ((aligned (64)));
+   uint32_t hash6[24] __attribute__ ((aligned (64)));
+   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   x16r_8way_context_overlay ctx;
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   void *in2 = (void*) hash2;
+   void *in3 = (void*) hash3;
+   void *in4 = (void*) hash4;
+   void *in5 = (void*) hash5;
+   void *in6 = (void*) hash6;
+   void *in7 = (void*) hash7;
+   int size = 80;
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 input, 640 );
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            blake512_8way_init( &ctx.blake );
+            if ( i == 0 )
+               blake512_8way_update( &ctx.blake, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
+                            size<<3 );
+               blake512_8way_update( &ctx.blake, vhash, size );
+            }
+            blake512_8way_close( &ctx.blake, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case BMW:
+            bmw512_8way_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_8way_update( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+            bmw512_8way_update( &ctx.bmw, vhash, size );
+            }
+            bmw512_8way_close( &ctx.bmw, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case GROESTL:
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                 (const char*)in0, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                 (const char*)in1, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                 (const char*)in2, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                 (const char*)in3, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                                 (const char*)in4, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                                 (const char*)in5, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                                 (const char*)in6, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                                 (const char*)in7, size<<3 );
+         break;
+         case SKEIN:
+            skein512_8way_init( &ctx.skein );
+            if ( i == 0 )
+               skein512_8way_update( &ctx.skein, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               skein512_8way_update( &ctx.skein, vhash, size );
+            }
+            skein512_8way_close( &ctx.skein, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case JH:
+            jh512_8way_init( &ctx.jh );
+            if ( i == 0 )
+               jh512_8way_update( &ctx.jh, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
+                            size<<3 );
+               jh512_8way_update( &ctx.jh, vhash, size );
+            }
+            jh512_8way_close( &ctx.jh, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case KECCAK:
+            keccak512_8way_init( &ctx.keccak );
+            if ( i == 0 )
+               keccak512_8way_update( &ctx.keccak, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
+                            size<<3 );
+               keccak512_8way_update( &ctx.keccak, vhash, size );
+            }
+            keccak512_8way_close( &ctx.keccak, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case LUFFA:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case CUBEHASH:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in0, size );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in1, size );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in2, size );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in3, size );
+            sph_shavite512_close( &ctx.shavite, hash3 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in4, size );
+            sph_shavite512_close( &ctx.shavite, hash4 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in5, size );
+            sph_shavite512_close( &ctx.shavite, hash5 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in6, size );
+            sph_shavite512_close( &ctx.shavite, hash6 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in7, size );
+            sph_shavite512_close( &ctx.shavite, hash7 );
+         break;
+         case SIMD:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case ECHO:
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                                (const BitSequence*)in0, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                                (const BitSequence*)in1, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                                (const BitSequence*)in2, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                                (const BitSequence*)in3, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
+                                (const BitSequence*)in4, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
+                                (const BitSequence*)in5, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
+                                (const BitSequence*)in6, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
+                                (const BitSequence*)in7, size<<3 );
+         break;
+         case HAMSI:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+
+             hamsi512_8way_init( &ctx.hamsi );
+             hamsi512_8way_update( &ctx.hamsi, vhash, size );
+             hamsi512_8way_close( &ctx.hamsi, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+             break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in0, size );
+             sph_fugue512_close( &ctx.fugue, hash0 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in1, size );
+             sph_fugue512_close( &ctx.fugue, hash1 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in2, size );
+             sph_fugue512_close( &ctx.fugue, hash2 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in3, size );
+             sph_fugue512_close( &ctx.fugue, hash3 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in4, size );
+             sph_fugue512_close( &ctx.fugue, hash4 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in5, size );
+             sph_fugue512_close( &ctx.fugue, hash5 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in6, size );
+             sph_fugue512_close( &ctx.fugue, hash6 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in7, size );
+             sph_fugue512_close( &ctx.fugue, hash7 );
+         break;
+         case SHABAL:
+             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                          size<<3 );
+             shabal512_8way_init( &ctx.shabal );
+             shabal512_8way_update( &ctx.shabal, vhash, size );
+             shabal512_8way_close( &ctx.shabal, vhash );
+             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in0, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash0 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in1, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash1 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in2, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash2 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in3, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in4, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash4 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in5, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash5 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in6, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash6 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in7, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash7 );
+         break;
+         case SHA_512:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+             sha512_8way_init( &ctx.sha512 );
+             sha512_8way_update( &ctx.sha512, vhash, size );
+             sha512_8way_close( &ctx.sha512, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+      }
+      size = 64;
+   }
+
+   memcpy( output,     hash0, 32 );
+   memcpy( output+32,  hash1, 32 );
+   memcpy( output+64,  hash2, 32 );
+   memcpy( output+96,  hash3, 32 );
+   memcpy( output+128, hash4, 32 );
+   memcpy( output+160, hash5, 32 );
+   memcpy( output+192, hash6, 32 );
+   memcpy( output+224, hash7, 32 );
+}
+
+int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   int thr_id = mythr->id;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+   const uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+   }
+
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+      x16r_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
+      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      {
+         pdata[19] = n+i;
+         submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+#elif defined (X16R_4WAY)
+
 union _x16r_4way_context_overlay
 {
     blake512_4way_context   blake;
@@ -50,16 +434,16 @@ union _x16r_4way_context_overlay
     shabal512_4way_context  shabal;
     sph_whirlpool_context   whirlpool;
     sha512_4way_context     sha512;
-};
+} __attribute__ ((aligned (64)));
 typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;
 
 void x16r_4way_hash( void* output, const void* input )
 {
+   uint32_t vhash[24*4] __attribute__ ((aligned (128)));
    uint32_t hash0[24] __attribute__ ((aligned (64)));
    uint32_t hash1[24] __attribute__ ((aligned (64)));
    uint32_t hash2[24] __attribute__ ((aligned (64)));
    uint32_t hash3[24] __attribute__ ((aligned (64)));
-   uint32_t vhash[24*4] __attribute__ ((aligned (64)));
    x16r_4way_context_overlay ctx;
    void *in0 = (void*) hash0;
    void *in1 = (void*) hash1;
@@ -86,7 +470,7 @@ void x16r_4way_hash( void* output, const void* input )
                blake512_4way( &ctx.blake, vhash, size );
             }
             blake512_4way_close( &ctx.blake, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case BMW:
             bmw512_4way_init( &ctx.bmw );
@@ -98,7 +482,7 @@ void x16r_4way_hash( void* output, const void* input )
                bmw512_4way( &ctx.bmw, vhash, size );
             }
             bmw512_4way_close( &ctx.bmw, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case GROESTL:
                init_groestl( &ctx.groestl, 64 );
@@ -124,7 +508,7 @@ void x16r_4way_hash( void* output, const void* input )
                skein512_4way( &ctx.skein, vhash, size );
             }
             skein512_4way_close( &ctx.skein, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case JH:
             jh512_4way_init( &ctx.jh );
@@ -136,7 +520,7 @@ void x16r_4way_hash( void* output, const void* input )
                jh512_4way( &ctx.jh, vhash, size );
             }
             jh512_4way_close( &ctx.jh, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case KECCAK:
             keccak512_4way_init( &ctx.keccak );
@@ -148,17 +532,17 @@ void x16r_4way_hash( void* output, const void* input )
                keccak512_4way( &ctx.keccak, vhash, size );
             }
             keccak512_4way_close( &ctx.keccak, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case LUFFA:
             intrlv_2x128( vhash, in0, in1, size<<3 );
             luffa_2way_init( &ctx.luffa, 512 );
             luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
-            dintrlv_2x128( hash0, hash1, vhash, 512 );
+            dintrlv_2x128_512( hash0, hash1, vhash );
             intrlv_2x128( vhash, in2, in3, size<<3 );
             luffa_2way_init( &ctx.luffa, 512 );
             luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
-            dintrlv_2x128( hash2, hash3, vhash, 512 );
+            dintrlv_2x128_512( hash2, hash3, vhash );
          break;
          case CUBEHASH:
             cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -192,11 +576,11 @@ void x16r_4way_hash( void* output, const void* input )
             intrlv_2x128( vhash, in0, in1, size<<3 );
             simd_2way_init( &ctx.simd, 512 );
             simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_2x128( hash0, hash1, vhash, 512 );
+            dintrlv_2x128_512( hash0, hash1, vhash );
             intrlv_2x128( vhash, in2, in3, size<<3 );
             simd_2way_init( &ctx.simd, 512 );
             simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_2x128( hash2, hash3, vhash, 512 );
+            dintrlv_2x128_512( hash2, hash3, vhash );
          break;
          case ECHO:
              init_echo( &ctx.echo, 512 );
@@ -217,7 +601,7 @@ void x16r_4way_hash( void* output, const void* input )
              hamsi512_4way_init( &ctx.hamsi );
              hamsi512_4way( &ctx.hamsi, vhash, size );
              hamsi512_4way_close( &ctx.hamsi, vhash );
-             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case FUGUE:
              sph_fugue512_init( &ctx.fugue );
@@ -238,7 +622,7 @@ void x16r_4way_hash( void* output, const void* input )
              shabal512_4way_init( &ctx.shabal );
              shabal512_4way( &ctx.shabal, vhash, size );
              shabal512_4way_close( &ctx.shabal, vhash );
-             dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+             dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case WHIRLPOOL:
              sph_whirlpool_init( &ctx.whirlpool );
@@ -259,7 +643,7 @@ void x16r_4way_hash( void* output, const void* input )
              sha512_4way_init( &ctx.sha512 );
              sha512_4way( &ctx.sha512, vhash, size );
              sha512_4way_close( &ctx.sha512, vhash );
-             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
       }
       size = 64;
@@ -280,6 +664,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
    uint32_t *ptarget = work->target;
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
    uint32_t n = first_nonce;
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
    int thr_id = mythr->id;
@@ -317,9 +702,9 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
          submit_lane_solution( work, hash+(i<<3), mythr, i );
       }
       n += 4;
-   } while ( likely( ( n < max_nonce ) && !(*restart) ) );
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
 
-   *hashes_done = n - first_nonce + 1;
+   *hashes_done = n - first_nonce;
    return 0;
 }
 
diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c
index 1480813..7d76c0d 100644
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -34,14 +34,17 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output )
 
 bool register_x16r_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16r_8way;
+  gate->hash      = (void*)&x16r_8way_hash;
+#elif defined (X16R_4WAY)
   gate->scanhash  = (void*)&scanhash_x16r_4way;
   gate->hash      = (void*)&x16r_4way_hash;
 #else
   gate->scanhash  = (void*)&scanhash_x16r;
   gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
   x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
   opt_target_factor = 256.0;
   return true;
@@ -49,14 +52,17 @@ bool register_x16r_algo( algo_gate_t* gate )
 
 bool register_x16rv2_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16rv2_8way;
+  gate->hash      = (void*)&x16rv2_8way_hash;
+#elif defined (X16R_4WAY)
   gate->scanhash  = (void*)&scanhash_x16rv2_4way;
   gate->hash      = (void*)&x16rv2_4way_hash;
 #else
   gate->scanhash  = (void*)&scanhash_x16rv2;
   gate->hash      = (void*)&x16rv2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
   x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
   opt_target_factor = 256.0;
   return true;
@@ -64,14 +70,17 @@ bool register_x16rv2_algo( algo_gate_t* gate )
 
 bool register_x16s_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16r_8way;
+  gate->hash      = (void*)&x16r_8way_hash;
+#elif defined (X16R_4WAY)
   gate->scanhash  = (void*)&scanhash_x16r_4way;
   gate->hash      = (void*)&x16r_4way_hash;
 #else
   gate->scanhash  = (void*)&scanhash_x16r;
   gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
   x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
   opt_target_factor = 256.0;
   return true;
@@ -196,28 +205,34 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 
 bool register_x16rt_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16rt_8way;
+  gate->hash      = (void*)&x16rt_8way_hash;
+#elif defined (X16R_4WAY)
   gate->scanhash  = (void*)&scanhash_x16rt_4way;
   gate->hash      = (void*)&x16rt_4way_hash;
 #else
   gate->scanhash  = (void*)&scanhash_x16rt;
   gate->hash      = (void*)&x16rt_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
   opt_target_factor = 256.0;
   return true;
 };
 
 bool register_x16rt_veil_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16rt_8way;
+  gate->hash      = (void*)&x16rt_8way_hash;
+#elif defined (X16R_4WAY)
   gate->scanhash  = (void*)&scanhash_x16rt_4way;
   gate->hash      = (void*)&x16rt_4way_hash;
 #else
   gate->scanhash  = (void*)&scanhash_x16rt;
   gate->hash      = (void*)&x16rt_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
   gate->build_extraheader = (void*)&veil_build_extraheader;
   opt_target_factor = 256.0;
   return true;
@@ -231,7 +246,7 @@ bool register_hex_algo( algo_gate_t* gate )
 {
   gate->scanhash        = (void*)&scanhash_hex;
   gate->hash            = (void*)&hex_hash;
-  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
   gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
   opt_target_factor = 128.0;
   return true;
diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h
index dd4c216..ff6d44d 100644
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -6,8 +6,10 @@
 #include <stdint.h>
 #include <unistd.h>
 
-#if defined(__AVX2__) && defined(__AES__)
-  #define X16R_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X16R_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X16R_4WAY 1
 #endif
 
 enum x16r_Algo {
@@ -44,7 +46,20 @@ bool register_x16rt_algo( algo_gate_t* gate );
 bool register_hex__algo( algo_gate_t* gate );
 bool register_x21s__algo( algo_gate_t* gate );
 
-#if defined(X16R_4WAY)
+#if defined(X16R_8WAY)
+
+void x16r_8way_hash( void *state, const void *input );
+int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
+void x16rv2_8way_hash( void *state, const void *input );
+int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+void x16rt_8way_hash( void *state, const void *input );
+int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(X16R_4WAY)
 
 void x16r_4way_hash( void *state, const void *input );
 int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
@@ -58,12 +73,7 @@ void x16rt_4way_hash( void *state, const void *input );
 int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 
-void x21s_4way_hash( void *state, const void *input );
-int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr );
-bool x21s_4way_thread_init();
-
-#endif
+#else
 
 void x16r_hash( void *state, const void *input );
 int scanhash_x16r( struct work *work, uint32_t max_nonce,
@@ -77,9 +87,16 @@ void x16rt_hash( void *state, const void *input );
 int scanhash_x16rt( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
 
-void hex_hash( void *state, const void *input );
-int scanhash_hex( struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done, struct thr_info *mythr );
+#endif
+
+#if defined(X16R_4WAY)
+
+void x21s_4way_hash( void *state, const void *input );
+int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+bool x21s_4way_thread_init();
+
+#else
 
 void x21s_hash( void *state, const void *input );
 int scanhash_x21s( struct work *work, uint32_t max_nonce,
@@ -88,3 +105,9 @@ bool x21s_thread_init();
 
 #endif
 
+void hex_hash( void *state, const void *input );
+int scanhash_hex( struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
diff --git a/algo/x16/x16rt-4way.c b/algo/x16/x16rt-4way.c
index 4e28d40..8118bc8 100644
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -1,7 +1,4 @@
 #include "x16r-gate.h"
-
-#if defined (X16R_4WAY)
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -15,6 +12,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -26,6 +24,391 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
 
+#if defined (X16R_8WAY)
+
+union _x16rt_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+} __attribute__ ((aligned (64)));
+
+typedef union _x16rt_8way_context_overlay x16rt_8way_context_overlay;
+
+void x16rt_8way_hash( void* output, const void* input )
+{
+   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[24] __attribute__ ((aligned (64)));
+   uint32_t hash1[24] __attribute__ ((aligned (64)));
+   uint32_t hash2[24] __attribute__ ((aligned (64)));
+   uint32_t hash3[24] __attribute__ ((aligned (64)));
+   uint32_t hash4[24] __attribute__ ((aligned (64)));
+   uint32_t hash5[24] __attribute__ ((aligned (64)));
+   uint32_t hash6[24] __attribute__ ((aligned (64)));
+   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   x16rt_8way_context_overlay ctx;
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   void *in2 = (void*) hash2;
+   void *in3 = (void*) hash3;
+   void *in4 = (void*) hash4;
+   void *in5 = (void*) hash5;
+   void *in6 = (void*) hash6;
+   void *in7 = (void*) hash7;
+   int size = 80;
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 input, 640 );
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            blake512_8way_init( &ctx.blake );
+            if ( i == 0 )
+               blake512_8way_update( &ctx.blake, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               blake512_8way_update( &ctx.blake, vhash, size );
+            }
+            blake512_8way_close( &ctx.blake, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case BMW:
+            bmw512_8way_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_8way_update( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+            bmw512_8way_update( &ctx.bmw, vhash, size );
+            }
+            bmw512_8way_close( &ctx.bmw, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case GROESTL:
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                 (const char*)in0, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                 (const char*)in1, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                 (const char*)in2, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                 (const char*)in3, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                                 (const char*)in4, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                                 (const char*)in5, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                                 (const char*)in6, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                                 (const char*)in7, size<<3 );
+         break;
+         case SKEIN:
+            skein512_8way_init( &ctx.skein );
+            if ( i == 0 )
+               skein512_8way_update( &ctx.skein, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               skein512_8way_update( &ctx.skein, vhash, size );
+            }
+            skein512_8way_close( &ctx.skein, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case JH:
+            jh512_8way_init( &ctx.jh );
+            if ( i == 0 )
+               jh512_8way_update( &ctx.jh, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               jh512_8way_update( &ctx.jh, vhash, size );
+            }
+            jh512_8way_close( &ctx.jh, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case KECCAK:
+            keccak512_8way_init( &ctx.keccak );
+            if ( i == 0 )
+               keccak512_8way_update( &ctx.keccak, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               keccak512_8way_update( &ctx.keccak, vhash, size );
+            }
+            keccak512_8way_close( &ctx.keccak, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case LUFFA:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case CUBEHASH:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in0, size );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in1, size );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in2, size );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in3, size );
+            sph_shavite512_close( &ctx.shavite, hash3 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in4, size );
+            sph_shavite512_close( &ctx.shavite, hash4 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in5, size );
+            sph_shavite512_close( &ctx.shavite, hash5 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in6, size );
+            sph_shavite512_close( &ctx.shavite, hash6 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in7, size );
+            sph_shavite512_close( &ctx.shavite, hash7 );
+         break;
+         case SIMD:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case ECHO:
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                                (const BitSequence*)in0, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                                (const BitSequence*)in1, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                                (const BitSequence*)in2, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                                (const BitSequence*)in3, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
+                                (const BitSequence*)in4, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
+                                (const BitSequence*)in5, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
+                                (const BitSequence*)in6, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
+                                (const BitSequence*)in7, size<<3 );
+         break;
+         case HAMSI:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+
+             hamsi512_8way_init( &ctx.hamsi );
+             hamsi512_8way_update( &ctx.hamsi, vhash, size );
+             hamsi512_8way_close( &ctx.hamsi, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+             break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in0, size );
+             sph_fugue512_close( &ctx.fugue, hash0 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in1, size );
+             sph_fugue512_close( &ctx.fugue, hash1 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in2, size );
+             sph_fugue512_close( &ctx.fugue, hash2 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in3, size );
+             sph_fugue512_close( &ctx.fugue, hash3 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in4, size );
+             sph_fugue512_close( &ctx.fugue, hash4 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in5, size );
+             sph_fugue512_close( &ctx.fugue, hash5 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in6, size );
+             sph_fugue512_close( &ctx.fugue, hash6 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in7, size );
+             sph_fugue512_close( &ctx.fugue, hash7 );
+         break;
+         case SHABAL:
+             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                          size<<3 );
+             shabal512_8way_init( &ctx.shabal );
+             shabal512_8way_update( &ctx.shabal, vhash, size );
+             shabal512_8way_close( &ctx.shabal, vhash );
+             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in0, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash0 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in1, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash1 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in2, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash2 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in3, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in4, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash4 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in5, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash5 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in6, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash6 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in7, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash7 );
+         break;
+         case SHA_512:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+             sha512_8way_init( &ctx.sha512 );
+             sha512_8way_update( &ctx.sha512, vhash, size );
+             sha512_8way_close( &ctx.sha512, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+      }
+      size = 64;
+   }
+
+   memcpy( output,     hash0, 32 );
+   memcpy( output+32,  hash1, 32 );
+   memcpy( output+64,  hash2, 32 );
+   memcpy( output+96,  hash3, 32 );
+   memcpy( output+128, hash4, 32 );
+   memcpy( output+160, hash5, 32 );
+   memcpy( output+192, hash6, 32 );
+   memcpy( output+224, hash7, 32 );
+}
+
+int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) timeHash[8*8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   int thr_id = mythr->id;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16rt_getTimeHash( ntime, &timeHash );
+      x16rt_getAlgoString( &timeHash[0], hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
+                               hashOrder, ntime, timeHash );
+   }
+
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+      x16rt_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
+      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      {
+         pdata[19] = n+i;
+         submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (X16R_4WAY)
+
 union _x16rt_4way_context_overlay
 {
     blake512_4way_context   blake;
diff --git a/algo/x16/x16rv2-4way.c b/algo/x16/x16rv2-4way.c
index 6cbb0f2..7dd6306 100644
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -5,9 +5,6 @@
  * Optimized by JayDDee@github Jan 2018
  */
 #include "x16r-gate.h"
-
-#if defined (X16R_4WAY)
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -21,6 +18,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -33,6 +31,477 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
 
+#if defined (X16R_8WAY)
+
+union _x16rv2_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+    sph_tiger_context       tiger;
+} __attribute__ ((aligned (64)));
+
+typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
+
+void x16rv2_8way_hash( void* output, const void* input )
+{
+   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[24] __attribute__ ((aligned (64)));
+   uint32_t hash1[24] __attribute__ ((aligned (64)));
+   uint32_t hash2[24] __attribute__ ((aligned (64)));
+   uint32_t hash3[24] __attribute__ ((aligned (64)));
+   uint32_t hash4[24] __attribute__ ((aligned (64)));
+   uint32_t hash5[24] __attribute__ ((aligned (64)));
+   uint32_t hash6[24] __attribute__ ((aligned (64)));
+   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   x16rv2_8way_context_overlay ctx;
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   void *in2 = (void*) hash2;
+   void *in3 = (void*) hash3;
+   void *in4 = (void*) hash4;
+   void *in5 = (void*) hash5;
+   void *in6 = (void*) hash6;
+   void *in7 = (void*) hash7;
+   int size = 80;
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 input, 640 );
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            blake512_8way_init( &ctx.blake );
+            if ( i == 0 )
+               blake512_8way_update( &ctx.blake, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               blake512_8way_update( &ctx.blake, vhash, size );
+            }
+            blake512_8way_close( &ctx.blake, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case BMW:
+            bmw512_8way_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_8way_update( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+            bmw512_8way_update( &ctx.bmw, vhash, size );
+            }
+            bmw512_8way_close( &ctx.bmw, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case GROESTL:
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                 (const char*)in0, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                 (const char*)in1, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                 (const char*)in2, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                 (const char*)in3, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                                 (const char*)in4, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                                 (const char*)in5, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                                 (const char*)in6, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                                 (const char*)in7, size<<3 );
+         break;
+         case SKEIN:
+            skein512_8way_init( &ctx.skein );
+            if ( i == 0 )
+               skein512_8way_update( &ctx.skein, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               skein512_8way_update( &ctx.skein, vhash, size );
+            }
+            skein512_8way_close( &ctx.skein, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case JH:
+            jh512_8way_init( &ctx.jh );
+            if ( i == 0 )
+               jh512_8way_update( &ctx.jh, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               jh512_8way_update( &ctx.jh, vhash, size );
+            }
+            jh512_8way_close( &ctx.jh, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case KECCAK:
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in0, size );
+             sph_tiger_close( &ctx.tiger, hash0 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in1, size );
+             sph_tiger_close( &ctx.tiger, hash1 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in2, size );
+             sph_tiger_close( &ctx.tiger, hash2 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in3, size );
+             sph_tiger_close( &ctx.tiger, hash3 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in4, size );
+             sph_tiger_close( &ctx.tiger, hash4 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in5, size );
+             sph_tiger_close( &ctx.tiger, hash5 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in6, size );
+             sph_tiger_close( &ctx.tiger, hash6 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in7, size );
+             sph_tiger_close( &ctx.tiger, hash7 );
+
+             for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = hash2[i] = hash3[i] =
+                hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
+
+             intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
+                          hash6, hash7 );
+             keccak512_8way_init( &ctx.keccak );
+             keccak512_8way_update( &ctx.keccak, vhash, 64 );
+             keccak512_8way_close( &ctx.keccak, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case LUFFA:
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in0, size );
+             sph_tiger_close( &ctx.tiger, hash0 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in1, size );
+             sph_tiger_close( &ctx.tiger, hash1 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in2, size );
+             sph_tiger_close( &ctx.tiger, hash2 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in3, size );
+             sph_tiger_close( &ctx.tiger, hash3 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in4, size );
+             sph_tiger_close( &ctx.tiger, hash4 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in5, size );
+             sph_tiger_close( &ctx.tiger, hash5 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in6, size );
+             sph_tiger_close( &ctx.tiger, hash6 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in7, size );
+             sph_tiger_close( &ctx.tiger, hash7 );
+
+             for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = hash2[i] = hash3[i] = 
+                hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
+
+            intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3);
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7);
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case CUBEHASH:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in0, size );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in1, size );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in2, size );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in3, size );
+            sph_shavite512_close( &ctx.shavite, hash3 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in4, size );
+            sph_shavite512_close( &ctx.shavite, hash4 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in5, size );
+            sph_shavite512_close( &ctx.shavite, hash5 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in6, size );
+            sph_shavite512_close( &ctx.shavite, hash6 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in7, size );
+            sph_shavite512_close( &ctx.shavite, hash7 );
+         break;
+         case SIMD:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case ECHO:
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                                (const BitSequence*)in0, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                                (const BitSequence*)in1, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                                (const BitSequence*)in2, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                                (const BitSequence*)in3, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
+                                (const BitSequence*)in4, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
+                                (const BitSequence*)in5, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
+                                (const BitSequence*)in6, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
+                                (const BitSequence*)in7, size<<3 );
+         break;
+         case HAMSI:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+
+             hamsi512_8way_init( &ctx.hamsi );
+             hamsi512_8way_update( &ctx.hamsi, vhash, size );
+             hamsi512_8way_close( &ctx.hamsi, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+             break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in0, size );
+             sph_fugue512_close( &ctx.fugue, hash0 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in1, size );
+             sph_fugue512_close( &ctx.fugue, hash1 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in2, size );
+             sph_fugue512_close( &ctx.fugue, hash2 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in3, size );
+             sph_fugue512_close( &ctx.fugue, hash3 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in4, size );
+             sph_fugue512_close( &ctx.fugue, hash4 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in5, size );
+             sph_fugue512_close( &ctx.fugue, hash5 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in6, size );
+             sph_fugue512_close( &ctx.fugue, hash6 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in7, size );
+             sph_fugue512_close( &ctx.fugue, hash7 );
+         break;
+         case SHABAL:
+             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                          size<<3 );
+             shabal512_8way_init( &ctx.shabal );
+             shabal512_8way_update( &ctx.shabal, vhash, size );
+             shabal512_8way_close( &ctx.shabal, vhash );
+             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in0, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash0 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in1, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash1 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in2, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash2 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in3, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in4, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash4 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in5, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash5 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in6, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash6 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in7, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash7 );
+         break;
+         case SHA_512:
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in0, size );
+             sph_tiger_close( &ctx.tiger, hash0 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in1, size );
+             sph_tiger_close( &ctx.tiger, hash1 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in2, size );
+             sph_tiger_close( &ctx.tiger, hash2 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in3, size );
+             sph_tiger_close( &ctx.tiger, hash3 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in4, size );
+             sph_tiger_close( &ctx.tiger, hash4 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in5, size );
+             sph_tiger_close( &ctx.tiger, hash5 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in6, size );
+             sph_tiger_close( &ctx.tiger, hash6 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in7, size );
+             sph_tiger_close( &ctx.tiger, hash7 );
+
+             for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = hash2[i] = hash3[i] =
+                hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
+
+             intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
+                          hash6, hash7 );
+             sha512_8way_init( &ctx.sha512 );
+             sha512_8way_update( &ctx.sha512, vhash, 64 );
+             sha512_8way_close( &ctx.sha512, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+      }
+      size = 64;
+   }
+
+   memcpy( output,     hash0, 32 );
+   memcpy( output+32,  hash1, 32 );
+   memcpy( output+64,  hash2, 32 );
+   memcpy( output+96,  hash3, 32 );
+   memcpy( output+128, hash4, 32 );
+   memcpy( output+160, hash5, 32 );
+   memcpy( output+192, hash6, 32 );
+   memcpy( output+224, hash7, 32 );
+}
+
+int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   int thr_id = mythr->id;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+   const uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+   }
+
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+      x16rv2_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
+      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      {
+         pdata[19] = n+i;
+         submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+#elif defined (X16R_4WAY)
+
+
+
 union _x16rv2_4way_context_overlay
 {
     blake512_4way_context   blake;
diff --git a/algo/x17/sonoa-4way.c b/algo/x17/sonoa-4way.c
index 4406529..3a0b248 100644
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -1,7 +1,4 @@
 #include "sonoa-gate.h"
-
-#if defined(SONOA_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -25,6 +22,1338 @@
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/sha/sha-hash-4way.h"
 
+#if defined(SONOA_8WAY)
+
+union _sonoa_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+    haval256_5_8way_context haval;
+} __attribute__ ((aligned (64)));
+
+typedef union _sonoa_8way_context_overlay sonoa_8way_context_overlay;
+
+void sonoa_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
+     uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     sonoa_8way_context_overlay ctx;
+
+// 1
+     
+     blake512_8way_init( &ctx.blake );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     bmw512_8way_init( &ctx.bmw );
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7, vhash );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7 );
+
+     skein512_8way_init( &ctx.skein );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_init( &ctx.jh );
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_init( &ctx.keccak );
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+     
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
+
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+     
+// 2
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7 );
+
+     bmw512_8way_init( &ctx.bmw );
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7, vhash );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7 );
+
+     skein512_8way_init( &ctx.skein );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_init( &ctx.jh );
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_init( &ctx.keccak );
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
+
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7 );
+
+     hamsi512_8way_init( &ctx.hamsi );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+
+// 3
+
+     bmw512_8way_init( &ctx.bmw );
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7, vhash );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7 );
+
+     skein512_8way_init( &ctx.skein );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_init( &ctx.jh );
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_init( &ctx.keccak );
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
+
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7 );
+
+     hamsi512_8way_init( &ctx.hamsi );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+// 4
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7 );
+
+     bmw512_8way_init( &ctx.bmw );
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7, vhash );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7 );
+
+     skein512_8way_init( &ctx.skein );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_init( &ctx.jh );
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_init( &ctx.keccak );
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
+
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7 );
+
+     hamsi512_8way_init( &ctx.hamsi );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+
+     shabal512_8way_init( &ctx.shabal );
+     shabal512_8way_update( &ctx.shabal, vhash, 64 );
+     shabal512_8way_close( &ctx.shabal, vhash );
+
+     rintrlv_8x32_8x64( vhashA, vhash, 512 );
+
+     hamsi512_8way_init( &ctx.hamsi );
+     hamsi512_8way_update( &ctx.hamsi, vhashA, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+// 5
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+
+     bmw512_8way_init( &ctx.bmw );
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     rintrlv_8x64_8x32( vhashA, vhash, 512 );
+
+     shabal512_8way_init( &ctx.shabal );
+     shabal512_8way_update( &ctx.shabal, vhashA, 64 );
+     shabal512_8way_close( &ctx.shabal, vhash );
+
+     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7 );
+
+     skein512_8way_init( &ctx.skein );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_init( &ctx.jh );
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_init( &ctx.keccak );
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
+
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7 );
+
+     hamsi512_8way_init( &ctx.hamsi );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+
+     shabal512_8way_init( &ctx.shabal );
+     shabal512_8way_update( &ctx.shabal, vhash, 64 );
+     shabal512_8way_close( &ctx.shabal, vhash );
+
+     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash4 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash5 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash6 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+
+// 6
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7 );
+
+     bmw512_8way_init( &ctx.bmw );
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7, vhash );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7 );
+
+     skein512_8way_init( &ctx.skein );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_init( &ctx.jh );
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_init( &ctx.keccak );
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
+
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7 );
+
+     hamsi512_8way_init( &ctx.hamsi );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+
+     shabal512_8way_init( &ctx.shabal );
+     shabal512_8way_update( &ctx.shabal, vhash, 64 );
+     shabal512_8way_close( &ctx.shabal, vhash );
+
+     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash4 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash5 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash6 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+
+     sha512_8way_init( &ctx.sha512 );
+     sha512_8way_update( &ctx.sha512, vhash, 64 );
+     sha512_8way_close( &ctx.sha512, vhash );
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash4 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash5 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash6 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+
+// 7
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7 );
+
+     bmw512_8way_init( &ctx.bmw );
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7, vhash );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7 );
+
+     skein512_8way_init( &ctx.skein );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_init( &ctx.jh );
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_init( &ctx.keccak );
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
+
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                       hash7 );
+
+     hamsi512_8way_init( &ctx.hamsi );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+
+     shabal512_8way_init( &ctx.shabal );
+     shabal512_8way_update( &ctx.shabal, vhash, 64 );
+     shabal512_8way_close( &ctx.shabal, vhash );
+
+     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash4 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash5 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash6 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+
+     sha512_8way_init( &ctx.sha512 );
+     sha512_8way_update( &ctx.sha512, vhash, 64 );
+     sha512_8way_close( &ctx.sha512, vhash );
+
+     rintrlv_8x64_8x32( vhashA, vhash, 512 );
+
+     haval256_5_8way_init( &ctx.haval );
+     haval256_5_8way_update( &ctx.haval, vhashA, 64 );
+     haval256_5_8way_close( &ctx.haval, state );
+}
+     
+int scanhash_sonoa_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+     const uint32_t last_nonce = max_nonce - 8;
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const uint32_t Htarg = ptarget[7];
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+      sonoa_8way_hash( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if unlikely( ( hash7[ lane ] <= Htarg ) )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+
+#elif defined(SONOA_4WAY)
+
 union _sonoa_4way_context_overlay
 {
     blake512_4way_context   blake;
diff --git a/algo/x17/sonoa-gate.c b/algo/x17/sonoa-gate.c
index fea4d39..3687733 100644
--- a/algo/x17/sonoa-gate.c
+++ b/algo/x17/sonoa-gate.c
@@ -2,8 +2,10 @@
 
 bool register_sonoa_algo( algo_gate_t* gate )
 {
-#if defined (SONOA_4WAY)
-//  init_sonoa_4way_ctx();
+#if defined (SONOA_8WAY)
+  gate->scanhash  = (void*)&scanhash_sonoa_8way;
+  gate->hash      = (void*)&sonoa_8way_hash;
+#elif defined (SONOA_4WAY)
   gate->scanhash  = (void*)&scanhash_sonoa_4way;
   gate->hash      = (void*)&sonoa_4way_hash;
 #else
@@ -11,7 +13,7 @@ bool register_sonoa_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_sonoa;
   gate->hash      = (void*)&sonoa_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
   return true;
 };
 
diff --git a/algo/x17/sonoa-gate.h b/algo/x17/sonoa-gate.h
index c97a375..aaad2a4 100644
--- a/algo/x17/sonoa-gate.h
+++ b/algo/x17/sonoa-gate.h
@@ -4,29 +4,33 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX2__) && defined(__AES__)
-  #define SONOA_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define SONOA_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define SONOA_4WAY 1
 #endif
 
 bool register_sonoa_algo( algo_gate_t* gate );
 
-#if defined(SONOA_4WAY)
+#if defined(SONOA_8WAY)
+
+void sonoa_8way_hash( void *state, const void *input );
+int scanhash_sonoa_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(SONOA_4WAY)
 
 void sonoa_4way_hash( void *state, const void *input );
-
 int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 
-//void init_sonoa_4way_ctx();
-
-#endif
+#else
 
 void sonoa_hash( void *state, const void *input );
-
 int scanhash_sonoa( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_sonoa_ctx();
 
 #endif
 
+#endif
diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c
index f913644..18eed41 100644
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -1,7 +1,4 @@
 #include "x17-gate.h"
-
-#if defined(X17_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,6 +11,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
+#include "algo/shavite/sph_shavite.h"
 #include "algo/shavite/shavite-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -24,6 +22,309 @@
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/sha/sha-hash-4way.h"
 
+#if defined(X17_8WAY)
+
+union _x17_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+    haval256_5_8way_context haval;
+} __attribute__ ((aligned (64)));
+typedef union _x17_8way_context_overlay x17_8way_context_overlay;
+
+void x17_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[8*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[8*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     x17_8way_context_overlay ctx;
+
+     // 1 Blake parallel 4 way 64 bit
+     blake512_8way_init( &ctx.blake );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     // 2 Bmw
+     bmw512_8way_init( &ctx.bmw );
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     // Serialize
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     // 3 Groestl
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     // Parallellize
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+
+     // 4 Skein parallel 4 way 64 bit 
+     skein512_8way_init( &ctx.skein );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_8way_init( &ctx.jh );
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_8way_init( &ctx.keccak );
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     // 7 Luffa  
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     // 8 Cubehash
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+
+     // 9 Shavite
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     // 10 Simd
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+
+     // 11 Echo serial
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                            (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                            (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                            (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                            (const BitSequence *) hash3, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                            (const BitSequence *) hash4, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                            (const BitSequence *) hash5, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                            (const BitSequence *) hash6, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                            (const BitSequence *) hash7, 512 );
+
+     // 12 Hamsi parallel 4 way 64 bit
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+
+     hamsi512_8way_init( &ctx.hamsi );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 13 Fugue serial
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     // 14 Shabal, parallel 4 way 32 bit
+     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+
+     shabal512_8way_init( &ctx.shabal );
+     shabal512_8way_update( &ctx.shabal, vhash, 64 );
+     shabal512_8way_close( &ctx.shabal, vhash );
+
+     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 15 Whirlpool serial
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash4 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash5 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash6 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+
+     // 16 SHA512 parallel 64 bit 
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+
+     sha512_8way_init( &ctx.sha512 );
+     sha512_8way_update( &ctx.sha512, vhash, 64 );
+     sha512_8way_close( &ctx.sha512, vhash );
+
+     // 17 Haval parallel 32 bit
+     rintrlv_8x64_8x32( vhash0, vhash,  512 );
+
+     haval256_5_8way_init( &ctx.haval );
+     haval256_5_8way_update( &ctx.haval, vhash0, 64 );
+     haval256_5_8way_close( &ctx.haval, state );
+}
+
+int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+     const uint32_t last_nonce = max_nonce - 8;
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const uint32_t Htarg = ptarget[7];
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+      x17_8way_hash( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if unlikely( ( hash7[ lane ] <= Htarg ) )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(X17_4WAY)
+
 union _x17_4way_context_overlay
 {
     blake512_4way_context   blake;
diff --git a/algo/x17/x17-gate.c b/algo/x17/x17-gate.c
index 69d28f6..73ce607 100644
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -2,14 +2,17 @@
 
 bool register_x17_algo( algo_gate_t* gate )
 {
-#if defined (X17_4WAY)
+#if defined (X17_8WAY)
+  gate->scanhash  = (void*)&scanhash_x17_8way;
+  gate->hash      = (void*)&x17_8way_hash;
+#elif defined (X17_4WAY)
   gate->scanhash  = (void*)&scanhash_x17_4way;
   gate->hash      = (void*)&x17_4way_hash;
 #else
   gate->scanhash  = (void*)&scanhash_x17;
   gate->hash      = (void*)&x17_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
   return true;
 };
 
diff --git a/algo/x17/x17-gate.h b/algo/x17/x17-gate.h
index 9a40b34..014caef 100644
--- a/algo/x17/x17-gate.h
+++ b/algo/x17/x17-gate.h
@@ -4,13 +4,20 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX2__) && defined(__AES__)
-  #define X17_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X17_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X17_4WAY 1
 #endif
 
 bool register_x17_algo( algo_gate_t* gate );
 
-#if defined(X17_4WAY)
+#if defined(X17_8WAY)
+
+void x17_8way_hash( void *state, const void *input );
+int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+#elif defined(X17_4WAY)
 
 void x17_4way_hash( void *state, const void *input );
 int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c
index 91a2a9f..28bc1c2 100644
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -1,7 +1,4 @@
 #include "xevan-gate.h"
-
-#if defined(XEVAN_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -15,6 +12,7 @@
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/shavite-hash-2way.h"
+#include "algo/shavite/sph_shavite.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -25,6 +23,515 @@
 #include "algo/sha/sha-hash-4way.h"
 #include "algo/haval/haval-hash-4way.h"
 
+#if defined(XEVAN_8WAY)
+
+union _xevan_8way_context_overlay
+{
+   blake512_8way_context   blake;
+   bmw512_8way_context     bmw;
+   hashState_groestl       groestl;
+   skein512_8way_context   skein;
+   jh512_8way_context      jh;
+   keccak512_8way_context  keccak;
+   luffa_4way_context      luffa;
+   cube_4way_context       cube;
+   sph_shavite512_context  shavite;
+   simd_4way_context       simd;
+   hashState_echo          echo;
+   hamsi512_8way_context   hamsi;
+   sph_fugue512_context    fugue;
+   shabal512_8way_context  shabal;
+   sph_whirlpool_context   whirlpool;
+   sha512_8way_context     sha512;
+   haval256_5_8way_context haval;
+} __attribute__ ((aligned (64)));
+typedef union _xevan_8way_context_overlay xevan_8way_context_overlay;
+
+void xevan_8way_hash( void *output, const void *input )
+{
+     uint64_t vhash[16<<3] __attribute__ ((aligned (128)));
+     uint64_t vhashA[16<<3] __attribute__ ((aligned (64)));
+     uint64_t vhashB[16<<3] __attribute__ ((aligned (64)));
+     uint64_t hash0[16] __attribute__ ((aligned (64)));
+     uint64_t hash1[16] __attribute__ ((aligned (64)));
+     uint64_t hash2[16] __attribute__ ((aligned (64)));
+     uint64_t hash3[16] __attribute__ ((aligned (64)));
+     uint64_t hash4[16] __attribute__ ((aligned (64)));
+     uint64_t hash5[16] __attribute__ ((aligned (64)));
+     uint64_t hash6[16] __attribute__ ((aligned (64)));
+     uint64_t hash7[16] __attribute__ ((aligned (64)));
+     const int dataLen = 128;
+     xevan_8way_context_overlay ctx __attribute__ ((aligned (64)));
+
+     blake512_8way_init( &ctx.blake );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+     memset( &vhash[8<<3], 0, 64<<3 );
+
+     bmw512_8way_init( &ctx.bmw );
+     bmw512_8way_update( &ctx.bmw, vhash, dataLen );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, dataLen<<3 );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7,
+                               dataLen<<3 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     skein512_8way_init( &ctx.skein );
+     skein512_8way_update( &ctx.skein, vhash, dataLen );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_init( &ctx.jh );
+     jh512_8way_update( &ctx.jh, vhash, dataLen );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_init( &ctx.keccak );
+     keccak512_8way_update( &ctx.keccak, vhash, dataLen );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );
+
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
+
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
+
+     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
+     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
+
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash0, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash4, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash5, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash6, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash7, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 );
+     intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 );
+
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
+
+     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
+     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, dataLen<<3 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     hamsi512_8way_init( &ctx.hamsi );
+     hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, dataLen<<3 );
+
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash0, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash4, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     shabal512_8way_init( &ctx.shabal );
+     shabal512_8way_update( &ctx.shabal, vhash, dataLen );
+     shabal512_8way_close( &ctx.shabal, vhash );
+
+     dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, dataLen<<3 );
+
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash4, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash4 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash5, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash5 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash6, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash6 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash7, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     sha512_8way_init( &ctx.sha512 );
+     sha512_8way_update( &ctx.sha512, vhash, dataLen );
+     sha512_8way_close( &ctx.sha512, vhash );
+
+     rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 );
+
+     haval256_5_8way_init( &ctx.haval );
+     haval256_5_8way_update( &ctx.haval, vhashA, dataLen );
+     haval256_5_8way_close( &ctx.haval, vhashA );
+
+     rintrlv_8x32_8x64( vhash, vhashA, dataLen<<3 );
+
+     memset( &vhash[ 4<<3 ], 0, (dataLen-32) << 3 );
+
+     blake512_8way_init( &ctx.blake );
+     blake512_8way_update( &ctx.blake, vhash, dataLen );
+     blake512_8way_close(&ctx.blake, vhash);
+
+     bmw512_8way_init( &ctx.bmw );
+     bmw512_8way_update( &ctx.bmw, vhash, dataLen );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, dataLen<<3 );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7,
+                               dataLen<<3 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     skein512_8way_init( &ctx.skein );
+     skein512_8way_update( &ctx.skein, vhash, dataLen );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_init( &ctx.jh );
+     jh512_8way_update( &ctx.jh, vhash, dataLen );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_init( &ctx.keccak );
+     keccak512_8way_update( &ctx.keccak, vhash, dataLen );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );
+
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
+
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
+
+     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
+     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
+
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash0, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash4, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash5, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash6, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash7, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 );
+     intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 );
+
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
+
+     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
+     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, dataLen<<3 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     hamsi512_8way_init( &ctx.hamsi );
+     hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, dataLen<<3 );
+
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash0, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash4, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     shabal512_8way_init( &ctx.shabal );
+     shabal512_8way_update( &ctx.shabal, vhash, dataLen );
+     shabal512_8way_close( &ctx.shabal, vhash );
+
+     dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, dataLen<<3 );
+
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash4, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash4 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash5, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash5 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash6, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash6 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash7, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     sha512_8way_init( &ctx.sha512 );
+     sha512_8way_update( &ctx.sha512, vhash, dataLen );
+     sha512_8way_close( &ctx.sha512, vhash );
+
+     rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 );
+
+     haval256_5_8way_init( &ctx.haval );
+     haval256_5_8way_update( &ctx.haval, vhashA, dataLen );
+     haval256_5_8way_close( &ctx.haval, output );
+}
+
+int scanhash_xevan_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const uint32_t Htarg = ptarget[7];
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+      xevan_8way_hash( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if unlikely( ( hash7[ lane ] <= Htarg ) )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(XEVAN_4WAY)
+
 union _xevan_4way_context_overlay
 {
 	blake512_4way_context   blake;
diff --git a/algo/x17/xevan-gate.c b/algo/x17/xevan-gate.c
index 96b811c..8cb86a4 100644
--- a/algo/x17/xevan-gate.c
+++ b/algo/x17/xevan-gate.c
@@ -2,8 +2,10 @@
 
 bool register_xevan_algo( algo_gate_t* gate )
 {
-#if defined (XEVAN_4WAY)
-//  init_xevan_4way_ctx();
+#if defined (XEVAN_8WAY)
+  gate->scanhash  = (void*)&scanhash_xevan_8way;
+  gate->hash      = (void*)&xevan_8way_hash;
+#elif defined (XEVAN_4WAY)
   gate->scanhash  = (void*)&scanhash_xevan_4way;
   gate->hash      = (void*)&xevan_4way_hash;
 #else
diff --git a/algo/x17/xevan-gate.h b/algo/x17/xevan-gate.h
index c614c0b..be0dfbc 100644
--- a/algo/x17/xevan-gate.h
+++ b/algo/x17/xevan-gate.h
@@ -4,13 +4,21 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX2__) && defined(__AES__)
-  #define XEVAN_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define XEVAN_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define XEVAN_4WAY 1
 #endif
 
 bool register_xevan_algo( algo_gate_t* gate );
 
-#if defined(XEVAN_4WAY)
+#if defined(XEVAN_8WAY)
+
+void xevan_8way_hash( void *state, const void *input );
+
+int scanhash_xevan_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+#elif defined(XEVAN_4WAY)
 
 void xevan_4way_hash( void *state, const void *input );
 
@@ -19,7 +27,7 @@ int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,
 
 //void init_xevan_4way_ctx();
 
-#endif
+#else
 
 void xevan_hash( void *state, const void *input );
 
@@ -30,3 +38,4 @@ void init_xevan_ctx();
 
 #endif
 
+#endif
diff --git a/build-allarch.sh b/build-allarch.sh
index e6ab8d5..6e8fd89 100755
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,6 +4,8 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.
 
+rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen 
+
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
diff --git a/configure b/configure
index 94be3e8..2c54f15 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.2.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.5.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.10.2'
-PACKAGE_STRING='cpuminer-opt 3.10.2'
+PACKAGE_VERSION='3.10.5'
+PACKAGE_STRING='cpuminer-opt 3.10.5'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.10.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.10.5 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.10.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.10.5:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.10.2
+cpuminer-opt configure 3.10.5
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.10.2, which was
+It was created by cpuminer-opt $as_me 3.10.5, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.10.2'
+ VERSION='3.10.5'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.10.2, which was
+This file was extended by cpuminer-opt $as_me 3.10.5, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.10.2
+cpuminer-opt config.status 3.10.5
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index eae85ca..467397e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.10.2])
+AC_INIT([cpuminer-opt], [3.10.5])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 85cdb40..572cdef 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -3410,39 +3410,39 @@ bool check_cpu_capability ()
         printf(".\n");
      #endif
 
-     printf("CPU features:");
-     if      ( cpu_has_vaes   )    printf( " VAES"   );
-     else if ( cpu_has_aes    )    printf( " AES"    );
-     if      ( cpu_has_sha    )    printf( " SHA"    );
+     printf("CPU features: ");
      if      ( cpu_has_avx512 )    printf( " AVX512" );
-     else if ( cpu_has_avx2   )    printf( " AVX2"   );
-     else if ( cpu_has_avx    )    printf( " AVX"    );
+     else if ( cpu_has_avx2   )    printf( " AVX2  " );
+     else if ( cpu_has_avx    )    printf( " AVX   " );
      else if ( cpu_has_sse42  )    printf( " SSE4.2" );
-     else if ( cpu_has_sse2   )    printf( " SSE2"   );
+     else if ( cpu_has_sse2   )    printf( " SSE2  " );
+     if      ( cpu_has_vaes   )    printf( " VAES"   );
+     else if ( cpu_has_aes    )    printf( "  AES"   );
+     if      ( cpu_has_sha    )    printf( " SHA"    );
 
-     printf(".\nSW features:");
-     if      ( sw_has_vaes   )    printf( " VAES"   );
-     else if ( sw_has_aes    )    printf( " AES"    );
-     if      ( sw_has_sha    )    printf( " SHA"    );
+     printf("\nSW features:  ");
      if      ( sw_has_avx512 )    printf( " AVX512" );
-     else if ( sw_has_avx2   )    printf( " AVX2"   );
-     else if ( sw_has_avx    )    printf( " AVX"    );
+     else if ( sw_has_avx2   )    printf( " AVX2  " );
+     else if ( sw_has_avx    )    printf( " AVX   " );
      else if ( sw_has_sse42  )    printf( " SSE4.2" );
-     else if ( sw_has_sse2   )    printf( " SSE2"   );
+     else if ( sw_has_sse2   )    printf( " SSE2  " );
+     if      ( sw_has_vaes   )    printf( " VAES"   );
+     else if ( sw_has_aes    )    printf( " AES "   );
+     if      ( sw_has_sha    )    printf( " SHA"    );
 
-     printf(".\nAlgo features:");
+     printf("\nAlgo features:");
      if ( algo_features == EMPTY_SET ) printf( " None" );
      else
      {
-        if      ( algo_has_vaes   )    printf( " VAES"   );
-        else if ( algo_has_aes    )    printf( " AES"    );
-        if      ( algo_has_sha    )    printf( " SHA"    );
         if      ( algo_has_avx512 )    printf( " AVX512" );
-        else if ( algo_has_avx2   )    printf( " AVX2"   );
+        else if ( algo_has_avx2   )    printf( " AVX2  " );
         else if ( algo_has_sse42  )    printf( " SSE4.2" );
-        else if ( algo_has_sse2   )    printf( " SSE2"   );
+        else if ( algo_has_sse2   )    printf( " SSE2  " );
+        if      ( algo_has_vaes   )    printf( " VAES"   );
+        else if ( algo_has_aes    )    printf( " AES "   );
+        if      ( algo_has_sha    )    printf( " SHA"    );
      }
-     printf(".\n");
+     printf("\n");
 
      // Check for CPU and build incompatibilities
      if ( !cpu_has_sse2 )
@@ -3483,19 +3483,19 @@ bool check_cpu_capability ()
                    use_sha || use_vaes );
       
      // Display best options
-     printf( "Start mining with" );
+     printf( "\nStarting miner with" );
      if         ( use_none ) printf( " no optimizations" );
      else
      {
-        if      ( use_vaes   ) printf( " VAES"   );
-        else if ( use_aes    ) printf( " AES"    );
         if      ( use_avx512 ) printf( " AVX512" );
         else if ( use_avx2   ) printf( " AVX2"   );
         else if ( use_sse42  ) printf( " SSE4.2" );
         else if ( use_sse2   ) printf( " SSE2"   );
+        if      ( use_vaes   ) printf( " VAES"   );
+        else if ( use_aes    ) printf( " AES"    );
         if      ( use_sha    ) printf( " SHA"    );
      }
-     printf( ".\n\n" );
+     printf( "...\n\n" );
 
      return true;
 }
diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h
index db9c0e9..961c57d 100644
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -897,7 +897,7 @@ static inline void intrlv_16x32_512( void *dst, const void *s00,
    *( (uint32_t*)(d06) +(i) ) = s[ 6]; \
    *( (uint32_t*)(d07) +(i) ) = s[ 7]; \
    *( (uint32_t*)(d08) +(i) ) = s[ 8]; \
-   *( (uint32_t*)(d09) +(i) ) = s[ 0]; \
+   *( (uint32_t*)(d09) +(i) ) = s[ 9]; \
    *( (uint32_t*)(d10) +(i) ) = s[10]; \
    *( (uint32_t*)(d11) +(i) ) = s[11]; \
    *( (uint32_t*)(d12) +(i) ) = s[12]; \
@@ -2055,7 +2055,7 @@ static inline void intrlv_2x256( void *dst, const void *src0,
    if ( bit_len <= 512 ) return;
    d[4] = s0[2];
    if ( bit_len <= 640 ) return;
-   d[5] = s1[2];
+                      d[5] = s1[2];
    d[6] = s0[3];      d[7] = s1[3];
 }
 
@@ -2075,9 +2075,6 @@ static inline void dintrlv_2x256( void *dst0, void *dst1,
    d0[3] = s[6];      d1[3] = s[7];
 }
 
-
-
-
 #endif // AVX
 
 ///////////////////////////
@@ -2165,7 +2162,9 @@ static inline void rintrlv_4x32_4x64( void *dst,
    d[ 5] = _mm_unpackhi_epi32( s[ 4], s[ 5] );
    d[ 6] = _mm_unpacklo_epi32( s[ 6], s[ 7] );
    d[ 7] = _mm_unpackhi_epi32( s[ 6], s[ 7] );
+
    if ( bit_len <= 256 ) return;
+
    d[ 8] = _mm_unpacklo_epi32( s[ 8], s[ 9] );
    d[ 9] = _mm_unpackhi_epi32( s[ 8], s[ 9] );
    d[10] = _mm_unpacklo_epi32( s[10], s[11] );
@@ -2174,16 +2173,21 @@ static inline void rintrlv_4x32_4x64( void *dst,
    d[13] = _mm_unpackhi_epi32( s[12], s[13] );
    d[14] = _mm_unpacklo_epi32( s[14], s[15] );
    d[15] = _mm_unpackhi_epi32( s[14], s[15] );
+
    if ( bit_len <= 512 ) return;
+
    d[16] = _mm_unpacklo_epi32( s[16], s[17] );
    d[17] = _mm_unpackhi_epi32( s[16], s[17] );
    d[18] = _mm_unpacklo_epi32( s[18], s[19] );
    d[19] = _mm_unpackhi_epi32( s[18], s[19] );
+
    if ( bit_len <= 640 ) return;
+
    d[20] = _mm_unpacklo_epi32( s[20], s[21] );
    d[21] = _mm_unpackhi_epi32( s[20], s[21] );
    d[22] = _mm_unpacklo_epi32( s[22], s[23] );
    d[23] = _mm_unpackhi_epi32( s[22], s[23] );
+
    d[24] = _mm_unpacklo_epi32( s[24], s[25] );
    d[25] = _mm_unpackhi_epi32( s[24], s[25] );
    d[26] = _mm_unpacklo_epi32( s[26], s[27] );
@@ -2194,6 +2198,93 @@ static inline void rintrlv_4x32_4x64( void *dst,
    d[31] = _mm_unpackhi_epi32( s[30], s[31] );
 }
 
+// 8x32 -> 8x64
+
+static inline void rintrlv_8x32_8x64( void *dst,
+                                      const void *src, const int bit_len )
+{
+   __m128i *d = (__m128i*)dst;
+   const __m128i *s = (const __m128i*)src;
+
+   d[ 0] = _mm_unpacklo_epi32( s[ 0], s[ 2] );
+   d[ 1] = _mm_unpackhi_epi32( s[ 0], s[ 2] );
+   d[ 2] = _mm_unpacklo_epi32( s[ 1], s[ 3] );
+   d[ 3] = _mm_unpackhi_epi32( s[ 1], s[ 3] );
+   d[ 4] = _mm_unpacklo_epi32( s[ 4], s[ 6] );
+   d[ 5] = _mm_unpackhi_epi32( s[ 4], s[ 6] );
+   d[ 6] = _mm_unpacklo_epi32( s[ 5], s[ 7] );
+   d[ 7] = _mm_unpackhi_epi32( s[ 5], s[ 7] );
+
+   d[ 8] = _mm_unpacklo_epi32( s[ 8], s[10] );
+   d[ 9] = _mm_unpackhi_epi32( s[ 8], s[10] );
+   d[10] = _mm_unpacklo_epi32( s[ 9], s[11] );
+   d[11] = _mm_unpackhi_epi32( s[ 9], s[11] );
+   d[12] = _mm_unpacklo_epi32( s[12], s[14] );
+   d[13] = _mm_unpackhi_epi32( s[12], s[14] );
+   d[14] = _mm_unpacklo_epi32( s[13], s[15] );
+   d[15] = _mm_unpackhi_epi32( s[13], s[15] );
+
+   if ( bit_len <= 256 ) return;
+
+   d[16] = _mm_unpacklo_epi32( s[16], s[18] );
+   d[17] = _mm_unpackhi_epi32( s[16], s[18] );
+   d[18] = _mm_unpacklo_epi32( s[17], s[19] );
+   d[19] = _mm_unpackhi_epi32( s[17], s[19] );
+   d[20] = _mm_unpacklo_epi32( s[20], s[22] );
+   d[21] = _mm_unpackhi_epi32( s[20], s[22] );
+   d[22] = _mm_unpacklo_epi32( s[21], s[23] );
+   d[23] = _mm_unpackhi_epi32( s[21], s[23] );
+
+   d[24] = _mm_unpacklo_epi32( s[24], s[26] );
+   d[25] = _mm_unpackhi_epi32( s[24], s[26] );
+   d[26] = _mm_unpacklo_epi32( s[25], s[27] );
+   d[27] = _mm_unpackhi_epi32( s[25], s[27] );
+   d[28] = _mm_unpacklo_epi32( s[28], s[30] );
+   d[29] = _mm_unpackhi_epi32( s[28], s[30] );
+   d[30] = _mm_unpacklo_epi32( s[29], s[31] );
+   d[31] = _mm_unpackhi_epi32( s[29], s[31] );
+
+   if ( bit_len <= 512 ) return;
+
+   d[32] = _mm_unpacklo_epi32( s[32], s[34] );
+   d[33] = _mm_unpackhi_epi32( s[32], s[34] );
+   d[34] = _mm_unpacklo_epi32( s[33], s[35] );
+   d[35] = _mm_unpackhi_epi32( s[33], s[35] );
+   d[36] = _mm_unpacklo_epi32( s[36], s[38] );
+   d[37] = _mm_unpackhi_epi32( s[36], s[38] );
+   d[38] = _mm_unpacklo_epi32( s[37], s[39] );
+   d[39] = _mm_unpackhi_epi32( s[37], s[39] );
+
+   d[40] = _mm_unpacklo_epi32( s[40], s[42] );
+   d[41] = _mm_unpackhi_epi32( s[40], s[42] );
+   d[42] = _mm_unpacklo_epi32( s[41], s[43] );
+   d[43] = _mm_unpackhi_epi32( s[41], s[43] );
+   d[44] = _mm_unpacklo_epi32( s[44], s[46] );
+   d[45] = _mm_unpackhi_epi32( s[44], s[46] );
+   d[46] = _mm_unpacklo_epi32( s[45], s[47] );
+   d[47] = _mm_unpackhi_epi32( s[45], s[47] );
+
+   d[48] = _mm_unpacklo_epi32( s[48], s[50] );
+   d[49] = _mm_unpackhi_epi32( s[48], s[50] );
+   d[50] = _mm_unpacklo_epi32( s[49], s[51] );
+   d[51] = _mm_unpackhi_epi32( s[49], s[51] );
+   d[52] = _mm_unpacklo_epi32( s[52], s[54] );
+   d[53] = _mm_unpackhi_epi32( s[52], s[54] );
+   d[54] = _mm_unpacklo_epi32( s[53], s[55] );
+   d[55] = _mm_unpackhi_epi32( s[53], s[55] );
+
+   d[56] = _mm_unpacklo_epi32( s[56], s[58] );
+   d[57] = _mm_unpackhi_epi32( s[56], s[58] );
+   d[58] = _mm_unpacklo_epi32( s[57], s[59] );
+   d[59] = _mm_unpackhi_epi32( s[57], s[59] );
+   d[60] = _mm_unpacklo_epi32( s[60], s[62] );
+   d[61] = _mm_unpackhi_epi32( s[60], s[62] );
+   d[62] = _mm_unpacklo_epi32( s[61], s[63] );
+   d[63] = _mm_unpackhi_epi32( s[61], s[63] );
+}
+
+
+
 /*
 #define RLEAVE_4x32_4x64(i) do \
 { \
@@ -2225,7 +2316,6 @@ static inline void rintrlv_4x32_4x64( void *dst,
 
 // 2x128 -> 4x64
 
-
 static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
                                        const void *src1, const int bit_len )
 {
@@ -2268,7 +2358,6 @@ static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
    d[31] = _mm_unpackhi_epi64( s1[14], s1[15] );
 }
 
-
 /*
 #define RLEAVE_2x128_4x64( i ) do \
 { \
@@ -2339,7 +2428,6 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
    d1[15] = _mm_unpackhi_epi64( s[29], s[31] );
 }
 
-
 /*
 #define RLEAVE_4x64_2x128( i ) do \
 { \
@@ -2364,6 +2452,354 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
 }
 */
 
+// 2x128 -> 8x64
+
+static inline void rintrlv_4x128_8x64( void *dst, const void *src0,
+                                       const void *src1, const int bit_len )
+{
+   __m128i *d = (__m128i*)dst;
+   const __m128i *s0 = (const __m128i*)src0;
+   const __m128i *s1 = (const __m128i*)src1;
+
+   d[ 0] = _mm_unpacklo_epi64( s0[ 0], s0[ 1] );
+   d[ 1] = _mm_unpacklo_epi64( s0[ 2], s0[ 3] );
+   d[ 2] = _mm_unpacklo_epi64( s1[ 0], s1[ 1] );
+   d[ 3] = _mm_unpacklo_epi64( s1[ 2], s1[ 3] );
+   d[ 4] = _mm_unpackhi_epi64( s0[ 0], s0[ 1] );
+   d[ 5] = _mm_unpackhi_epi64( s0[ 2], s0[ 3] );
+   d[ 6] = _mm_unpackhi_epi64( s1[ 0], s1[ 1] );
+   d[ 7] = _mm_unpackhi_epi64( s1[ 2], s1[ 3] );
+
+   d[ 8] = _mm_unpacklo_epi64( s0[ 4], s0[ 5] );
+   d[ 9] = _mm_unpacklo_epi64( s0[ 6], s0[ 7] );
+   d[10] = _mm_unpacklo_epi64( s1[ 4], s1[ 5] );
+   d[11] = _mm_unpacklo_epi64( s1[ 6], s1[ 7] );
+   d[12] = _mm_unpackhi_epi64( s0[ 4], s0[ 5] );
+   d[13] = _mm_unpackhi_epi64( s0[ 6], s0[ 7] );
+   d[14] = _mm_unpackhi_epi64( s1[ 4], s1[ 5] );
+   d[15] = _mm_unpackhi_epi64( s1[ 6], s1[ 7] );
+
+   if ( bit_len <= 256 ) return;
+
+   d[16] = _mm_unpacklo_epi64( s0[ 8], s0[ 9] );
+   d[17] = _mm_unpacklo_epi64( s0[10], s0[11] );
+   d[18] = _mm_unpacklo_epi64( s1[ 8], s1[ 9] );
+   d[19] = _mm_unpacklo_epi64( s1[10], s1[11] );
+   d[20] = _mm_unpackhi_epi64( s0[ 8], s0[ 9] );
+   d[21] = _mm_unpackhi_epi64( s0[10], s0[11] );
+   d[22] = _mm_unpackhi_epi64( s1[ 8], s1[ 9] );
+   d[23] = _mm_unpackhi_epi64( s1[10], s1[11] );
+
+   d[24] = _mm_unpacklo_epi64( s0[12], s0[13] );
+   d[25] = _mm_unpacklo_epi64( s0[14], s0[15] );
+   d[26] = _mm_unpacklo_epi64( s1[12], s1[13] );
+   d[27] = _mm_unpacklo_epi64( s1[14], s1[15] );
+   d[28] = _mm_unpackhi_epi64( s0[12], s0[13] );
+   d[29] = _mm_unpackhi_epi64( s0[14], s0[15] );
+   d[30] = _mm_unpackhi_epi64( s1[12], s1[13] );
+   d[31] = _mm_unpackhi_epi64( s1[14], s1[15] );
+
+   if ( bit_len <= 512 ) return;
+
+   d[32] = _mm_unpacklo_epi64( s0[16], s0[17] );
+   d[33] = _mm_unpacklo_epi64( s0[18], s0[19] );
+   d[34] = _mm_unpacklo_epi64( s1[16], s1[17] );
+   d[35] = _mm_unpacklo_epi64( s1[18], s1[19] );
+   d[36] = _mm_unpackhi_epi64( s0[16], s0[17] );
+   d[37] = _mm_unpackhi_epi64( s0[18], s0[19] );
+   d[38] = _mm_unpackhi_epi64( s1[16], s1[17] );
+   d[39] = _mm_unpackhi_epi64( s1[18], s1[19] );
+
+   d[40] = _mm_unpacklo_epi64( s0[20], s0[21] );
+   d[41] = _mm_unpacklo_epi64( s0[22], s0[23] );
+   d[42] = _mm_unpacklo_epi64( s1[20], s1[21] );
+   d[43] = _mm_unpacklo_epi64( s1[22], s1[23] );
+   d[44] = _mm_unpackhi_epi64( s0[20], s0[21] );
+   d[45] = _mm_unpackhi_epi64( s0[22], s0[23] );
+   d[46] = _mm_unpackhi_epi64( s1[20], s1[21] );
+   d[47] = _mm_unpackhi_epi64( s1[22], s1[23] );
+
+   d[48] = _mm_unpacklo_epi64( s0[24], s0[25] );
+   d[49] = _mm_unpacklo_epi64( s0[26], s0[27] );
+   d[50] = _mm_unpacklo_epi64( s1[24], s1[25] );
+   d[51] = _mm_unpacklo_epi64( s1[26], s1[27] );
+   d[52] = _mm_unpackhi_epi64( s0[24], s0[25] );
+   d[53] = _mm_unpackhi_epi64( s0[26], s0[27] );
+   d[54] = _mm_unpackhi_epi64( s1[24], s1[25] );
+   d[55] = _mm_unpackhi_epi64( s1[26], s1[27] );
+
+   d[56] = _mm_unpacklo_epi64( s0[28], s0[29] );
+   d[57] = _mm_unpacklo_epi64( s0[30], s0[31] );
+   d[58] = _mm_unpacklo_epi64( s1[28], s1[29] );
+   d[59] = _mm_unpacklo_epi64( s1[30], s1[31] );
+   d[60] = _mm_unpackhi_epi64( s0[28], s0[29] );
+   d[61] = _mm_unpackhi_epi64( s0[30], s0[31] );
+   d[62] = _mm_unpackhi_epi64( s1[28], s1[29] );
+   d[63] = _mm_unpackhi_epi64( s1[30], s1[31] );
+}
+
+// 8x64 -> 4x128
+
+static inline void rintrlv_8x64_4x128( void *dst0, void *dst1,
+                                       const void *src, const int bit_len )
+{
+   __m128i *d0 = (__m128i*)dst0;
+   __m128i *d1 = (__m128i*)dst1;
+   const __m128i* s = (const __m128i*)src;
+
+   d0[ 0] = _mm_unpacklo_epi64( s[ 0], s[ 4] );
+   d0[ 1] = _mm_unpackhi_epi64( s[ 0], s[ 4] );
+   d1[ 0] = _mm_unpacklo_epi64( s[ 2], s[ 6] );
+   d1[ 1] = _mm_unpackhi_epi64( s[ 2], s[ 6] );
+   d0[ 2] = _mm_unpacklo_epi64( s[ 1], s[ 5] );
+   d0[ 3] = _mm_unpackhi_epi64( s[ 1], s[ 5] );
+   d1[ 2] = _mm_unpacklo_epi64( s[ 3], s[ 7] );
+   d1[ 3] = _mm_unpackhi_epi64( s[ 3], s[ 7] );
+
+   d0[ 4] = _mm_unpacklo_epi64( s[ 8], s[12] );
+   d0[ 5] = _mm_unpackhi_epi64( s[ 8], s[12] );
+   d1[ 4] = _mm_unpacklo_epi64( s[10], s[14] );
+   d1[ 5] = _mm_unpackhi_epi64( s[10], s[14] );
+   d0[ 6] = _mm_unpacklo_epi64( s[ 9], s[13] );
+   d0[ 7] = _mm_unpackhi_epi64( s[ 9], s[13] );
+   d1[ 6] = _mm_unpacklo_epi64( s[11], s[15] );
+   d1[ 7] = _mm_unpackhi_epi64( s[11], s[15] );
+
+   if ( bit_len <= 256 ) return;
+
+   d0[ 8] = _mm_unpacklo_epi64( s[16], s[20] );
+   d0[ 9] = _mm_unpackhi_epi64( s[16], s[20] );
+   d1[ 8] = _mm_unpacklo_epi64( s[18], s[22] );
+   d1[ 9] = _mm_unpackhi_epi64( s[18], s[22] );
+   d0[10] = _mm_unpacklo_epi64( s[17], s[21] );
+   d0[11] = _mm_unpackhi_epi64( s[17], s[21] );
+   d1[10] = _mm_unpacklo_epi64( s[19], s[23] );
+   d1[11] = _mm_unpackhi_epi64( s[19], s[23] );
+
+   d0[12] = _mm_unpacklo_epi64( s[24], s[28] );
+   d0[13] = _mm_unpackhi_epi64( s[24], s[28] );
+   d1[12] = _mm_unpacklo_epi64( s[26], s[30] );
+   d1[13] = _mm_unpackhi_epi64( s[26], s[30] );
+   d0[14] = _mm_unpacklo_epi64( s[25], s[29] );
+   d0[15] = _mm_unpackhi_epi64( s[25], s[29] );
+   d1[14] = _mm_unpacklo_epi64( s[27], s[31] );
+   d1[15] = _mm_unpackhi_epi64( s[27], s[31] );
+
+   if ( bit_len <= 512 ) return;
+
+   d0[16] = _mm_unpacklo_epi64( s[32], s[36] );
+   d0[17] = _mm_unpackhi_epi64( s[32], s[36] );
+   d1[16] = _mm_unpacklo_epi64( s[34], s[38] );
+   d1[17] = _mm_unpackhi_epi64( s[34], s[38] );
+   d0[18] = _mm_unpacklo_epi64( s[33], s[37] );
+   d0[19] = _mm_unpackhi_epi64( s[33], s[37] );
+   d1[18] = _mm_unpacklo_epi64( s[35], s[39] );
+   d1[19] = _mm_unpackhi_epi64( s[35], s[39] );
+
+   d0[20] = _mm_unpacklo_epi64( s[40], s[44] );
+   d0[21] = _mm_unpackhi_epi64( s[40], s[44] );
+   d1[20] = _mm_unpacklo_epi64( s[42], s[46] );
+   d1[21] = _mm_unpackhi_epi64( s[42], s[46] );
+   d0[22] = _mm_unpacklo_epi64( s[41], s[45] );
+   d0[23] = _mm_unpackhi_epi64( s[41], s[45] );
+   d1[22] = _mm_unpacklo_epi64( s[43], s[47] );
+   d1[23] = _mm_unpackhi_epi64( s[43], s[47] );
+
+   d0[24] = _mm_unpacklo_epi64( s[48], s[52] );
+   d0[25] = _mm_unpackhi_epi64( s[48], s[52] );
+   d1[24] = _mm_unpacklo_epi64( s[50], s[54] );
+   d1[25] = _mm_unpackhi_epi64( s[50], s[54] );
+   d0[26] = _mm_unpacklo_epi64( s[49], s[53] );
+   d0[27] = _mm_unpackhi_epi64( s[49], s[53] );
+   d1[26] = _mm_unpacklo_epi64( s[51], s[55] );
+   d1[27] = _mm_unpackhi_epi64( s[51], s[55] );
+
+   d0[28] = _mm_unpacklo_epi64( s[56], s[60] );
+   d0[29] = _mm_unpackhi_epi64( s[56], s[60] );
+   d1[28] = _mm_unpacklo_epi64( s[58], s[62] );
+   d1[29] = _mm_unpackhi_epi64( s[58], s[62] );
+   d0[30] = _mm_unpacklo_epi64( s[57], s[61] );
+   d0[31] = _mm_unpackhi_epi64( s[57], s[61] );
+   d1[30] = _mm_unpacklo_epi64( s[59], s[63] );
+   d1[31] = _mm_unpackhi_epi64( s[59], s[63] );
+}
+
+// 8x64 -> 2x256
+
+static inline void rintrlv_8x64_2x256( void *dst0, void *dst1, void *dst2,
+                          void *dst3,  const void *src, const int bit_len )
+{
+   __m128i *d0 = (__m128i*)dst0;
+   __m128i *d1 = (__m128i*)dst1;
+   __m128i *d2 = (__m128i*)dst2;
+   __m128i *d3 = (__m128i*)dst3;
+   const __m128i* s = (const __m128i*)src;
+
+   d0[ 0] = _mm_unpacklo_epi64( s[ 0], s[ 4] );
+   d1[ 0] = _mm_unpackhi_epi64( s[ 0], s[ 4] );
+   d2[ 0] = _mm_unpacklo_epi64( s[ 1], s[ 5] );   
+   d3[ 0] = _mm_unpackhi_epi64( s[ 1], s[ 5] );
+   d0[ 1] = _mm_unpacklo_epi64( s[ 2], s[ 6] ); 
+   d1[ 1] = _mm_unpackhi_epi64( s[ 2], s[ 6] );
+   d2[ 1] = _mm_unpacklo_epi64( s[ 3], s[ 7] ); 
+   d3[ 1] = _mm_unpackhi_epi64( s[ 3], s[ 7] );
+   
+   d0[ 2] = _mm_unpacklo_epi64( s[ 8], s[12] ); 
+   d1[ 2] = _mm_unpackhi_epi64( s[ 8], s[12] );
+   d2[ 2] = _mm_unpacklo_epi64( s[ 9], s[13] ); 
+   d3[ 2] = _mm_unpackhi_epi64( s[ 9], s[13] );
+   d0[ 3] = _mm_unpacklo_epi64( s[10], s[14] );
+   d1[ 3] = _mm_unpackhi_epi64( s[10], s[14] );
+   d2[ 3] = _mm_unpacklo_epi64( s[11], s[15] );
+   d3[ 3] = _mm_unpackhi_epi64( s[11], s[15] );
+
+   if ( bit_len <= 256 ) return;
+
+   d0[ 4] = _mm_unpacklo_epi64( s[16], s[20] );
+   d1[ 4] = _mm_unpackhi_epi64( s[16], s[20] );
+   d2[ 4] = _mm_unpacklo_epi64( s[17], s[21] );
+   d3[ 4] = _mm_unpackhi_epi64( s[17], s[21] );
+   d0[ 5] = _mm_unpacklo_epi64( s[18], s[22] );
+   d1[ 5] = _mm_unpackhi_epi64( s[18], s[22] );
+   d2[ 5] = _mm_unpacklo_epi64( s[19], s[23] );
+   d3[ 5] = _mm_unpackhi_epi64( s[19], s[23] );
+   
+   d0[ 6] = _mm_unpacklo_epi64( s[24], s[28] );
+   d1[ 6] = _mm_unpackhi_epi64( s[24], s[28] );
+   d2[ 6] = _mm_unpacklo_epi64( s[25], s[29] );
+   d3[ 6] = _mm_unpackhi_epi64( s[25], s[29] );
+   d0[ 7] = _mm_unpacklo_epi64( s[26], s[30] );
+   d1[ 7] = _mm_unpackhi_epi64( s[26], s[30] );
+   d2[ 7] = _mm_unpacklo_epi64( s[27], s[31] );
+   d3[ 7] = _mm_unpackhi_epi64( s[27], s[31] );
+
+   if ( bit_len <= 512 ) return;
+
+   d0[ 8] = _mm_unpacklo_epi64( s[32], s[36] );
+   d1[ 8] = _mm_unpackhi_epi64( s[32], s[36] );
+   d2[ 8] = _mm_unpacklo_epi64( s[33], s[37] );
+   d3[ 8] = _mm_unpackhi_epi64( s[33], s[37] );
+   d0[ 9] = _mm_unpacklo_epi64( s[34], s[38] );
+   d1[ 9] = _mm_unpackhi_epi64( s[34], s[38] );
+   d2[ 9] = _mm_unpacklo_epi64( s[35], s[39] );
+   d3[ 9] = _mm_unpackhi_epi64( s[35], s[39] );
+
+   d0[10] = _mm_unpacklo_epi64( s[40], s[44] );
+   d1[10] = _mm_unpackhi_epi64( s[40], s[44] );
+   d2[10] = _mm_unpacklo_epi64( s[41], s[45] );
+   d3[10] = _mm_unpackhi_epi64( s[41], s[45] );
+   d0[11] = _mm_unpacklo_epi64( s[42], s[46] );
+   d1[11] = _mm_unpackhi_epi64( s[42], s[46] );
+   d2[11] = _mm_unpacklo_epi64( s[43], s[47] );
+   d3[11] = _mm_unpackhi_epi64( s[43], s[47] );
+
+   d0[12] = _mm_unpacklo_epi64( s[48], s[52] );
+   d1[12] = _mm_unpackhi_epi64( s[48], s[52] );
+   d2[12] = _mm_unpacklo_epi64( s[49], s[53] );
+   d3[12] = _mm_unpackhi_epi64( s[49], s[53] );
+   d0[13] = _mm_unpacklo_epi64( s[50], s[54] );
+   d1[13] = _mm_unpackhi_epi64( s[50], s[54] );
+   d2[13] = _mm_unpacklo_epi64( s[51], s[55] );
+   d3[13] = _mm_unpackhi_epi64( s[51], s[55] );
+
+   d0[14] = _mm_unpacklo_epi64( s[56], s[60] );
+   d1[14] = _mm_unpackhi_epi64( s[56], s[60] );
+   d2[14] = _mm_unpacklo_epi64( s[57], s[61] );
+   d3[14] = _mm_unpackhi_epi64( s[57], s[61] );
+   d0[15] = _mm_unpacklo_epi64( s[58], s[62] );
+   d1[15] = _mm_unpackhi_epi64( s[58], s[62] );
+   d2[15] = _mm_unpacklo_epi64( s[59], s[63] );
+   d3[15] = _mm_unpackhi_epi64( s[59], s[63] );
+}
+
+// 4x128 -> 8x64
+
+static inline void rintrlv_2x256_8x64( void *dst, const void *src0,
+      const void *src1, const void *src2, const void *src3, const int bit_len )
+{
+   __m128i *d = (__m128i*)dst;
+   __m128i *s0 = (__m128i*)src0;
+   __m128i *s1 = (__m128i*)src1;
+   __m128i *s2 = (__m128i*)src2;
+   __m128i *s3 = (__m128i*)src3;
+
+   d[ 0] = _mm_unpacklo_epi64( s0[0], s0[2] );
+   d[ 1] = _mm_unpacklo_epi64( s1[0], s1[2] );
+   d[ 2] = _mm_unpacklo_epi64( s2[0], s2[2] );
+   d[ 3] = _mm_unpacklo_epi64( s3[0], s3[2] );
+   d[ 4] = _mm_unpackhi_epi64( s0[0], s0[2] );
+   d[ 5] = _mm_unpackhi_epi64( s1[0], s1[2] );
+   d[ 6] = _mm_unpackhi_epi64( s2[0], s2[2] );
+   d[ 7] = _mm_unpackhi_epi64( s3[0], s3[2] );
+
+   d[ 8] = _mm_unpacklo_epi64( s0[1], s0[3] );
+   d[ 9] = _mm_unpacklo_epi64( s1[1], s1[3] );
+   d[10] = _mm_unpacklo_epi64( s2[1], s2[3] );
+   d[11] = _mm_unpacklo_epi64( s3[1], s3[3] );
+   d[12] = _mm_unpackhi_epi64( s0[1], s0[3] );
+   d[13] = _mm_unpackhi_epi64( s1[1], s1[3] );
+   d[14] = _mm_unpackhi_epi64( s2[1], s2[3] );
+   d[15] = _mm_unpackhi_epi64( s3[1], s3[3] );
+
+   if ( bit_len <= 256 ) return;
+
+   d[16] = _mm_unpacklo_epi64( s0[4], s0[6] );
+   d[17] = _mm_unpacklo_epi64( s1[4], s1[6] );
+   d[18] = _mm_unpacklo_epi64( s2[4], s2[6] );
+   d[19] = _mm_unpacklo_epi64( s3[4], s3[6] );
+   d[20] = _mm_unpackhi_epi64( s0[4], s0[6] );
+   d[21] = _mm_unpackhi_epi64( s1[4], s1[6] );
+   d[22] = _mm_unpackhi_epi64( s2[4], s2[6] );
+   d[23] = _mm_unpackhi_epi64( s3[4], s3[6] );
+
+   d[24] = _mm_unpacklo_epi64( s0[5], s0[7] );
+   d[25] = _mm_unpacklo_epi64( s1[5], s1[7] );
+   d[26] = _mm_unpacklo_epi64( s2[5], s2[7] );
+   d[27] = _mm_unpacklo_epi64( s3[5], s3[7] );
+   d[28] = _mm_unpackhi_epi64( s0[5], s0[7] );
+   d[29] = _mm_unpackhi_epi64( s1[5], s1[7] );
+   d[30] = _mm_unpackhi_epi64( s2[5], s2[7] );
+   d[31] = _mm_unpackhi_epi64( s3[5], s3[7] );
+
+   if ( bit_len <= 512 ) return;
+
+   d[32] = _mm_unpacklo_epi64( s0[8], s0[10] );
+   d[33] = _mm_unpacklo_epi64( s1[8], s1[10] );
+   d[34] = _mm_unpacklo_epi64( s2[8], s2[10] );
+   d[35] = _mm_unpacklo_epi64( s3[8], s3[10] );
+   d[36] = _mm_unpackhi_epi64( s0[8], s0[10] );
+   d[37] = _mm_unpackhi_epi64( s1[8], s1[10] );
+   d[38] = _mm_unpackhi_epi64( s2[8], s2[10] );
+   d[39] = _mm_unpackhi_epi64( s3[8], s3[10] );
+
+   d[40] = _mm_unpacklo_epi64( s0[9], s0[11] );
+   d[41] = _mm_unpacklo_epi64( s1[9], s1[11] );
+   d[42] = _mm_unpacklo_epi64( s2[9], s2[11] );
+   d[43] = _mm_unpacklo_epi64( s3[9], s3[11] );
+   d[44] = _mm_unpackhi_epi64( s0[9], s0[11] );
+   d[45] = _mm_unpackhi_epi64( s1[9], s1[11] );
+   d[46] = _mm_unpackhi_epi64( s2[9], s2[11] );
+   d[47] = _mm_unpackhi_epi64( s3[9], s3[11] );
+
+   d[48] = _mm_unpacklo_epi64( s0[12], s0[14] );
+   d[49] = _mm_unpacklo_epi64( s1[12], s1[14] );
+   d[50] = _mm_unpacklo_epi64( s2[12], s2[14] );
+   d[51] = _mm_unpacklo_epi64( s3[12], s3[14] );
+   d[52] = _mm_unpackhi_epi64( s0[12], s0[14] );
+   d[53] = _mm_unpackhi_epi64( s1[12], s1[14] );
+   d[54] = _mm_unpackhi_epi64( s2[12], s2[14] );
+   d[55] = _mm_unpackhi_epi64( s3[12], s3[14] );
+
+   d[56] = _mm_unpacklo_epi64( s0[13], s0[15] );
+   d[57] = _mm_unpacklo_epi64( s1[13], s1[15] );
+   d[58] = _mm_unpacklo_epi64( s2[13], s2[15] );
+   d[59] = _mm_unpacklo_epi64( s3[13], s3[15] );
+   d[60] = _mm_unpackhi_epi64( s0[13], s0[15] );
+   d[61] = _mm_unpackhi_epi64( s1[13], s1[15] );
+   d[62] = _mm_unpackhi_epi64( s2[13], s2[15] );
+   d[63] = _mm_unpackhi_epi64( s3[13], s3[15] );
+}
+
 //
 // Some functions customized for mining.
 
diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h
index fae6203..6e32965 100644
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -252,7 +252,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 
 #else
 
-
 #define mm128_ror_64   mm128_ror_var_64
 #define mm128_rol_64   mm128_rol_var_64
 #define mm128_ror_32   mm128_ror_var_32
@@ -274,6 +273,15 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_ror_1x32( v )   _mm_shuffle_epi32( v, 0x39 )
 #define mm128_rol_1x32( v )   _mm_shuffle_epi32( v, 0x93 )
 
+// Rotate 16 byte (128 bit) vector by c bytes.
+// Less efficient using shift but more versatile. Use only for odd number
+// byte rotations. Use shuffle above whenever possible.
+#define mm128_ror_x8( v, c ) \
+   _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) )
+
+#define mm128_rol_x8( v, c ) \
+   _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
+
 #if defined (__SSE3__)
 // no SSE2 implementation, no current users
 
@@ -289,17 +297,21 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_1x8( v ) \
    _mm_shuffle_epi8( v, m128_const_64( 0x0e0d0c0b0a090807, \
                                        0x060504030201000f ) )
-#endif  // SSE3
+#else  // SSE2
 
-// Rotate 16 byte (128 bit) vector by c bytes.
-// Less efficient using shift but more versatile. Use only for odd number
-// byte rotations. Use shuffle above whenever possible.
-#define mm128_bror( v, c ) \
-   _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) )
+#define mm128_ror_1x16( v ) \
+   _mm_or_si128( _mm_srli_si128( v, 2 ), _mm_slli_si128( v, 14 ) )
 
-#define mm128_brol( v, c ) \
-   _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
+#define mm128_rol_1x16( v ) \
+   _mm_or_si128( _mm_slli_si128( v, 2 ), _mm_srli_si128( v, 14 ) )
 
+#define mm128_ror_1x8( v ) \
+   _mm_or_si128( _mm_srli_si128( v, 1 ), _mm_slli_si128( v, 15 ) )
+
+#define mm128_rol_1x8( v ) \
+   _mm_or_si128( _mm_slli_si128( v, 1 ), _mm_srli_si128( v, 15 ) )
+
+#endif   // SSE3 else SSE2
 
 // Invert vector: {3,2,1,0} -> {0,1,2,3}
 #define mm128_invert_32( v ) _mm_shuffle_epi32( v, 0x1b )
@@ -319,19 +331,24 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 //
 // Rotate elements within lanes.
 
-#define mm128_swap32_64( v )  _mm_shuffle_epi32( v, 0xb1 )
+#define mm128_swap_64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
 
-#define mm128_ror16_64( v ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x09080f0e0d0c0b0a, \
-                                       0x0100070605040302 )
+#define mm128_rol64_8( v, c ) \
+     _mm_or_si128( _mm_slli_epi64( v, ( ( (c)<<3 ) ), \
+                   _mm_srli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
 
-#define mm128_rol16_64( v ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x0d0c0b0a09080f0e, \
-                                       0x0504030201000706 )
+#define mm128_ror64_8( v, c ) \
+     _mm_or_si128( _mm_srli_epi64( v, ( ( (c)<<3 ) ), \
+                   _mm_slli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
 
-#define mm128_swap16_32( v ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x0d0c0f0e09080b0a, \
-                                       0x0504070601000302 )
+#define mm128_rol32_8( v, c ) \
+     _mm_or_si128( _mm_slli_epi32( v, ( ( (c)<<3 ) ), \
+                   _mm_srli_epi32( v, ( ( 32 - ( (c)<<3 ) ) ) )
+
+#define mm128_ror32_8( v, c ) \
+     _mm_or_si128( _mm_srli_epi32( v, ( ( (c)<<3 ) ), \
+                   _mm_slli_epi32( v, ( ( 32 - ( (c)<<3 ) ) ) )
+           
 
 //
 // Endian byte swap.
@@ -431,64 +448,65 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 
 // Swap 128 bit vectorse.
 
-#define mm128_swap128_256( v1, v2 ) \
+#define mm128_swap256_128( v1, v2 ) \
    v1 = _mm_xor_si128( v1, v2 ); \
    v2 = _mm_xor_si128( v1, v2 ); \
    v1 = _mm_xor_si128( v1, v2 );
 
+
 // Concatenate v1 & v2 and rotate as one 256 bit vector.
 #if defined(__SSE4_1__)
 
-#define mm128_ror1x64_256( v1, v2 ) \
+#define mm128_ror256_64( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
            v1 = _mm_alignr_epi8( v2, v1, 8 ); \
            v2 = t; \
 } while(0)
 
-#define mm128_rol1x64_256( v1, v2 ) \
+#define mm128_rol256_64( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
            v2 = _mm_alignr_epi8( v2, v1, 8 ); \
            v1 = t; \
 } while(0)
 
-#define mm128_ror1x32_256( v1, v2 ) \
+#define mm128_ror256_32( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 4 ); \
            v1 = _mm_alignr_epi8( v2, v1, 4 ); \
            v2 = t; \
 } while(0)
 
-#define mm128_rol1x32_256( v1, v2 ) \
+#define mm128_rol256_32( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 12 ); \
            v2 = _mm_alignr_epi8( v2, v1, 12 ); \
            v1 = t; \
 } while(0)
 
-#define mm128_ror1x16_256( v1, v2 ) \
+#define mm128_ror256_16( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 2 ); \
            v1 = _mm_alignr_epi8( v2, v1, 2 ); \
            v2 = t; \
 } while(0)
 
-#define mm128_rol1x16_256( v1, v2 ) \
+#define mm128_rol256_16( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 14 ); \
            v2 = _mm_alignr_epi8( v2, v1, 14 ); \
            v1 = t; \
 } while(0)
 
-#define mm128_ror1x8_256( v1, v2 ) \
+#define mm128_ror256_8( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 1 ); \
            v1 = _mm_alignr_epi8( v2, v1, 1 ); \
            v2 = t; \
 } while(0)
 
-#define mm128_rol1x8_256( v1, v2 ) \
+#define mm128_rol256_8( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 15 ); \
            v2 = _mm_alignr_epi8( v2, v1, 15 ); \
@@ -497,7 +515,7 @@ do { \
 
 #else  // SSE2
 
-#define mm128_ror1x64_256( v1, v2 ) \
+#define mm128_ror256_64( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
                               _mm_slli_si128( v2, 8 ) ); \
@@ -506,7 +524,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_rol1x64_256( v1, v2 ) \
+#define mm128_rol256_64( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
                               _mm_srli_si128( v2, 8 ) ); \
@@ -515,7 +533,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_ror1x32_256( v1, v2 ) \
+#define mm128_ror256_32( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
                               _mm_slli_si128( v2, 12 ) ); \
@@ -524,7 +542,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_rol1x32_256( v1, v2 ) \
+#define mm128_rol256_32( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
                               _mm_srli_si128( v2, 12 ) ); \
@@ -533,7 +551,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_ror1x16_256( v1, v2 ) \
+#define mm128_ror256_16( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
                               _mm_slli_si128( v2, 14 ) ); \
@@ -542,7 +560,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_rol1x16_256( v1, v2 ) \
+#define mm128_rol256_16( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
                               _mm_srli_si128( v2, 14 ) ); \
@@ -551,7 +569,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_ror1x8_256( v1, v2 ) \
+#define mm128_ror256_8( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
                               _mm_slli_si128( v2, 15 ) ); \
@@ -560,7 +578,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_rol1x8_256( v1, v2 ) \
+#define mm128_rol256_8( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
                               _mm_srli_si128( v2, 15 ) ); \
diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h
index ac7bef2..3bdde9b 100644
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -414,99 +414,71 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 
 
 //
-// Rotate elements within lanes of 256 bit vector.
+// Rotate elements within each 128 bit lane of 256 bit vector.
 
-// Swap 64 bit elements in each 128 bit lane.
-#define mm256_swap64_128( v )   _mm256_shuffle_epi32( v, 0x4e )
+#define mm256_swap128_64( v )   _mm256_shuffle_epi32( v, 0x4e )
 
-// Rotate each 128 bit lane by one 32 bit element.
-#define mm256_ror1x32_128( v )  _mm256_shuffle_epi32( v, 0x39 )
-#define mm256_rol1x32_128( v )  _mm256_shuffle_epi32( v, 0x93 )
+#define mm256_ror128_32( v )  _mm256_shuffle_epi32( v, 0x39 )
 
-#define mm256_ror1x16_128( v ) \
-   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x11101f1e1d1c1b1a, 0x1918171615141312, \
-                        0x01000f0e0d0c0b0a, 0x0908070605040302 ) )
+#define mm256_rol128_1x32( v )  _mm256_shuffle_epi32( v, 0x93 )
 
-#define mm256_rol1x16_128( v ) \
-   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x1d1c1b1a19181716, 0x1514131211101f1e, \
-                        0x0d0c0b0a09080706, 0x0504030201000f0e ) )
-
-#define mm256_ror1x8_128( v ) \
-   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x101f1e1d1c1b1a19, 0x1817161514131211, \
-                        0x000f0e0d0c0b0a09, 0x0807060504030201 ) )
-
-#define mm256_rol1x8_128( v ) \
-   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \
-                        0x0d0c0b0a09080f0e, 0x0504030201000706 ) )
-
-// Rotate each 128 bit lane by c bytes.
-#define mm256_bror_128( v, c ) \
+// Rotave each 128 bit lane by c elements.
+#define mm256_ror128_8( v, c ) \
   _mm256_or_si256( _mm256_bsrli_epi128( v, c ), \
                    _mm256_bslli_epi128( v, 16-(c) ) )
-#define mm256_brol_128( v, c ) \
+#define mm256_rol128_8( v, c ) \
   _mm256_or_si256( _mm256_bslli_epi128( v, c ), \
                    _mm256_bsrli_epi128( v, 16-(c) ) )
 
-// Swap 32 bit elements in each 64 bit lane
-#define mm256_swap32_64( v )    _mm256_shuffle_epi32( v, 0xb1 )
+
+// Rotate elements in each 64 bit lane
+
+#define mm256_swap64_32( v )    _mm256_shuffle_epi32( v, 0xb1 )
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
-#define mm256_rol1x16_64( v )   _mm256_rol_epi64( v, 16 )
-#define mm256_ror1x16_64( v )   _mm256_ror_epi64( v, 16 )
+#define mm256_rol64_8( v, c )   _mm256_rol_epi64( v, ((c)<<3) ) 
+#define mm256_ror64_8( v, c )   _mm256_ror_epi64( v, ((c)<<3) ) 
 
 #else
 
-#define mm256_ror1x16_64( v ) \
-   _mm256_shuffle_epi8( v, \
-        m256_const_64( 0x19181f1e1d1c1b1a, 0x1110171615141312, \
-                       0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
+#define mm256_rol64_8( v, c ) \
+     _mm256_or_si256( _mm256_slli_epi64( v, ( ( (c)<<3 ) ), \
+                      _mm256_srli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
+
+#define mm256_ror64_8( v, c ) \
+     _mm256_or_si256( _mm256_srli_epi64( v, ( ( (c)<<3 ) ), \
+                      _mm256_slli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
 
-#define mm256_rol1x16_64( v ) \
-   _mm256_shuffle_epi8( v, \
-        m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \
-                       0x0d0c0b0a09080f0e, 0x0504030201000706 ) )
 #endif
 
-#define mm256_ror1x8_64( v ) \
-   _mm256_shuffle_epi8( v, \
-        m256_const_64( 0x181f1e1d1c1b1a19, 0x1017161514131211, \
-                       0x080f0e0d0c0b0a09, 0x0007060504030201 ) )
 
-#define mm256_rol1x8_64( v ) \
-   _mm256_shuffle_epi8( v, \
-        m256_const_64( 0x1e1d1c1b1a19181f, 0x1615141312111017, \
-                       0x0e0d0c0b0a09080f, 0x0605040302010007 ) )
-
-#define mm256_ror3x8_64( v ) \
-   _mm256_shuffle_epi8( v, \
-        m256_const_64( 0x1a19181f1e1d1c1b, 0x1211101716151413, \
-                       0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
-
-#define mm256_rol3x8_64( v ) \
-   _mm256_shuffle_epi8( v, \
-        m256_const_64( 0x1c1b1a19181f1e1d, 0x1413121110171615, \
-                       0x0c0b0a09080f0e0d, 0x0403020100070605 ) )
-
-
-// Swap 16 bit elements in each 32 bit lane
+// Rotate elements in each 32 bit lane
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
-#define mm256_swap16_32( v )   _mm256_rol_epi32( v, 16 )
+#define mm256_swap32_16( v ) _mm256_rol_epi32( v, 16 )
+
+#define mm256_rol32_8( v )   _mm256_rol_epi32( v, 8 )
+#define mm256_ror32_8( v )   _mm256_ror_epi32( v, 8 )
 
 #else
 
-#define mm256_swap16_32( v ) \
-   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x1b1a19181f1e1d1c, 0x1312111017161514, \
-                        0x0b0a09080f0e0d0c, 0x0302010007060504 ) )
+#define mm256_swap32_16( v ) \
+     _mm256_or_si256( _mm256_slli_epi32( v, 16 ), \
+                      _mm256_srli_epi32( v, 16 ) )
+
+#define mm256_rol32_8( v ) \
+     _mm256_or_si256( _mm256_slli_epi32( v, 8 ), \
+                      _mm256_srli_epi32( v, 8 ) )
+
+#define mm256_ror32_8( v, c ) \
+     _mm256_or_si256( _mm256_srli_epi32( v, 8 ), \
+                      _mm256_slli_epi32( v, 8 ) )
+
 #endif
 
+
 //
 // Swap bytes in vector elements, endian bswap.
 #define mm256_bswap_64( v ) \
@@ -565,19 +537,19 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //  _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
 //  makes these macros unnecessary.
 
-#define mm256_swap256_512 (v1, v2) \
-   v1 = _mm256_xor_si256(v1, v2); \
-   v2 = _mm256_xor_si256(v1, v2); \
-   v1 = _mm256_xor_si256(v1, v2);
+#define mm256_swap512_256( v1, v2 ) \
+   v1 = _mm256_xor_si256( v1, v2 ); \
+   v2 = _mm256_xor_si256( v1, v2 ); \
+   v1 = _mm256_xor_si256( v1, v2 );
 
-#define mm256_ror1x128_512( v1, v2 ) \
+#define mm256_ror512_128( v1, v2 ) \
 do { \
    __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
    v1 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \
    v2 = t; \
 } while(0)
 
-#define mm256_rol1x128_512( v1, v2 ) \
+#define mm256_rol512_128( v1, v2 ) \
 do { \
    __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
    v2 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \
diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h
index 58ae8cb..a4e0807 100644
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -15,13 +15,13 @@
 
 //  AVX512 intrinsics have a few changes from previous conventions.
 //
-//    Some instructions like cmp and blend use the mask regsiters now instead
-//    a vector mask.
+//    cmp instruction now returns a bitmask isnstead of a vector mask.
+//    This eliminates the need for the blendv instruction.
 //
-//    The new rotate instructions require the count to be only an 8 bit
-//    immediate value. The documentation is the same as for shift and
-//    it allows variables. Suspect a compiler issue but it still happens
-//    in GCC9.
+//    The new rotate instructions require the count to be an 8 bit
+//    immediate value only. Compilation fails if a variable is used.
+//    The documentation is the same as for shift and it works with
+//    variables.
 //
 //    _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
 //    usually shuffles accross all lanes.
@@ -109,6 +109,11 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
 #define m512_const2_64( i1, i0 ) \
    m512_const1_128( m128_const_64( i1, i0 ) )
 
+#define m512_const2_32( i1, i0 ) \
+   m512_const1_64( ( ( ( (uint64_t)(i1) << 32 ) ) \
+                     | ( (uint64_t)(i0) & 0xffffffff ) ) )
+
+
 static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
                                       const uint64_t i1, const uint64_t i0 )
 {
@@ -265,7 +270,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
                m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
                               0x28292a2b2c2d2e2f, 0x2021222324252627, \
                               0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                              0x08090a0b0c0d0e0f, 0x0001020304050607 ))
+                              0x08090a0b0c0d0e0f, 0x0001020304050607 ) )
 
 #define mm512_bswap_32( v ) \
    _mm512_shuffle_epi8( v, \
@@ -304,8 +309,8 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 { \
   __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
                                0x2c2d2e2f28292a2b, 0x2425262720212223, \
-                               0x0c0d0e0f08090a0b, 0x0405060700010203, \
-                               0x1c1d1e1f18191a1b, 0x1415161710111213 ); \
+                               0x1c1d1e1f18191a1b, 0x1415161710111213, \
+                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
   casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
   casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
   casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
@@ -320,8 +325,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Rotate elements in 512 bit vector.
 
+
 #define mm512_swap_256( v )        _mm512_alignr_epi64( v, v, 4 )
 
+// 1x64 notation used to disinguish from bit rotation.
 #define mm512_ror_1x128( v )       _mm512_alignr_epi64( v, v, 2 )
 #define mm512_rol_1x128( v )       _mm512_alignr_epi64( v, v, 6 )
 
@@ -401,51 +408,58 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Rotate elements within 256 bit lanes of 512 bit vector.
 
+// Rename these for consistency. Element size is always last.
+// mm<vectorsize>_<op><lanesize>_<elementsize>
+
+
 // Swap hi & lo 128 bits in each 256 bit lane
-#define mm512_swap128_256( v )   _mm512_permutex_epi64( v, 0x4e )
+
+#define mm512_swap256_128( v )   _mm512_permutex_epi64( v, 0x4e )
 
 // Rotate 256 bit lanes by one 64 bit element
-#define mm512_ror1x64_256( v )   _mm512_permutex_epi64( v, 0x39 )
-#define mm512_rol1x64_256( v )   _mm512_permutex_epi64( v, 0x93 )
+
+#define mm512_ror256_64( v )   _mm512_permutex_epi64( v, 0x39 )
+#define mm512_rol256_64( v )   _mm512_permutex_epi64( v, 0x93 )
 
 
 // Rotate 256 bit lanes by one 32 bit element
-#define mm512_ror1x32_256( v ) \
+
+#define mm512_ror256_32( v ) \
    _mm512_permutexvar_epi32( m512_const_64( \
                       0x000000080000000f, 0x0000000e0000000d, \
                       0x0000000c0000000b, 0x0000000a00000009, \
                       0x0000000000000007, 0x0000000600000005, \
                       0x0000000400000003, 0x0000000200000001 ), v )
 
-#define mm512_rol1x32_256( v ) \
+#define mm512_rol256_32( v ) \
    _mm512_permutexvar_epi32( m512_const_64( \
                       0x0000000e0000000d, 0x0000000c0000000b, \
                       0x0000000a00000009, 0x000000080000000f, \
                       0x0000000600000005, 0x0000000400000003, \
                       0x0000000200000001, 0x0000000000000007 ), v )
 
-#define mm512_ror1x16_256( v ) \
+#define mm512_ror256_16( v ) \
    _mm512_permutexvar_epi16( m512_const_64( \
                      0x00100001001e001d, 0x001c001b001a0019, \
                      0x0018001700160015, 0x0014001300120011, \
                      0x0000000f000e000d, 0x000c000b000a0009, \
                      0x0008000700060005, 0x0004000300020001 ), v )
 
-#define mm512_rol1x16_256( v ) \
+#define mm512_rol256_16( v ) \
    _mm512_permutexvar_epi16( m512_const_64( \
                      0x001e001d001c001b, 0x001a001900180017, \
                      0x0016001500140013, 0x001200110010001f, \
                      0x000e000d000c000b, 0x000a000900080007, \
                      0x0006000500040003, 0x000200010000000f ), v )
 
-#define mm512_ror1x8_256( v ) \
+#define mm512_ror256_8( v ) \
    _mm512_shuffle_epi8( v, m512_const_64( \
                      0x203f3e3d3c3b3a39, 0x3837363534333231, \
                      0x302f2e2d2c2b2a29, 0x2827262524232221, \
                      0x001f1e1d1c1b1a19, 0x1817161514131211, \
                      0x100f0e0d0c0b0a09, 0x0807060504030201 ), v )
 
-#define mm512_rol1x8_256( v ) \
+#define mm512_rol256_8( v ) \
    _mm512_shuffle_epi8( v, m512_const_64( \
                      0x3e3d3c3b3a393837, 0x363534333231302f, \
                      0x2e2d2c2b2a292827, 0x262524232221203f, \
@@ -456,45 +470,19 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // Rotate elements within 128 bit lanes of 512 bit vector.
 
 // Swap hi & lo 64 bits in each 128 bit lane
-#define mm512_swap64_128( v )    _mm512_shuffle_epi32( v, 0x4e )
+#define mm512_swap128_64( v )    _mm512_shuffle_epi32( v, 0x4e )
 
 // Rotate 128 bit lanes by one 32 bit element
-#define mm512_ror1x32_128( v )   _mm512_shuffle_epi32( v, 0x39 )
-#define mm512_rol1x32_128( v )   _mm512_shuffle_epi32( v, 0x93 )
+#define mm512_ror128_32( v )   _mm512_shuffle_epi32( v, 0x39 )
+#define mm512_rol128_32( v )   _mm512_shuffle_epi32( v, 0x93 )
 
-#define mm512_ror1x16_128( v ) \
-    _mm512_permutexvar_epi16( m512_const_64( \
-                     0x0018001f001e001d, 0x001c001b001a0019, \
-                     0x0010001700160015, 0x0014001300120011, \
-                     0x0008000f000e000d, 0x000c000b000a0009, \
-                     0x0000000700060005, 0x0004000300020001 ), v ) 
 
-#define mm512_rol1x16_128( v ) \
-    _mm512_permutexvar_epi16( m512_const_64( \
-                     0x001e001d001c001b, 0x001a00190018001f, \
-                     0x0016001500140013, 0x0012001100100017, \
-                     0x000e000d000c000b, 0x000a00090008000f, \
-                     0x0006000500040003, 0x0002000100000007 ), v ) 
-
-#define mm512_ror1x8_128( v ) \
-    _mm512_shuffle_epi8( v, m512_const_64( \
-                     0x303f3e3d3c3b3a39, 0x3837363534333231, \
-                     0x202f2e2d2c2b2a29, 0x2827262524232221, \
-                     0x101f1e1d1c1b1a19, 0x1817161514131211, \
-                     0x000f0e0d0c0b0a09, 0x0807060504030201 ) )
-
-#define mm512_rol1x8_128( v ) \
-    _mm512_shuffle_epi8( v, m512_const_64( \
-                     0x3e3d3c3b3a393837, 0x363534333231303f, \
-                     0x2e2d2c2b2a292827, 0x262524232221202f, \
-                     0x1e1d1c1b1a191817, 0x161514131211101f, \
-                     0x0e0d0c0b0a090807, 0x060504030201000f ) )
-
-// Rotate 128 bit lanes by c bytes.  
-#define mm512_bror_128( v, c ) \
+// Rotate 128 bit lanes by c bytes, faster than building that monstrous 
+// constant above.  
+#define mm512_ror128_8( v, c ) \
    _mm512_or_si512( _mm512_bsrli_epi128( v, c ), \
                     _mm512_bslli_epi128( v, 16-(c) ) )
-#define mm512_brol_128( v, c ) \
+#define mm512_rol128_8( v, c ) \
    _mm512_or_si512( _mm512_bslli_epi128( v, c ), \
                     _mm512_bsrli_epi128( v, 16-(c) ) )
 
@@ -502,75 +490,23 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Rotate elements within 64 bit lanes.
 
+#define mm512_rol64_x8( v, c )   _mm512_rol_epi64( v, ((c)<<3) )
+#define mm512_ror64_x8( v, c )   _mm512_ror_epi64( v, ((c)<<3) )
+
 // Swap 32 bit elements in each 64 bit lane
-#define mm512_swap32_64( v )      _mm512_shuffle_epi32( v, 0xb1 )
+#define mm512_swap64_32( v )      _mm512_shuffle_epi32( v, 0xb1 )
 
 // Rotate each 64 bit lane by one 16 bit element.
-#define mm512_ror1x16_64( v )   _mm512_ror_epi64( v, 16 )
-#define mm512_rol1x16_64( v )   _mm512_rol_epi64( v, 16 )
-#define mm512_ror1x8_64( v )    _mm512_ror_epi64( v, 8 )
-#define mm512_rol1x8_64( v )    _mm512_rol_epi64( v, 8 )
-
-/*
-#define mm512_ror1x16_64( v ) \
-    _mm512_permutexvar_epi16( m512_const_64( \
-                      0x001c001f001e001d, 0x0018001b001a0019, \
-                      0x0014001700160015, 0x0010001300120011, \
-                      0x000c000f000e000d, 0x0008000b000a0009, \
-                      0x0004000700060005, 0x0000000300020001, v )
-
-#define mm512_rol1x16_64( v ) \
-    _mm512_permutexvar_epi16( m512_const_64( \
-                      0x001e001d001c001f, 0x001a00190018001b, \
-                      0x0016001500140017, 0x0012001100100013, \
-                      0x000e000d000c000f, 0x000a00090008000b, \
-                      0x0006000500040007, 0x0002000100000003, v )
-
-// Rotate each 64 bit lane by one byte.
-#define mm512_ror1x8_64( v ) \
-    _mm512_shuffle_epi8( v, m512_const_64( \
-                      0x383F3E3D3C3B3A39, 0x3037363534333231, \
-                      0x282F2E2D2C2B2A29, 0x2027262524232221, \
-                      0x181F1E1D1C1B1A19, 0x1017161514131211, \
-                      0x080F0E0D0C0B0A09, 0x0007060504030201 ) )
-#define mm512_rol1x8_64( v ) \
-    _mm512_shuffle( v, m512_const_64( \
-                       0x3E3D3C3B3A39383F, 0x3635343332313037, \
-                       0x2E2D2C2B2A29282F, 0x2625242322212027, \
-                       0x1E1D1C1B1A19181F, 0x1615141312111017, \
-                       0x0E0D0C0B0A09080F, 0x0605040302010007 ) )
-*/
+#define mm512_ror64_16( v )   _mm512_ror_epi64( v, 16 )
+#define mm512_rol64_16( v )   _mm512_rol_epi64( v, 16 )
+#define mm512_ror64_8( v )    _mm512_ror_epi64( v, 8 )
+#define mm512_rol64_8( v )    _mm512_rol_epi64( v, 8 )
 
 //
 // Rotate elements within 32 bit lanes.
 
-#define mm512_swap16_32( v )   _mm512_ror_epi32( v, 16 )
-#define mm512_ror1x8_32( v )   _mm512_ror_epi32( v, 8 )
-#define mm512_rol1x8_32( v )   _mm512_rol_epi32( v, 8 )
-
-/*
-#define mm512_swap16_32( v ) \
-   _mm512_permutexvar_epi16( m512_const_64( \
-                       0x001e001f001c001d, 0x001a001b00180019, \
-                       0x0016001700140015, 0x0012001300100011, \
-                       0x000e000f000c000d, 0x000a000b00080009, \
-                       0x0006000700040005, 0x0002000300000001 ), v )
-
-#define mm512_ror1x8_32( v ) \
-   _mm512_shuffle_epi8( v, m512_const_64( \
-                       0x3C3F3E3D383B3A39, 0x3437363530333231, \
-                       0x2C2F2E2D282B2A29, 0x2427262520232221, \
-                       0x1C1F1E1D181B1A19, 0x1417161510131211, \
-                       0x0C0F0E0D080B0A09, 0x0407060500030201 ))
-
-#define mm512_rol1x8_32( v ) \
-   _mm512_shuffle_epi8( v, m512_const_64( \
-                       0x3E3D3C3F3A39383B, 0x3635343732313033, \
-                       0x2E2D2C2F2A29282B, 0x2625242722212023, \
-                       0x1E1D1C1F1A19181B, 0x1615141712111013, \
-                       0x0E0D0C0F0A09080B, 0x0605040702010003 ) )
-*/
-
+#define mm512_rol32_x8( v, c )   _mm512_rol_epi32( v, ((c)<<2) )
+#define mm512_ror32_x8( v, c )   _mm512_ror_epi32( v, ((c)<<2) )
 
 
 //
@@ -579,61 +515,61 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //  These can all be done with 2 permutex2var instructions but they are
 //  slower than either xor or alignr and require AVX512VBMI.
 
-#define mm512_swap512_1024(v1, v2) \
+#define mm512_swap1024_512(v1, v2) \
    v1 = _mm512_xor_si512(v1, v2); \
    v2 = _mm512_xor_si512(v1, v2); \
    v1 = _mm512_xor_si512(v1, v2);
 
-#define mm512_ror1x256_1024( v1, v2 ) \
+#define mm512_ror1024_256( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
    v1 = _mm512_alignr_epi64( v2, v1, 4 ); \
    v2 = t; \
 } while(0)
 
-#define mm512_rol1x256_1024( v1, v2 ) \
+#define mm512_rol1024_256( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
    v2 = _mm512_alignr_epi64( v2, v1, 4 ); \
    v1 = t; \
 } while(0)
 
-#define mm512_ror1x128_1024( v1, v2 ) \
+#define mm512_ror1024_128( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \
    v1 = _mm512_alignr_epi64( v2, v1, 2 ); \
    v2 = t; \
 } while(0)
 
-#define mm512_rol1x128_1024( v1, v2 ) \
+#define mm512_rol1024_128( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \
    v2 = _mm512_alignr_epi64( v2, v1, 6 ); \
    v1 = t; \
 } while(0)
 
-#define mm512_ror1x64_1024( v1, v2 ) \
+#define mm512_ror1024_64( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \
    v1 = _mm512_alignr_epi64( v2, v1, 1 ); \
    v2 = t; \
 } while(0)
 
-#define mm512_rol1x64_1024( v1, v2 ) \
+#define mm512_rol1024_64( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \
    v2 = _mm512_alignr_epi64( v2, v1, 7 ); \
    v1 = t; \
 } while(0)
 
-#define mm512_ror1x32_1024( v1, v2 ) \
+#define mm512_ror1024_32( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \
    v1 = _mm512_alignr_epi32( v2, v1, 1 ); \
    v2 = t; \
 } while(0)
 
-#define mm512_rol1x32_1024( v1, v2 ) \
+#define mm512_rol1024_32( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \
    v2 = _mm512_alignr_epi32( v2, v1, 15 ); \