v3.10.6

v3.10.5
2025-09-17 23:44:27 +00:00 · 2019-12-25 01:26:26 -05:00 · 2019-12-21 13:19:29 -05:00
71 changed files with 9778 additions and 880 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -84,6 +84,7 @@ cpuminer_SOURCES = \
  algo/cubehash/cubehash_sse2.c\
  algo/cubehash/cube-hash-2way.c \
  algo/echo/sph_echo.c \
+  algo/echo/echo-hash-4way.c \
  algo/echo/aes_ni/hash.c\
  algo/gost/sph_gost.c \
  algo/groestl/sph_groestl.c \
@@ -125,6 +126,7 @@ cpuminer_SOURCES = \
  algo/lyra2/lyra2.c \
  algo/lyra2/sponge.c \
  algo/lyra2/sponge-2way.c \
+  algo/lyra2/lyra2-hash-2way.c \
  algo/lyra2/lyra2-gate.c \
  algo/lyra2/lyra2rev2.c \
  algo/lyra2/lyra2rev2-4way.c \
--- a/README.md
+++ b/README.md
@@ -126,11 +126,11 @@ Supported Algorithms
                          x16rv2        Ravencoin (RVN)
                          x16rt         Gincoin (GIN)
                          x16rt-veil    Veil (VEIL)
-                          x16s          
+                          x16s          Pigeoncoin (PGN)
                          x17
-                          x21s          Pigeoncoin (PGN)
-                          x22i          
-                          x25x          Sinovative (SIN)
+                          x21s
+                          x22i
+                          x25x
                          xevan         Bitsend (BSD)
                          yescrypt      Globalboost-Y (BSTY)
                          yescryptr8    BitZeny (ZNY)
--- a/29
+++ b/29
@@ -1,13 +1,17 @@
 cpuminer-opt is a console program run from the command line using the
 keyboard, not the mouse.

+See also README.md for list of supported algorithms,
+
 Security warning
 ----------------

 Miner programs are often flagged as malware by antivirus programs. This is
-a false positive, they are flagged simply because they are cryptocurrency 
-miners. The source code is open for anyone to inspect. If you don't trust 
-the software, don't use it.
+usually a false positive, they are flagged simply because they are
+cryptocurrency miners. However, some malware has been spread using the
+cover that miners are known to be subject to false positives. Always be on
+alert. The source code of cpuminer-opt is open for anyone to inspect.
+If you don't trust the software don't download it.

 The cryptographic hashing code has been taken from trusted sources but has been
 modified for speed at the expense of accepted security practices. This
@@ -31,6 +35,25 @@ not supported. FreeBSD YMMV.
 Change Log
 ----------

+v3.10.6
+
+Added support for SSL stratum: stratum+tcps://
+
+Added job id reporting again, but leaner, suppressed with --quiet.
+
+AVX512 for x21s, x22i, lyra2z, allium
+
+Fixed share overflow warnings mining lbry with Ryzen (SHA).
+
+v3.10.5
+
+AVX512 for x17, sonoa, xevan, hmq1725, lyra2rev3, lyra2rev2. 
+Faster hmq1725 AVX2.
+
+v3.10.4
+
+AVX512 for x16r, x16rv2, x16rt, x16s, x16rt-veil (veil).
+
 v3.10.3

 AVX512 for x12, x13, x14, x15.
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -463,6 +463,38 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
   return 0;
 }

+// Update and final when inlen is a multiple of 64 bytes
+int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
+                              const void *input, uint64_t inlen )
+{
+    __m256i *in = (__m256i*)input;
+    __m256i *buf = (__m256i*)S->buf;
+
+    while( inlen > BLAKE2S_BLOCKBYTES )
+    {
+       memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
+       S->buflen = BLAKE2S_BLOCKBYTES;
+       inlen -= BLAKE2S_BLOCKBYTES;
+       S->t[0] += BLAKE2S_BLOCKBYTES;
+       S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+       blake2s_8way_compress( S, buf );
+       S->buflen = 0;
+       in += ( BLAKE2S_BLOCKBYTES >> 2 );
+    }
+
+    // last block
+    memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
+    S->buflen = BLAKE2S_BLOCKBYTES;
+    S->t[0] += S->buflen;
+    S->t[1] += ( S->t[0] < S->buflen );
+    if ( S->last_node )  S->f[1] = ~0U;
+    S->f[0] = ~0U;
+    blake2s_8way_compress( S, buf );
+
+    for ( int i = 0; i < 8; ++i )
+      casti_m256i( out, i ) = S->h[ i ];
+    return 0;
+}

 #endif // __AVX2__

--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -95,8 +95,8 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen );
 int blake2s_8way_update( blake2s_8way_state *S, const void *in,
                         uint64_t inlen );
 int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
-//int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
-//                              const void *input, uint64_t inlen );
+int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
+                              const void *input, uint64_t inlen );

 #endif

--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -874,6 +874,57 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
                 mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
                 mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );

+#define DH1L( m, sl, sr, a, b, c ) \
+   _mm256_add_epi32( \
+               _mm256_xor_si256( M[m], \
+                  _mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \
+                                    _mm256_srli_epi32( qt[a], sr ) ) ), \
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+
+#define DH1R( m, sl, sr, a, b, c ) \
+   _mm256_add_epi32( \
+               _mm256_xor_si256( M[m], \
+                  _mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \
+                                    _mm256_slli_epi32( qt[a], sr ) ) ), \
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+
+#define DH2L( m, rl, sl, h, a, b, c ) \
+   _mm256_add_epi32( _mm256_add_epi32( \
+       mm256_rol_32( dH[h], rl ), \
+          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
+                 _mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \
+                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+
+#define DH2R( m, rl, sr, h, a, b, c ) \
+   _mm256_add_epi32( _mm256_add_epi32( \
+       mm256_rol_32( dH[h], rl ), \
+          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
+                 _mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \
+                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+
+   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
+   dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
+   dH[ 2] = DH1R(  2,  5,  5, 18, 26, 2 );
+   dH[ 3] = DH1R(  3,  1,  5, 19, 27, 3 );
+   dH[ 4] = DH1R(  4,  3,  0, 20, 28, 4 );
+   dH[ 5] = DH1L(  5,  6,  6, 21, 29, 5 );
+   dH[ 6] = DH1R(  6,  4,  6, 22, 30, 6 );
+   dH[ 7] = DH1R(  7, 11,  2, 23, 31, 7 );
+   dH[ 8] = DH2L(  8,  9,  8,  4, 24, 23,  8 );
+   dH[ 9] = DH2R(  9, 10,  6,  5, 25, 16,  9 );
+   dH[10] = DH2L( 10, 11,  6,  6, 26, 17, 10 );
+   dH[11] = DH2L( 11, 12,  4,  7, 27, 18, 11 );
+   dH[12] = DH2R( 12, 13,  3,  0, 28, 19, 12 );
+   dH[13] = DH2R( 13, 14,  4,  1, 29, 20, 13 );
+   dH[14] = DH2R( 14, 15,  7,  2, 30, 21, 14 );
+   dH[15] = DH2R( 15, 16,  2,  3, 31, 22, 15 );
+
+#undef DH1L
+#undef DH1R
+#undef DH2L
+#undef DH2R
+
+/*   
   dH[ 0] = _mm256_add_epi32(
                 _mm256_xor_si256( M[0],
                      _mm256_xor_si256( _mm256_slli_epi32( xh, 5 ),
@@ -954,6 +1005,7 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
                 _mm256_xor_si256( _mm256_srli_epi32( xl, 2 ),
                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
+*/
 }

 static const __m256i final_s8[16] =
--- a/algo/echo/echo-hash-4way.c
+++ b/algo/echo/echo-hash-4way.c
@@ -0,0 +1,559 @@
+#if defined(__AVX512VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#include "simd-utils.h"
+#include "echo-hash-4way.h"
+
+/*
+#include <memory.h>
+#include "miner.h"
+#include "hash_api.h"
+//#include "vperm.h"
+#include <immintrin.h>
+*/
+/*
+#ifndef NO_AES_NI
+#include <wmmintrin.h>
+#else
+#include <tmmintrin.h>
+#endif
+*/
+
+// not used
+/*
+const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
+const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
+const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1};
+const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C};
+const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1};
+const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8};
+const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09};
+const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79};
+const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8};
+const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170};
+const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1};
+const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363};
+const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6};
+const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b};
+const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e};
+const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e};
+const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515};
+const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c};
+const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601};
+const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06};
+const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b};
+*/
+
+/*
+MYALIGN const unsigned int 	const1[]		= {0x00000001, 0x00000000, 0x00000000, 0x00000000};
+MYALIGN const unsigned int	mul2mask[]		= {0x00001b00, 0x00000000, 0x00000000, 0x00000000};
+MYALIGN const unsigned int	lsbmask[]		= {0x01010101, 0x01010101, 0x01010101, 0x01010101};
+MYALIGN const unsigned int	invshiftrows[]	= {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
+MYALIGN const unsigned int	zero[]			= {0x00000000, 0x00000000, 0x00000000, 0x00000000};
+*/
+
+MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
+
+// do these need to be reversed?
+
+#define mul2mask \
+   m512_const4_32( 0x00001b00, 0, 0, 0 ) 
+
+#define lsbmask    m512_const1_32( 0x01010101 ) 
+
+#define ECHO_SUBBYTES( state, i, j ) \
+	state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
+	state[i][j] = _mm512_aesenc_epi128( state[i][j], m512_zero ); \
+	k1 = _mm512_add_epi32( k1, m512_one_32 )
+
+#define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) do \
+{ \
+   const int j1 = ( j+1 ) & 3; \
+   const int j2 = ( j+2 ) & 3; \
+   const int j3 = ( j+3 ) & 3; \
+   s2 = _mm512_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \
+	t1 = _mm512_srli_epi16( state1[ 0 ][ j ], 7 ); \
+	t1 = _mm512_and_si128( t1, lsbmask );\
+	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+	s2 = _mm512_xor_si512( s2, t2 ); \
+	state2[ 0 ] [j ] = s2; \
+	state2[ 1 ] [j ] = state1[ 0 ][ j ]; \
+	state2[ 2 ] [j ] = state1[ 0 ][ j ]; \
+	state2[ 3 ] [j ] = _mm512_xor_si512( s2, state1[ 0 ][ j ] );\
+	s2 = _mm512_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \
+	t1 = _mm512_srli_epi16( state1[ 1 ][ j1 ], 7 ); \
+	t1 = _mm512_and_si512( t1, lsbmask ); \
+	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+	s2 = _mm512_xor_si512( s2, t2 );\
+	state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
+                            _mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
+	state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
+	state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
+	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
+	s2 = _mm512_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \
+	t1 = _mm512_srli_epi16( state1[ 2 ][ j2 ], 7 ); \
+	t1 = _mm512_and_si512( t1, lsbmask ); \
+	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+	s2 = _mm512_xor_si512( s2, t2 ); \
+	state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
+	state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
+                            _mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
+	state2[ 2 ][ j ] = _mm512_xor_si512128( state2[ 2 ][ j ], s2 ); \
+	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
+	s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
+	t1 = _mm512_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
+	t1 = _mm512_and_si512( t1, lsbmask ); \
+	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+	s2 = _mm512_xor_si512( s2, t2 ); \
+	state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
+	state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
+	state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
+                            _mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
+	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 )
+} while(0)
+
+#define ECHO_ROUND_UNROLL2 \
+	ECHO_SUBBYTES(_state, 0, 0);\
+	ECHO_SUBBYTES(_state, 1, 0);\
+	ECHO_SUBBYTES(_state, 2, 0);\
+	ECHO_SUBBYTES(_state, 3, 0);\
+	ECHO_SUBBYTES(_state, 0, 1);\
+	ECHO_SUBBYTES(_state, 1, 1);\
+	ECHO_SUBBYTES(_state, 2, 1);\
+	ECHO_SUBBYTES(_state, 3, 1);\
+	ECHO_SUBBYTES(_state, 0, 2);\
+	ECHO_SUBBYTES(_state, 1, 2);\
+	ECHO_SUBBYTES(_state, 2, 2);\
+	ECHO_SUBBYTES(_state, 3, 2);\
+	ECHO_SUBBYTES(_state, 0, 3);\
+	ECHO_SUBBYTES(_state, 1, 3);\
+	ECHO_SUBBYTES(_state, 2, 3);\
+	ECHO_SUBBYTES(_state, 3, 3);\
+	ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
+	ECHO_SUBBYTES(_state2, 0, 0);\
+	ECHO_SUBBYTES(_state2, 1, 0);\
+	ECHO_SUBBYTES(_state2, 2, 0);\
+	ECHO_SUBBYTES(_state2, 3, 0);\
+	ECHO_SUBBYTES(_state2, 0, 1);\
+	ECHO_SUBBYTES(_state2, 1, 1);\
+	ECHO_SUBBYTES(_state2, 2, 1);\
+	ECHO_SUBBYTES(_state2, 3, 1);\
+	ECHO_SUBBYTES(_state2, 0, 2);\
+	ECHO_SUBBYTES(_state2, 1, 2);\
+	ECHO_SUBBYTES(_state2, 2, 2);\
+	ECHO_SUBBYTES(_state2, 3, 2);\
+	ECHO_SUBBYTES(_state2, 0, 3);\
+	ECHO_SUBBYTES(_state2, 1, 3);\
+	ECHO_SUBBYTES(_state2, 2, 3);\
+	ECHO_SUBBYTES(_state2, 3, 3);\
+	ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
+
+
+
+#define SAVESTATE(dst, src)\
+	dst[0][0] = src[0][0];\
+	dst[0][1] = src[0][1];\
+	dst[0][2] = src[0][2];\
+	dst[0][3] = src[0][3];\
+	dst[1][0] = src[1][0];\
+	dst[1][1] = src[1][1];\
+	dst[1][2] = src[1][2];\
+	dst[1][3] = src[1][3];\
+	dst[2][0] = src[2][0];\
+	dst[2][1] = src[2][1];\
+	dst[2][2] = src[2][2];\
+	dst[2][3] = src[2][3];\
+	dst[3][0] = src[3][0];\
+	dst[3][1] = src[3][1];\
+	dst[3][2] = src[3][2];\
+	dst[3][3] = src[3][3]
+
+
+void echo_4way_compress( echo_4way_context *ctx, const unsigned char *pmsg,
+               unsigned int uBlockCount )
+{
+  unsigned int r, b, i, j;
+  __m512i t1, t2, s2, k1;
+  __m512i _state[4][4], _state2[4][4], _statebackup[4][4]; 
+
+// unroll   
+  for ( i = 0; i < 4; i++ )
+  for ( j = 0; j < ctx->uHashSize / 256; j++ )
+	 _state[ i ][ j ] = ctx->state[ i ][ j ];
+
+  for ( b = 0; b < uBlockCount; b++ )
+  {
+    ctx->k = _mm512_add_epi64( ctx->k, ctx->const1536 );
+
+    // load message, make aligned, remove loadu
+    for( j = ctx->uHashSize / 256; j < 4; j++ )
+    {
+      for ( i = 0; i < 4; i++ )
+	   {
+        _state[ i ][ j ] = _mm512_loadu_si512( 
+                     (__m512i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i );
+	   }
+	 }
+
+    // save state
+	 SAVESTATE( _statebackup, _state );
+
+	 k1 = ctx->k;
+
+	 for ( r = 0; r < ctx->uRounds / 2; r++ )
+	 {
+		ECHO_ROUND_UNROLL2;
+	 }
+		
+	 if ( ctx->uHashSize == 256 )
+	 {
+	   for ( i = 0; i < 4; i++ )
+	   {
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _state[ i ][ 1 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _state[ i ][ 2 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _state[ i ][ 3 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 0 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 1 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 2 ] ) ;
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 3 ] );
+	   }
+	 }
+	 else
+	 {
+	   for ( i = 0; i < 4; i++ )
+	   {
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _state[ i ][ 2 ] );
+		   _state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
+                                              _state[ i ][ 3 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 0 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ] [0 ],
+                                              _statebackup[ i ][ 2 ] );
+		   _state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
+                                              _statebackup[ i ][ 1 ] );
+		   _state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
+                                              _statebackup[ i ][ 3 ] );
+      }
+	 }
+    pmsg += ctx->uBlockLength;
+  }
+  SAVESTATE(ctx->state, _state);
+
+}
+
+
+
+int echo_4way_init( echo_4way_context *ctx, int nHashSize )
+{
+	int i, j;
+
+   ctx->k = m512_zero; 
+	ctx->processed_bits = 0;
+	ctx->uBufferBytes = 0;
+
+	switch( nHashSize )
+	{
+		case 256:
+			ctx->uHashSize = 256;
+			ctx->uBlockLength = 192;
+			ctx->uRounds = 8;
+			ctx->hashsize = _mm512_const4_32( 0, 0, 0, 0x100 );
+			ctx->const1536 = _mm512_const4_32( 0, 0, 0, 0x600 );
+			break;
+
+		case 512:
+			ctx->uHashSize = 512;
+			ctx->uBlockLength = 128;
+			ctx->uRounds = 10;
+			ctx->hashsize = _mm512_const4_32( 0, 0, 0, 0x200 );
+			ctx->const1536 = _mm512_const4_32( 0, 0, 0, 0x400);
+			break;
+
+		default:
+			return BAD_HASHBITLEN;
+	}
+
+
+	for( i = 0; i < 4; i++ )
+		for( j = 0; j < nHashSize / 256; j++ )
+			ctx->state[ i ][ j ] = ctx->hashsize;
+
+	for( i = 0; i < 4; i++ )
+		for( j = nHashSize / 256; j < 4; j++ )
+			ctx->state[ i ][ j ] = m512_zero;
+
+	return SUCCESS;
+}
+
+int echo_4way_update( echo_4way_context *state, const BitSequence *data, DataLength databitlen )
+{
+	unsigned int uByteLength, uBlockCount, uRemainingBytes;
+
+	uByteLength = (unsigned int)(databitlen / 8);
+
+	if ( ( state->uBufferBytes + uByteLength ) >= state->uBlockLength )
+	{
+		if ( state->uBufferBytes != 0 )
+		{
+			// Fill the buffer
+			memcpy( state->buffer + state->uBufferBytes,
+               (void*)data, state->uBlockLength - state->uBufferBytes );
+
+			// Process buffer
+			echo_4way_compress( state, state->buffer, 1 );
+			state->processed_bits += state->uBlockLength * 8;
+
+			data += state->uBlockLength - state->uBufferBytes;
+			uByteLength -= state->uBlockLength - state->uBufferBytes;
+		}
+
+		// buffer now does not contain any unprocessed bytes
+
+		uBlockCount = uByteLength / state->uBlockLength;
+		uRemainingBytes = uByteLength % state->uBlockLength;
+
+		if ( uBlockCount > 0 )
+		{
+			echo_4way_compress( state, data, uBlockCount );
+
+			state->processed_bits += uBlockCount * state->uBlockLength * 8;
+			data += uBlockCount * state->uBlockLength;
+		}
+
+		if ( uRemainingBytes > 0 )
+		{
+			memcpy( state->buffer, (void*)data, uRemainingBytes );
+		}
+
+		state->uBufferBytes = uRemainingBytes;
+	}
+	else
+	{
+		memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength );
+		state->uBufferBytes += uByteLength;
+	}
+
+	return 0;
+}
+
+echo_4way_close( echo_4way_context *state, BitSequence *hashval )
+{
+	__m512i remainingbits;
+
+	// Add remaining bytes in the buffer
+	state->processed_bits += state->uBufferBytes * 8;
+
+	remainingbits = _mm512_set4_epi32( 0, 0, 0, state->uBufferBytes * 8 );
+
+	// Pad with 0x80
+	state->buffer[ state->uBufferBytes++ ] = 0x80;
+	
+	// Enough buffer space for padding in this block?
+	if ( ( state->uBlockLength - state->uBufferBytes ) >= 18)
+	{
+		// Pad with zeros
+		memset( state->buffer + state->uBufferBytes, 0,
+                         state->uBlockLength - ( state->uBufferBytes + 18 ) );
+
+		// Hash size
+		*( (unsigned short*)( state->buffer + state->uBlockLength - 18 ) )
+           = state->uHashSize;
+
+		// Processed bits
+		*( ( DataLength*)( state->buffer + state->uBlockLength - 16 ) )
+           = state->processed_bits;
+		*( ( DataLength*)( state->buffer + state->uBlockLength - 8 ) ) = 0;
+
+		// Last block contains message bits?
+		if ( state->uBufferBytes == 1 )
+		{
+			state->k = _mm512_xor_si512( state->k, state->k );
+			state->k = _mm512_sub_epi64( state->k, state->const1536 );
+		}
+		else
+		{
+			state->k = _mm512_add_epi64( state->k, remainingbits );
+			state->k = _mm512_sub_epi64( state->k, state->const1536 );
+		}
+
+		// Compress
+		echo_4way_compress( state, state->buffer, 1 );
+	}
+	else
+	{
+		// Fill with zero and compress
+		memset( state->buffer + state->uBufferBytes, 0,
+                        state->uBlockLength - state->uBufferBytes );
+		state->k = _mm512_add_epi64( state->k, remainingbits );
+		state->k = _mm512_sub_epi64( state->k, state->const1536 );
+		echo_4way_compress( state, state->buffer, 1 );
+
+		// Last block
+		memset( state->buffer, 0, state->uBlockLength - 18 );
+
+		// Hash size
+		*( (unsigned short*)( state->buffer + state->uBlockLength - 18 ) )
+            = state->uHashSize;
+
+		// Processed bits
+		*( (DataLength*)( state->buffer + state->uBlockLength - 16 ) )
+            = state->processed_bits;
+		*( (DataLength*)( state->buffer + state->uBlockLength - 8 ) ) = 0;
+
+		// Compress the last block
+		state->k = _mm512_xor_si512(state->k, state->k);
+		state->k = _mm512_sub_epi64(state->k, state->const1536);
+		echo_4way_compress(state, state->buffer, 1);
+	}
+
+	// Store the hash value
+	_mm512_storeu_si512( (__m512i*)hashval + 0, state->state[ 0][ 0 ]);
+	_mm512_storeu_si512( (__m512i*)hashval + 1, state->state[ 1][ 0 ]);
+
+	if ( state->uHashSize == 512 )
+	{
+		_mm512_storeu_si512((__m512i*)hashval + 2, state->state[ 2 ][ 0 ]);
+		_mm512_storeu_si512((__m512i*)hashval + 3, state->state[ 3 ][ 0 ]);
+	}
+
+	return 0;
+}
+
+int echo_4way_update_close( echo_4way_context *state, BitSequence *hashval,
+                              const BitSequence *data, DataLength databitlen )
+{
+  unsigned int uByteLength, uBlockCount, uRemainingBytes;
+
+  uByteLength = (unsigned int)(databitlen / 8);
+
+  if ( (state->uBufferBytes + uByteLength) >= state->uBlockLength )
+  {
+     if ( state->uBufferBytes != 0 )
+     {
+        // Fill the buffer
+        memcpy( state->buffer + state->uBufferBytes,
+                   (void*)data, state->uBlockLength - state->uBufferBytes );
+
+        // Process buffer
+        echo_4way_compress( state, state->buffer, 1 );
+        state->processed_bits += state->uBlockLength * 8;
+
+        data += state->uBlockLength - state->uBufferBytes;
+        uByteLength -= state->uBlockLength - state->uBufferBytes;
+     }
+
+     // buffer now does not contain any unprocessed bytes
+
+     uBlockCount = uByteLength / state->uBlockLength;
+     uRemainingBytes = uByteLength % state->uBlockLength;
+
+     if ( uBlockCount > 0 )
+     {
+        echo_4way_compress( state, data, uBlockCount );
+        state->processed_bits += uBlockCount * state->uBlockLength * 8;
+        data += uBlockCount * state->uBlockLength;
+     }
+
+     if ( uRemainingBytes > 0 )
+     memcpy(state->buffer, (void*)data, uRemainingBytes);
+     state->uBufferBytes = uRemainingBytes;
+  }
+  else
+  {
+     memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength );
+     state->uBufferBytes += uByteLength;
+  } 
+
+  __m512i remainingbits;
+
+  // Add remaining bytes in the buffer
+  state->processed_bits += state->uBufferBytes * 8;
+
+  remainingbits = _mm512_set4_epi32( 0, 0, 0, state->uBufferBytes * 8 );
+
+  // Pad with 0x80
+  state->buffer[ state->uBufferBytes++ ] = 0x80;
+  // Enough buffer space for padding in this block?
+  if ( (state->uBlockLength - state->uBufferBytes) >= 18 )
+   {
+     // Pad with zeros
+     memset( state->buffer + state->uBufferBytes, 0,i
+                        state->uBlockLength - (state->uBufferBytes + 18) );
+
+     // Hash size
+     *( (unsigned short*)(state->buffer + state->uBlockLength - 18) )
+                   = state->uHashSize;
+
+     // Processed bits
+     *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
+                   state->processed_bits;
+     *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
+
+     // Last block contains message bits?
+     if( state->uBufferBytes == 1 )
+     {
+        state->k = _mm512_xor_si512( state->k, state->k );
+        state->k = _mm512_sub_epi64( state->k, state->const1536 );
+     }
+     else
+     {
+        state->k = _mm_add_epi64( state->k, remainingbits );
+        state->k = _mm_sub_epi64( state->k, state->const1536 );
+     }
+
+     // Compress
+     echo_4way_compress( state, state->buffer, 1 );
+  }
+  else
+  {
+     // Fill with zero and compress
+     memset( state->buffer + state->uBufferBytes, 0,
+                state->uBlockLength - state->uBufferBytes );
+     state->k = _mm512_add_epi64( state->k, remainingbits );
+     state->k = _mm512_sub_epi64( state->k, state->const1536 );
+     echo_4way_compress( state, state->buffer, 1 );
+
+     // Last block
+     memset( state->buffer, 0, state->uBlockLength - 18 );
+
+     // Hash size
+     *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) =
+                 state->uHashSize;
+
+     // Processed bits
+     *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
+                  state->processed_bits;
+     *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
+     // Compress the last block
+     state->k = _mm512_xor_si512( state->k, state->k );
+     state->k = _mm512_sub_epi64( state->k, state->const1536 );
+     echo_4way_compress( state, state->buffer, 1) ;
+  }
+
+  // Store the hash value
+  _mm512_storeu_si512( (__m512i*)hashval + 0, state->state[ 0 ][ 0] );
+  _mm512_storeu_si512( (__m512i*)hashval + 1, state->state[ 1 ][ 0] );
+
+  if ( state->uHashSize == 512 )
+  {
+     _mm512_storeu_si512( (__m512i*)hashval + 2, state->state[ 2 ][ 0 ] );
+     _mm512_storeu_si512( (__m512i*)hashval + 3, state->state[ 3 ][ 0 ] );
+
+  }
+  return 0;
+}
+
+#endif
--- a/algo/echo/echo-hash-4way.h
+++ b/algo/echo/echo-hash-4way.h
@@ -0,0 +1,36 @@
+#if !defined(ECHO_HASH_4WAY_H__)
+#define ECHO_HASH_4WAY_H__ 1
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#include "simd-utils.h"
+
+typedef struct
+{
+   __m512i    state[4][4];
+   __m512i    buffer[ 4 * 192 / 16 ];  // 4x128 interleaved 192 bytes
+   __m512i    k;
+   __m512i    hashsize;
+   __m512i    const1536;
+
+   unsigned int   uRounds;
+   unsigned int   uHashSize;
+   unsigned int   uBlockLength;
+   unsigned int   uBufferBytes;
+   unsigned int   processed_bits;
+
+} echo_4way_context __attribute__ ((aligned (64)));
+
+int echo_4way_init( echo_4way_context *state, int hashbitlen );
+
+
+int echo_4way_update( echo_4way_context *state, const void *data,
+    unsigned int databitlen);
+
+int echo_close( echo_4way_context *state, void *hashval );
+
+int echo_4way_update_close( echo_4way_context *state, void *hashval,
+                              const void *data, int databitlen );
+
+#endif 
+#endif
--- a/algo/groestl/aes_ni/groestl-version.h
+++ b/algo/groestl/aes_ni/groestl-version.h
@@ -9,6 +9,7 @@

 //#ifndef NO_AES_NI

+// Not to be confused with AVX512VAES
 #define VAES
 // #define VAVX
 // #define VVPERM
--- a/algo/haval/haval-8way-helper.c
+++ b/algo/haval/haval-8way-helper.c
@@ -0,0 +1,115 @@
+/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
+/*
+ * Helper code, included (three times !) by HAVAL implementation.
+ *
+ * TODO: try to merge this with md_helper.c.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#undef SPH_XCAT
+#define SPH_XCAT(a, b)    SPH_XCAT_(a, b)
+#undef SPH_XCAT_
+#define SPH_XCAT_(a, b)   a ## b
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_update)
+( haval_8way_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   unsigned current;
+
+   current = (unsigned)sc->count_low & 127U;
+   while ( len > 0 )
+   {
+      unsigned clen;
+      uint32_t clow, clow2;
+
+      clen = 128U - current;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( sc->buf + (current>>2), vdata, clen>>2 );
+      vdata += clen>>2;
+      current += clen;
+      len -= clen;
+      if ( current == 128U )
+      {
+         DSTATE_8W;
+         IN_PREPARE_8W(sc->buf);
+         RSTATE_8W;
+         SPH_XCAT(CORE_8W, PASSES)(INW_8W);
+         WSTATE_8W;
+         current = 0;
+      }
+      clow = sc->count_low;
+      clow2 = clow + clen;
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high ++;
+   }
+}
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_close)( haval_8way_context *sc,
+                                                void *dst)
+{
+   unsigned current;
+   DSTATE_8W;
+
+   current = (unsigned)sc->count_low & 127UL;
+
+   sc->buf[ current>>2 ] = m256_one_32;
+   current += 4;   
+   RSTATE_8W;
+   if ( current > 116UL )
+   {
+      memset_zero_256( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
+      do
+      {
+         IN_PREPARE_8W(sc->buf);
+         SPH_XCAT(CORE_8W, PASSES)(INW_8W);
+      } while (0);
+      current = 0;
+   }
+
+   uint32_t t1, t2;
+   memset_zero_256( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
+   t1 = 0x01 | (PASSES << 3);
+   t2 = sc->olen << 3;
+   sc->buf[ 116>>2 ] = _mm256_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
+   sc->buf[ 120>>2 ] = _mm256_set1_epi32( sc->count_low << 3 );
+   sc->buf[ 124>>2 ] = _mm256_set1_epi32( (sc->count_high << 3)
+                                     | (sc->count_low >> 29) );
+   do
+   {
+      IN_PREPARE_8W(sc->buf);
+      SPH_XCAT(CORE_8W, PASSES)(INW_8W);
+   } while (0);
+   WSTATE_8W;
+   haval_8way_out( sc, dst );
+}
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -40,7 +40,7 @@
 #include <string.h>
 #include "haval-hash-4way.h"

-// won't compile with sse4.2
+// won't compile with sse4.2, not a problem, it's only used with AVX2 4 way.
 //#if defined (__SSE4_2__)
 #if defined(__AVX__)

@@ -518,6 +518,301 @@ do { \

 #define INMSG(i)   msg[i]

+#if defined(__AVX2__)
+
+// Haval-256 8 way 32 bit avx2
+
+#define F1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   _mm256_xor_si256( x0, \
+       _mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \
+                      _mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
+                                     _mm256_and_si256( x3, x6 ) ) ) ) \
+
+#define F2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   _mm256_xor_si256( \
+      _mm256_and_si256( x2, \
+         _mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \
+                        _mm256_xor_si256( _mm256_and_si256( x4, x5 ), \
+                                       _mm256_xor_si256( x6, x0 ) ) ) ), \
+         _mm256_xor_si256( \
+             _mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \
+             _mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \
+
+#define F3_8W(x6, x5, x4, x3, x2, x1, x0) \
+  _mm256_xor_si256( \
+    _mm256_and_si256( x3, \
+      _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
+                     _mm256_xor_si256( x6, x0 ) ) ), \
+      _mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \
+                                   _mm256_and_si256( x2, x5 ) ), x0 ) )
+
+#define F4_8W(x6, x5, x4, x3, x2, x1, x0) \
+  _mm256_xor_si256( \
+     _mm256_xor_si256( \
+        _mm256_and_si256( x3, \
+           _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
+                                         _mm256_or_si256( x4, x6 ) ), x5 ) ), \
+        _mm256_and_si256( x4, \
+           _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \
+                          _mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \
+     _mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) )
+
+
+#define F5_8W(x6, x5, x4, x3, x2, x1, x0) \
+   _mm256_xor_si256( \
+       _mm256_and_si256( x0, \
+            mm256_not( _mm256_xor_si256( \
+                    _mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \
+      _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
+                                    _mm256_and_si256( x2, x5 ) ), \
+                                    _mm256_and_si256( x3, x6 ) ) )
+
+#define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F1_8W(x1, x0, x3, x5, x6, x2, x4)
+#define FP3_2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F2_8W(x4, x2, x1, x0, x5, x3, x6)
+#define FP3_3_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F3_8W(x6, x1, x2, x3, x4, x5, x0)
+
+#define FP4_1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F1_8W(x2, x6, x1, x4, x5, x3, x0)
+#define FP4_2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F2_8W(x3, x5, x2, x0, x1, x6, x4)
+#define FP4_3_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F3_8W(x1, x4, x3, x6, x0, x2, x5)
+#define FP4_4_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F4_8W(x6, x4, x0, x5, x2, x1, x3)
+
+#define FP5_1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F1_8W(x3, x4, x1, x0, x5, x2, x6)
+#define FP5_2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F2_8W(x6, x2, x1, x0, x3, x4, x5)
+#define FP5_3_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F3_8W(x2, x6, x0, x4, x3, x1, x5)
+#define FP5_4_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F4_8W(x1, x5, x3, x2, x0, x4, x6)
+#define FP5_5_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F5_8W(x2, x5, x0, x6, x4, x3, x1)
+
+#define STEP_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
+do { \
+   __m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
+                                      mm256_ror_32( x7, 11 ) ), \
+                       _mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
+} while (0)
+
+#define PASS1_8W(n, in)   do { \
+      unsigned pass_count; \
+      for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+         STEP_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
+            in(pass_count + 0), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
+            in(pass_count + 1), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
+            in(pass_count + 2), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
+            in(pass_count + 3), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
+            in(pass_count + 4), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
+            in(pass_count + 5), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
+            in(pass_count + 6), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
+            in(pass_count + 7), SPH_C32(0x00000000)); \
+         } \
+   } while (0)
+
+#define PASSG_8W(p, n, in)   do { \
+      unsigned pass_count; \
+      for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+         STEP_8W(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
+            in(MP ## p[pass_count + 0]), \
+            RK ## p[pass_count + 0]); \
+         STEP_8W(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
+            in(MP ## p[pass_count + 1]), \
+            RK ## p[pass_count + 1]); \
+         STEP_8W(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
+            in(MP ## p[pass_count + 2]), \
+            RK ## p[pass_count + 2]); \
+         STEP_8W(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
+            in(MP ## p[pass_count + 3]), \
+            RK ## p[pass_count + 3]); \
+         STEP_8W(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
+            in(MP ## p[pass_count + 4]), \
+            RK ## p[pass_count + 4]); \
+         STEP_8W(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
+            in(MP ## p[pass_count + 5]), \
+            RK ## p[pass_count + 5]); \
+         STEP_8W(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
+            in(MP ## p[pass_count + 6]), \
+            RK ## p[pass_count + 6]); \
+         STEP_8W(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
+            in(MP ## p[pass_count + 7]), \
+            RK ## p[pass_count + 7]); \
+         } \
+   } while (0)
+
+#define PASS2_8W(n, in)    PASSG_8W(2, n, in)
+#define PASS3_8W(n, in)    PASSG_8W(3, n, in)
+#define PASS4_8W(n, in)    PASSG_8W(4, n, in)
+#define PASS5_8W(n, in)    PASSG_8W(5, n, in)
+
+#define SAVE_STATE_8W \
+   __m256i u0, u1, u2, u3, u4, u5, u6, u7; \
+   do { \
+      u0 = s0; \
+      u1 = s1; \
+      u2 = s2; \
+      u3 = s3; \
+      u4 = s4; \
+      u5 = s5; \
+      u6 = s6; \
+      u7 = s7; \
+   } while (0)
+
+#define UPDATE_STATE_8W \
+do { \
+   s0 = _mm256_add_epi32( s0, u0 ); \
+   s1 = _mm256_add_epi32( s1, u1 ); \
+   s2 = _mm256_add_epi32( s2, u2 ); \
+   s3 = _mm256_add_epi32( s3, u3 ); \
+   s4 = _mm256_add_epi32( s4, u4 ); \
+   s5 = _mm256_add_epi32( s5, u5 ); \
+   s6 = _mm256_add_epi32( s6, u6 ); \
+   s7 = _mm256_add_epi32( s7, u7 ); \
+} while (0)
+
+#define CORE_8W5(in)  do { \
+      SAVE_STATE_8W; \
+      PASS1_8W(5, in); \
+      PASS2_8W(5, in); \
+      PASS3_8W(5, in); \
+      PASS4_8W(5, in); \
+      PASS5_8W(5, in); \
+      UPDATE_STATE_8W; \
+   } while (0)
+
+#define DSTATE_8W   __m256i s0, s1, s2, s3, s4, s5, s6, s7
+
+#define RSTATE_8W \
+do { \
+   s0 = sc->s0; \
+   s1 = sc->s1; \
+   s2 = sc->s2; \
+   s3 = sc->s3; \
+   s4 = sc->s4; \
+   s5 = sc->s5; \
+   s6 = sc->s6; \
+   s7 = sc->s7; \
+} while (0)
+
+#define WSTATE_8W \
+do { \
+   sc->s0 = s0; \
+   sc->s1 = s1; \
+   sc->s2 = s2; \
+   sc->s3 = s3; \
+   sc->s4 = s4; \
+   sc->s5 = s5; \
+   sc->s6 = s6; \
+   sc->s7 = s7; \
+} while (0)
+
+static void
+haval_8way_init( haval_8way_context *sc, unsigned olen, unsigned passes )
+{
+   sc->s0 = m256_const1_32( 0x243F6A88UL );
+   sc->s1 = m256_const1_32( 0x85A308D3UL );
+   sc->s2 = m256_const1_32( 0x13198A2EUL );
+   sc->s3 = m256_const1_32( 0x03707344UL );
+   sc->s4 = m256_const1_32( 0xA4093822UL );
+   sc->s5 = m256_const1_32( 0x299F31D0UL );
+   sc->s6 = m256_const1_32( 0x082EFA98UL );
+   sc->s7 = m256_const1_32( 0xEC4E6C89UL );
+   sc->olen = olen;
+   sc->passes = passes;
+   sc->count_high = 0;
+   sc->count_low = 0;
+
+}
+#define IN_PREPARE_8W(indata) const __m256i *const load_ptr_8w = (indata)
+
+#define INW_8W(i)   load_ptr_8w[ i ] 
+
+static void
+haval_8way_out( haval_8way_context *sc, void *dst )
+{
+   __m256i *buf = (__m256i*)dst;
+   DSTATE_8W;
+   RSTATE_8W;
+
+   buf[0] = s0;
+   buf[1] = s1;
+   buf[2] = s2;
+   buf[3] = s3;
+   buf[4] = s4;
+   buf[5] = s5;
+   buf[6] = s6;
+   buf[7] = s7;
+}
+
+#undef PASSES
+#define PASSES   5
+#include "haval-8way-helper.c"
+
+#define API_8W(xxx, y) \
+void \
+haval ## xxx ## _ ## y ## _8way_init(void *cc) \
+{ \
+   haval_8way_init(cc, xxx >> 5, y); \
+} \
+ \
+void \
+haval ## xxx ## _ ## y ## _8way_update (void *cc, const void *data, size_t len) \
+{ \
+   haval ## y ## _8way_update(cc, data, len); \
+} \
+ \
+void \
+haval ## xxx ## _ ## y ## _8way_close(void *cc, void *dst) \
+{ \
+   haval ## y ## _8way_close(cc, dst); \
+} \
+
+API_8W(256, 5)
+
+#define RVAL_8W \
+do { \
+   s0 = val[0]; \
+   s1 = val[1]; \
+   s2 = val[2]; \
+   s3 = val[3]; \
+   s4 = val[4]; \
+   s5 = val[5]; \
+   s6 = val[6]; \
+   s7 = val[7]; \
+} while (0)
+
+#define WVAL_8W \
+do { \
+   val[0] = s0; \
+   val[1] = s1; \
+   val[2] = s2; \
+   val[3] = s3; \
+   val[4] = s4; \
+   val[5] = s5; \
+   val[6] = s6; \
+   val[7] = s7; \
+} while (0)
+
+#define INMSG_8W(i)   msg[i]
+
+
+
+#endif // AVX2
+
 #ifdef __cplusplus
 }
 #endif	
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -59,7 +59,7 @@
 */

 #ifndef HAVAL_HASH_4WAY_H__
-#define HAVAL_HASH_4WAY_H__
+#define HAVAL_HASH_4WAY_H__ 1

 #if defined(__AVX__)

@@ -84,10 +84,30 @@ typedef haval_4way_context haval256_5_4way_context;

 void haval256_5_4way_init( void *cc );

-void haval256_5_4way( void *cc, const void *data, size_t len );
+void haval256_5_4way_update( void *cc, const void *data, size_t len );
+#define haval256_5_4way haval256_5_4way_update

 void haval256_5_4way_close( void *cc, void *dst );

+#if defined(__AVX2__)
+
+typedef struct {
+   __m256i buf[32];
+   __m256i s0, s1, s2, s3, s4, s5, s6, s7;
+   unsigned olen, passes;
+   uint32_t count_high, count_low;
+} haval_8way_context __attribute__ ((aligned (64)));
+
+typedef haval_8way_context haval256_5_8way_context;
+
+void haval256_5_8way_init( void *cc );
+
+void haval256_5_8way_update( void *cc, const void *data, size_t len );
+
+void haval256_5_8way_close( void *cc, void *dst );
+
+#endif // AVX2
+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -1,15 +1,206 @@
 #include "lyra2-gate.h"
 #include <memory.h>
 #include <mm_malloc.h>
-
-#if defined (ALLIUM_4WAY)	
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/groestl/aes_ni/hash-groestl256.h"

+#if defined (ALLIUM_8WAY)  
+
+typedef struct {
+   blake256_8way_context     blake;
+   keccak256_8way_context    keccak;
+   cube_4way_context          cube;
+   skein256_8way_context     skein;
+   hashState_groestl256      groestl;
+} allium_8way_ctx_holder;
+
+static __thread allium_8way_ctx_holder allium_8way_ctx;
+
+bool init_allium_8way_ctx()
+{
+   keccak256_8way_init( &allium_8way_ctx.keccak );
+   cube_4way_init( &allium_8way_ctx.cube, 256, 16, 32 );
+   skein256_8way_init( &allium_8way_ctx.skein );
+   init_groestl256( &allium_8way_ctx.groestl, 32 );
+   return true;
+}
+
+void allium_8way_hash( void *state, const void *input )
+{
+   uint32_t vhash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
+   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (64)));
+   uint32_t hash2[8] __attribute__ ((aligned (64)));
+   uint32_t hash3[8] __attribute__ ((aligned (64)));
+   uint32_t hash4[8] __attribute__ ((aligned (64)));
+   uint32_t hash5[8] __attribute__ ((aligned (64)));
+   uint32_t hash6[8] __attribute__ ((aligned (64)));
+   uint32_t hash7[8] __attribute__ ((aligned (64)));
+   allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
+
+   memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
+   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
+   blake256_8way_close( &ctx.blake, vhash );
+
+   rintrlv_8x32_8x64( vhashA, vhash, 256 );
+   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
+   keccak256_8way_close( &ctx.keccak, vhash );
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 vhash, 256 );
+
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+  
+/* 
+   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
+   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
+   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
+   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
+   LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
+   LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
+   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
+   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
+*/
+
+
+
+   intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
+
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+
+
+/*
+   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
+   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
+   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
+   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
+   LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
+   LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
+   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
+   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
+*/
+
+
+
+   intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                hash7, 256 );
+
+   skein256_8way_update( &ctx.skein, vhash, 32 );
+   skein256_8way_close( &ctx.skein, vhash );
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 vhash, 256 );
+
+   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+128, hash4, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+160, hash5, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+192, hash6, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+}
+
+int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 8;
+   const uint32_t Htarg = ptarget[7];
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );
+   blake256_8way_init( &allium_8way_ctx.blake );
+   blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );
+
+   do {
+     *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                 n+3, n+2, n+1, n ) );
+
+     allium_8way_hash( hash, vdata );
+     pdata[19] = n;
+
+     for ( int lane = 0; lane < 8; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
+     {
+        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
+        {
+           pdata[19] = n + lane;
+           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+         }
+     }
+     n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+#elif defined (ALLIUM_4WAY)  
+
+
 typedef struct {
   blake256_4way_context     blake;
   keccak256_4way_context    keccak;
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -78,8 +78,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_lyra2rev3;
  gate->hash      = (void*)&lyra2rev3_hash;
 #endif
-//  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
-  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
  opt_target_factor = 256.0;
  return true;
@@ -95,10 +94,14 @@ bool lyra2rev2_thread_init()
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;

   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+#if defined (LYRA2REV2_8WAY)
+   l2v2_wholeMatrix = _mm_malloc( 2 * size, 64 );   // 2 way
+   init_lyra2rev2_8way_ctx();;
+#elif defined (LYRA2REV2_4WAY)
   l2v2_wholeMatrix = _mm_malloc( size, 64 );
-#if defined (LYRA2REV2_4WAY)
   init_lyra2rev2_4way_ctx();;
 #else
+   l2v2_wholeMatrix = _mm_malloc( size, 64 );
   init_lyra2rev2_ctx();
 #endif
   return l2v2_wholeMatrix;
@@ -106,14 +109,17 @@ bool lyra2rev2_thread_init()

 bool register_lyra2rev2_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV2_4WAY)
+#if defined (LYRA2REV2_8WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev2_8way;
+  gate->hash      = (void*)&lyra2rev2_8way_hash;
+#elif defined (LYRA2REV2_4WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
  gate->hash      = (void*)&lyra2rev2_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_lyra2rev2;
  gate->hash      = (void*)&lyra2rev2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  opt_target_factor = 256.0;
  return true;
@@ -123,7 +129,11 @@ bool register_lyra2rev2_algo( algo_gate_t* gate )

 bool register_lyra2z_algo( algo_gate_t* gate )
 {
-#if defined(LYRA2Z_8WAY)
+#if defined(LYRA2Z_16WAY)
+  gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2z_16way;
+  gate->hash       = (void*)&lyra2z_16way_hash;
+#elif defined(LYRA2Z_8WAY)
  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_8way;
  gate->hash       = (void*)&lyra2z_8way_hash;
@@ -136,7 +146,7 @@ bool register_lyra2z_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2z;
  gate->hash       = (void*)&lyra2z_hash;
 #endif
-  gate->optimizations = SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE42_OPT | AVX2_OPT | AVX512_OPT;
  opt_target_factor = 256.0;
  return true;
 };
@@ -164,7 +174,11 @@ bool register_lyra2h_algo( algo_gate_t* gate )

 bool register_allium_algo( algo_gate_t* gate )
 {
-#if defined (ALLIUM_4WAY)
+#if defined (ALLIUM_8WAY)
+  gate->miner_thread_init = (void*)&init_allium_8way_ctx;
+  gate->scanhash  = (void*)&scanhash_allium_8way;
+  gate->hash      = (void*)&allium_8way_hash;
+#elif defined (ALLIUM_4WAY)
  gate->miner_thread_init = (void*)&init_allium_4way_ctx;
  gate->scanhash  = (void*)&scanhash_allium_4way;
  gate->hash      = (void*)&allium_4way_hash;
@@ -173,7 +187,7 @@ bool register_allium_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_allium;
  gate->hash      = (void*)&allium_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
  opt_target_factor = 256.0;
  return true;
 };
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -5,10 +5,10 @@
 #include <stdint.h>
 #include "lyra2.h"

-//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-//  #define LYRA2REV3_16WAY 1
-//#elif defined(__AVX2__)
-#if defined(__AVX2__)
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define LYRA2REV3_16WAY 1
+#elif defined(__AVX2__)
  #define LYRA2REV3_8WAY 1
 #elif defined(__SSE2__)
  #define LYRA2REV3_4WAY 1
@@ -50,15 +50,24 @@ bool init_lyra2rev3_ctx();

 //////////////////////////////////

-#if defined(__AVX2__)
-  #define LYRA2REV2_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define LYRA2REV2_8WAY 1
+#elif defined(__AVX2__)
+  #define LYRA2REV2_4WAY 1
 #endif

 extern __thread uint64_t* l2v2_wholeMatrix;

 bool register_lyra2rev2_algo( algo_gate_t* gate );

-#if defined(LYRA2REV2_4WAY)
+#if defined(LYRA2REV2_8WAY)
+
+void lyra2rev2_8way_hash( void *state, const void *input );
+int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr );
+bool init_lyra2rev2_8way_ctx();
+
+#elif defined(LYRA2REV2_4WAY)

 void lyra2rev2_4way_hash( void *state, const void *input );
 int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
@@ -76,17 +85,25 @@ bool init_lyra2rev2_ctx();

 /////////////////////////

-#if defined(__SSE2__)
-  #define LYRA2Z_4WAY
-#endif
-#if defined(__AVX2__)
-  #define LYRA2Z_8WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define LYRA2Z_16WAY 1
+#elif defined(__AVX2__)
+  #define LYRA2Z_8WAY 1
+#elif defined(__SSE2__)
+  #define LYRA2Z_4WAY 1
 #endif


 #define LYRA2Z_MATRIX_SIZE  BLOCK_LEN_INT64 * 8 * 8 * 8

-#if defined(LYRA2Z_8WAY)
+#if defined(LYRA2Z_16WAY)
+
+void lyra2z_16way_hash( void *state, const void *input );
+int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );
+bool lyra2z_16way_thread_init();
+
+#elif defined(LYRA2Z_8WAY)

 void lyra2z_8way_hash( void *state, const void *input );
 int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
@@ -135,13 +152,22 @@ bool lyra2h_thread_init();

 //////////////////////////////////

-#if defined(__AVX2__) && defined(__AES__)
-  #define ALLIUM_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define ALLIUM_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define ALLIUM_4WAY 1
 #endif

 bool register_allium_algo( algo_gate_t* gate );

-#if defined(ALLIUM_4WAY)
+#if defined(ALLIUM_8WAY)
+
+void allium_8way_hash( void *state, const void *input );
+int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );
+bool init_allium_8way_ctx();
+
+#elif defined(ALLIUM_4WAY)

 void allium_4way_hash( void *state, const void *input );
 int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
--- a/algo/lyra2/lyra2-hash-2way.c
+++ b/algo/lyra2/lyra2-hash-2way.c
@@ -0,0 +1,578 @@
+/**
+ * Implementation of the Lyra2 Password Hashing Scheme (PHS).
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <mm_malloc.h>
+#include "compat.h"
+#include "lyra2.h"
+#include "sponge.h"
+
+//  LYRA2RE 8 cols 8 rows used by lyra2re, allium, phi2, x22i, x25x, 
+//  dynamic matrix allocation.
+//
+//  LYRA2REV2 4 cols 4 rows used by lyra2rev2 and x21s, static matrix
+//  allocation.
+//
+//  LYRA2REV3 4 cols 4 rows with an extra twist in calculating
+//  rowa in the wandering phase. Used by lyra2rev3. Static matrix
+//  allocation.
+// 
+//  LYRA2Z various cols & rows and supports 80 byte input. Used by lyra2z,
+//  lyra2z330, lyra2h, 
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+/**
+ * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
+ * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
+ * where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all
+ * integer parameters (treated as type "unsigned int") in the order they are provided, plus the value
+ * of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols).
+ *
+ * @param K The derived key to be output by the algorithm
+ * @param kLen Desired key length
+ * @param pwd User password
+ * @param pwdlen Password length
+ * @param salt Salt
+ * @param saltlen Salt length
+ * @param timeCost Parameter to determine the processing time (T)
+ * @param nRows Number or rows of the memory matrix (R)
+ * @param nCols Number of columns of the memory matrix (C)
+ *
+ * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
+ */
+
+// For lyra2rev3.
+// convert a simple offset to an index into 2x4 u64 interleaved data.
+// good for state and 4 row matrix. 
+// index = ( int( off / 4 ) * 2 ) + ( off mod 4 )
+
+#define offset_to_index( o ) \
+   ( ( ( (uint64_t)( (o) & 0xf) / 4 ) * 8 ) + ( (o) % 4 ) )
+
+
+int LYRA2REV2_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
+             const void *pwd, const uint64_t pwdlen, const uint64_t timeCost,
+             const uint64_t nRows, const uint64_t nCols )
+{
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[32];
+   int64_t row = 2;
+   int64_t prev = 1;
+   int64_t rowa0 = 0;
+   int64_t rowa1 = 0;
+   int64_t tau; 
+   int64_t step = 1;
+   int64_t window = 2;
+   int64_t gap = 1;
+   //====================================================================/
+
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+
+   // for Lyra2REv2, nCols = 4, v1 was using 8
+   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
+                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+   uint64_t *ptrWord = wholeMatrix;
+
+   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+
+   uint64_t *ptr = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;
+
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;
+
+   // now build the rest interleaving on the fly.
+
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;
+
+   ptrWord = wholeMatrix;
+
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
+
+   //Initializes M[0] and M[1]
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
+
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                           &wholeMatrix[ 2 * ROW_LEN_INT64 ],  nCols );
+
+   do
+   {
+     //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+     reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64],
+                                        &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64],
+                                        &wholeMatrix[ 2* row*ROW_LEN_INT64],
+                                        nCols );
+
+     rowa0 = (rowa0 + step) & (window - 1);
+
+     prev = row;
+     row++;
+
+     if ( rowa0 == 0 )
+     {
+        step = window + gap;
+        window *= 2; 
+        gap = -gap;
+     }
+   } while ( row < nRows );
+
+   //===================== Wandering Phase =============================//
+   row = 0;
+   for ( tau = 1; tau <= timeCost; tau++ )
+   {
+      step = ( (tau & 1) == 0 ) ? -1 : ( nRows >> 1 ) - 1;
+      do
+      {
+        rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
+        rowa1 = state[ 4 ] & (unsigned int)(nRows-1);
+
+        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row *ROW_LEN_INT64 ],
+                                      nCols );
+         prev = row;
+
+         row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+
+      } while (row != 0);
+   }
+
+   //===================== Wrap-up Phase ===============================//
+   //Absorbs the last block of the memory matrix
+   absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64 ],
+                            &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64 ] );
+   //Squeezes the key
+   squeeze_2way( state, K, (unsigned int) kLen );
+
+   return 0;
+}
+
+// This version is currently only used by REv3 and has some hard coding
+// specific to v3 such as input data size of 32 bytes.
+//
+// Similarly with REv2. Thedifference with REv3 isn't clear and maybe
+// they can be merged.
+//
+// RE is used by RE, allium. The main difference between RE and REv2
+// in the matrix size.
+//
+// Z also needs to support 80 byte input as well as 32 byte, and odd
+// matrix sizes like 330 rows. It is used by lyra2z330, lyra2z, lyra2h.
+
+
+/////////////////////////////////////////////////
+
+// 2 way 256
+// drop salt, salt len arguments, hard code some others.
+// Data is interleaved 2x256.
+
+int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
+                    const void *pwd, uint64_t pwdlen, uint64_t timeCost,
+                    uint64_t nRows, uint64_t nCols )
+
+// hard coded for 32 byte input as well as matrix size.
+// Other required versions include 80 byte input and different block
+// sizes.
+
+{
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[32];
+   int64_t row = 2; 
+   int64_t prev = 1;
+   int64_t rowa0 = 0;
+   int64_t rowa1 = 0;
+   int64_t tau; 
+   int64_t step = 1;
+   int64_t window = 2;
+   int64_t gap = 1; 
+   uint64_t instance0 = 0;
+   uint64_t instance1 = 0;
+   //====================================================================/
+
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+   const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
+
+   uint64_t *ptrWord = wholeMatrix;
+
+//  2 way 256 rewrite. Salt always == password, and data is interleaved,
+//  need to build in parallel as pw isalready interleaved.
+
+   
+//  {   password,    (64 or 80 bytes)
+//      salt,        (64 or 80 bytes) =  same as password
+//      Klen,        (u64)  = 32 bytes
+//      pwdlen,      (u64)
+//      saltlen,     (u64)
+//      timecost,    (u64)
+//      nrows,       (u64)
+//      ncols,       (u64)
+//      0x80,        (byte)
+//      { 0 .. 0 },
+//      1            (byte)
+//   }
+   
+// input is usually 32 maybe 64, both are aligned to 256 bit vector.
+// 80 byte inpput is not aligned complicating matters for lyra2z.   
+
+   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+   
+   uint64_t *ptr = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;
+
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;
+ 
+   // now build the rest interleaving on the fly.
+
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;
+
+   ptrWord = wholeMatrix;
+
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
+
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
+
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                           &wholeMatrix[2*ROW_LEN_INT64],  nCols );
+
+   do
+   {
+
+      reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
+                                         nCols );
+
+      rowa0 = (rowa0 + step) & (window - 1);
+
+      prev = row;
+      row++;
+
+      if (rowa0 == 0)
+      {
+         step = window + gap; //changes the step: approximately doubles its value
+         window *= 2; //doubles the size of the re-visitation window
+         gap = -gap; //inverts the modifier to the step
+      }
+
+   } while (row < nRows);
+
+   row = 0;
+   for (tau = 1; tau <= timeCost; tau++)
+   {
+      step = ( (tau & 1) == 0 ) ? -1 : ( nRows >> 1 ) - 1;
+      do
+      {
+        instance0 = state[ offset_to_index( instance0 ) ];
+        instance1 = (&state[4])[ offset_to_index( instance1 ) ];
+
+        rowa0 = state[ offset_to_index( instance0 )  ]
+                & (unsigned int)(nRows-1);
+        rowa1 = (state+4)[ offset_to_index( instance1 ) ]
+                & (unsigned int)(nRows-1);
+
+        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
+                                      nCols );
+
+        prev = row;
+        row = (row + step) & (unsigned int)(nRows-1); 
+
+       } while ( row != 0 );
+   }
+
+   absorbBlock_2way( state, &wholeMatrix[2*rowa0*ROW_LEN_INT64],
+                            &wholeMatrix[2*rowa1*ROW_LEN_INT64] );
+
+   squeeze_2way( state, K, (unsigned int) kLen );
+
+   return 0;
+}
+
+//////////////////////////////////////////////////
+
+int LYRA2Z_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
+               const void *pwd, const uint64_t pwdlen, const uint64_t timeCost,
+               const uint64_t nRows, const uint64_t nCols )
+{
+    //========================== Basic variables ============================//
+    uint64_t _ALIGN(256) state[32];
+    int64_t row = 2;
+    int64_t prev = 1;
+    int64_t rowa0 = 0;
+    int64_t rowa1 = 0;
+    int64_t tau; 
+    int64_t step = 1;
+    int64_t window = 2;
+    int64_t gap = 1; 
+    //=======================================================================/
+
+    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+
+    //First, we clean enough blocks for the password, salt, basil and padding
+    uint64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 *
+                       sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+
+   uint64_t *ptr = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;
+
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;
+
+   // now build the rest interleaving on the fly.
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;
+
+   uint64_t *ptrWord = wholeMatrix;
+
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput,
+                               BLOCK_LEN_BLAKE2_SAFE_INT64 );
+
+   //Initializes M[0] and M[1]
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
+
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                           &wholeMatrix[ 2 * ROW_LEN_INT64 ],  nCols );
+
+   do
+   {
+     //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+     reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64],
+                                        &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64],
+                                        &wholeMatrix[ 2* row*ROW_LEN_INT64],
+                                        nCols );
+
+     rowa0 = (rowa0 + step) & (window - 1);
+     prev = row;
+     row++;
+
+     if ( rowa0 == 0 )
+     {
+        step = window + gap;
+        window *= 2;
+        gap = -gap;
+     }
+   } while ( row < nRows );
+
+   row = 0;
+   for ( tau = 1; tau <= timeCost; tau++ )
+   {
+        step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+      do
+      {
+        rowa0 = state[ 0 ] % nRows;
+        rowa1 = state[ 4 ] % nRows;
+
+        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row *ROW_LEN_INT64 ],
+                                      nCols );
+
+        prev = row;
+        row = (row + step) % nRows;
+
+      } while (row != 0);
+   }
+
+   absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64 ],
+                            &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64 ] );
+
+   //Squeezes the key
+   squeeze_2way( state, K, (unsigned int) kLen );
+
+   return 0;
+}
+
+////////////////////////////////////////////////////
+
+// Lyra2RE doesn't like the new wholeMatrix implementation
+int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
+                  const uint64_t pwdlen, const uint64_t timeCost,
+                  const uint64_t nRows, const uint64_t nCols )
+{
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[32];
+   int64_t row = 2; //index of row to be processed
+   int64_t prev = 1; //index of prev (last row ever computed/modified)
+   int64_t rowa0 = 0;
+   int64_t rowa1 = 0;
+   int64_t tau; //Time Loop iterator
+   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+   int64_t i; //auxiliary iteration counter
+   //====================================================================/
+
+   //=== Initializing the Memory Matrix and pointers to it =============//
+   //Tries to allocate enough space for the whole memory matrix
+
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   // for Lyra2REv2, nCols = 4, v1 was using 8
+   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
+                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+
+   i = (int64_t)ROW_LEN_BYTES * nRows;
+   uint64_t *wholeMatrix = _mm_malloc( 2*i, 64 );
+   if (wholeMatrix == NULL)
+      return -1;
+
+   memset_zero_512( (__m512i*)wholeMatrix, i>>5 );
+
+   uint64_t *ptrWord = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;
+
+   //First, we clean enough blocks for the password, salt, basil and padding
+   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+
+   uint64_t *ptr = wholeMatrix;
+
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;
+
+   // now build the rest interleaving on the fly.
+
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;
+
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
+
+   //Initializes M[0] and M[1]
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                                  &wholeMatrix[ 2 * ROW_LEN_INT64], nCols );
+
+   do
+   {
+      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+      reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
+                                         nCols );
+
+      //updates the value of row* (deterministically picked during Setup))
+      rowa0 = (rowa0 + step) & (window - 1);
+      //update prev: it now points to the last row ever computed
+
+      prev = row;
+      //updates row: goes to the next row to be computed
+      row++;
+
+      //Checks if all rows in the window where visited.
+      if (rowa0 == 0)
+      {
+         step = window + gap; //changes the step: approximately doubles its value
+         window *= 2; //doubles the size of the re-visitation window
+         gap = -gap; //inverts the modifier to the step
+      }
+
+   } while (row < nRows);
+
+   //===================== Wandering Phase =============================//
+   row = 0; //Resets the visitation to the first row of the memory matrix
+   for (tau = 1; tau <= timeCost; tau++)
+   {
+      step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
+      do
+      {
+        rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
+        rowa1 = state[ 4 ] & (unsigned int)(nRows-1);
+
+        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row *ROW_LEN_INT64 ],
+                                      nCols );
+
+           //update prev: it now points to the last row ever computed
+           prev = row;
+
+           //updates row: goes to the next row to be computed
+           //----------------------------------------------------
+           row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //----------------------------------------------------
+
+       } while (row != 0);
+   }
+
+   //===================== Wrap-up Phase ===============================//
+   //Absorbs the last block of the memory matrix
+   absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64],
+                            &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64] );
+   //Squeezes the key
+   squeeze_2way( state, K, (unsigned int) kLen );
+
+   //================== Freeing the memory =============================//
+   _mm_free(wholeMatrix);
+
+   return 0;
+}
+
+#endif
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -327,7 +327,6 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,

   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
                      nCols);
-
   do
   {
      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -62,12 +62,17 @@ int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *sa

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
-               uint64_t pwdlen, const void *salt, uint64_t saltlen,
-               uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
+                  uint64_t timeCost, uint64_t nRows, uint64_t nCols );

-//int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
-//        uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+int LYRA2REV2_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
+        uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
+int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
+        uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
+int LYRA2Z_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
+          uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );

 #endif

--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -1,13 +1,150 @@
 #include "lyra2-gate.h"
 #include <memory.h>
-
-#if defined (LYRA2REV2_4WAY)	
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 
+#include "algo/cubehash/cube-hash-2way.h"
+
+#if defined (LYRA2REV2_8WAY)
+
+typedef struct {
+   blake256_8way_context     blake;
+   keccak256_8way_context    keccak;
+   cube_4way_context          cube;
+   skein256_8way_context     skein;
+   bmw256_8way_context          bmw;
+} lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));
+
+static lyra2v2_8way_ctx_holder l2v2_8way_ctx;
+
+bool init_lyra2rev2_8way_ctx()
+{
+   keccak256_8way_init( &l2v2_8way_ctx.keccak );
+   cube_4way_init( &l2v2_8way_ctx.cube, 256, 16, 32 );
+   skein256_8way_init( &l2v2_8way_ctx.skein );
+   bmw256_8way_init( &l2v2_8way_ctx.bmw );
+   return true;
+}
+
+void lyra2rev2_8way_hash( void *state, const void *input )
+{
+   uint32_t vhash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
+   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (64)));
+   uint32_t hash2[8] __attribute__ ((aligned (64)));
+   uint32_t hash3[8] __attribute__ ((aligned (64)));
+   uint32_t hash4[8] __attribute__ ((aligned (64)));
+   uint32_t hash5[8] __attribute__ ((aligned (64)));
+   uint32_t hash6[8] __attribute__ ((aligned (64)));
+   uint32_t hash7[8] __attribute__ ((aligned (64)));
+   lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );
+
+   blake256_8way( &ctx.blake, input + (64<<3), 16 );
+   blake256_8way_close( &ctx.blake, vhash );
+
+   rintrlv_8x32_8x64( vhashA, vhash, 256 );
+
+   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
+   keccak256_8way_close( &ctx.keccak, vhash );
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
+
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+
+   intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                hash7, 256 );
+
+   skein256_8way_update( &ctx.skein, vhash, 32 );
+   skein256_8way_close( &ctx.skein, vhash );
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
+
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+   
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
+
+   intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, 
+                hash7, 256 );
+
+   bmw256_8way_update( &ctx.bmw, vhash, 32 );
+   bmw256_8way_close( &ctx.bmw, state );
+}
+
+int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+   const uint32_t Htarg = ptarget[7];
+   __m256i *noncev = (__m256i*)vdata + 19;   // aligned
+   int thr_id = mythr->id; 
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );
+
+   blake256_8way_init( &l2v2_8way_ctx.blake );
+   blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 );
+
+   do
+   {
+      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                  n+3, n+2, n+1, n ) );
+
+      lyra2rev2_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (LYRA2REV2_4WAY)

 typedef struct {
   blake256_4way_context     blake;
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -4,8 +4,180 @@
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 
+#include "algo/cubehash/cube-hash-2way.h"

-#if defined (LYRA2REV3_8WAY)
+#if defined (LYRA2REV3_16WAY)
+
+typedef struct {
+   blake256_16way_context     blake;
+   cube_4way_context          cube;
+   bmw256_16way_context       bmw;
+} lyra2v3_16way_ctx_holder;
+
+static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx;
+
+bool init_lyra2rev3_16way_ctx()
+{
+   blake256_16way_init( &l2v3_16way_ctx.blake );
+   cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 );
+   bmw256_16way_init( &l2v3_16way_ctx.bmw );
+   return true;
+}
+
+void lyra2rev3_16way_hash( void *state, const void *input )
+{
+   uint32_t vhash[16*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (64)));
+   uint32_t hash2[8] __attribute__ ((aligned (64)));
+   uint32_t hash3[8] __attribute__ ((aligned (64)));
+   uint32_t hash4[8] __attribute__ ((aligned (64)));
+   uint32_t hash5[8] __attribute__ ((aligned (64)));
+   uint32_t hash6[8] __attribute__ ((aligned (64)));
+   uint32_t hash7[8] __attribute__ ((aligned (64)));
+   uint32_t hash8[8] __attribute__ ((aligned (64)));
+   uint32_t hash9[8] __attribute__ ((aligned (64)));
+   uint32_t hash10[8] __attribute__ ((aligned (64)));
+   uint32_t hash11[8] __attribute__ ((aligned (64)));
+   uint32_t hash12[8] __attribute__ ((aligned (64)));
+   uint32_t hash13[8] __attribute__ ((aligned (64)));
+   uint32_t hash14[8] __attribute__ ((aligned (64)));
+   uint32_t hash15[8] __attribute__ ((aligned (64)));
+   lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );
+
+   blake256_16way_update( &ctx.blake, input + (64*16), 16 );
+   blake256_16way_close( &ctx.blake, vhash );
+
+   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+           hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
+           vhash, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+   intrlv_2x256( vhash, hash8, hash9, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash8, hash9, vhash, 256 );
+   intrlv_2x256( vhash, hash10, hash11, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash10, hash11, vhash, 256 );
+   intrlv_2x256( vhash, hash12, hash13, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash12, hash13, vhash, 256 );
+   intrlv_2x256( vhash, hash14, hash15, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash14, hash15, vhash, 256 );
+
+   intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 256 );
+   intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 256 );
+   intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash8, hash9, hash10, hash11, vhash, 256 );
+   intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash12, hash13, hash14, hash15, vhash, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+   intrlv_2x256( vhash, hash8, hash9, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash8, hash9, vhash, 256 );
+   intrlv_2x256( vhash, hash10, hash11, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash10, hash11, vhash, 256 );
+   intrlv_2x256( vhash, hash12, hash13, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash12, hash13, vhash, 256 );
+   intrlv_2x256( vhash, hash14, hash15, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash14, hash15, vhash, 256 );
+
+   intrlv_16x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+             hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
+             hash15, 256 );
+
+   bmw256_16way_update( &ctx.bmw, vhash, 32 );
+   bmw256_16way_close( &ctx.bmw, state );
+}
+
+
+int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &hash[7<<4];
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 16;
+   const uint32_t Htarg = ptarget[7];
+   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
+   const int thr_id = mythr->id;
+
+   if ( opt_benchmark )  ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm512_bswap32_intrlv80_16x32( vdata, pdata );
+
+   blake256_16way_init( &l2v3_16way_ctx.blake );
+   blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
+
+   do
+   {
+      *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
+                                                  n+11, n+10, n+ 9, n+ 8,
+                                                  n+ 7, n+ 6, n+ 5, n+ 4,
+                                                  n+ 3, n+ 2, n+ 1, n ) );
+
+      lyra2rev3_16way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int lane = 0; lane < 16; lane++ )
+      if ( unlikely( hash7[lane] <= Htarg ) )
+      {
+         extr_lane_16x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+             pdata[19] = n + lane;
+             submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 16;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (LYRA2REV3_8WAY)

 typedef struct {
   blake256_8way_context     blake;
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -1,13 +1,240 @@
 #include "lyra2-gate.h"
-
-#ifdef LYRA2Z_4WAY
-
 #include <memory.h>
 #include <mm_malloc.h>
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"
 #include "algo/blake/blake-hash-4way.h"

+#if defined(LYRA2Z_16WAY)
+
+__thread uint64_t* lyra2z_16way_matrix;
+
+bool lyra2z_16way_thread_init()
+{
+ return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
+}
+
+static __thread blake256_16way_context l2z_16way_blake_mid;
+
+void lyra2z_16way_midstate( const void* input )
+{
+       blake256_16way_init( &l2z_16way_blake_mid );
+       blake256_16way_update( &l2z_16way_blake_mid, input, 64 );
+}
+
+void lyra2z_16way_hash( void *state, const void *input )
+{
+    uint32_t vhash[8*16] __attribute__ ((aligned (128)));
+    uint32_t hash0[8] __attribute__ ((aligned (64)));
+    uint32_t hash1[8] __attribute__ ((aligned (64)));
+    uint32_t hash2[8] __attribute__ ((aligned (64)));
+    uint32_t hash3[8] __attribute__ ((aligned (64)));
+    uint32_t hash4[8] __attribute__ ((aligned (64)));
+    uint32_t hash5[8] __attribute__ ((aligned (64)));
+    uint32_t hash6[8] __attribute__ ((aligned (64)));
+    uint32_t hash7[8] __attribute__ ((aligned (64)));
+    uint32_t hash8[8] __attribute__ ((aligned (64)));
+    uint32_t hash9[8] __attribute__ ((aligned (64)));
+    uint32_t hash10[8] __attribute__ ((aligned (64)));
+    uint32_t hash11[8] __attribute__ ((aligned (64)));
+    uint32_t hash12[8] __attribute__ ((aligned (64)));
+    uint32_t hash13[8] __attribute__ ((aligned (64)));
+    uint32_t hash14[8] __attribute__ ((aligned (64)));
+    uint32_t hash15[8] __attribute__ ((aligned (64)));
+    blake256_16way_context ctx_blake __attribute__ ((aligned (64)));
+
+    memcpy( &ctx_blake, &l2z_16way_blake_mid, sizeof l2z_16way_blake_mid );
+    blake256_16way_update( &ctx_blake, input + (64*16), 16 );
+    blake256_16way_close( &ctx_blake, vhash );
+
+    dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+              hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
+               vhash, 256 );
+
+    intrlv_2x256( vhash, hash0, hash1, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash0, hash1, vhash, 256 );
+    intrlv_2x256( vhash, hash2, hash3, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash2, hash3, vhash, 256 );
+    intrlv_2x256( vhash, hash4, hash5, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash4, hash5, vhash, 256 );
+    intrlv_2x256( vhash, hash6, hash7, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash6, hash7, vhash, 256 );
+    intrlv_2x256( vhash, hash8, hash9, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash8, hash9, vhash, 256 );
+    intrlv_2x256( vhash, hash10, hash11, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash10, hash11, vhash, 256 );
+    intrlv_2x256( vhash, hash12, hash13, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash12, hash13, vhash, 256 );
+    intrlv_2x256( vhash, hash14, hash15, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash14, hash15, vhash, 256 );
+   
+    memcpy( state,     hash0, 32 );
+    memcpy( state+ 32, hash1, 32 );
+    memcpy( state+ 64, hash2, 32 );
+    memcpy( state+ 96, hash3, 32 );
+    memcpy( state+128, hash4, 32 );
+    memcpy( state+160, hash5, 32 );
+    memcpy( state+192, hash6, 32 );
+    memcpy( state+224, hash7, 32 );
+    memcpy( state+256, hash8, 32 );
+    memcpy( state+288, hash9, 32 );
+    memcpy( state+320, hash10, 32 );
+    memcpy( state+352, hash11, 32 );
+    memcpy( state+384, hash12, 32 );
+    memcpy( state+416, hash13, 32 );
+    memcpy( state+448, hash14, 32 );
+    memcpy( state+480, hash15, 32 );
+}
+
+int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0000ff;
+
+   mm512_bswap32_intrlv80_16x32( vdata, pdata );
+   lyra2z_16way_midstate( vdata );
+
+   do {
+      *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
+                                                  n+11, n+10, n+ 9, n+ 8,
+                                                  n+ 7, n+ 6, n+ 5, n+ 4,
+                                                  n+ 3, n+ 2, n+ 1, n ) );
+      lyra2z_16way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 16; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
+           && !opt_benchmark )
+      {
+          pdata[19] = n+i;
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 16;
+   } while ( (n < max_nonce-16) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#elif defined(LYRA2Z_8WAY)
+
+__thread uint64_t* lyra2z_8way_matrix;
+
+bool lyra2z_8way_thread_init()
+{
+ return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
+}
+
+static __thread blake256_8way_context l2z_8way_blake_mid;
+
+void lyra2z_8way_midstate( const void* input )
+{
+       blake256_8way_init( &l2z_8way_blake_mid );
+       blake256_8way( &l2z_8way_blake_mid, input, 64 );
+}
+
+void lyra2z_8way_hash( void *state, const void *input )
+{
+     uint32_t hash0[8] __attribute__ ((aligned (64)));
+     uint32_t hash1[8] __attribute__ ((aligned (64)));
+     uint32_t hash2[8] __attribute__ ((aligned (64)));
+     uint32_t hash3[8] __attribute__ ((aligned (64)));
+     uint32_t hash4[8] __attribute__ ((aligned (64)));
+     uint32_t hash5[8] __attribute__ ((aligned (64)));
+     uint32_t hash6[8] __attribute__ ((aligned (64)));
+     uint32_t hash7[8] __attribute__ ((aligned (64)));
+     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+     blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
+
+     memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
+     blake256_8way( &ctx_blake, input + (64*8), 16 );
+     blake256_8way_close( &ctx_blake, vhash );
+
+     dintrlv_8x32( hash0, hash1, hash2, hash3,
+                   hash4, hash5, hash6, hash7, vhash, 256 );
+
+     LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash4, 32, hash4, 32, hash4, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash5, 32, hash5, 32, hash5, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
+
+
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0000ff;
+
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );
+   lyra2z_8way_midstate( vdata );
+
+   do {
+      *noncev = mm256_bswap_32(
+                 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
+      lyra2z_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
+           && !opt_benchmark )
+      {
+          pdata[19] = n+i;
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 8;
+   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+
+#elif defined(LYRA2Z_4WAY)
+
+
 __thread uint64_t* lyra2z_4way_matrix;

 bool lyra2z_4way_thread_init()
@@ -85,100 +312,3 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,

 #endif

-#if defined(LYRA2Z_8WAY)
-
-__thread uint64_t* lyra2z_8way_matrix;
-
-bool lyra2z_8way_thread_init()
-{
- return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
-}
-
-static __thread blake256_8way_context l2z_8way_blake_mid;
-
-void lyra2z_8way_midstate( const void* input )
-{
-       blake256_8way_init( &l2z_8way_blake_mid );
-       blake256_8way( &l2z_8way_blake_mid, input, 64 );
-}
-
-void lyra2z_8way_hash( void *state, const void *input )
-{
-     uint32_t hash0[8] __attribute__ ((aligned (64)));
-     uint32_t hash1[8] __attribute__ ((aligned (64)));
-     uint32_t hash2[8] __attribute__ ((aligned (64)));
-     uint32_t hash3[8] __attribute__ ((aligned (64)));
-     uint32_t hash4[8] __attribute__ ((aligned (64)));
-     uint32_t hash5[8] __attribute__ ((aligned (64)));
-     uint32_t hash6[8] __attribute__ ((aligned (64)));
-     uint32_t hash7[8] __attribute__ ((aligned (64)));
-     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
-     blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
-
-     memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
-     blake256_8way( &ctx_blake, input + (64*8), 16 );
-     blake256_8way_close( &ctx_blake, vhash );
-
-     dintrlv_8x32( hash0, hash1, hash2, hash3,
-                   hash4, hash5, hash6, hash7, vhash, 256 );
-
-     LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash4, 32, hash4, 32, hash4, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash5, 32, hash5, 32, hash5, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
-
-     memcpy( state,     hash0, 32 );
-     memcpy( state+ 32, hash1, 32 );
-     memcpy( state+ 64, hash2, 32 );
-     memcpy( state+ 96, hash3, 32 );
-     memcpy( state+128, hash4, 32 );
-     memcpy( state+160, hash5, 32 );
-     memcpy( state+192, hash6, 32 );
-     memcpy( state+224, hash7, 32 );
-}
-
-int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t hash[8*8] __attribute__ ((aligned (64)));
-   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   uint32_t n = first_nonce;
-   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-   if ( opt_benchmark )
-      ptarget[7] = 0x0000ff;
-
-   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-   lyra2z_8way_midstate( vdata );
-
-   do {
-      *noncev = mm256_bswap_32(
-                 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
-      lyra2z_8way_hash( hash, vdata );
-      pdata[19] = n;
-
-      for ( int i = 0; i < 8; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
-           && !opt_benchmark )
-      {
-          pdata[19] = n+i;         
-          submit_lane_solution( work, hash+(i<<3), mythr, i );
-      }
-      n += 8;
-   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
-
-   *hashes_done = n - first_nonce + 1;
-   return 0;
-}
-
-
-#endif
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -19,7 +19,7 @@
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

-#include "algo-gate-api.h"
+//#include "algo-gate.h"
 #include <string.h>
 #include <stdio.h>
 #include <time.h>
@@ -27,8 +27,7 @@
 #include "sponge.h"
 #include "lyra2.h"

-#if 0
-//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
 {
@@ -41,19 +40,26 @@ inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
    //Squeezes full blocks
    for ( i = 0; i < fullBlocks; i++ )
    {
-       memcpy_512( out, state, BLOCK_LEN_M256I*2 );
-       LYRA_ROUND_2WAY_AVX2( state[0], state[1], state[2], state[3] );
-       out += BLOCK_LEN_M256I*2;
+       memcpy_512( out, state, BLOCK_LEN_M256I );
+       LYRA_ROUND_2WAY_AVX512( state[0], state[1], state[2], state[3] );
+       out += BLOCK_LEN_M256I;
    }
    //Squeezes remaining bytes
-    memcpy_512( out, state, ( (len_m256i % BLOCK_LEN_M256I) * 2 ) );
+    memcpy_512( out, state, len_m256i % BLOCK_LEN_M256I );
 }

-inline void absorbBlock_2way( uint64_t *State, const uint64_t *In ) 
+inline void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
+                                               const uint64_t *In1 ) 
 {
    register __m512i state0, state1, state2, state3;
-    __m512i *in = (__m512i*)In;
-
+    __m512i in[3];
+    casti_m256i( in, 0 ) = casti_m256i( In0, 0 );
+    casti_m256i( in, 1 ) = casti_m256i( In1, 1 );
+    casti_m256i( in, 2 ) = casti_m256i( In0, 2 );
+    casti_m256i( in, 3 ) = casti_m256i( In1, 3 );
+    casti_m256i( in, 4 ) = casti_m256i( In0, 4 );
+    casti_m256i( in, 5 ) = casti_m256i( In1, 5 );
+    
    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
    state2 = _mm512_load_si512( (__m512i*)State + 2 );
@@ -91,7 +97,7 @@ inline void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
    state1 = _mm512_xor_si512( state1, in[1] );

    LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
-    In += block_len * 2;
+    In += block_len*2;
  }

  _mm512_store_si512( (__m512i*)State,     state0 );
@@ -110,7 +116,7 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,


    register __m512i state0, state1, state2, state3;
-    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );

    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -127,13 +133,13 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
    {
       _mm_prefetch( out -  9, _MM_HINT_T0 );
       _mm_prefetch( out - 11, _MM_HINT_T0 );
-                   
+
       out[0] = state0;
       out[1] = state1;
       out[2] = state2;

       //Goes to next block (column) that will receive the squeezed data
-       out -= BLOCK_LEN_M256I * 2;
+       out -= BLOCK_LEN_M256I;

       LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
    }
@@ -144,15 +150,14 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
    _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

-// This function has to deal with gathering 2 256 bit rowin vectors from
-// non-contiguous memory. Extra work and performance penalty.

 inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
                 uint64_t *rowOut, uint64_t nCols )
 {
    int i;
    register __m512i state0, state1, state2, state3;
-    __m512i *in = (__m256i*)rowIn;
+    __m512i *in = (__m512i*)rowIn;
+    __m512i *out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );

    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -172,28 +177,25 @@ inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
         out[2] = _mm512_xor_si512( state2, in[2] );

         //Input: next column (i.e., next block in sequence)
-         in0 += BLOCK_LEN_M256I;
-         in1 += BLOCK_LEN_M256I;
+         in += BLOCK_LEN_M256I;
         //Output: goes to previous column
-         out -= BLOCK_LEN_M256I * 2;
+         out -= BLOCK_LEN_M256I;
    }

-    _mm512_store_si256( (__m512i*)State,     state0 );
-    _mm512_store_si256( (__m512i*)State + 1, state1 );
-    _mm512_store_si256( (__m512i*)State + 2, state2 );
-    _mm512_store_si256( (__m512i*)State + 3, state3 );
-   }
+    _mm512_store_si512( (__m512i*)State,     state0 );
+    _mm512_store_si512( (__m512i*)State + 1, state1 );
+    _mm512_store_si512( (__m512i*)State + 2, state2 );
+    _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

 inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
                       uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols )
 {
    int i;
-
    register __m512i state0, state1, state2, state3;
    __m512i* in    = (__m512i*)rowIn;
    __m512i* inout = (__m512i*)rowInOut;
-    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
    __m512i  t0, t1, t2;

    state0 = _mm512_load_si512( (__m512i*)State     );
@@ -210,7 +212,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
       state2 = _mm512_xor_si512( state2,
                                  _mm512_add_epi64( in[2], inout[2] ) );

-       LYRA_ROUND_2WAY AVX512( state0, state1, state2, state3 );
+       LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );

       out[0] = _mm512_xor_si512( state0, in[0] );
       out[1] = _mm512_xor_si512( state1, in[1] );
@@ -222,17 +224,18 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
       t2 = _mm512_permutex_epi64( state2, 0x93 );

       inout[0] = _mm512_xor_si512( inout[0],
-                                 _mm512_mask_blend_epi32( t0, t2, 0x03 ) );
+                                 _mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
       inout[1] = _mm512_xor_si512( inout[1],
-                                 _mm512_mask_blend_epi32( t1, t0, 0x03 ) );
+                                 _mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
       inout[2] = _mm512_xor_si512( inout[2],
-                                 _mm512_mask_blend_epi32( t2, t1, 0x03 ) );
+                                 _mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
+

       //Inputs: next column (i.e., next block in sequence)
-       in    += BLOCK_LEN_M256I * 2;
-       inout += BLOCK_LEN_M256I * 2;
+       in    += BLOCK_LEN_M256I;
+       inout += BLOCK_LEN_M256I;
       //Output: goes to previous column
-       out   -= BLOCK_LEN_M256I * 2;
+       out   -= BLOCK_LEN_M256I;
    }

    _mm512_store_si512( (__m512i*)State,     state0 );
@@ -241,49 +244,61 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
    _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

-inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
-                uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut,
-                uint64_t nCols )
+// big ugly workaound for pointer aliasing, use a union of pointers.
+// Access matrix using m512i for in and out, m256i for inout
+
+inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
+                            uint64_t *rowInOut0, uint64_t *rowInOut1,
+                            uint64_t *rowOut, uint64_t nCols)
 {
   int i;
-
   register __m512i state0, state1, state2, state3;
-    __m256i *in0 = (__m256i*)rowIn0;
-    __m256i *in0 = (__m256i*)rowIn0;
-    __m2512* in    = (__m512i*)rowIn;
-    __m2512* inout = (__m512i*)rowInOut;
-    __m512i* out   = (__m512i*)rowOut;
-    __m512i  t0, t1, t2;
+   __m512i *in = (__m512i*)rowIn;
+   __m256i *inout0 = (__m256i*)rowInOut0;
+   __m256i *inout1 = (__m256i*)rowInOut1;
+   __m512i *out = (__m512i*)rowOut;
+   __m512i io[3];
+   povly inout;
+   inout.v512 = &io[0];
+    __m512i t0, t1, t2;

-    _mm_prefetch( in0,     _MM_HINT_T0 );
-    _mm_prefetch( in1,     _MM_HINT_T0 );
-    _mm_prefetch( in0 + 2, _MM_HINT_T0 );
-    _mm_prefetch( in1 + 2, _MM_HINT_T0 );
-    _mm_prefetch( in0 + 4, _MM_HINT_T0 );
-    _mm_prefetch( in1 + 4, _MM_HINT_T0 );
-    _mm_prefetch( in0 + 6, _MM_HINT_T0 );
-    _mm_prefetch( in1 + 6, _MM_HINT_T0 );
-   
   state0 = _mm512_load_si512( (__m512i*)State     );
   state1 = _mm512_load_si512( (__m512i*)State + 1 );
   state2 = _mm512_load_si512( (__m512i*)State + 2 );
   state3 = _mm512_load_si512( (__m512i*)State + 3 );
+    
+    _mm_prefetch( in,     _MM_HINT_T0 );
+    _mm_prefetch( inout0,     _MM_HINT_T0 );
+    _mm_prefetch( inout1,     _MM_HINT_T0 );
+    _mm_prefetch( in     + 2, _MM_HINT_T0 );
+    _mm_prefetch( inout0 + 2, _MM_HINT_T0 );
+    _mm_prefetch( inout1 + 2, _MM_HINT_T0 );
+    _mm_prefetch( in     + 4, _MM_HINT_T0 );
+    _mm_prefetch( inout0 + 4, _MM_HINT_T0 );
+    _mm_prefetch( inout1 + 4, _MM_HINT_T0 );
+    _mm_prefetch( in     + 6, _MM_HINT_T0 );
+    _mm_prefetch( inout0 + 6, _MM_HINT_T0 );
+    _mm_prefetch( inout1 + 6, _MM_HINT_T0 );
+
+    
+    for ( i = 0; i < nCols; i++ )
+    {

      //Absorbing "M[prev] [+] M[row*]"
+      inout.v256[0] = inout0[0];
+      inout.v256[1] = inout1[1];
+      inout.v256[2] = inout0[2];
+      inout.v256[3] = inout1[3];
+      inout.v256[4] = inout0[4];
+      inout.v256[5] = inout1[5];

-//         state0 = _mm512_xor_si512( state0, mm512_concat_256( in1[0], in0[0] );
-//         state1 = _mm512_xor_si512( state1, mm512_concat_256( in1[1], in0[1] );
-//         state2 = _mm512_xor_si512( state2, mm512_concat_256( in1[2], in0[2] );
-      t0 = mm512_concat_256( in1[0], in0[0] );
-      t1 = mm512_concat_256( in1[1], in0[1] );
-      t2 = mm512_concat_256( in1[2], in0[2] );
-      
      state0 = _mm512_xor_si512( state0,
-                                     _mm512_add_epi64( t0, inout[0] ) );
+                                 _mm512_add_epi64( in[0], inout.v512[0] ) );
      state1 = _mm512_xor_si512( state1,
-                                     _mm512_add_epi64( t1, inout[1] ) );
+                                 _mm512_add_epi64( in[1], inout.v512[1] ) );
      state2 = _mm512_xor_si512( state2,
-                                     _mm512_add_epi64( t2, inout[2] ) );
+                                 _mm512_add_epi64( in[2], inout.v512[2] ) );
+

      //Applies the reduced-round transformation f to the sponge's state
      LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
@@ -293,22 +308,44 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
      out[1] = _mm512_xor_si512( out[1], state1 );
      out[2] = _mm512_xor_si512( out[2], state2 );

+      // if inout is the same row as out it was just overwritten, reload.
+      if ( rowOut == rowInOut0 )
+      {
+         inout.v256[0] = inout0[0];
+         inout.v256[2] = inout0[2];
+         inout.v256[4] = inout0[4];
+      }
+      if ( rowOut == rowInOut1 )
+      {
+         inout.v256[1] = inout1[1];
+         inout.v256[3] = inout1[3];
+         inout.v256[5] = inout1[5];
+      }
+
      //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
      t0 = _mm512_permutex_epi64( state0, 0x93 );
      t1 = _mm512_permutex_epi64( state1, 0x93 );
      t2 = _mm512_permutex_epi64( state2, 0x93 );

-      inout[0] = _mm512_xor_si512( inout[0],
-                                   _mm512_mask_blend_epi32( t0, t2, 0x03 ) );
-      inout[1] = _mm512_xor_si512( inout[1],
-                                   _mm512_mask_blend_epi32( t1, t0, 0x03 ) );
-      inout[2] = _mm512_xor_si512( inout[2],
-                                   _mm512_mask_blend_epi32( t2, t1, 0x03 ) );
+      inout.v512[0] = _mm512_xor_si512( inout.v512[0],
+                                   _mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
+      inout.v512[1] = _mm512_xor_si512( inout.v512[1],
+                                   _mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
+      inout.v512[2] = _mm512_xor_si512( inout.v512[2],
+                                   _mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
+      
+      inout0[0] = inout.v256[0];
+      inout1[1] = inout.v256[1];
+      inout0[2] = inout.v256[2];
+      inout1[3] = inout.v256[3];
+      inout0[4] = inout.v256[4];
+      inout1[5] = inout.v256[5];

       //Goes to next block
-       in    += BLOCK_LEN_M256I * 2;
-       out   += BLOCK_LEN_M256I * 2;
-       inout += BLOCK_LEN_M256I * 2;
+       in     += BLOCK_LEN_M256I;
+       inout0 += BLOCK_LEN_M256I * 2;
+       inout1 += BLOCK_LEN_M256I * 2;
+       out    += BLOCK_LEN_M256I;
   }

   _mm512_store_si512( (__m512i*)State,     state0 );
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -375,7 +375,10 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
    {
       _mm_prefetch( out -  9, _MM_HINT_T0 );
       _mm_prefetch( out - 11, _MM_HINT_T0 );
-                   
+
+//printf("S RSR0 col= %d, out= %x\n",i,out);
+
+
       out[0] = state0;
       out[1] = state1;
       out[2] = state2;
@@ -706,11 +709,34 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
       out[1] = _mm256_xor_si256( state1, in[1] );
       out[2] = _mm256_xor_si256( state2, in[2] );

+/*
+printf("s duplexsetup col= %d\n",i); 
+uint64_t * o = (uint64_t*)out;
+printf("S out %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
+*/
+
       //M[row*][col] = M[row*][col] XOR rotW(rand)
       t0 = _mm256_permute4x64_epi64( state0, 0x93 );
       t1 = _mm256_permute4x64_epi64( state1, 0x93 );
       t2 = _mm256_permute4x64_epi64( state2, 0x93 );

+/*
+uint64_t *t = (uint64_t*)&t0;
+printf("S t0 %016lx %016lx %016lx %016lx\n",t[0],t[1],t[2],t[3]);
+
+o = (uint64_t*)inout;
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
+*/       
       inout[0] = _mm256_xor_si256( inout[0],
                                    _mm256_blend_epi32( t0, t2, 0x03 ) );
       inout[1] = _mm256_xor_si256( inout[1],
@@ -718,7 +744,17 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
       inout[2] = _mm256_xor_si256( inout[2],
                                    _mm256_blend_epi32( t2, t1, 0x03 ) );

-       //Inputs: next column (i.e., next block in sequence)
+/*
+o = (uint64_t*)inout;
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
+*/
+
+//Inputs: next column (i.e., next block in sequence)
       in    += BLOCK_LEN_M256I;
       inout += BLOCK_LEN_M256I;
       //Output: goes to previous column
@@ -949,6 +985,22 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
      _mm_prefetch( inout +  9, _MM_HINT_T0 );
      _mm_prefetch( inout + 11, _MM_HINT_T0 );

+/*
+uint64_t *io = (uint64_t*)inout;
+uint64_t *ii = (uint64_t*)in;
+
+printf("RDRS1 col= %d\n", i);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[15]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[0],ii[1],ii[2],ii[3]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[4],ii[5],ii[6],ii[7]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[8],ii[9],ii[10],ii[11]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[12],ii[13],ii[14],ii[15]);
+*/
+
+
      //Absorbing "M[prev] [+] M[row*]"
      state0 = _mm256_xor_si256( state0,
                                     _mm256_add_epi64( in[0], inout[0] ) );
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -203,24 +203,36 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+union _povly
+{
+   __m512i *v512;
+   __m256i *v256;
+   uint64_t *u64;
+};
+typedef union _povly povly;
+
 //---- Housekeeping
-void initState_2way( uint64_t state[/*16*/] );
+void initState_2way( uint64_t State[/*16*/] );

 //---- Squeezes
-void squeeze_2way( uint64_t *state, unsigned char *out, unsigned int len );
+void squeeze_2way( uint64_t *State, unsigned char *out, unsigned int len );
 void reducedSqueezeRow0_2way( uint64_t* state, uint64_t* row, uint64_t nCols );

 //---- Absorbs
-void absorbBlock_2way( uint64_t *state, const uint64_t *in );
-void absorbBlockBlake2Safe_2way( uint64_t *state, const uint64_t *in,
+void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
+                       const uint64_t *In1 );
+void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
                            const uint64_t nBlocks, const uint64_t block_len );

 //---- Duplexes
-void reducedDuplexRow1_2way( uint64_t *state, uint64_t *rowIn,
+void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
                             uint64_t *rowOut, uint64_t nCols);
-void reducedDuplexRowSetup_2way( uint64_t *state, uint64_t *rowIn,
+void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
                    uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
-void reducedDuplexRow_2way(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
+
+void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
+                            uint64_t *rowInOut0, uint64_t *rowInOut1,
+                            uint64_t *rowOut, uint64_t nCols);

 #endif

--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
--- a/algo/quark/hmq1725-gate.c
+++ b/algo/quark/hmq1725-gate.c
@@ -2,7 +2,10 @@

 bool register_hmq1725_algo( algo_gate_t* gate )
 {
-#if defined(HMQ1725_4WAY)
+#if defined(HMQ1725_8WAY)
+  gate->scanhash  = (void*)&scanhash_hmq1725_8way;
+  gate->hash      = (void*)&hmq1725_8way_hash;
+#elif defined(HMQ1725_4WAY)
  gate->scanhash  = (void*)&scanhash_hmq1725_4way;
  gate->hash      = (void*)&hmq1725_4way_hash;
 #else
@@ -10,7 +13,7 @@ bool register_hmq1725_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_hmq1725;
  gate->hash      = (void*)&hmq1725hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  opt_target_factor = 65536.0;
  return true;
 };
--- a/algo/quark/hmq1725-gate.h
+++ b/algo/quark/hmq1725-gate.h
@@ -4,13 +4,21 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-//  #define HMQ1725_4WAY 1
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define HMQ1725_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define HMQ1725_4WAY 1
 #endif

 bool register_hmq1725_algo( algo_gate_t* gate );

-#if defined(HMQ1725_4WAY)
+#if defined(HMQ1725_8WAY)
+
+void hmq1725_8way_hash( void *state, const void *input );
+int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(HMQ1725_4WAY)

 void hmq1725_4way_hash( void *state, const void *input );
 int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -333,6 +333,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFFFFF)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -346,6 +347,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFFFF0)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -359,6 +361,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFFF00)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -372,6 +375,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFF000)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -386,6 +390,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFF0000)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -399,6 +404,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			hmq1725hash(hash64, endiandata);
 			if (fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -63,20 +63,6 @@ void quark_8way_hash( void *state, const void *input )
    bmw512_8way_update( &ctx.bmw, vhash, 64 );
    bmw512_8way_close( &ctx.bmw, vhash );

-// AVX 512 cmpeq returns a bit mask instead of a vector mask.
-// This should simplify things but the logic doesn't seem to be working.
-// The problem appears to be related to the test to skip a hash if it isn't
-// to be used. Skipping the test for all 8 way hashes seems to have
-// fixed it. The hash selection blending works if the hash is produced
-// but the hash wasn't being produced when it should.
-// Both decisions are based on the same data, the __mmask8. It works
-// as a blend mask but not in a logical comparison, maybe the type is the
-// problem. Maybe a cast to int or movm is needed to make it work.
-// It's now moot because the hash can only be skipped 1 in 256 iterations
-// when hashing parallel 8 ways.
-// The performance impact of the workaround should be negligible.
-// It's a problem for another day.
-
    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
                                       zero );

--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -10,7 +10,140 @@
 #define LBRY_MIDSTATE    64
 #define LBRY_TAIL (LBRY_INPUT_SIZE) - (LBRY_MIDSTATE)

-#if defined(LBRY_8WAY)
+#if defined(LBRY_16WAY)
+
+static __thread sha256_16way_context sha256_16w_mid;
+
+void lbry_16way_hash( void* output, const void* input )
+{
+   uint32_t _ALIGN(128) vhashA[16<<4];
+   uint32_t _ALIGN(64) vhashB[16<<4];
+   uint32_t _ALIGN(64) vhashC[16<<4];
+   uint32_t _ALIGN(64) h0[32];
+   uint32_t _ALIGN(64) h1[32];
+   uint32_t _ALIGN(64) h2[32];
+   uint32_t _ALIGN(64) h3[32];
+   uint32_t _ALIGN(64) h4[32];
+   uint32_t _ALIGN(64) h5[32];
+   uint32_t _ALIGN(64) h6[32];
+   uint32_t _ALIGN(64) h7[32];
+   uint32_t _ALIGN(64) h8[32];
+   uint32_t _ALIGN(64) h9[32];
+   uint32_t _ALIGN(64) h10[32];
+   uint32_t _ALIGN(64) h11[32];
+   uint32_t _ALIGN(64) h12[32];
+   uint32_t _ALIGN(64) h13[32];
+   uint32_t _ALIGN(64) h14[32];
+   uint32_t _ALIGN(64) h15[32];
+   sha256_16way_context     ctx_sha256 __attribute__ ((aligned (64)));
+   sha512_8way_context     ctx_sha512;
+   ripemd160_16way_context  ctx_ripemd;
+
+   memcpy( &ctx_sha256, &sha256_16w_mid, sizeof(ctx_sha256) );
+   sha256_16way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL );
+   sha256_16way_close( &ctx_sha256, vhashA );
+
+   sha256_16way_init( &ctx_sha256 );
+   sha256_16way_update( &ctx_sha256, vhashA, 32 );
+   sha256_16way_close( &ctx_sha256, vhashA );
+
+   // reinterleave to do sha512 4-way 64 bit twice.
+   dintrlv_16x32( h0, h1, h2, h3, h4, h5, h6, h7,
+                  h8, h9, h10, h11, h12, h13, h14, h15, vhashA, 256 );
+   intrlv_8x64( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 256 );
+   intrlv_8x64( vhashB, h8, h9, h10, h11, h12, h13, h14, h15, 256 );
+
+   sha512_8way_init( &ctx_sha512 );
+   sha512_8way_update( &ctx_sha512, vhashA, 32 );
+   sha512_8way_close( &ctx_sha512, vhashA );
+
+   sha512_8way_init( &ctx_sha512 );
+   sha512_8way_update( &ctx_sha512, vhashB, 32 );
+   sha512_8way_close( &ctx_sha512, vhashB );
+
+   // back to 8-way 32 bit
+   dintrlv_8x64( h0, h1, h2, h3,h4, h5, h6, h7, vhashA, 512 );
+   dintrlv_8x64( h8, h9, h10, h11, h12, h13, h14, h15, vhashB, 512 );
+   intrlv_16x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7,
+                         h8, h9, h10, h11, h12, h13, h14, h15, 512 );
+
+   ripemd160_16way_init( &ctx_ripemd );
+   ripemd160_16way_update( &ctx_ripemd, vhashA, 32 );
+   ripemd160_16way_close( &ctx_ripemd, vhashB );
+
+   ripemd160_16way_init( &ctx_ripemd );
+   ripemd160_16way_update( &ctx_ripemd, vhashA+(8<<4), 32 );
+   ripemd160_16way_close( &ctx_ripemd, vhashC );
+
+   sha256_16way_init( &ctx_sha256 );
+   sha256_16way_update( &ctx_sha256, vhashB, 20 );
+   sha256_16way_update( &ctx_sha256, vhashC, 20 );
+   sha256_16way_close( &ctx_sha256, vhashA );
+
+   sha256_16way_init( &ctx_sha256 );
+   sha256_16way_update( &ctx_sha256, vhashA, 32 );
+   sha256_16way_close( &ctx_sha256, output );
+}
+
+int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[32*16] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[7<<4]);
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[27];
+   const uint32_t first_nonce = pdata[27];
+   const uint32_t Htarg = ptarget[7];
+   uint32_t edata[32] __attribute__ ((aligned (64)));
+   __m512i  *noncev = (__m512i*)vdata + 27;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   // we need bigendian data...
+   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
+   casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
+   casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
+   intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
+        edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
+
+   sha256_16way_init( &sha256_16w_mid );
+   sha256_16way( &sha256_16w_mid, vdata, LBRY_MIDSTATE );
+
+   do
+   {
+      *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
+                                                  n+11, n+10, n+ 9, n+ 8,
+                                                  n+ 7, n+ 6, n+ 5, n+ 4,
+                                                  n+ 3, n+ 2, n+ 1, n ) );
+      lbry_16way_hash( hash, vdata );
+
+      for ( int i = 0; i < 16; i++ )
+      if ( unlikely( hash7[ i ] <= Htarg ) )
+      {
+         // deinterleave hash for lane
+         extr_lane_16x32( lane_hash, hash, i, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+            pdata[27] = n + i;
+            submit_lane_solution( work, lane_hash, mythr, i );
+         }
+      }
+      n += 16;
+   } while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+
+
+#elif defined(LBRY_8WAY)

 static __thread sha256_8way_context sha256_8w_mid;

@@ -91,11 +224,6 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
   __m256i  *noncev = (__m256i*)vdata + 27;   // aligned
   int thr_id = mythr->id;  // thr_id arg is deprecated

-   uint64_t htmax[] = {          0,        0xF,       0xFF,
-                             0xFFF,     0xFFFF, 0x10000000 };
-   uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
-                        0xFFFFF000, 0xFFFF0000,          0 };
-
   // we need bigendian data...
   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
@@ -106,33 +234,30 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
   casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
   casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
   intrlv_8x32( vdata, edata, edata, edata, edata,
-                             edata, edata, edata, edata, 1024 );
+                       edata, edata, edata, edata, 1024 );
+
   sha256_8way_init( &sha256_8w_mid );
   sha256_8way( &sha256_8w_mid, vdata, LBRY_MIDSTATE );

-   for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] )
+   do
   {
-      uint32_t mask = masks[m];
-      do
-      {
-        *noncev = mm256_bswap_32( _mm256_set_epi32(
-                                          n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
-         lbry_8way_hash( hash, vdata );
+      *noncev = mm256_bswap_32( _mm256_set_epi32(
+                                       n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
+      lbry_8way_hash( hash, vdata );

-         for ( int i = 0; i < 8; i++ )  if ( !( hash7[ i ] & mask ) )
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( hash7[ i ] <= Htarg ) )
+      {
+         // deinterleave hash for lane
+         extr_lane_8x32( lane_hash, hash, i, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
-            // deinterleave hash for lane
-            extr_lane_8x32( lane_hash, hash, i, 256 );
-            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
-            {
-              pdata[27] = n + i;
-              submit_lane_solution( work, lane_hash, mythr, i );
-            }
+            pdata[27] = n + i;
+            submit_lane_solution( work, lane_hash, mythr, i );
         }
-         n += 8;
-      } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
-      break;
-   }
+      }
+      n += 8;
+   } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -98,16 +98,23 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }

 bool register_lbry_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT | SHA_OPT;
-#if defined (LBRY_8WAY)
+  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
+#if defined (LBRY_16WAY)
+  gate->scanhash              = (void*)&scanhash_lbry_16way;
+  gate->hash                  = (void*)&lbry_16way_hash;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
+#elif defined (LBRY_8WAY)
  gate->scanhash              = (void*)&scanhash_lbry_8way;
  gate->hash                  = (void*)&lbry_8way_hash;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
 #elif defined (LBRY_4WAY)
  gate->scanhash              = (void*)&scanhash_lbry_4way;
  gate->hash                  = (void*)&lbry_4way_hash;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
 #else 
  gate->scanhash              = (void*)&scanhash_lbry;
  gate->hash                  = (void*)&lbry_hash;
+  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
 #endif
  gate->calc_network_diff     = (void*)&lbry_calc_network_diff;
  gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
--- a/algo/ripemd/lbry-gate.h
+++ b/algo/ripemd/lbry-gate.h
@@ -4,11 +4,20 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

+
+// 16 way needs sha256 16 way
+//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+//  #define LBRY_16WAY
+#if defined(__AVX2__)
+  #define LBRY_8WAY
+#endif
+/*
 #if !defined(__SHA__)
 #if defined(__AVX2__)
  #define LBRY_8WAY
 #endif
 #endif
+*/

 #define LBRY_NTIME_INDEX 25
 #define LBRY_NBITS_INDEX 26
@@ -18,7 +27,12 @@

 bool register_lbry_algo( algo_gate_t* gate );

-#if defined(LBRY_8WAY)
+#if defined(LBRY_16WAY)
+
+void lbry_16way_hash( void *state, const void *input );
+int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+#elif defined(LBRY_8WAY)

 void lbry_8way_hash( void *state, const void *input );
 int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
--- a/algo/ripemd/lbry.c
+++ b/algo/ripemd/lbry.c
@@ -80,9 +80,6 @@ int scanhash_lbry( struct work *work, uint32_t max_nonce,
 	// we need bigendian data...
        swab32_array( endiandata, pdata, 32 );

-#ifdef DEBUG_ALGO
-	printf("[%d] Htarg=%X\n", thr_id, Htarg);
-#endif
 	for (int m=0; m < sizeof(masks); m++) {
 		if (Htarg <= htmax[m]) {
 			uint32_t mask = masks[m];
@@ -90,23 +87,11 @@ int scanhash_lbry( struct work *work, uint32_t max_nonce,
 				pdata[27] = ++n;
 				be32enc(&endiandata[27], n);
 				lbry_hash(hash64, &endiandata);
-#ifndef DEBUG_ALGO
 				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
-					*hashes_done = n - first_nonce + 1;
-					return true;
+               pdata[27] = n;
+               submit_solution( work, hash64, mythr );
 				}
-#else
-				if (!(n % 0x1000) && !thr_id) printf(".");
-				if (!(hash64[7] & mask)) {
-					printf("[%d]",thr_id);
-					if (fulltest(hash64, ptarget)) {
-						*hashes_done = n - first_nonce + 1;
-						return true;
-					}
-				}
-#endif
-			} while (n < max_nonce && !work_restart[thr_id].restart);
-			// see blake.c if else to understand the loop on htmax => mask
+			} while ( (n < max_nonce -8) && !work_restart[thr_id].restart);
 			break;
 		}
 	}
--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -623,3 +623,303 @@ void ripemd160_8way_close( ripemd160_8way_context  *sc, void *dst )

 #endif // __AVX2__

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+//  RIPEMD-160 16 way
+
+
+#define F16W_1(x, y, z) \
+   _mm512_xor_si512( _mm512_xor_si512( x, y ), z )
+
+#define F16W_2(x, y, z) \
+   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( y, z ), x ), z )
+
+#define F16W_3(x, y, z) \
+   _mm512_xor_si512( _mm512_or_si512( x, mm512_not( y ) ), z )
+
+#define F16W_4(x, y, z) \
+   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( x, y ), z ), y )
+
+#define F16W_5(x, y, z) \
+   _mm512_xor_si512( x, _mm512_or_si512( y, mm512_not( z ) ) )
+
+#define RR_16W(a, b, c, d, e, f, s, r, k) \
+do{ \
+   a = _mm512_add_epi32( mm512_rol_32( _mm512_add_epi32( _mm512_add_epi32( \
+                _mm512_add_epi32( a, f( b ,c, d ) ), r ), \
+                                 m512_const1_64( k ) ), s ), e ); \
+   c = mm512_rol_32( c, 10 );\
+} while (0)
+
+#define ROUND1_16W(a, b, c, d, e, f, s, r, k)  \
+        RR_16W(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
+
+#define ROUND2_16W(a, b, c, d, e, f, s, r, k)  \
+        RR_16W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
+
+static void ripemd160_16way_round( ripemd160_16way_context *sc )
+{
+   const __m512i *in = (__m512i*)sc->buf;
+   __m512i *h  = (__m512i*)sc->val;
+   register __m512i A1, B1, C1, D1, E1;
+   register __m512i A2, B2, C2, D2, E2;
+   __m512i tmp;
+
+   A1 = A2 = h[0];
+   B1 = B2 = h[1];
+   C1 = C2 = h[2];
+   D1 = D2 = h[3];
+   E1 = E2 = h[4];
+
+   ROUND1_16W( A, B, C, D, E, F16W_1, 11, in[ 0], 1 );
+   ROUND1_16W( E, A, B, C, D, F16W_1, 14, in[ 1], 1 );
+   ROUND1_16W( D, E, A, B, C, F16W_1, 15, in[ 2], 1 );
+   ROUND1_16W( C, D, E, A, B, F16W_1, 12, in[ 3], 1 );
+   ROUND1_16W( B, C, D, E, A, F16W_1,  5, in[ 4], 1 );
+   ROUND1_16W( A, B, C, D, E, F16W_1,  8, in[ 5], 1 );
+   ROUND1_16W( E, A, B, C, D, F16W_1,  7, in[ 6], 1 );
+   ROUND1_16W( D, E, A, B, C, F16W_1,  9, in[ 7], 1 );
+   ROUND1_16W( C, D, E, A, B, F16W_1, 11, in[ 8], 1 );
+   ROUND1_16W( B, C, D, E, A, F16W_1, 13, in[ 9], 1 );
+   ROUND1_16W( A, B, C, D, E, F16W_1, 14, in[10], 1 );
+   ROUND1_16W( E, A, B, C, D, F16W_1, 15, in[11], 1 );
+   ROUND1_16W( D, E, A, B, C, F16W_1,  6, in[12], 1 );
+   ROUND1_16W( C, D, E, A, B, F16W_1,  7, in[13], 1 );
+   ROUND1_16W( B, C, D, E, A, F16W_1,  9, in[14], 1 );
+   ROUND1_16W( A, B, C, D, E, F16W_1,  8, in[15], 1 );
+
+   ROUND1_16W( E, A, B, C, D, F16W_2,  7, in[ 7], 2 );
+   ROUND1_16W( D, E, A, B, C, F16W_2,  6, in[ 4], 2 );
+   ROUND1_16W( C, D, E, A, B, F16W_2,  8, in[13], 2 );
+   ROUND1_16W( B, C, D, E, A, F16W_2, 13, in[ 1], 2 );
+   ROUND1_16W( A, B, C, D, E, F16W_2, 11, in[10], 2 );
+   ROUND1_16W( E, A, B, C, D, F16W_2,  9, in[ 6], 2 );
+   ROUND1_16W( D, E, A, B, C, F16W_2,  7, in[15], 2 );
+   ROUND1_16W( C, D, E, A, B, F16W_2, 15, in[ 3], 2 );
+   ROUND1_16W( B, C, D, E, A, F16W_2,  7, in[12], 2 );
+   ROUND1_16W( A, B, C, D, E, F16W_2, 12, in[ 0], 2 );
+   ROUND1_16W( E, A, B, C, D, F16W_2, 15, in[ 9], 2 );
+   ROUND1_16W( D, E, A, B, C, F16W_2,  9, in[ 5], 2 );
+   ROUND1_16W( C, D, E, A, B, F16W_2, 11, in[ 2], 2 );
+   ROUND1_16W( B, C, D, E, A, F16W_2,  7, in[14], 2 );
+   ROUND1_16W( A, B, C, D, E, F16W_2, 13, in[11], 2 );
+   ROUND1_16W( E, A, B, C, D, F16W_2, 12, in[ 8], 2 );
+
+   ROUND1_16W( D, E, A, B, C, F16W_3, 11, in[ 3], 3 );
+   ROUND1_16W( C, D, E, A, B, F16W_3, 13, in[10], 3 );
+   ROUND1_16W( B, C, D, E, A, F16W_3,  6, in[14], 3 );
+   ROUND1_16W( A, B, C, D, E, F16W_3,  7, in[ 4], 3 );
+   ROUND1_16W( E, A, B, C, D, F16W_3, 14, in[ 9], 3 );
+   ROUND1_16W( D, E, A, B, C, F16W_3,  9, in[15], 3 );
+   ROUND1_16W( C, D, E, A, B, F16W_3, 13, in[ 8], 3 );
+   ROUND1_16W( B, C, D, E, A, F16W_3, 15, in[ 1], 3 );
+   ROUND1_16W( A, B, C, D, E, F16W_3, 14, in[ 2], 3 );
+   ROUND1_16W( E, A, B, C, D, F16W_3,  8, in[ 7], 3 );
+   ROUND1_16W( D, E, A, B, C, F16W_3, 13, in[ 0], 3 );
+   ROUND1_16W( C, D, E, A, B, F16W_3,  6, in[ 6], 3 );
+   ROUND1_16W( B, C, D, E, A, F16W_3,  5, in[13], 3 );
+   ROUND1_16W( A, B, C, D, E, F16W_3, 12, in[11], 3 );
+   ROUND1_16W( E, A, B, C, D, F16W_3,  7, in[ 5], 3 );
+   ROUND1_16W( D, E, A, B, C, F16W_3,  5, in[12], 3 );
+
+   ROUND1_16W( C, D, E, A, B, F16W_4, 11, in[ 1], 4 );
+   ROUND1_16W( B, C, D, E, A, F16W_4, 12, in[ 9], 4 );
+   ROUND1_16W( A, B, C, D, E, F16W_4, 14, in[11], 4 );
+   ROUND1_16W( E, A, B, C, D, F16W_4, 15, in[10], 4 );
+   ROUND1_16W( D, E, A, B, C, F16W_4, 14, in[ 0], 4 );
+   ROUND1_16W( C, D, E, A, B, F16W_4, 15, in[ 8], 4 );
+   ROUND1_16W( B, C, D, E, A, F16W_4,  9, in[12], 4 );
+   ROUND1_16W( A, B, C, D, E, F16W_4,  8, in[ 4], 4 );
+   ROUND1_16W( E, A, B, C, D, F16W_4,  9, in[13], 4 );
+   ROUND1_16W( D, E, A, B, C, F16W_4, 14, in[ 3], 4 );
+   ROUND1_16W( C, D, E, A, B, F16W_4,  5, in[ 7], 4 );
+   ROUND1_16W( B, C, D, E, A, F16W_4,  6, in[15], 4 );
+   ROUND1_16W( A, B, C, D, E, F16W_4,  8, in[14], 4 );
+   ROUND1_16W( E, A, B, C, D, F16W_4,  6, in[ 5], 4 );
+   ROUND1_16W( D, E, A, B, C, F16W_4,  5, in[ 6], 4 );
+   ROUND1_16W( C, D, E, A, B, F16W_4, 12, in[ 2], 4 );
+
+   ROUND1_16W( B, C, D, E, A, F16W_5,  9, in[ 4], 5 );
+   ROUND1_16W( A, B, C, D, E, F16W_5, 15, in[ 0], 5 );
+   ROUND1_16W( E, A, B, C, D, F16W_5,  5, in[ 5], 5 );
+   ROUND1_16W( D, E, A, B, C, F16W_5, 11, in[ 9], 5 );
+   ROUND1_16W( C, D, E, A, B, F16W_5,  6, in[ 7], 5 );
+   ROUND1_16W( B, C, D, E, A, F16W_5,  8, in[12], 5 );
+   ROUND1_16W( A, B, C, D, E, F16W_5, 13, in[ 2], 5 );
+   ROUND1_16W( E, A, B, C, D, F16W_5, 12, in[10], 5 );
+   ROUND1_16W( D, E, A, B, C, F16W_5,  5, in[14], 5 );
+   ROUND1_16W( C, D, E, A, B, F16W_5, 12, in[ 1], 5 );
+   ROUND1_16W( B, C, D, E, A, F16W_5, 13, in[ 3], 5 );
+   ROUND1_16W( A, B, C, D, E, F16W_5, 14, in[ 8], 5 );
+   ROUND1_16W( E, A, B, C, D, F16W_5, 11, in[11], 5 );
+   ROUND1_16W( D, E, A, B, C, F16W_5,  8, in[ 6], 5 );
+   ROUND1_16W( C, D, E, A, B, F16W_5,  5, in[15], 5 );
+   ROUND1_16W( B, C, D, E, A, F16W_5,  6, in[13], 5 );
+
+   ROUND2_16W( A, B, C, D, E, F16W_5,  8, in[ 5], 1 );
+   ROUND2_16W( E, A, B, C, D, F16W_5,  9, in[14], 1 );
+   ROUND2_16W( D, E, A, B, C, F16W_5,  9, in[ 7], 1 );
+   ROUND2_16W( C, D, E, A, B, F16W_5, 11, in[ 0], 1 );
+   ROUND2_16W( B, C, D, E, A, F16W_5, 13, in[ 9], 1 );
+   ROUND2_16W( A, B, C, D, E, F16W_5, 15, in[ 2], 1 );
+   ROUND2_16W( E, A, B, C, D, F16W_5, 15, in[11], 1 );
+   ROUND2_16W( D, E, A, B, C, F16W_5,  5, in[ 4], 1 );
+   ROUND2_16W( C, D, E, A, B, F16W_5,  7, in[13], 1 );
+   ROUND2_16W( B, C, D, E, A, F16W_5,  7, in[ 6], 1 );
+   ROUND2_16W( A, B, C, D, E, F16W_5,  8, in[15], 1 );
+   ROUND2_16W( E, A, B, C, D, F16W_5, 11, in[ 8], 1 );
+   ROUND2_16W( D, E, A, B, C, F16W_5, 14, in[ 1], 1 );
+   ROUND2_16W( C, D, E, A, B, F16W_5, 14, in[10], 1 );
+   ROUND2_16W( B, C, D, E, A, F16W_5, 12, in[ 3], 1 );
+   ROUND2_16W( A, B, C, D, E, F16W_5,  6, in[12], 1 );
+
+   ROUND2_16W( E, A, B, C, D, F16W_4,  9, in[ 6], 2 );
+   ROUND2_16W( D, E, A, B, C, F16W_4, 13, in[11], 2 );
+   ROUND2_16W( C, D, E, A, B, F16W_4, 15, in[ 3], 2 );
+   ROUND2_16W( B, C, D, E, A, F16W_4,  7, in[ 7], 2 );
+   ROUND2_16W( A, B, C, D, E, F16W_4, 12, in[ 0], 2 );
+   ROUND2_16W( E, A, B, C, D, F16W_4,  8, in[13], 2 );
+   ROUND2_16W( D, E, A, B, C, F16W_4,  9, in[ 5], 2 );
+   ROUND2_16W( C, D, E, A, B, F16W_4, 11, in[10], 2 );
+   ROUND2_16W( B, C, D, E, A, F16W_4,  7, in[14], 2 );
+   ROUND2_16W( A, B, C, D, E, F16W_4,  7, in[15], 2 );
+   ROUND2_16W( E, A, B, C, D, F16W_4, 12, in[ 8], 2 );
+   ROUND2_16W( D, E, A, B, C, F16W_4,  7, in[12], 2 );
+   ROUND2_16W( C, D, E, A, B, F16W_4,  6, in[ 4], 2 );
+   ROUND2_16W( B, C, D, E, A, F16W_4, 15, in[ 9], 2 );
+   ROUND2_16W( A, B, C, D, E, F16W_4, 13, in[ 1], 2 );
+   ROUND2_16W( E, A, B, C, D, F16W_4, 11, in[ 2], 2 );
+
+   ROUND2_16W( D, E, A, B, C, F16W_3,  9, in[15], 3 );
+   ROUND2_16W( C, D, E, A, B, F16W_3,  7, in[ 5], 3 );
+   ROUND2_16W( B, C, D, E, A, F16W_3, 15, in[ 1], 3 );
+   ROUND2_16W( A, B, C, D, E, F16W_3, 11, in[ 3], 3 );
+   ROUND2_16W( E, A, B, C, D, F16W_3,  8, in[ 7], 3 );
+   ROUND2_16W( D, E, A, B, C, F16W_3,  6, in[14], 3 );
+   ROUND2_16W( C, D, E, A, B, F16W_3,  6, in[ 6], 3 );
+   ROUND2_16W( B, C, D, E, A, F16W_3, 14, in[ 9], 3 );
+   ROUND2_16W( A, B, C, D, E, F16W_3, 12, in[11], 3 );
+   ROUND2_16W( E, A, B, C, D, F16W_3, 13, in[ 8], 3 );
+   ROUND2_16W( D, E, A, B, C, F16W_3,  5, in[12], 3 );
+   ROUND2_16W( C, D, E, A, B, F16W_3, 14, in[ 2], 3 );
+   ROUND2_16W( B, C, D, E, A, F16W_3, 13, in[10], 3 );
+   ROUND2_16W( A, B, C, D, E, F16W_3, 13, in[ 0], 3 );
+   ROUND2_16W( E, A, B, C, D, F16W_3,  7, in[ 4], 3 );
+   ROUND2_16W( D, E, A, B, C, F16W_3,  5, in[13], 3 );
+
+   ROUND2_16W( C, D, E, A, B, F16W_2, 15, in[ 8], 4 );
+   ROUND2_16W( B, C, D, E, A, F16W_2,  5, in[ 6], 4 );
+   ROUND2_16W( A, B, C, D, E, F16W_2,  8, in[ 4], 4 );
+   ROUND2_16W( E, A, B, C, D, F16W_2, 11, in[ 1], 4 );
+   ROUND2_16W( D, E, A, B, C, F16W_2, 14, in[ 3], 4 );
+   ROUND2_16W( C, D, E, A, B, F16W_2, 14, in[11], 4 );
+   ROUND2_16W( B, C, D, E, A, F16W_2,  6, in[15], 4 );
+   ROUND2_16W( A, B, C, D, E, F16W_2, 14, in[ 0], 4 );
+   ROUND2_16W( E, A, B, C, D, F16W_2,  6, in[ 5], 4 );
+   ROUND2_16W( D, E, A, B, C, F16W_2,  9, in[12], 4 );
+   ROUND2_16W( C, D, E, A, B, F16W_2, 12, in[ 2], 4 );
+   ROUND2_16W( B, C, D, E, A, F16W_2,  9, in[13], 4 );
+   ROUND2_16W( A, B, C, D, E, F16W_2, 12, in[ 9], 4 );
+   ROUND2_16W( E, A, B, C, D, F16W_2,  5, in[ 7], 4 );
+   ROUND2_16W( D, E, A, B, C, F16W_2, 15, in[10], 4 );
+   ROUND2_16W( C, D, E, A, B, F16W_2,  8, in[14], 4 );
+
+   ROUND2_16W( B, C, D, E, A, F16W_1,  8, in[12], 5 );
+   ROUND2_16W( A, B, C, D, E, F16W_1,  5, in[15], 5 );
+   ROUND2_16W( E, A, B, C, D, F16W_1, 12, in[10], 5 );
+   ROUND2_16W( D, E, A, B, C, F16W_1,  9, in[ 4], 5 );
+   ROUND2_16W( C, D, E, A, B, F16W_1, 12, in[ 1], 5 );
+   ROUND2_16W( B, C, D, E, A, F16W_1,  5, in[ 5], 5 );
+   ROUND2_16W( A, B, C, D, E, F16W_1, 14, in[ 8], 5 );
+   ROUND2_16W( E, A, B, C, D, F16W_1,  6, in[ 7], 5 );
+   ROUND2_16W( D, E, A, B, C, F16W_1,  8, in[ 6], 5 );
+   ROUND2_16W( C, D, E, A, B, F16W_1, 13, in[ 2], 5 );
+   ROUND2_16W( B, C, D, E, A, F16W_1,  6, in[13], 5 );
+   ROUND2_16W( A, B, C, D, E, F16W_1,  5, in[14], 5 );
+   ROUND2_16W( E, A, B, C, D, F16W_1, 15, in[ 0], 5 );
+   ROUND2_16W( D, E, A, B, C, F16W_1, 13, in[ 3], 5 );
+   ROUND2_16W( C, D, E, A, B, F16W_1, 11, in[ 9], 5 );
+   ROUND2_16W( B, C, D, E, A, F16W_1, 11, in[11], 5 );
+
+   tmp =  _mm512_add_epi32( _mm512_add_epi32( h[1], C1 ), D2 );
+   h[1] = _mm512_add_epi32( _mm512_add_epi32( h[2], D1 ), E2 );
+   h[2] = _mm512_add_epi32( _mm512_add_epi32( h[3], E1 ), A2 );
+   h[3] = _mm512_add_epi32( _mm512_add_epi32( h[4], A1 ), B2 );
+   h[4] = _mm512_add_epi32( _mm512_add_epi32( h[0], B1 ), C2 );
+   h[0] = tmp;
+}
+
+void ripemd160_16way_init( ripemd160_16way_context *sc )
+{
+   sc->val[0] = m512_const1_64( 0x6745230167452301 );
+   sc->val[1] = m512_const1_64( 0xEFCDAB89EFCDAB89 );
+   sc->val[2] = m512_const1_64( 0x98BADCFE98BADCFE );
+   sc->val[3] = m512_const1_64( 0x1032547610325476 );
+   sc->val[4] = m512_const1_64( 0xC3D2E1F0C3D2E1F0 );
+   sc->count_high = sc->count_low = 0;
+}
+
+void ripemd160_16way( ripemd160_16way_context *sc, const void *data,
+                      size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   size_t ptr;
+   const int block_size = 64;
+
+   ptr = (unsigned)sc->count_low & (block_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      uint32_t clow, clow2;
+
+      clen = block_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_512( sc->buf + (ptr>>2), vdata, clen>>2 );
+      vdata = vdata + (clen>>2);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == block_size )
+      {
+         ripemd160_16way_round( sc );
+         ptr = 0;
+      }
+      clow = sc->count_low;
+      clow2 = clow + clen;
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high++;
+   }
+}
+
+void ripemd160_16way_close( ripemd160_16way_context  *sc, void *dst )
+{
+   unsigned ptr, u;
+   uint32_t low, high;
+   const int block_size = 64;
+   const int pad = block_size - 8;
+
+   ptr = (unsigned)sc->count_low & ( block_size - 1U);
+   sc->buf[ ptr>>2 ] = m512_const1_32( 0x80 );
+   ptr += 4;
+
+   if ( ptr > pad )
+   {
+       memset_zero_512( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
+       ripemd160_16way_round( sc );
+       memset_zero_512( sc->buf, pad>>2 );
+   }
+   else
+       memset_zero_512( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
+
+    low = sc->count_low;
+    high = (sc->count_high << 3) | (low >> 29);
+    low = low << 3;
+    sc->buf[  pad>>2      ] = _mm512_set1_epi32( low  );
+    sc->buf[ (pad>>2) + 1 ] = _mm512_set1_epi32( high );
+    ripemd160_16way_round( sc );
+    for (u = 0; u < 5; u ++)
+        casti_m512i( dst, u ) = sc->val[u];
+}
+
+#endif  // AVX512
--- a/algo/ripemd/ripemd-hash-4way.h
+++ b/algo/ripemd/ripemd-hash-4way.h
@@ -32,7 +32,21 @@ void ripemd160_8way_init( ripemd160_8way_context *sc );
 void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len );
 void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+typedef struct
+{
+   __m512i buf[64>>2];
+   __m512i val[5];
+   uint32_t count_high, count_low;
+} __attribute__ ((aligned (128))) ripemd160_16way_context;
+
+void ripemd160_16way_init( ripemd160_16way_context *sc );
+void ripemd160_16way( ripemd160_16way_context *sc, const void *data,
+                      size_t len );
+void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst );
+
+#endif // AVX512
 #endif // __AVX2__
 #endif // __SSE4_2__
 #endif // RIPEMD_HASH_4WAY_H__
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -56,7 +56,7 @@ typedef struct {
   __m128i val[8];
   uint32_t count_high, count_low;
   bool initialized;
-} sha256_4way_context;
+} sha256_4way_context __attribute__ ((aligned (64)));

 void sha256_4way_init( sha256_4way_context *sc );
 void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
@@ -71,10 +71,11 @@ typedef struct {
   __m256i val[8];
   uint32_t count_high, count_low;
   bool initialized;
-} sha256_8way_context;
+} sha256_8way_context __attribute__ ((aligned (128)));

 void sha256_8way_init( sha256_8way_context *sc );
-void sha256_8way( sha256_8way_context *sc, const void *data, size_t len );
+void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
+#define sha256_8way sha256_8way_update
 void sha256_8way_close( sha256_8way_context *sc, void *dst );

 //#define SPH_SIZE_sha512   512
@@ -86,30 +87,32 @@ typedef struct {
   __m256i val[8];
   uint64_t count;
   bool initialized;
-} sha512_4way_context;
+} sha512_4way_context __attribute__ ((aligned (128)));

 void sha512_4way_init( sha512_4way_context *sc);
-void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
+void sha512_4way_update( sha512_4way_context *sc, const void *data,
+                         size_t len );
+#define sha512_4way sha512_4way_update
 void sha512_4way_close( sha512_4way_context *sc, void *dst );

-// SHA-256 11 way hybrid
-// Combines AVX2, MMX and scalar data to do 8 + 2 + 1 parallel.
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-512 8 way
+
 typedef struct {
-   __m256i  bufx[64>>2];
-   __m256i  valx[8];
-   __m64    bufy[64>>2];
-   __m64    valy[8];
-   uint32_t bufz[64>>2];
-   uint32_t valz[8];
-   uint32_t count_high, count_low;
-} sha256_11way_context;
+   __m512i buf[128>>3];
+   __m512i val[8];
+   uint64_t count;
+   bool initialized;
+} sha512_8way_context __attribute__ ((aligned (128)));

-void sha256_11way_init( sha256_11way_context *ctx );
-void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
-	                 const void *datay, const void *dataz, size_t len );
-void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx,
-	                 void *dstz  );
+void sha512_8way_init( sha512_8way_context *sc);
+void sha512_8way_update( sha512_8way_context *sc, const void *data, 
+                         size_t len );
+void sha512_8way_close( sha512_8way_context *sc, void *dst );

+
+#endif  // AVX512
 #endif  // __AVX2__
 #endif  // __SSE2__
 #endif  // SHA256_4WAY_H__
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -36,8 +36,6 @@
 #include <string.h>
 #include "sha-hash-4way.h"

-// SHA-512 4 way 64 bit
-
 /*
 static const sph_u64 H512[8] = {
        SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
@@ -90,6 +88,236 @@ static const sph_u64 K512[80] = {
 	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
 };

+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-512 8 way 64 bit
+
+#define CH8W(X, Y, Z) \
+   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) 
+
+#define MAJ8W(X, Y, Z) \
+   _mm512_or_si512( _mm512_and_si512( X, Y ), \
+                    _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
+
+#define BSG8W_5_0(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x, 28), mm512_ror_64(x, 34) ), mm512_ror_64(x, 39) )
+
+#define BSG8W_5_1(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x, 14), mm512_ror_64(x, 18) ), mm512_ror_64(x, 41) )
+
+#define SSG8W_5_0(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x,  1), mm512_ror_64(x,  8) ), _mm512_srli_epi64(x, 7) ) 
+
+#define SSG8W_5_1(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x, 19), mm512_ror_64(x, 61) ), _mm512_srli_epi64(x, 6) )
+
+static inline __m512i ssg8w_512_add( __m512i w0, __m512i w1 )
+{
+   __m512i w0a, w1a, w0b, w1b;
+   w0a = mm512_ror_64( w0, 1 );
+   w1a = mm512_ror_64( w1,19 );
+   w0b = mm512_ror_64( w0, 8 );
+   w1b = mm512_ror_64( w1,61 );
+   w0a = _mm512_xor_si512( w0a, w0b );
+   w1a = _mm512_xor_si512( w1a, w1b );
+   w0b = _mm512_srli_epi64( w0, 7 );
+   w1b = _mm512_srli_epi64( w1, 6 );
+   w0a = _mm512_xor_si512( w0a, w0b );
+   w1a = _mm512_xor_si512( w1a, w1b );
+   return _mm512_add_epi64( w0a, w1a );
+}
+
+
+#define SSG8W_512x2_0( w0, w1, i ) do \
+{ \
+   __m512i X0a, X1a, X0b, X1b; \
+  X0a = mm512_ror_64( W[i-15], 1 ); \
+  X1a = mm512_ror_64( W[i-14], 1 ); \
+  X0b = mm512_ror_64( W[i-15], 8 ); \
+  X1b = mm512_ror_64( W[i-14], 8 ); \
+  X0a = _mm512_xor_si512( X0a, X0b ); \
+  X1a = _mm512_xor_si512( X1a, X1b ); \
+  X0b = _mm512_srli_epi64( W[i-15], 7 ); \
+  X1b = _mm512_srli_epi64( W[i-14], 7 ); \
+  w0  = _mm512_xor_si512( X0a, X0b ); \
+  w1  = _mm512_xor_si512( X1a, X1b ); \
+} while(0)
+
+#define SSG8W_512x2_1( w0, w1, i ) do \
+{ \
+   __m512i X0a, X1a, X0b, X1b; \
+  X0a = mm512_ror_64( W[i-2],19 ); \
+  X1a = mm512_ror_64( W[i-1],19 ); \
+  X0b = mm512_ror_64( W[i-2],61 ); \
+  X1b = mm512_ror_64( W[i-1],61 ); \
+  X0a = _mm512_xor_si512( X0a, X0b ); \
+  X1a = _mm512_xor_si512( X1a, X1b ); \
+  X0b = _mm512_srli_epi64( W[i-2], 6 ); \
+  X1b = _mm512_srli_epi64( W[i-1], 6 ); \
+  w0  = _mm512_xor_si512( X0a, X0b ); \
+  w1  = _mm512_xor_si512( X1a, X1b ); \
+} while(0)
+
+#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
+do { \
+  __m512i T1, T2; \
+  __m512i K = _mm512_set1_epi64( K512[ i ] ); \
+  T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \
+                                           K, W[i] ) ); \
+  T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \
+  D  = _mm512_add_epi64( D, T1 ); \
+  H  = _mm512_add_epi64( T1, T2 ); \
+} while (0)
+
+static void
+sha512_8way_round( sha512_8way_context *ctx,  __m512i *in, __m512i r[8] )
+{
+   int i;
+   register __m512i A, B, C, D, E, F, G, H;
+   __m512i W[80];
+
+   mm512_block_bswap_64( W  , in );
+   mm512_block_bswap_64( W+8, in+8 );
+
+   for ( i = 16; i < 80; i++ )
+      W[i] = _mm512_add_epi64( ssg8w_512_add( W[i-15], W[i-2] ),
+                               _mm512_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
+
+   if ( ctx->initialized )
+   {
+      A = r[0];
+      B = r[1];
+      C = r[2];
+      D = r[3];
+      E = r[4];
+      F = r[5];
+      G = r[6];
+      H = r[7];
+   }
+   else
+   {
+      A = m512_const1_64( 0x6A09E667F3BCC908 );
+      B = m512_const1_64( 0xBB67AE8584CAA73B );
+      C = m512_const1_64( 0x3C6EF372FE94F82B );
+      D = m512_const1_64( 0xA54FF53A5F1D36F1 );
+      E = m512_const1_64( 0x510E527FADE682D1 );
+      F = m512_const1_64( 0x9B05688C2B3E6C1F );
+      G = m512_const1_64( 0x1F83D9ABFB41BD6B );
+      H = m512_const1_64( 0x5BE0CD19137E2179 );
+   }
+
+   for ( i = 0; i < 80; i += 8 )
+   {
+      SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
+      SHA3_8WAY_STEP( H, A, B, C, D, E, F, G, i + 1 );
+      SHA3_8WAY_STEP( G, H, A, B, C, D, E, F, i + 2 );
+      SHA3_8WAY_STEP( F, G, H, A, B, C, D, E, i + 3 );
+      SHA3_8WAY_STEP( E, F, G, H, A, B, C, D, i + 4 );
+      SHA3_8WAY_STEP( D, E, F, G, H, A, B, C, i + 5 );
+      SHA3_8WAY_STEP( C, D, E, F, G, H, A, B, i + 6 );
+      SHA3_8WAY_STEP( B, C, D, E, F, G, H, A, i + 7 );
+   }
+
+   if ( ctx->initialized )
+   {
+      r[0] = _mm512_add_epi64( r[0], A );
+      r[1] = _mm512_add_epi64( r[1], B );
+      r[2] = _mm512_add_epi64( r[2], C );
+      r[3] = _mm512_add_epi64( r[3], D );
+      r[4] = _mm512_add_epi64( r[4], E );
+      r[5] = _mm512_add_epi64( r[5], F );
+      r[6] = _mm512_add_epi64( r[6], G );
+      r[7] = _mm512_add_epi64( r[7], H );
+   }
+   else
+   {
+      ctx->initialized = true;
+      r[0] = _mm512_add_epi64( A, m512_const1_64( 0x6A09E667F3BCC908 ) );
+      r[1] = _mm512_add_epi64( B, m512_const1_64( 0xBB67AE8584CAA73B ) );
+      r[2] = _mm512_add_epi64( C, m512_const1_64( 0x3C6EF372FE94F82B ) );
+      r[3] = _mm512_add_epi64( D, m512_const1_64( 0xA54FF53A5F1D36F1 ) );
+      r[4] = _mm512_add_epi64( E, m512_const1_64( 0x510E527FADE682D1 ) );
+      r[5] = _mm512_add_epi64( F, m512_const1_64( 0x9B05688C2B3E6C1F ) );
+      r[6] = _mm512_add_epi64( G, m512_const1_64( 0x1F83D9ABFB41BD6B ) );
+      r[7] = _mm512_add_epi64( H, m512_const1_64( 0x5BE0CD19137E2179 ) );
+   }
+}
+
+void sha512_8way_init( sha512_8way_context *sc )
+{
+   sc->initialized = false;
+   sc->count = 0;
+}
+
+void sha512_8way_update( sha512_8way_context *sc, const void *data, size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   size_t ptr;
+   const int buf_size = 128;
+
+   ptr = (unsigned)sc->count & (buf_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_512( sc->buf + (ptr>>3), vdata, clen>>3 );
+      vdata = vdata + (clen>>3);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         sha512_8way_round( sc, sc->buf, sc->val );
+         ptr = 0;
+      }
+      sc->count += clen;
+   }
+}
+
+void sha512_8way_close( sha512_8way_context *sc, void *dst )
+{
+    unsigned ptr;
+    const int buf_size = 128;
+    const int pad = buf_size - 16;
+    const __m512i shuff_bswap64 = m512_const_64(
+                                    0x38393a3b3c3d3e3f, 0x3031323334353637,
+                                    0x28292a2b2c2d2e2f, 0x2021222324252627,
+                                    0x18191a1b1c1d1e1f, 0x1011121314151617,
+                                    0x08090a0b0c0d0e0f, 0x0001020304050607 );
+
+    ptr = (unsigned)sc->count & (buf_size - 1U);
+    sc->buf[ ptr>>3 ] = m512_const1_64( 0x80 );
+    ptr += 8;
+    if ( ptr > pad )
+    {
+         memset_zero_512( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );
+         sha512_8way_round( sc, sc->buf, sc->val );
+         memset_zero_512( sc->buf, pad >> 3 );
+    }
+    else
+         memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
+
+    sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
+                       _mm512_set1_epi64( sc->count >> 61 ), shuff_bswap64 );
+    sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
+                       _mm512_set1_epi64( sc->count <<  3 ), shuff_bswap64 );
+    sha512_8way_round( sc, sc->buf, sc->val );
+
+    mm512_block_bswap_64( dst, sc->val );
+}
+
+
+#endif   // AVX512
+
+// SHA-512 4 way 64 bit
+
+
 #define CH(X, Y, Z) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 

@@ -254,7 +482,7 @@ void sha512_4way_init( sha512_4way_context *sc )
   sc->count = 0;
 }

-void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
+void sha512_4way_update( sha512_4way_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   size_t ptr;
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -3,6 +3,12 @@

 #include <stdio.h>

+// This implementation is deprecated, superseded by VAES in Icelake
+// which provides HW based 4 way aes.
+// It was created for AVX2 to eliminate interleaving between the 
+// preceding and following function.
+// This code can be removed when current users have reverted to one way.
+
 #if defined(__AVX2__)


--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -100,9 +100,20 @@ c512( sph_shavite_big_context *sc, const void *msg )
   p3 = h[3];   

   // round
+
+//  working proof of concept   
+/*
+   __m512i K = m512_const1_128( m[0] );
+   __m512i X = _mm512_xor_si512( m512_const1_128( p1 ), K );
+   X = _mm512_aesenc_epi128( X, m512_zero );
+   k00 = _mm512_castsi512_si128( K );
+   x = _mm512_castsi512_si128( X );
+*/
+
   k00 = m[0];
   x = _mm_xor_si128( p1, k00 );
   x = _mm_aesenc_si128( x, zero );
+
   k01 = m[1];
   x = _mm_xor_si128( x, k01 );
   x = _mm_aesenc_si128( x, zero );
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -51,6 +51,8 @@ void init_c11_8way_ctx()
 void c11_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));     
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -107,21 +109,18 @@ void c11_8way_hash( void *state, const void *input )
     skein512_8way_update( &ctx.skein, vhash, 64 );
     skein512_8way_close( &ctx.skein, vhash );

-     // Serial
-     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash );
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );

-     // 7 Luffa + 8 cube
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );

     // 9 Shavite
     sph_shavite512( &ctx.shavite, hash0, 64 );
--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -51,6 +51,8 @@ void init_x11_8way_ctx()
 void x11_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -108,20 +110,18 @@ void x11_8way_hash( void *state, const void *input )
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );

-     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash );
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );

-     // Luffa + Cube
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );

     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
--- a/algo/x12/x12-4way.c
+++ b/algo/x12/x12-4way.c
@@ -56,6 +56,8 @@ void init_x12_8way_ctx()
 void x12_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -73,20 +75,18 @@ void x12_8way_hash( void *state, const void *input )
     bmw512_8way_update( &ctx.bmw, vhash, 64 );
     bmw512_8way_close( &ctx.bmw, vhash );

-     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                      hash7, vhash );
-     
-     // Luffa + Cube
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );

     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
--- a/algo/x13/x13-4way.c
+++ b/algo/x13/x13-4way.c
@@ -58,6 +58,8 @@ void init_x13_8way_ctx()
 void x13_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -113,17 +115,18 @@ void x13_8way_hash( void *state, const void *input )
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash );

-     // Luffa + Cube
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );

     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -62,6 +62,8 @@ void init_x14_8way_ctx()
 void x14_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -115,20 +117,18 @@ void x14_8way_hash( void *state, const void *input )
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );

-     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash );
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );

-     // Luffa + Cube
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+     
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );

     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -65,6 +65,9 @@ void init_x15_8way_ctx()

 void x15_8way_hash( void *state, const void *input )
 {
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -73,7 +76,6 @@ void x15_8way_hash( void *state, const void *input )
     uint64_t hash5[8] __attribute__ ((aligned (64)));
     uint64_t hash6[8] __attribute__ ((aligned (64)));
     uint64_t hash7[8] __attribute__ ((aligned (64)));
-     uint64_t vhash[8*8] __attribute__ ((aligned (64)));
     x15_8way_ctx_holder ctx;
     memcpy( &ctx, &x15_8way_ctx, sizeof(x15_8way_ctx) );

@@ -119,17 +121,18 @@ void x15_8way_hash( void *state, const void *input )
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );

-     // Luffa + Cube
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );

     // 9 Shavite
     sph_shavite512( &ctx.shavite, hash0, 64 );
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -5,9 +5,6 @@
 * Optimized by JayDDee@github Jan 2018
 */
 #include "x16r-gate.h"
-
-#if defined (X16R_4WAY)
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -20,6 +17,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -32,6 +30,392 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };

+#if defined (X16R_8WAY)
+
+union _x16r_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+} __attribute__ ((aligned (64)));
+
+typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
+
+void x16r_8way_hash( void* output, const void* input )
+{
+   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[24] __attribute__ ((aligned (64)));
+   uint32_t hash1[24] __attribute__ ((aligned (64)));
+   uint32_t hash2[24] __attribute__ ((aligned (64)));
+   uint32_t hash3[24] __attribute__ ((aligned (64)));
+   uint32_t hash4[24] __attribute__ ((aligned (64)));
+   uint32_t hash5[24] __attribute__ ((aligned (64)));
+   uint32_t hash6[24] __attribute__ ((aligned (64)));
+   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   x16r_8way_context_overlay ctx;
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   void *in2 = (void*) hash2;
+   void *in3 = (void*) hash3;
+   void *in4 = (void*) hash4;
+   void *in5 = (void*) hash5;
+   void *in6 = (void*) hash6;
+   void *in7 = (void*) hash7;
+   int size = 80;
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 input, 640 );
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            blake512_8way_init( &ctx.blake );
+            if ( i == 0 )
+               blake512_8way_update( &ctx.blake, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
+                            size<<3 );
+               blake512_8way_update( &ctx.blake, vhash, size );
+            }
+            blake512_8way_close( &ctx.blake, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case BMW:
+            bmw512_8way_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_8way_update( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+            bmw512_8way_update( &ctx.bmw, vhash, size );
+            }
+            bmw512_8way_close( &ctx.bmw, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case GROESTL:
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                 (const char*)in0, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                 (const char*)in1, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                 (const char*)in2, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                 (const char*)in3, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                                 (const char*)in4, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                                 (const char*)in5, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                                 (const char*)in6, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                                 (const char*)in7, size<<3 );
+         break;
+         case SKEIN:
+            skein512_8way_init( &ctx.skein );
+            if ( i == 0 )
+               skein512_8way_update( &ctx.skein, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               skein512_8way_update( &ctx.skein, vhash, size );
+            }
+            skein512_8way_close( &ctx.skein, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case JH:
+            jh512_8way_init( &ctx.jh );
+            if ( i == 0 )
+               jh512_8way_update( &ctx.jh, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
+                            size<<3 );
+               jh512_8way_update( &ctx.jh, vhash, size );
+            }
+            jh512_8way_close( &ctx.jh, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case KECCAK:
+            keccak512_8way_init( &ctx.keccak );
+            if ( i == 0 )
+               keccak512_8way_update( &ctx.keccak, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
+                            size<<3 );
+               keccak512_8way_update( &ctx.keccak, vhash, size );
+            }
+            keccak512_8way_close( &ctx.keccak, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case LUFFA:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case CUBEHASH:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in0, size );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in1, size );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in2, size );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in3, size );
+            sph_shavite512_close( &ctx.shavite, hash3 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in4, size );
+            sph_shavite512_close( &ctx.shavite, hash4 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in5, size );
+            sph_shavite512_close( &ctx.shavite, hash5 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in6, size );
+            sph_shavite512_close( &ctx.shavite, hash6 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in7, size );
+            sph_shavite512_close( &ctx.shavite, hash7 );
+         break;
+         case SIMD:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case ECHO:
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                                (const BitSequence*)in0, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                                (const BitSequence*)in1, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                                (const BitSequence*)in2, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                                (const BitSequence*)in3, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
+                                (const BitSequence*)in4, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
+                                (const BitSequence*)in5, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
+                                (const BitSequence*)in6, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
+                                (const BitSequence*)in7, size<<3 );
+         break;
+         case HAMSI:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+
+             hamsi512_8way_init( &ctx.hamsi );
+             hamsi512_8way_update( &ctx.hamsi, vhash, size );
+             hamsi512_8way_close( &ctx.hamsi, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+             break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in0, size );
+             sph_fugue512_close( &ctx.fugue, hash0 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in1, size );
+             sph_fugue512_close( &ctx.fugue, hash1 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in2, size );
+             sph_fugue512_close( &ctx.fugue, hash2 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in3, size );
+             sph_fugue512_close( &ctx.fugue, hash3 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in4, size );
+             sph_fugue512_close( &ctx.fugue, hash4 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in5, size );
+             sph_fugue512_close( &ctx.fugue, hash5 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in6, size );
+             sph_fugue512_close( &ctx.fugue, hash6 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in7, size );
+             sph_fugue512_close( &ctx.fugue, hash7 );
+         break;
+         case SHABAL:
+             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                          size<<3 );
+             shabal512_8way_init( &ctx.shabal );
+             shabal512_8way_update( &ctx.shabal, vhash, size );
+             shabal512_8way_close( &ctx.shabal, vhash );
+             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in0, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash0 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in1, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash1 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in2, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash2 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in3, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in4, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash4 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in5, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash5 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in6, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash6 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in7, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash7 );
+         break;
+         case SHA_512:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+             sha512_8way_init( &ctx.sha512 );
+             sha512_8way_update( &ctx.sha512, vhash, size );
+             sha512_8way_close( &ctx.sha512, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+      }
+      size = 64;
+   }
+
+   memcpy( output,     hash0, 32 );
+   memcpy( output+32,  hash1, 32 );
+   memcpy( output+64,  hash2, 32 );
+   memcpy( output+96,  hash3, 32 );
+   memcpy( output+128, hash4, 32 );
+   memcpy( output+160, hash5, 32 );
+   memcpy( output+192, hash6, 32 );
+   memcpy( output+224, hash7, 32 );
+}
+
+int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   int thr_id = mythr->id;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+   const uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+   }
+
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+      x16r_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
+      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      {
+         pdata[19] = n+i;
+         submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+#elif defined (X16R_4WAY)
+
 union _x16r_4way_context_overlay
 {
    blake512_4way_context   blake;
@@ -50,16 +434,16 @@ union _x16r_4way_context_overlay
    shabal512_4way_context  shabal;
    sph_whirlpool_context   whirlpool;
    sha512_4way_context     sha512;
-};
+} __attribute__ ((aligned (64)));
 typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;

 void x16r_4way_hash( void* output, const void* input )
 {
+   uint32_t vhash[24*4] __attribute__ ((aligned (128)));
   uint32_t hash0[24] __attribute__ ((aligned (64)));
   uint32_t hash1[24] __attribute__ ((aligned (64)));
   uint32_t hash2[24] __attribute__ ((aligned (64)));
   uint32_t hash3[24] __attribute__ ((aligned (64)));
-   uint32_t vhash[24*4] __attribute__ ((aligned (64)));
   x16r_4way_context_overlay ctx;
   void *in0 = (void*) hash0;
   void *in1 = (void*) hash1;
@@ -86,7 +470,7 @@ void x16r_4way_hash( void* output, const void* input )
               blake512_4way( &ctx.blake, vhash, size );
            }
            blake512_4way_close( &ctx.blake, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case BMW:
            bmw512_4way_init( &ctx.bmw );
@@ -98,7 +482,7 @@ void x16r_4way_hash( void* output, const void* input )
               bmw512_4way( &ctx.bmw, vhash, size );
            }
            bmw512_4way_close( &ctx.bmw, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case GROESTL:
               init_groestl( &ctx.groestl, 64 );
@@ -124,7 +508,7 @@ void x16r_4way_hash( void* output, const void* input )
               skein512_4way( &ctx.skein, vhash, size );
            }
            skein512_4way_close( &ctx.skein, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case JH:
            jh512_4way_init( &ctx.jh );
@@ -136,7 +520,7 @@ void x16r_4way_hash( void* output, const void* input )
               jh512_4way( &ctx.jh, vhash, size );
            }
            jh512_4way_close( &ctx.jh, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case KECCAK:
            keccak512_4way_init( &ctx.keccak );
@@ -148,17 +532,17 @@ void x16r_4way_hash( void* output, const void* input )
               keccak512_4way( &ctx.keccak, vhash, size );
            }
            keccak512_4way_close( &ctx.keccak, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case LUFFA:
            intrlv_2x128( vhash, in0, in1, size<<3 );
            luffa_2way_init( &ctx.luffa, 512 );
            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
-            dintrlv_2x128( hash0, hash1, vhash, 512 );
+            dintrlv_2x128_512( hash0, hash1, vhash );
            intrlv_2x128( vhash, in2, in3, size<<3 );
            luffa_2way_init( &ctx.luffa, 512 );
            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
-            dintrlv_2x128( hash2, hash3, vhash, 512 );
+            dintrlv_2x128_512( hash2, hash3, vhash );
         break;
         case CUBEHASH:
            cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -192,11 +576,11 @@ void x16r_4way_hash( void* output, const void* input )
            intrlv_2x128( vhash, in0, in1, size<<3 );
            simd_2way_init( &ctx.simd, 512 );
            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_2x128( hash0, hash1, vhash, 512 );
+            dintrlv_2x128_512( hash0, hash1, vhash );
            intrlv_2x128( vhash, in2, in3, size<<3 );
            simd_2way_init( &ctx.simd, 512 );
            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_2x128( hash2, hash3, vhash, 512 );
+            dintrlv_2x128_512( hash2, hash3, vhash );
         break;
         case ECHO:
             init_echo( &ctx.echo, 512 );
@@ -217,7 +601,7 @@ void x16r_4way_hash( void* output, const void* input )
             hamsi512_4way_init( &ctx.hamsi );
             hamsi512_4way( &ctx.hamsi, vhash, size );
             hamsi512_4way_close( &ctx.hamsi, vhash );
-             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case FUGUE:
             sph_fugue512_init( &ctx.fugue );
@@ -238,7 +622,7 @@ void x16r_4way_hash( void* output, const void* input )
             shabal512_4way_init( &ctx.shabal );
             shabal512_4way( &ctx.shabal, vhash, size );
             shabal512_4way_close( &ctx.shabal, vhash );
-             dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+             dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case WHIRLPOOL:
             sph_whirlpool_init( &ctx.whirlpool );
@@ -259,7 +643,7 @@ void x16r_4way_hash( void* output, const void* input )
             sha512_4way_init( &ctx.sha512 );
             sha512_4way( &ctx.sha512, vhash, size );
             sha512_4way_close( &ctx.sha512, vhash );
-             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
      }
      size = 64;
@@ -280,6 +664,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   int thr_id = mythr->id;
@@ -317,9 +702,9 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
         submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;
-   } while ( likely( ( n < max_nonce ) && !(*restart) ) );
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );

-   *hashes_done = n - first_nonce + 1;
+   *hashes_done = n - first_nonce;
   return 0;
 }

--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -34,14 +34,17 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output )

 bool register_x16r_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16r_8way;
+  gate->hash      = (void*)&x16r_8way_hash;
+#elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -49,14 +52,17 @@ bool register_x16r_algo( algo_gate_t* gate )

 bool register_x16rv2_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16RV2_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16rv2_8way;
+  gate->hash      = (void*)&x16rv2_8way_hash;
+#elif defined (X16RV2_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rv2_4way;
  gate->hash      = (void*)&x16rv2_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rv2;
  gate->hash      = (void*)&x16rv2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -64,14 +70,17 @@ bool register_x16rv2_algo( algo_gate_t* gate )

 bool register_x16s_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16r_8way;
+  gate->hash      = (void*)&x16r_8way_hash;
+#elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -196,28 +205,34 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )

 bool register_x16rt_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16RT_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16rt_8way;
+  gate->hash      = (void*)&x16rt_8way_hash;
+#elif defined (X16RT_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16rt_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16rt_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  opt_target_factor = 256.0;
  return true;
 };

 bool register_x16rt_veil_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16RT_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16rt_8way;
+  gate->hash      = (void*)&x16rt_8way_hash;
+#elif defined (X16RT_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16rt_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16rt_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  gate->build_extraheader = (void*)&veil_build_extraheader;
  opt_target_factor = 256.0;
  return true;
@@ -231,7 +246,7 @@ bool register_hex_algo( algo_gate_t* gate )
 {
  gate->scanhash        = (void*)&scanhash_hex;
  gate->hash            = (void*)&hex_hash;
-  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  opt_target_factor = 128.0;
  return true;
@@ -243,16 +258,23 @@ bool register_hex_algo( algo_gate_t* gate )

 bool register_x21s_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X21S_8WAY)
+  gate->scanhash          = (void*)&scanhash_x21s_8way;
+  gate->hash              = (void*)&x21s_8way_hash;
+  gate->miner_thread_init = (void*)&x21s_8way_thread_init;
+  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+#elif defined (X21S_4WAY)
  gate->scanhash          = (void*)&scanhash_x21s_4way;
  gate->hash              = (void*)&x21s_4way_hash;
  gate->miner_thread_init = (void*)&x21s_4way_thread_init;
+  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
 #else
  gate->scanhash          = (void*)&scanhash_x21s;
  gate->hash              = (void*)&x21s_hash;
  gate->miner_thread_init = (void*)&x21s_thread_init;
+  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
 #endif
-  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
+//  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
  x16_r_s_getAlgoString   = (void*)&x16s_getAlgoString;
  opt_target_factor = 256.0;
  return true;
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -6,8 +6,28 @@
 #include <stdint.h>
 #include <unistd.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X16R_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X16R_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X16R_4WAY 1
+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X16RV2_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X16RV2_4WAY 1
+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X16RT_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X16RT_4WAY 1
+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X21S_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X21S_4WAY 1
 #endif

 enum x16r_Algo {
@@ -44,42 +64,86 @@ bool register_x16rt_algo( algo_gate_t* gate );
 bool register_hex__algo( algo_gate_t* gate );
 bool register_x21s__algo( algo_gate_t* gate );

-#if defined(X16R_4WAY)
+// x16r, x16s
+#if defined(X16R_8WAY)
+
+void x16r_8way_hash( void *state, const void *input );
+int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
+
+#elif defined(X16R_4WAY)

 void x16r_4way_hash( void *state, const void *input );
 int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

+#else
+
+void x16r_hash( void *state, const void *input );
+int scanhash_x16r( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
+// x16Rv2
+#if defined(X16RV2_8WAY)
+
+void x16rv2_8way_hash( void *state, const void *input );
+int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(X16RV2_4WAY)
+
 void x16rv2_4way_hash( void *state, const void *input );
 int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

+#else
+
+void x16rv2_hash( void *state, const void *input );
+int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
+// x16rt, veil
+#if defined(X16RT_8WAY)
+
+void x16rt_8way_hash( void *state, const void *input );
+int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(X16RT_4WAY)
+
 void x16rt_4way_hash( void *state, const void *input );
 int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

+#else
+
+void x16rt_hash( void *state, const void *input );
+int scanhash_x16rt( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
+// x21s
+#if defined(X21S_8WAY)
+
+void x21s_8way_hash( void *state, const void *input );
+int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+bool x21s_8way_thread_init();
+
+#elif defined(X21S_4WAY)
+
 void x21s_4way_hash( void *state, const void *input );
 int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 bool x21s_4way_thread_init();

-#endif
-
-void x16r_hash( void *state, const void *input );
-int scanhash_x16r( struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done, struct thr_info *mythr );
-
-void x16rv2_hash( void *state, const void *input );
-int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done, struct thr_info *mythr );
-
-void x16rt_hash( void *state, const void *input );
-int scanhash_x16rt( struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done, struct thr_info *mythr );
-
-void hex_hash( void *state, const void *input );
-int scanhash_hex( struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done, struct thr_info *mythr );
+#else

 void x21s_hash( void *state, const void *input );
 int scanhash_x21s( struct work *work, uint32_t max_nonce,
@@ -88,3 +152,9 @@ bool x21s_thread_init();

 #endif

+void hex_hash( void *state, const void *input );
+int scanhash_hex( struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -1,7 +1,4 @@
 #include "x16r-gate.h"
-
-#if defined (X16R_4WAY)
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -15,6 +12,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -26,6 +24,391 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };

+#if defined (X16RT_8WAY)
+
+union _x16rt_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+} __attribute__ ((aligned (64)));
+
+typedef union _x16rt_8way_context_overlay x16rt_8way_context_overlay;
+
+void x16rt_8way_hash( void* output, const void* input )
+{
+   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[24] __attribute__ ((aligned (64)));
+   uint32_t hash1[24] __attribute__ ((aligned (64)));
+   uint32_t hash2[24] __attribute__ ((aligned (64)));
+   uint32_t hash3[24] __attribute__ ((aligned (64)));
+   uint32_t hash4[24] __attribute__ ((aligned (64)));
+   uint32_t hash5[24] __attribute__ ((aligned (64)));
+   uint32_t hash6[24] __attribute__ ((aligned (64)));
+   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   x16rt_8way_context_overlay ctx;
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   void *in2 = (void*) hash2;
+   void *in3 = (void*) hash3;
+   void *in4 = (void*) hash4;
+   void *in5 = (void*) hash5;
+   void *in6 = (void*) hash6;
+   void *in7 = (void*) hash7;
+   int size = 80;
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 input, 640 );
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            blake512_8way_init( &ctx.blake );
+            if ( i == 0 )
+               blake512_8way_update( &ctx.blake, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               blake512_8way_update( &ctx.blake, vhash, size );
+            }
+            blake512_8way_close( &ctx.blake, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case BMW:
+            bmw512_8way_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_8way_update( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+            bmw512_8way_update( &ctx.bmw, vhash, size );
+            }
+            bmw512_8way_close( &ctx.bmw, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case GROESTL:
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                 (const char*)in0, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                 (const char*)in1, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                 (const char*)in2, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                 (const char*)in3, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                                 (const char*)in4, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                                 (const char*)in5, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                                 (const char*)in6, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                                 (const char*)in7, size<<3 );
+         break;
+         case SKEIN:
+            skein512_8way_init( &ctx.skein );
+            if ( i == 0 )
+               skein512_8way_update( &ctx.skein, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               skein512_8way_update( &ctx.skein, vhash, size );
+            }
+            skein512_8way_close( &ctx.skein, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case JH:
+            jh512_8way_init( &ctx.jh );
+            if ( i == 0 )
+               jh512_8way_update( &ctx.jh, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               jh512_8way_update( &ctx.jh, vhash, size );
+            }
+            jh512_8way_close( &ctx.jh, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case KECCAK:
+            keccak512_8way_init( &ctx.keccak );
+            if ( i == 0 )
+               keccak512_8way_update( &ctx.keccak, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               keccak512_8way_update( &ctx.keccak, vhash, size );
+            }
+            keccak512_8way_close( &ctx.keccak, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case LUFFA:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case CUBEHASH:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in0, size );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in1, size );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in2, size );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in3, size );
+            sph_shavite512_close( &ctx.shavite, hash3 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in4, size );
+            sph_shavite512_close( &ctx.shavite, hash4 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in5, size );
+            sph_shavite512_close( &ctx.shavite, hash5 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in6, size );
+            sph_shavite512_close( &ctx.shavite, hash6 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in7, size );
+            sph_shavite512_close( &ctx.shavite, hash7 );
+         break;
+         case SIMD:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case ECHO:
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                                (const BitSequence*)in0, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                                (const BitSequence*)in1, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                                (const BitSequence*)in2, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                                (const BitSequence*)in3, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
+                                (const BitSequence*)in4, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
+                                (const BitSequence*)in5, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
+                                (const BitSequence*)in6, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
+                                (const BitSequence*)in7, size<<3 );
+         break;
+         case HAMSI:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+
+             hamsi512_8way_init( &ctx.hamsi );
+             hamsi512_8way_update( &ctx.hamsi, vhash, size );
+             hamsi512_8way_close( &ctx.hamsi, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+             break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in0, size );
+             sph_fugue512_close( &ctx.fugue, hash0 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in1, size );
+             sph_fugue512_close( &ctx.fugue, hash1 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in2, size );
+             sph_fugue512_close( &ctx.fugue, hash2 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in3, size );
+             sph_fugue512_close( &ctx.fugue, hash3 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in4, size );
+             sph_fugue512_close( &ctx.fugue, hash4 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in5, size );
+             sph_fugue512_close( &ctx.fugue, hash5 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in6, size );
+             sph_fugue512_close( &ctx.fugue, hash6 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in7, size );
+             sph_fugue512_close( &ctx.fugue, hash7 );
+         break;
+         case SHABAL:
+             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                          size<<3 );
+             shabal512_8way_init( &ctx.shabal );
+             shabal512_8way_update( &ctx.shabal, vhash, size );
+             shabal512_8way_close( &ctx.shabal, vhash );
+             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in0, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash0 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in1, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash1 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in2, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash2 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in3, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in4, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash4 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in5, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash5 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in6, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash6 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in7, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash7 );
+         break;
+         case SHA_512:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+             sha512_8way_init( &ctx.sha512 );
+             sha512_8way_update( &ctx.sha512, vhash, size );
+             sha512_8way_close( &ctx.sha512, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+      }
+      size = 64;
+   }
+
+   memcpy( output,     hash0, 32 );
+   memcpy( output+32,  hash1, 32 );
+   memcpy( output+64,  hash2, 32 );
+   memcpy( output+96,  hash3, 32 );
+   memcpy( output+128, hash4, 32 );
+   memcpy( output+160, hash5, 32 );
+   memcpy( output+192, hash6, 32 );
+   memcpy( output+224, hash7, 32 );
+}
+
+int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) timeHash[8*8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   int thr_id = mythr->id;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16rt_getTimeHash( ntime, &timeHash );
+      x16rt_getAlgoString( &timeHash[0], hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
+                               hashOrder, ntime, timeHash );
+   }
+
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+      x16rt_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
+      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      {
+         pdata[19] = n+i;
+         submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (X16RT_4WAY)
+
 union _x16rt_4way_context_overlay
 {
    blake512_4way_context   blake;
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -5,9 +5,6 @@
 * Optimized by JayDDee@github Jan 2018
 */
 #include "x16r-gate.h"
-
-#if defined (X16R_4WAY)
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -21,6 +18,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -33,6 +31,474 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };

+#if defined (X16RV2_8WAY)
+
+union _x16rv2_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+    sph_tiger_context       tiger;
+} __attribute__ ((aligned (64)));
+
+typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
+
+void x16rv2_8way_hash( void* output, const void* input )
+{
+   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[24] __attribute__ ((aligned (64)));
+   uint32_t hash1[24] __attribute__ ((aligned (64)));
+   uint32_t hash2[24] __attribute__ ((aligned (64)));
+   uint32_t hash3[24] __attribute__ ((aligned (64)));
+   uint32_t hash4[24] __attribute__ ((aligned (64)));
+   uint32_t hash5[24] __attribute__ ((aligned (64)));
+   uint32_t hash6[24] __attribute__ ((aligned (64)));
+   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   x16rv2_8way_context_overlay ctx;
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   void *in2 = (void*) hash2;
+   void *in3 = (void*) hash3;
+   void *in4 = (void*) hash4;
+   void *in5 = (void*) hash5;
+   void *in6 = (void*) hash6;
+   void *in7 = (void*) hash7;
+   int size = 80;
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 input, 640 );
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            blake512_8way_init( &ctx.blake );
+            if ( i == 0 )
+               blake512_8way_update( &ctx.blake, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               blake512_8way_update( &ctx.blake, vhash, size );
+            }
+            blake512_8way_close( &ctx.blake, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case BMW:
+            bmw512_8way_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_8way_update( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+            bmw512_8way_update( &ctx.bmw, vhash, size );
+            }
+            bmw512_8way_close( &ctx.bmw, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case GROESTL:
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                 (const char*)in0, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                 (const char*)in1, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                 (const char*)in2, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                 (const char*)in3, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                                 (const char*)in4, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                                 (const char*)in5, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                                 (const char*)in6, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                                 (const char*)in7, size<<3 );
+         break;
+         case SKEIN:
+            skein512_8way_init( &ctx.skein );
+            if ( i == 0 )
+               skein512_8way_update( &ctx.skein, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               skein512_8way_update( &ctx.skein, vhash, size );
+            }
+            skein512_8way_close( &ctx.skein, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case JH:
+            jh512_8way_init( &ctx.jh );
+            if ( i == 0 )
+               jh512_8way_update( &ctx.jh, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               jh512_8way_update( &ctx.jh, vhash, size );
+            }
+            jh512_8way_close( &ctx.jh, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case KECCAK:
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in0, size );
+             sph_tiger_close( &ctx.tiger, hash0 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in1, size );
+             sph_tiger_close( &ctx.tiger, hash1 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in2, size );
+             sph_tiger_close( &ctx.tiger, hash2 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in3, size );
+             sph_tiger_close( &ctx.tiger, hash3 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in4, size );
+             sph_tiger_close( &ctx.tiger, hash4 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in5, size );
+             sph_tiger_close( &ctx.tiger, hash5 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in6, size );
+             sph_tiger_close( &ctx.tiger, hash6 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in7, size );
+             sph_tiger_close( &ctx.tiger, hash7 );
+
+             for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = hash2[i] = hash3[i] =
+                hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
+
+             intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
+                          hash6, hash7 );
+             keccak512_8way_init( &ctx.keccak );
+             keccak512_8way_update( &ctx.keccak, vhash, 64 );
+             keccak512_8way_close( &ctx.keccak, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case LUFFA:
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in0, size );
+             sph_tiger_close( &ctx.tiger, hash0 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in1, size );
+             sph_tiger_close( &ctx.tiger, hash1 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in2, size );
+             sph_tiger_close( &ctx.tiger, hash2 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in3, size );
+             sph_tiger_close( &ctx.tiger, hash3 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in4, size );
+             sph_tiger_close( &ctx.tiger, hash4 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in5, size );
+             sph_tiger_close( &ctx.tiger, hash5 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in6, size );
+             sph_tiger_close( &ctx.tiger, hash6 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in7, size );
+             sph_tiger_close( &ctx.tiger, hash7 );
+
+             for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = hash2[i] = hash3[i] = 
+                hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
+
+            intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3);
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7);
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case CUBEHASH:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in0, size );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in1, size );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in2, size );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in3, size );
+            sph_shavite512_close( &ctx.shavite, hash3 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in4, size );
+            sph_shavite512_close( &ctx.shavite, hash4 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in5, size );
+            sph_shavite512_close( &ctx.shavite, hash5 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in6, size );
+            sph_shavite512_close( &ctx.shavite, hash6 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in7, size );
+            sph_shavite512_close( &ctx.shavite, hash7 );
+         break;
+         case SIMD:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case ECHO:
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                                (const BitSequence*)in0, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                                (const BitSequence*)in1, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                                (const BitSequence*)in2, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                                (const BitSequence*)in3, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
+                                (const BitSequence*)in4, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
+                                (const BitSequence*)in5, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
+                                (const BitSequence*)in6, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
+                                (const BitSequence*)in7, size<<3 );
+         break;
+         case HAMSI:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+
+             hamsi512_8way_init( &ctx.hamsi );
+             hamsi512_8way_update( &ctx.hamsi, vhash, size );
+             hamsi512_8way_close( &ctx.hamsi, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+             break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in0, size );
+             sph_fugue512_close( &ctx.fugue, hash0 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in1, size );
+             sph_fugue512_close( &ctx.fugue, hash1 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in2, size );
+             sph_fugue512_close( &ctx.fugue, hash2 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in3, size );
+             sph_fugue512_close( &ctx.fugue, hash3 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in4, size );
+             sph_fugue512_close( &ctx.fugue, hash4 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in5, size );
+             sph_fugue512_close( &ctx.fugue, hash5 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in6, size );
+             sph_fugue512_close( &ctx.fugue, hash6 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in7, size );
+             sph_fugue512_close( &ctx.fugue, hash7 );
+         break;
+         case SHABAL:
+             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                          size<<3 );
+             shabal512_8way_init( &ctx.shabal );
+             shabal512_8way_update( &ctx.shabal, vhash, size );
+             shabal512_8way_close( &ctx.shabal, vhash );
+             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in0, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash0 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in1, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash1 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in2, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash2 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in3, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in4, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash4 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in5, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash5 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in6, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash6 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in7, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash7 );
+         break;
+         case SHA_512:
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in0, size );
+             sph_tiger_close( &ctx.tiger, hash0 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in1, size );
+             sph_tiger_close( &ctx.tiger, hash1 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in2, size );
+             sph_tiger_close( &ctx.tiger, hash2 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in3, size );
+             sph_tiger_close( &ctx.tiger, hash3 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in4, size );
+             sph_tiger_close( &ctx.tiger, hash4 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in5, size );
+             sph_tiger_close( &ctx.tiger, hash5 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in6, size );
+             sph_tiger_close( &ctx.tiger, hash6 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in7, size );
+             sph_tiger_close( &ctx.tiger, hash7 );
+
+             for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = hash2[i] = hash3[i] =
+                hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
+
+             intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
+                          hash6, hash7 );
+             sha512_8way_init( &ctx.sha512 );
+             sha512_8way_update( &ctx.sha512, vhash, 64 );
+             sha512_8way_close( &ctx.sha512, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+      }
+      size = 64;
+   }
+
+   memcpy( output,     hash0, 32 );
+   memcpy( output+32,  hash1, 32 );
+   memcpy( output+64,  hash2, 32 );
+   memcpy( output+96,  hash3, 32 );
+   memcpy( output+128, hash4, 32 );
+   memcpy( output+160, hash5, 32 );
+   memcpy( output+192, hash6, 32 );
+   memcpy( output+224, hash7, 32 );
+}
+
+int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   int thr_id = mythr->id;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+   const uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+   }
+
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+      x16rv2_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
+      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      {
+         pdata[19] = n+i;
+         submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (X16RV2_4WAY)
+
 union _x16rv2_4way_context_overlay
 {
    blake512_4way_context   blake;
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -1,13 +1,10 @@
 /**
- * x16r algo implementation
+ * x21s algo implementation
 *
 * Implementation by tpruvot@github Jan 2018
 * Optimized by JayDDee@github Jan 2018
 */
 #include "x16r-gate.h"
-
-#if defined (X16R_4WAY)
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -21,6 +18,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -38,6 +36,483 @@

 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
+
+#if defined (X21S_8WAY)
+
+static __thread uint64_t* x21s_8way_matrix;
+
+union _x21s_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+    haval256_5_8way_context haval;
+    sph_tiger_context       tiger;
+    sph_gost512_context     gost;
+    sha256_8way_context     sha256;
+} __attribute__ ((aligned (64)));
+
+typedef union _x21s_8way_context_overlay x21s_8way_context_overlay;
+
+void x21s_8way_hash( void* output, const void* input )
+{
+   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[24] __attribute__ ((aligned (64)));
+   uint32_t hash1[24] __attribute__ ((aligned (64)));
+   uint32_t hash2[24] __attribute__ ((aligned (64)));
+   uint32_t hash3[24] __attribute__ ((aligned (64)));
+   uint32_t hash4[24] __attribute__ ((aligned (64)));
+   uint32_t hash5[24] __attribute__ ((aligned (64)));
+   uint32_t hash6[24] __attribute__ ((aligned (64)));
+   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   x21s_8way_context_overlay ctx;
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   void *in2 = (void*) hash2;
+   void *in3 = (void*) hash3;
+   void *in4 = (void*) hash4;
+   void *in5 = (void*) hash5;
+   void *in6 = (void*) hash6;
+   void *in7 = (void*) hash7;
+   int size = 80;
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 input, 640 );
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            blake512_8way_init( &ctx.blake );
+            if ( i == 0 )
+               blake512_8way_update( &ctx.blake, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               blake512_8way_update( &ctx.blake, vhash, size );
+            }
+            blake512_8way_close( &ctx.blake, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case BMW:
+            bmw512_8way_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_8way_update( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+            bmw512_8way_update( &ctx.bmw, vhash, size );
+            }
+            bmw512_8way_close( &ctx.bmw, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case GROESTL:
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                 (const char*)in0, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                 (const char*)in1, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                 (const char*)in2, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                 (const char*)in3, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                                 (const char*)in4, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                                 (const char*)in5, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                                 (const char*)in6, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                                 (const char*)in7, size<<3 );
+         break;
+         case SKEIN:
+            skein512_8way_init( &ctx.skein );
+            if ( i == 0 )
+               skein512_8way_update( &ctx.skein, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               skein512_8way_update( &ctx.skein, vhash, size );
+            }
+            skein512_8way_close( &ctx.skein, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case JH:
+            jh512_8way_init( &ctx.jh );
+            if ( i == 0 )
+               jh512_8way_update( &ctx.jh, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               jh512_8way_update( &ctx.jh, vhash, size );
+            }
+            jh512_8way_close( &ctx.jh, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case KECCAK:
+            keccak512_8way_init( &ctx.keccak );
+            if ( i == 0 )
+               keccak512_8way_update( &ctx.keccak, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               keccak512_8way_update( &ctx.keccak, vhash, size );
+            }
+            keccak512_8way_close( &ctx.keccak, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case LUFFA:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case CUBEHASH:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in0, size );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in1, size );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in2, size );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in3, size );
+            sph_shavite512_close( &ctx.shavite, hash3 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in4, size );
+            sph_shavite512_close( &ctx.shavite, hash4 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in5, size );
+            sph_shavite512_close( &ctx.shavite, hash5 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in6, size );
+            sph_shavite512_close( &ctx.shavite, hash6 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in7, size );
+            sph_shavite512_close( &ctx.shavite, hash7 );
+         break;
+         case SIMD:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case ECHO:
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                                (const BitSequence*)in0, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                                (const BitSequence*)in1, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                                (const BitSequence*)in2, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                                (const BitSequence*)in3, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
+                                (const BitSequence*)in4, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
+                                (const BitSequence*)in5, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
+                                (const BitSequence*)in6, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
+                                (const BitSequence*)in7, size<<3 );
+         break;
+         case HAMSI:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+
+             hamsi512_8way_init( &ctx.hamsi );
+             hamsi512_8way_update( &ctx.hamsi, vhash, size );
+             hamsi512_8way_close( &ctx.hamsi, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+             break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in0, size );
+             sph_fugue512_close( &ctx.fugue, hash0 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in1, size );
+             sph_fugue512_close( &ctx.fugue, hash1 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in2, size );
+             sph_fugue512_close( &ctx.fugue, hash2 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in3, size );
+             sph_fugue512_close( &ctx.fugue, hash3 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in4, size );
+             sph_fugue512_close( &ctx.fugue, hash4 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in5, size );
+             sph_fugue512_close( &ctx.fugue, hash5 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in6, size );
+             sph_fugue512_close( &ctx.fugue, hash6 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in7, size );
+             sph_fugue512_close( &ctx.fugue, hash7 );
+         break;
+         case SHABAL:
+             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                          size<<3 );
+             shabal512_8way_init( &ctx.shabal );
+             shabal512_8way_update( &ctx.shabal, vhash, size );
+             shabal512_8way_close( &ctx.shabal, vhash );
+             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in0, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash0 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in1, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash1 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in2, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash2 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in3, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in4, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash4 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in5, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash5 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in6, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash6 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in7, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash7 );
+         break;
+         case SHA_512:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+             sha512_8way_init( &ctx.sha512 );
+             sha512_8way_update( &ctx.sha512, vhash, size );
+             sha512_8way_close( &ctx.sha512, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+      }
+      size = 64;
+   }
+
+   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                    hash7 );
+
+   haval256_5_8way_init( &ctx.haval );
+   haval256_5_8way_update( &ctx.haval, vhash, 64 );
+   haval256_5_8way_close( &ctx.haval, vhash );
+
+   dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                     hash7, vhash );
+
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash0, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash0 );
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash1, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash1 );
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash2, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash2 );
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash3, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash3 );
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash4, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash4 );
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash5, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash5 );
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash6, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash6 );
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash7, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash7 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2REV2_2WAY( x21s_8way_matrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2REV2_2WAY( x21s_8way_matrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2REV2_2WAY( x21s_8way_matrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2REV2_2WAY( x21s_8way_matrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash0, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash0 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash1, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash1 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash2, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash2 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash3, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash3 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash4, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash4 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash5, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash5 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash6, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash6 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash7, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash7 );
+
+   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                    hash7 );
+   sha256_8way_init( &ctx.sha256 );
+   sha256_8way_update( &ctx.sha256, vhash, 64 );
+   sha256_8way_close( &ctx.sha256, output );
+}
+
+int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &hash[7<<3];
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 16;
+   int thr_id = mythr->id;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+    ptarget[7] = 0x0cff;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+   uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+   }
+
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+               _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                 n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+      x21s_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( unlikely( hash7[lane] <= Htarg ) )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+             pdata[19] = n + lane;
+             submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( (  n < last_nonce ) && !(*restart) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+bool x21s_8way_thread_init()
+{
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+   const int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+   x21s_8way_matrix = _mm_malloc( 2 * size, 64 );
+   return x21s_8way_matrix;
+}
+
+#elif defined (X21S_4WAY)
+
 static __thread uint64_t* x21s_4way_matrix;

 union _x21s_4way_context_overlay
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
--- a/algo/x17/sonoa-gate.c
+++ b/algo/x17/sonoa-gate.c
@@ -2,8 +2,10 @@

 bool register_sonoa_algo( algo_gate_t* gate )
 {
-#if defined (SONOA_4WAY)
-//  init_sonoa_4way_ctx();
+#if defined (SONOA_8WAY)
+  gate->scanhash  = (void*)&scanhash_sonoa_8way;
+  gate->hash      = (void*)&sonoa_8way_hash;
+#elif defined (SONOA_4WAY)
  gate->scanhash  = (void*)&scanhash_sonoa_4way;
  gate->hash      = (void*)&sonoa_4way_hash;
 #else
@@ -11,7 +13,7 @@ bool register_sonoa_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_sonoa;
  gate->hash      = (void*)&sonoa_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x17/sonoa-gate.h
+++ b/algo/x17/sonoa-gate.h
@@ -4,29 +4,33 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define SONOA_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define SONOA_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define SONOA_4WAY 1
 #endif

 bool register_sonoa_algo( algo_gate_t* gate );

-#if defined(SONOA_4WAY)
+#if defined(SONOA_8WAY)
+
+void sonoa_8way_hash( void *state, const void *input );
+int scanhash_sonoa_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(SONOA_4WAY)

 void sonoa_4way_hash( void *state, const void *input );
-
 int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );

-//void init_sonoa_4way_ctx();
-
-#endif
+#else

 void sonoa_hash( void *state, const void *input );
-
 int scanhash_sonoa( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_sonoa_ctx();

 #endif

+#endif
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -1,7 +1,4 @@
 #include "x17-gate.h"
-
-#if defined(X17_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,6 +11,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
+#include "algo/shavite/sph_shavite.h"
 #include "algo/shavite/shavite-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -24,6 +22,309 @@
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/sha/sha-hash-4way.h"

+#if defined(X17_8WAY)
+
+union _x17_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+    haval256_5_8way_context haval;
+} __attribute__ ((aligned (64)));
+typedef union _x17_8way_context_overlay x17_8way_context_overlay;
+
+void x17_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[8*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[8*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     x17_8way_context_overlay ctx;
+
+     // 1 Blake parallel 4 way 64 bit
+     blake512_8way_init( &ctx.blake );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     // 2 Bmw
+     bmw512_8way_init( &ctx.bmw );
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     // Serialize
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     // 3 Groestl
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     // Parallellize
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+
+     // 4 Skein parallel 4 way 64 bit 
+     skein512_8way_init( &ctx.skein );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_8way_init( &ctx.jh );
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_8way_init( &ctx.keccak );
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     // 7 Luffa  
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     // 8 Cubehash
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+
+     // 9 Shavite
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     // 10 Simd
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+
+     // 11 Echo serial
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                            (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                            (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                            (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                            (const BitSequence *) hash3, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                            (const BitSequence *) hash4, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                            (const BitSequence *) hash5, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                            (const BitSequence *) hash6, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                            (const BitSequence *) hash7, 512 );
+
+     // 12 Hamsi parallel 4 way 64 bit
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+
+     hamsi512_8way_init( &ctx.hamsi );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 13 Fugue serial
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     // 14 Shabal, parallel 4 way 32 bit
+     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+
+     shabal512_8way_init( &ctx.shabal );
+     shabal512_8way_update( &ctx.shabal, vhash, 64 );
+     shabal512_8way_close( &ctx.shabal, vhash );
+
+     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 15 Whirlpool serial
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash4 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash5 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash6 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+
+     // 16 SHA512 parallel 64 bit 
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+
+     sha512_8way_init( &ctx.sha512 );
+     sha512_8way_update( &ctx.sha512, vhash, 64 );
+     sha512_8way_close( &ctx.sha512, vhash );
+
+     // 17 Haval parallel 32 bit
+     rintrlv_8x64_8x32( vhash0, vhash,  512 );
+
+     haval256_5_8way_init( &ctx.haval );
+     haval256_5_8way_update( &ctx.haval, vhash0, 64 );
+     haval256_5_8way_close( &ctx.haval, state );
+}
+
+int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+     const uint32_t last_nonce = max_nonce - 8;
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const uint32_t Htarg = ptarget[7];
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+      x17_8way_hash( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if unlikely( ( hash7[ lane ] <= Htarg ) )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(X17_4WAY)
+
 union _x17_4way_context_overlay
 {
    blake512_4way_context   blake;
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -2,14 +2,17 @@

 bool register_x17_algo( algo_gate_t* gate )
 {
-#if defined (X17_4WAY)
+#if defined (X17_8WAY)
+  gate->scanhash  = (void*)&scanhash_x17_8way;
+  gate->hash      = (void*)&x17_8way_hash;
+#elif defined (X17_4WAY)
  gate->scanhash  = (void*)&scanhash_x17_4way;
  gate->hash      = (void*)&x17_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x17;
  gate->hash      = (void*)&x17_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x17/x17-gate.h
+++ b/algo/x17/x17-gate.h
@@ -4,13 +4,20 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X17_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X17_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X17_4WAY 1
 #endif

 bool register_x17_algo( algo_gate_t* gate );

-#if defined(X17_4WAY)
+#if defined(X17_8WAY)
+
+void x17_8way_hash( void *state, const void *input );
+int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+#elif defined(X17_4WAY)

 void x17_4way_hash( void *state, const void *input );
 int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -1,7 +1,4 @@
 #include "xevan-gate.h"
-
-#if defined(XEVAN_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -15,6 +12,7 @@
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/shavite-hash-2way.h"
+#include "algo/shavite/sph_shavite.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -25,6 +23,515 @@
 #include "algo/sha/sha-hash-4way.h"
 #include "algo/haval/haval-hash-4way.h"

+#if defined(XEVAN_8WAY)
+
+union _xevan_8way_context_overlay
+{
+   blake512_8way_context   blake;
+   bmw512_8way_context     bmw;
+   hashState_groestl       groestl;
+   skein512_8way_context   skein;
+   jh512_8way_context      jh;
+   keccak512_8way_context  keccak;
+   luffa_4way_context      luffa;
+   cube_4way_context       cube;
+   sph_shavite512_context  shavite;
+   simd_4way_context       simd;
+   hashState_echo          echo;
+   hamsi512_8way_context   hamsi;
+   sph_fugue512_context    fugue;
+   shabal512_8way_context  shabal;
+   sph_whirlpool_context   whirlpool;
+   sha512_8way_context     sha512;
+   haval256_5_8way_context haval;
+} __attribute__ ((aligned (64)));
+typedef union _xevan_8way_context_overlay xevan_8way_context_overlay;
+
+void xevan_8way_hash( void *output, const void *input )
+{
+     uint64_t vhash[16<<3] __attribute__ ((aligned (128)));
+     uint64_t vhashA[16<<3] __attribute__ ((aligned (64)));
+     uint64_t vhashB[16<<3] __attribute__ ((aligned (64)));
+     uint64_t hash0[16] __attribute__ ((aligned (64)));
+     uint64_t hash1[16] __attribute__ ((aligned (64)));
+     uint64_t hash2[16] __attribute__ ((aligned (64)));
+     uint64_t hash3[16] __attribute__ ((aligned (64)));
+     uint64_t hash4[16] __attribute__ ((aligned (64)));
+     uint64_t hash5[16] __attribute__ ((aligned (64)));
+     uint64_t hash6[16] __attribute__ ((aligned (64)));
+     uint64_t hash7[16] __attribute__ ((aligned (64)));
+     const int dataLen = 128;
+     xevan_8way_context_overlay ctx __attribute__ ((aligned (64)));
+
+     blake512_8way_init( &ctx.blake );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+     memset( &vhash[8<<3], 0, 64<<3 );
+
+     bmw512_8way_init( &ctx.bmw );
+     bmw512_8way_update( &ctx.bmw, vhash, dataLen );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, dataLen<<3 );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7,
+                               dataLen<<3 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     skein512_8way_init( &ctx.skein );
+     skein512_8way_update( &ctx.skein, vhash, dataLen );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_init( &ctx.jh );
+     jh512_8way_update( &ctx.jh, vhash, dataLen );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_init( &ctx.keccak );
+     keccak512_8way_update( &ctx.keccak, vhash, dataLen );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );
+
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
+
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
+
+     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
+     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
+
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash0, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash4, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash5, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash6, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash7, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 );
+     intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 );
+
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
+
+     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
+     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, dataLen<<3 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     hamsi512_8way_init( &ctx.hamsi );
+     hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, dataLen<<3 );
+
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash0, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash4, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     shabal512_8way_init( &ctx.shabal );
+     shabal512_8way_update( &ctx.shabal, vhash, dataLen );
+     shabal512_8way_close( &ctx.shabal, vhash );
+
+     dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, dataLen<<3 );
+
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash4, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash4 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash5, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash5 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash6, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash6 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash7, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     sha512_8way_init( &ctx.sha512 );
+     sha512_8way_update( &ctx.sha512, vhash, dataLen );
+     sha512_8way_close( &ctx.sha512, vhash );
+
+     rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 );
+
+     haval256_5_8way_init( &ctx.haval );
+     haval256_5_8way_update( &ctx.haval, vhashA, dataLen );
+     haval256_5_8way_close( &ctx.haval, vhashA );
+
+     rintrlv_8x32_8x64( vhash, vhashA, dataLen<<3 );
+
+     memset( &vhash[ 4<<3 ], 0, (dataLen-32) << 3 );
+
+     blake512_8way_init( &ctx.blake );
+     blake512_8way_update( &ctx.blake, vhash, dataLen );
+     blake512_8way_close(&ctx.blake, vhash);
+
+     bmw512_8way_init( &ctx.bmw );
+     bmw512_8way_update( &ctx.bmw, vhash, dataLen );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, dataLen<<3 );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7,
+                               dataLen<<3 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     skein512_8way_init( &ctx.skein );
+     skein512_8way_update( &ctx.skein, vhash, dataLen );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_init( &ctx.jh );
+     jh512_8way_update( &ctx.jh, vhash, dataLen );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_init( &ctx.keccak );
+     keccak512_8way_update( &ctx.keccak, vhash, dataLen );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );
+
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
+
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
+
+     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
+     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
+
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash0, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash4, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash5, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash6, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash7, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 );
+     intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 );
+
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
+
+     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
+     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, dataLen<<3 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     hamsi512_8way_init( &ctx.hamsi );
+     hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, dataLen<<3 );
+
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash0, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash4, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     shabal512_8way_init( &ctx.shabal );
+     shabal512_8way_update( &ctx.shabal, vhash, dataLen );
+     shabal512_8way_close( &ctx.shabal, vhash );
+
+     dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, dataLen<<3 );
+
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash4, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash4 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash5, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash5 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash6, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash6 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash7, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     sha512_8way_init( &ctx.sha512 );
+     sha512_8way_update( &ctx.sha512, vhash, dataLen );
+     sha512_8way_close( &ctx.sha512, vhash );
+
+     rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 );
+
+     haval256_5_8way_init( &ctx.haval );
+     haval256_5_8way_update( &ctx.haval, vhashA, dataLen );
+     haval256_5_8way_close( &ctx.haval, output );
+}
+
+int scanhash_xevan_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const uint32_t Htarg = ptarget[7];
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+      xevan_8way_hash( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if unlikely( ( hash7[ lane ] <= Htarg ) )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(XEVAN_4WAY)
+
 union _xevan_4way_context_overlay
 {
 	blake512_4way_context   blake;
--- a/algo/x17/xevan-gate.c
+++ b/algo/x17/xevan-gate.c
@@ -2,8 +2,10 @@

 bool register_xevan_algo( algo_gate_t* gate )
 {
-#if defined (XEVAN_4WAY)
-//  init_xevan_4way_ctx();
+#if defined (XEVAN_8WAY)
+  gate->scanhash  = (void*)&scanhash_xevan_8way;
+  gate->hash      = (void*)&xevan_8way_hash;
+#elif defined (XEVAN_4WAY)
  gate->scanhash  = (void*)&scanhash_xevan_4way;
  gate->hash      = (void*)&xevan_4way_hash;
 #else
--- a/algo/x17/xevan-gate.h
+++ b/algo/x17/xevan-gate.h
@@ -4,13 +4,21 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define XEVAN_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define XEVAN_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define XEVAN_4WAY 1
 #endif

 bool register_xevan_algo( algo_gate_t* gate );

-#if defined(XEVAN_4WAY)
+#if defined(XEVAN_8WAY)
+
+void xevan_8way_hash( void *state, const void *input );
+
+int scanhash_xevan_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+#elif defined(XEVAN_4WAY)

 void xevan_4way_hash( void *state, const void *input );

@@ -19,7 +27,7 @@ int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,

 //void init_xevan_4way_ctx();

-#endif
+#else

 void xevan_hash( void *state, const void *input );

@@ -30,3 +38,4 @@ void init_xevan_ctx();

 #endif

+#endif
--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -1,7 +1,4 @@
 #include "x22i-gate.h"
-
-#if defined(X22I_4WAY)
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -12,6 +9,7 @@
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/shavite-hash-2way.h"
+#include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -25,6 +23,426 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/swifftx/swifftx.h"

+
+#if defined(X22I_8WAY)
+
+union _x22i_8way_ctx_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    hashState_echo          echo;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+    haval256_5_8way_context haval;
+    sph_tiger_context       tiger;
+    sph_gost512_context     gost;
+    sha256_8way_context     sha256;
+};
+typedef union _x22i_8way_ctx_overlay x22i_8way_ctx_overlay;
+
+void x22i_8way_hash( void *output, const void *input )
+{
+   uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+   uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
+   uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
+   uint64_t hash0[8*4] __attribute__ ((aligned (64)));
+   uint64_t hash1[8*4] __attribute__ ((aligned (64)));
+   uint64_t hash2[8*4] __attribute__ ((aligned (64)));
+   uint64_t hash3[8*4] __attribute__ ((aligned (64)));
+   uint64_t hash4[8*4] __attribute__ ((aligned (64)));
+   uint64_t hash5[8*4] __attribute__ ((aligned (64)));
+   uint64_t hash6[8*4] __attribute__ ((aligned (64)));
+   uint64_t hash7[8*4] __attribute__ ((aligned (64)));
+
+//   unsigned char hash[64 * 4] __attribute__((aligned(64))) = {0};
+   unsigned char hashA0[64]    __attribute__((aligned(64))) = {0};
+   unsigned char hashA1[64]    __attribute__((aligned(32))) = {0};
+   unsigned char hashA2[64]    __attribute__((aligned(32))) = {0};
+   unsigned char hashA3[64]    __attribute__((aligned(32))) = {0};
+   unsigned char hashA4[64]    __attribute__((aligned(64))) = {0};
+   unsigned char hashA5[64]    __attribute__((aligned(32))) = {0};
+   unsigned char hashA6[64]    __attribute__((aligned(32))) = {0};
+   unsigned char hashA7[64]    __attribute__((aligned(32))) = {0};
+   x22i_8way_ctx_overlay ctx;
+
+   blake512_8way_init( &ctx.blake );
+   blake512_8way_update( &ctx.blake, input, 80 );
+   blake512_8way_close( &ctx.blake, vhash );
+
+   bmw512_8way_init( &ctx.bmw );
+   bmw512_8way_update( &ctx.bmw, vhash, 64 );
+   bmw512_8way_close( &ctx.bmw, vhash );
+
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
+                     hash4, hash5, hash6, hash7, vhash );
+
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                  (const char*)hash0, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                  (const char*)hash1, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                  (const char*)hash2, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                  (const char*)hash3, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                  (const char*)hash4, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                  (const char*)hash5, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                  (const char*)hash6, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                  (const char*)hash7, 512 );
+
+   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
+                           hash4, hash5, hash6, hash7 );
+   
+   skein512_8way_init( &ctx.skein );
+   skein512_8way_update( &ctx.skein, vhash, 64 );
+   skein512_8way_close( &ctx.skein, vhash );
+
+   jh512_8way_init( &ctx.jh );
+   jh512_8way_update( &ctx.jh, vhash, 64 );
+   jh512_8way_close( &ctx.jh, vhash );
+
+   keccak512_8way_init( &ctx.keccak );
+   keccak512_8way_update( &ctx.keccak, vhash, 64 );
+   keccak512_8way_close( &ctx.keccak, vhash );
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+   luffa_4way_init( &ctx.luffa, 512 );
+   luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+   luffa_4way_init( &ctx.luffa, 512 );
+   luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+   cube_4way_init( &ctx.cube, 512, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+   cube_4way_init( &ctx.cube, 512, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+   dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+   dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512( &ctx.shavite, hash0, 64 );
+   sph_shavite512_close( &ctx.shavite, hash0 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512( &ctx.shavite, hash1, 64 );
+   sph_shavite512_close( &ctx.shavite, hash1 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512( &ctx.shavite, hash2, 64 );
+   sph_shavite512_close( &ctx.shavite, hash2 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512( &ctx.shavite, hash3, 64 );
+   sph_shavite512_close( &ctx.shavite, hash3 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512( &ctx.shavite, hash4, 64 );
+   sph_shavite512_close( &ctx.shavite, hash4 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512( &ctx.shavite, hash5, 64 );
+   sph_shavite512_close( &ctx.shavite, hash5 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512( &ctx.shavite, hash6, 64 );
+   sph_shavite512_close( &ctx.shavite, hash6 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512( &ctx.shavite, hash7, 64 );
+   sph_shavite512_close( &ctx.shavite, hash7 );
+
+   intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+   intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
+
+   simd_4way_init( &ctx.simd, 512 );
+   simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+   simd_4way_init( &ctx.simd, 512 );
+   simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+   dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+   dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash0,
+                            (const BitSequence*)hash0, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash1,
+                            (const BitSequence*)hash1, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash2,
+                            (const BitSequence*)hash2, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash3,
+                            (const BitSequence*)hash3, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash4,
+                            (const BitSequence*)hash4, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash5,
+                            (const BitSequence*)hash5, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash6,
+                            (const BitSequence*)hash6, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash7,
+                            (const BitSequence*)hash7, 512 );
+
+   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
+                           hash4, hash5, hash6, hash7 );
+
+   hamsi512_8way_init( &ctx.hamsi );
+   hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+   hamsi512_8way_close( &ctx.hamsi, vhash );
+
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
+                     hash4, hash5, hash6, hash7, vhash );
+   
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash0, 64 );
+   sph_fugue512_close( &ctx.fugue, hash0 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash1, 64 );
+   sph_fugue512_close( &ctx.fugue, hash1 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash2, 64 );
+   sph_fugue512_close( &ctx.fugue, hash2 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash3, 64 );
+   sph_fugue512_close( &ctx.fugue, hash3 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash4, 64 );
+   sph_fugue512_close( &ctx.fugue, hash4 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash5, 64 );
+   sph_fugue512_close( &ctx.fugue, hash5 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash6, 64 );
+   sph_fugue512_close( &ctx.fugue, hash6 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash7, 64 );
+   sph_fugue512_close( &ctx.fugue, hash7 );
+
+   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3,
+                           hash4, hash5, hash6, hash7 );
+
+   shabal512_8way_init( &ctx.shabal );
+   shabal512_8way_update( &ctx.shabal, vhash, 64 );
+   shabal512_8way_close( &ctx.shabal, vhash );
+
+   dintrlv_8x32_512( &hash0[8], &hash1[8], &hash2[8], &hash3[8],
+                     &hash4[8], &hash5[8], &hash6[8], &hash7[8], vhash );
+
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, &hash0[8], 64 );
+   sph_whirlpool_close( &ctx.whirlpool, &hash0[16] );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, &hash1[8], 64 );
+   sph_whirlpool_close( &ctx.whirlpool, &hash1[16] );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, &hash2[8], 64 );
+   sph_whirlpool_close( &ctx.whirlpool, &hash2[16] );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, &hash3[8], 64 );
+   sph_whirlpool_close( &ctx.whirlpool, &hash3[16] );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, &hash4[8], 64 );
+   sph_whirlpool_close( &ctx.whirlpool, &hash4[16] );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, &hash5[8], 64 );
+   sph_whirlpool_close( &ctx.whirlpool, &hash5[16] );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, &hash6[8], 64 );
+   sph_whirlpool_close( &ctx.whirlpool, &hash6[16] );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, &hash7[8], 64 );
+   sph_whirlpool_close( &ctx.whirlpool, &hash7[16] );
+
+   intrlv_8x64_512( vhash, &hash0[16], &hash1[16], &hash2[16], &hash3[16],
+                           &hash4[16], &hash5[16], &hash6[16], &hash7[16] );
+
+   sha512_8way_init( &ctx.sha512 );
+   sha512_8way_update( &ctx.sha512, vhash, 64 );
+   sha512_8way_close( &ctx.sha512, vhash );
+
+   dintrlv_8x64_512( &hash0[24], &hash1[24], &hash2[24], &hash3[24],
+                     &hash4[24], &hash5[24], &hash6[24], &hash7[24], vhash );
+               
+   ComputeSingleSWIFFTX((unsigned char*)hash0, (unsigned char*)hashA0);
+   ComputeSingleSWIFFTX((unsigned char*)hash1, (unsigned char*)hashA1);
+   ComputeSingleSWIFFTX((unsigned char*)hash2, (unsigned char*)hashA2);
+   ComputeSingleSWIFFTX((unsigned char*)hash3, (unsigned char*)hashA3);
+   ComputeSingleSWIFFTX((unsigned char*)hash4, (unsigned char*)hashA4);
+   ComputeSingleSWIFFTX((unsigned char*)hash5, (unsigned char*)hashA5);
+   ComputeSingleSWIFFTX((unsigned char*)hash6, (unsigned char*)hashA6);
+   ComputeSingleSWIFFTX((unsigned char*)hash7, (unsigned char*)hashA7);
+
+   intrlv_8x32_512( vhashA, hashA0, hashA1, hashA2, hashA3,
+                            hashA4, hashA5, hashA6, hashA7 );
+
+   memset( vhash, 0, 64*8 );
+
+   haval256_5_8way_init( &ctx.haval );
+   haval256_5_8way_update( &ctx.haval, vhashA, 64 );
+   haval256_5_8way_close( &ctx.haval, vhash );
+
+   dintrlv_8x32_512( hash0, hash1, hash2, hash3,
+                     hash4, hash5, hash6, hash7, vhash );
+
+   memset( hashA0, 0, 64 );
+   memset( hashA1, 0, 64 );
+   memset( hashA2, 0, 64 );
+   memset( hashA3, 0, 64 );
+   memset( hashA4, 0, 64 );
+   memset( hashA5, 0, 64 );
+   memset( hashA6, 0, 64 );
+   memset( hashA7, 0, 64 );
+
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash0, 64);
+   sph_tiger_close(&ctx.tiger, (void*) hashA0);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash1, 64);
+   sph_tiger_close(&ctx.tiger, (void*) hashA1);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash2, 64);
+   sph_tiger_close(&ctx.tiger, (void*) hashA2);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash3, 64);
+   sph_tiger_close(&ctx.tiger, (void*) hashA3);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash4, 64);
+   sph_tiger_close(&ctx.tiger, (void*) hashA4);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash5, 64);
+   sph_tiger_close(&ctx.tiger, (void*) hashA5);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash6, 64);
+   sph_tiger_close(&ctx.tiger, (void*) hashA6);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash7, 64);
+   sph_tiger_close(&ctx.tiger, (void*) hashA7);
+
+   memset( hash0, 0, 64 );
+   memset( hash1, 0, 64 );
+   memset( hash2, 0, 64 );
+   memset( hash3, 0, 64 );
+   memset( hash4, 0, 64 );
+   memset( hash5, 0, 64 );
+   memset( hash6, 0, 64 );
+   memset( hash7, 0, 64 );
+
+   intrlv_2x256( vhash, hashA0, hashA1, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hashA2, hashA3, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hashA4, hashA5, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hashA6, hashA7, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash0, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash0 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash1, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash1 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash2, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash2 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash3, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash3 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash4, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash4 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash5, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash5 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash6, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash6 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash7, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash7 );
+
+   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3,
+                           hash4, hash5, hash6, hash7 );
+
+   sha256_8way_init( &ctx.sha256 );
+   sha256_8way_update( &ctx.sha256, vhash, 64 );
+   sha256_8way_close( &ctx.sha256, output );
+}
+
+int scanhash_x22i_8way( struct work* work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 8;
+   const int thr_id = mythr->id;
+   const uint32_t Htarg = ptarget[7];
+
+   if (opt_benchmark)
+      ((uint32_t*)ptarget)[7] = 0x08ff;
+
+   InitializeSWIFFTX();
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+               _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                 n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+      x22i_8way_hash( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if unlikely( ( hash7[ lane ] <= Htarg ) )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+#elif defined(X22I_4WAY)
+
+
 union _x22i_4way_ctx_overlay
 {
    blake512_4way_context   blake;
--- a/algo/x22/x22i-gate.c
+++ b/algo/x22/x22i-gate.c
@@ -2,27 +2,39 @@

 bool register_x22i_algo( algo_gate_t* gate )
 {
-#if defined (X22I_4WAY)
+#if defined (X22I_8WAY)
+  gate->scanhash  = (void*)&scanhash_x22i_8way;
+  gate->hash      = (void*)&x22i_8way_hash;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+#elif defined (X22I_4WAY)
  gate->scanhash  = (void*)&scanhash_x22i_4way;
  gate->hash      = (void*)&x22i_4way_hash;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
 #else
  gate->scanhash  = (void*)&scanhash_x22i;
  gate->hash      = (void*)&x22i_hash;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
  return true;
 };

 bool register_x25x_algo( algo_gate_t* gate )
 {
-#if defined (X22I_4WAY)
+#if defined (X25X_8WAY)
+  gate->scanhash  = (void*)&scanhash_x25x_8way;
+  gate->hash      = (void*)&x25x_8way_hash;
+//  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+#elif defined (X25X_4WAY)
  gate->scanhash  = (void*)&scanhash_x25x_4way;
  gate->hash      = (void*)&x25x_4way_hash;
+//  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
 #else
  gate->scanhash  = (void*)&scanhash_x25x;
  gate->hash      = (void*)&x25x_hash;
+//  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
+
  return true;
 };

--- a/algo/x22/x22i-gate.h
+++ b/algo/x22/x22i-gate.h
@@ -6,30 +6,64 @@
 #include <stdint.h>
 #include <unistd.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X22I_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X22I_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X22I_4WAY 1
 #endif

-bool register_x22i__algo( algo_gate_t* gate );
+bool register_x22i_algo( algo_gate_t* gate );

-#if defined(X22I_4WAY)
+#if defined(X22I_8WAY)
+
+void x22i_8way_hash( void *state, const void *input );
+int scanhash_x22i_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(X22I_4WAY)

 void x22i_4way_hash( void *state, const void *input );
 int scanhash_x22i_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

-void x25x_4way_hash( void *state, const void *input );
-int scanhash_x25x_4way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr );
-
-#endif
+#else

 void x22i_hash( void *state, const void *input );
 int scanhash_x22i( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );

+#endif
+
+
+// Big problems with x25x 8 way. It blows up just by increasing the
+// buffer sizes and nothing else. It may have to do with accessing 2 dim arrays.
+
+//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+//  #define X25X_8WAY 1
+#if defined(__AVX2__) && defined(__AES__)
+  #define X25X_4WAY 1
+#endif
+
+bool register_x25i_algo( algo_gate_t* gate );
+
+#if defined(X25X_8WAY)
+
+void x25x_8way_hash( void *state, const void *input );
+int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(X25X_4WAY)
+
+void x25x_4way_hash( void *state, const void *input );
+int scanhash_x25x_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
+#else
+
 void x25x_hash( void *state, const void *input );
 int scanhash_x25x( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );

+#endif
+
 #endif  // X22I_GATE_H__
--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -1,6 +1,6 @@
 #include "x22i-gate.h"

-#if defined(X22I_4WAY)
+#if defined(X25X_4WAY)

 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
@@ -88,276 +88,282 @@ void x25x_4way_hash( void *output, const void *input )
   unsigned char hash2[25][64] __attribute__((aligned(64))) = {0};
   unsigned char hash3[25][64] __attribute__((aligned(64))) = {0};
   uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-   unsigned char vhashA[24][64*4] __attribute__ ((aligned (64)));
+// Doubling the size of vhashX breaks everything. It may have something
+// to do with accessing arrays: vhashX vs vhashX[0] vs &vhash[0].
+// Changing notation did seem to allow the larger buffer but still resulted
+// in problems further along.
+//   unsigned char vhashX[24][64*8] __attribute__ ((aligned (64)));
+   unsigned char vhashX[24][64*4] __attribute__ ((aligned (64)));
   x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64)));

   blake512_4way_init( &ctx.blake );
   blake512_4way( &ctx.blake, input, 80 );
   blake512_4way_close( &ctx.blake, vhash );
-   dintrlv_4x64_512( &hash0[0], &hash1[0], &hash2[0], &hash3[0], vhash );
+   dintrlv_4x64_512( hash0[0], hash1[0], hash2[0], hash3[0], vhash );

   bmw512_4way_init( &ctx.bmw );
   bmw512_4way( &ctx.bmw, vhash, 64 );
   bmw512_4way_close( &ctx.bmw, vhash );
-   dintrlv_4x64_512( &hash0[1], &hash1[1], &hash2[1], &hash3[1], vhash );
+   dintrlv_4x64_512( hash0[1], hash1[1], hash2[1], hash3[1], vhash );

   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)&hash0[2],
-                                  (const char*)&hash0[1], 512 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash0[2],
+                                  (const char*)hash0[1], 512 );
   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)&hash1[2],
-                                  (const char*)&hash1[1], 512 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash1[2],
+                                  (const char*)hash1[1], 512 );
   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)&hash2[2],
-                                  (const char*)&hash2[1], 512 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash2[2],
+                                  (const char*)hash2[1], 512 );
   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)&hash3[2],
-                                  (const char*)&hash3[1], 512 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash3[2],
+                                  (const char*)hash3[1], 512 );
   
-   intrlv_4x64_512( vhash, &hash0[2], &hash1[2], &hash2[2], &hash3[2] );
+   intrlv_4x64_512( vhash, hash0[2], hash1[2], hash2[2], hash3[2] );

   skein512_4way_init( &ctx.skein );
   skein512_4way( &ctx.skein, vhash, 64 );
   skein512_4way_close( &ctx.skein, vhash );
-   dintrlv_4x64_512( &hash0[3], &hash1[3], &hash2[3], &hash3[3], vhash );
+   dintrlv_4x64_512( hash0[3], hash1[3], hash2[3], hash3[3], vhash );

   jh512_4way_init( &ctx.jh );
   jh512_4way( &ctx.jh, vhash, 64 );
   jh512_4way_close( &ctx.jh, vhash );
-   dintrlv_4x64_512( &hash0[4], &hash1[4], &hash2[4], &hash3[4], vhash );
+   dintrlv_4x64_512( hash0[4], hash1[4], hash2[4], hash3[4], vhash );

   keccak512_4way_init( &ctx.keccak );
   keccak512_4way( &ctx.keccak, vhash, 64 );
   keccak512_4way_close( &ctx.keccak, vhash );
-   dintrlv_4x64_512( &hash0[5], &hash1[5], &hash2[5], &hash3[5], vhash );
+   dintrlv_4x64_512( hash0[5], hash1[5], hash2[5], hash3[5], vhash );
   
   init_luffa( &ctx.luffa, 512 );
-   update_and_final_luffa( &ctx.luffa, (BitSequence*)&hash0[6],
-                                (const BitSequence*)&hash0[5], 64 );
+   update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0[6],
+                                (const BitSequence*)hash0[5], 64 );
   init_luffa( &ctx.luffa, 512 );
-   update_and_final_luffa( &ctx.luffa, (BitSequence*)&hash1[6],
-                                (const BitSequence*)&hash1[5], 64 );
+   update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1[6],
+                                (const BitSequence*)hash1[5], 64 );
   init_luffa( &ctx.luffa, 512 );
-   update_and_final_luffa( &ctx.luffa, (BitSequence*)&hash2[6],
-                                (const BitSequence*)&hash2[5], 64 );
+   update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2[6],
+                                (const BitSequence*)hash2[5], 64 );
   init_luffa( &ctx.luffa, 512 );
-   update_and_final_luffa( &ctx.luffa, (BitSequence*)&hash3[6],
-                                (const BitSequence*)&hash3[5], 64 );
+   update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3[6],
+                                (const BitSequence*)hash3[5], 64 );

   cubehashInit( &ctx.cube, 512, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*) &hash0[7],
-                              (const byte*)&hash0[6], 64 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash0[7],
+                              (const byte*)hash0[6], 64 );
   cubehashInit( &ctx.cube, 512, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*) &hash1[7],
-                              (const byte*)&hash1[6], 64 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash1[7],
+                              (const byte*)hash1[6], 64 );
   cubehashInit( &ctx.cube, 512, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*) &hash2[7],
-                              (const byte*)&hash2[6], 64 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash2[7],
+                              (const byte*)hash2[6], 64 );
   cubehashInit( &ctx.cube, 512, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*) &hash3[7],
-                              (const byte*)&hash3[6], 64 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash3[7],
+                              (const byte*)hash3[6], 64 );

 	sph_shavite512_init(&ctx.shavite);
-	sph_shavite512(&ctx.shavite, (const void*) &hash0[7], 64);
-	sph_shavite512_close(&ctx.shavite, &hash0[8]);
+	sph_shavite512(&ctx.shavite, (const void*) hash0[7], 64);
+	sph_shavite512_close(&ctx.shavite, hash0[8]);
   sph_shavite512_init(&ctx.shavite);
-   sph_shavite512(&ctx.shavite, (const void*) &hash1[7], 64);
-   sph_shavite512_close(&ctx.shavite, &hash1[8]);
+   sph_shavite512(&ctx.shavite, (const void*) hash1[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash1[8]);
   sph_shavite512_init(&ctx.shavite);
-   sph_shavite512(&ctx.shavite, (const void*) &hash2[7], 64);
-   sph_shavite512_close(&ctx.shavite, &hash2[8]);
+   sph_shavite512(&ctx.shavite, (const void*) hash2[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash2[8]);
   sph_shavite512_init(&ctx.shavite);
-   sph_shavite512(&ctx.shavite, (const void*) &hash3[7], 64);
-   sph_shavite512_close(&ctx.shavite, &hash3[8]);
+   sph_shavite512(&ctx.shavite, (const void*) hash3[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash3[8]);

   init_sd( &ctx.simd, 512 );
-   update_final_sd( &ctx.simd, (BitSequence*)&hash0[9],
-                         (const BitSequence*)&hash0[8], 512 );
+   update_final_sd( &ctx.simd, (BitSequence*)hash0[9],
+                         (const BitSequence*)hash0[8], 512 );
   init_sd( &ctx.simd, 512 );
-   update_final_sd( &ctx.simd, (BitSequence*)&hash1[9],
-                         (const BitSequence*)&hash1[8], 512 );
+   update_final_sd( &ctx.simd, (BitSequence*)hash1[9],
+                         (const BitSequence*)hash1[8], 512 );
   init_sd( &ctx.simd, 512 );
-   update_final_sd( &ctx.simd, (BitSequence*)&hash2[9],
-                         (const BitSequence*)&hash2[8], 512 );
+   update_final_sd( &ctx.simd, (BitSequence*)hash2[9],
+                         (const BitSequence*)hash2[8], 512 );
   init_sd( &ctx.simd, 512 );
-   update_final_sd( &ctx.simd, (BitSequence*)&hash3[9],
-                         (const BitSequence*)&hash3[8], 512 );
+   update_final_sd( &ctx.simd, (BitSequence*)hash3[9],
+                         (const BitSequence*)hash3[8], 512 );

   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)&hash0[10],
-                            (const BitSequence*)&hash0[9], 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash0[10],
+                            (const BitSequence*)hash0[9], 512 );
   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)&hash1[10],
-                            (const BitSequence*)&hash1[9], 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash1[10],
+                            (const BitSequence*)hash1[9], 512 );
   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)&hash2[10],
-                            (const BitSequence*)&hash2[9], 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash2[10],
+                            (const BitSequence*)hash2[9], 512 );
   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)&hash3[10],
-                            (const BitSequence*)&hash3[9], 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash3[10],
+                            (const BitSequence*)hash3[9], 512 );

-   intrlv_4x64_512( vhash, &hash0[10], &hash1[10], &hash2[10], &hash3[10] );
+   intrlv_4x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10] );

   hamsi512_4way_init( &ctx.hamsi );
   hamsi512_4way( &ctx.hamsi, vhash, 64 );
   hamsi512_4way_close( &ctx.hamsi, vhash );
-   dintrlv_4x64_512( &hash0[11], &hash1[11], &hash2[11], &hash3[11], vhash );
+   dintrlv_4x64_512( hash0[11], hash1[11], hash2[11], hash3[11], vhash );

 	sph_fugue512_init(&ctx.fugue);
-	sph_fugue512(&ctx.fugue, (const void*) &hash0[11], 64);
-	sph_fugue512_close(&ctx.fugue, &hash0[12]);
+	sph_fugue512(&ctx.fugue, (const void*) hash0[11], 64);
+	sph_fugue512_close(&ctx.fugue, hash0[12]);
   sph_fugue512_init(&ctx.fugue);
-   sph_fugue512(&ctx.fugue, (const void*) &hash1[11], 64);
-   sph_fugue512_close(&ctx.fugue, &hash1[12]);
+   sph_fugue512(&ctx.fugue, (const void*) hash1[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash1[12]);
   sph_fugue512_init(&ctx.fugue);
-   sph_fugue512(&ctx.fugue, (const void*) &hash2[11], 64);
-   sph_fugue512_close(&ctx.fugue, &hash2[12]);
+   sph_fugue512(&ctx.fugue, (const void*) hash2[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash2[12]);
   sph_fugue512_init(&ctx.fugue);
-   sph_fugue512(&ctx.fugue, (const void*) &hash3[11], 64);
-   sph_fugue512_close(&ctx.fugue, &hash3[12]);
+   sph_fugue512(&ctx.fugue, (const void*) hash3[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash3[12]);

-   intrlv_4x32_512( vhash, &hash0[12], &hash1[12], &hash2[12], &hash3[12] );
+   intrlv_4x32_512( vhash, hash0[12], hash1[12], hash2[12], hash3[12] );

   shabal512_4way_init( &ctx.shabal );
   shabal512_4way( &ctx.shabal, vhash, 64 );
   shabal512_4way_close( &ctx.shabal, vhash );
-   dintrlv_4x32_512( &hash0[13], &hash1[13], &hash2[13], &hash3[13], vhash );
+   dintrlv_4x32_512( hash0[13], hash1[13], hash2[13], hash3[13], vhash );

 	sph_whirlpool_init(&ctx.whirlpool);
-	sph_whirlpool (&ctx.whirlpool, (const void*) &hash0[13], 64);
-	sph_whirlpool_close(&ctx.whirlpool, &hash0[14]);
+	sph_whirlpool (&ctx.whirlpool, (const void*) hash0[13], 64);
+	sph_whirlpool_close(&ctx.whirlpool, hash0[14]);
   sph_whirlpool_init(&ctx.whirlpool);
-   sph_whirlpool (&ctx.whirlpool, (const void*) &hash1[13], 64);
-   sph_whirlpool_close(&ctx.whirlpool, &hash1[14]);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash1[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash1[14]);
   sph_whirlpool_init(&ctx.whirlpool);
-   sph_whirlpool (&ctx.whirlpool, (const void*) &hash2[13], 64);
-   sph_whirlpool_close(&ctx.whirlpool, &hash2[14]);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash2[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash2[14]);
   sph_whirlpool_init(&ctx.whirlpool);
-   sph_whirlpool (&ctx.whirlpool, (const void*) &hash3[13], 64);
-   sph_whirlpool_close(&ctx.whirlpool, &hash3[14]);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash3[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash3[14]);

-   intrlv_4x64_512( vhash, &hash0[14], &hash1[14], &hash2[14], &hash3[14] );
+   intrlv_4x64_512( vhash, hash0[14], hash1[14], hash2[14], hash3[14] );

   sha512_4way_init( &ctx.sha512 );
   sha512_4way( &ctx.sha512, vhash, 64 );
   sha512_4way_close( &ctx.sha512, vhash );
-   dintrlv_4x64_512( &hash0[15], &hash1[15], &hash2[15], &hash3[15], vhash );
+   dintrlv_4x64_512( hash0[15], hash1[15], hash2[15], hash3[15], vhash );


-   ComputeSingleSWIFFTX((unsigned char*)&hash0[12], (unsigned char*)&hash0[16]);
-   ComputeSingleSWIFFTX((unsigned char*)&hash1[12], (unsigned char*)&hash1[16]);
-   ComputeSingleSWIFFTX((unsigned char*)&hash2[12], (unsigned char*)&hash2[16]);
-   ComputeSingleSWIFFTX((unsigned char*)&hash3[12], (unsigned char*)&hash3[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash0[12], (unsigned char*)hash0[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash1[12], (unsigned char*)hash1[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash2[12], (unsigned char*)hash2[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash3[12], (unsigned char*)hash3[16]);

-   intrlv_4x32_512( &vhashA, &hash0[16], &hash1[16], &hash2[16], &hash3[16] );
+   intrlv_4x32_512( vhashX[0], hash0[16], hash1[16], hash2[16], hash3[16] );

   memset( vhash, 0, 64*4 );
   
   haval256_5_4way_init( &ctx.haval );
-   haval256_5_4way( &ctx.haval, vhashA, 64 );
+   haval256_5_4way( &ctx.haval, vhashX[0], 64 );
   haval256_5_4way_close( &ctx.haval, vhash );
-   dintrlv_4x32_512( &hash0[17], &hash1[17], &hash2[17], &hash3[17], vhash );
+   dintrlv_4x32_512( hash0[17], hash1[17], hash2[17], hash3[17], vhash );

 	sph_tiger_init(&ctx.tiger);
-	sph_tiger (&ctx.tiger, (const void*) &hash0[17], 64);
-	sph_tiger_close(&ctx.tiger, (void*) &hash0[18]);
+	sph_tiger (&ctx.tiger, (const void*) hash0[17], 64);
+	sph_tiger_close(&ctx.tiger, (void*) hash0[18]);
   sph_tiger_init(&ctx.tiger);
-   sph_tiger (&ctx.tiger, (const void*) &hash1[17], 64);
-   sph_tiger_close(&ctx.tiger, (void*) &hash1[18]);
+   sph_tiger (&ctx.tiger, (const void*) hash1[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash1[18]);
   sph_tiger_init(&ctx.tiger);
-   sph_tiger (&ctx.tiger, (const void*) &hash2[17], 64);
-   sph_tiger_close(&ctx.tiger, (void*) &hash2[18]);
+   sph_tiger (&ctx.tiger, (const void*) hash2[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash2[18]);
   sph_tiger_init(&ctx.tiger);
-   sph_tiger (&ctx.tiger, (const void*) &hash3[17], 64);
-   sph_tiger_close(&ctx.tiger, (void*) &hash3[18]);
+   sph_tiger (&ctx.tiger, (const void*) hash3[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash3[18]);

-	LYRA2RE( (void*)&hash0[19], 32, (const void*)&hash0[18], 32,
-            (const void*)&hash0[18], 32, 1, 4, 4 );
-   LYRA2RE( (void*)&hash1[19], 32, (const void*)&hash1[18], 32,
-            (const void*)&hash1[18], 32, 1, 4, 4 );
-   LYRA2RE( (void*)&hash2[19], 32, (const void*)&hash2[18], 32,
-            (const void*)&hash2[18], 32, 1, 4, 4 );
-   LYRA2RE( (void*)&hash3[19], 32, (const void*)&hash3[18], 32,
-            (const void*)&hash3[18], 32, 1, 4, 4 );
+	LYRA2RE( (void*)hash0[19], 32, (const void*)hash0[18], 32,
+            (const void*)hash0[18], 32, 1, 4, 4 );
+   LYRA2RE( (void*)hash1[19], 32, (const void*)hash1[18], 32,
+            (const void*)hash1[18], 32, 1, 4, 4 );
+   LYRA2RE( (void*)hash2[19], 32, (const void*)hash2[18], 32,
+            (const void*)hash2[18], 32, 1, 4, 4 );
+   LYRA2RE( (void*)hash3[19], 32, (const void*)hash3[18], 32,
+            (const void*)hash3[18], 32, 1, 4, 4 );

 	sph_gost512_init(&ctx.gost);
-	sph_gost512 (&ctx.gost, (const void*) &hash0[19], 64);
-	sph_gost512_close(&ctx.gost, (void*) &hash0[20]);
+	sph_gost512 (&ctx.gost, (const void*) hash0[19], 64);
+	sph_gost512_close(&ctx.gost, (void*) hash0[20]);
   sph_gost512_init(&ctx.gost);
-   sph_gost512 (&ctx.gost, (const void*) &hash1[19], 64);
-   sph_gost512_close(&ctx.gost, (void*) &hash1[20]);
+   sph_gost512 (&ctx.gost, (const void*) hash1[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash1[20]);
   sph_gost512_init(&ctx.gost);
-   sph_gost512 (&ctx.gost, (const void*) &hash2[19], 64);
-   sph_gost512_close(&ctx.gost, (void*) &hash2[20]);
+   sph_gost512 (&ctx.gost, (const void*) hash2[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash2[20]);
   sph_gost512_init(&ctx.gost);
-   sph_gost512 (&ctx.gost, (const void*) &hash3[19], 64);
-   sph_gost512_close(&ctx.gost, (void*) &hash3[20]);
+   sph_gost512 (&ctx.gost, (const void*) hash3[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash3[20]);

-   intrlv_4x32_512( vhashA, &hash0[20], &hash1[20], &hash2[20], &hash3[20] );
+   intrlv_4x32_512( vhashX[0], hash0[20], hash1[20], hash2[20], hash3[20] );
   memset( vhash, 0, 64*4 );

   sha256_4way_init( &ctx.sha256 );
-   sha256_4way( &ctx.sha256, vhashA, 64 );
+   sha256_4way( &ctx.sha256, vhashX[0], 64 );
   sha256_4way_close( &ctx.sha256, vhash );
-   dintrlv_4x32_512( &hash0[21], &hash1[21], &hash2[21], &hash3[21], vhash );
+   dintrlv_4x32_512( hash0[21], hash1[21], hash2[21], hash3[21], vhash );

   sph_panama_init(&ctx.panama);
-   sph_panama (&ctx.panama, (const void*) &hash0[21], 64 );
-   sph_panama_close(&ctx.panama, (void*) &hash0[22]);
+   sph_panama (&ctx.panama, (const void*) hash0[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash0[22]);
   sph_panama_init(&ctx.panama);
-   sph_panama (&ctx.panama, (const void*) &hash1[21], 64 );
-   sph_panama_close(&ctx.panama, (void*) &hash1[22]);
+   sph_panama (&ctx.panama, (const void*) hash1[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash1[22]);
   sph_panama_init(&ctx.panama);
-   sph_panama (&ctx.panama, (const void*) &hash2[21], 64 );
-   sph_panama_close(&ctx.panama, (void*) &hash2[22]);
+   sph_panama (&ctx.panama, (const void*) hash2[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash2[22]);
   sph_panama_init(&ctx.panama);
-   sph_panama (&ctx.panama, (const void*) &hash3[21], 64 );
-   sph_panama_close(&ctx.panama, (void*) &hash3[22]);
+   sph_panama (&ctx.panama, (const void*) hash3[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash3[22]);

-   laneHash(512, (const BitSequence*)&hash0[22], 512, (BitSequence*)&hash0[23]);
-   laneHash(512, (const BitSequence*)&hash1[22], 512, (BitSequence*)&hash1[23]);
-   laneHash(512, (const BitSequence*)&hash2[22], 512, (BitSequence*)&hash2[23]);
-   laneHash(512, (const BitSequence*)&hash3[22], 512, (BitSequence*)&hash3[23]);
+   laneHash(512, (const BitSequence*)hash0[22], 512, (BitSequence*)hash0[23]);
+   laneHash(512, (const BitSequence*)hash1[22], 512, (BitSequence*)hash1[23]);
+   laneHash(512, (const BitSequence*)hash2[22], 512, (BitSequence*)hash2[23]);
+   laneHash(512, (const BitSequence*)hash3[22], 512, (BitSequence*)hash3[23]);

   x25x_shuffle( hash0 );
   x25x_shuffle( hash1 );
   x25x_shuffle( hash2 );
   x25x_shuffle( hash3 );

-   intrlv_4x32_512( &vhashA[ 0], &hash0[ 0], &hash1[ 0], &hash2[ 0], &hash3[ 0] );
-   intrlv_4x32_512( &vhashA[ 1], &hash0[ 1], &hash1[ 1], &hash2[ 1], &hash3[ 1] );
-   intrlv_4x32_512( &vhashA[ 2], &hash0[ 2], &hash1[ 2], &hash2[ 2], &hash3[ 2] );
-   intrlv_4x32_512( &vhashA[ 3], &hash0[ 3], &hash1[ 3], &hash2[ 3], &hash3[ 3] );
-   intrlv_4x32_512( &vhashA[ 4], &hash0[ 4], &hash1[ 4], &hash2[ 4], &hash3[ 4] );
-   intrlv_4x32_512( &vhashA[ 5], &hash0[ 5], &hash1[ 5], &hash2[ 5], &hash3[ 5] );
-   intrlv_4x32_512( &vhashA[ 6], &hash0[ 6], &hash1[ 6], &hash2[ 6], &hash3[ 6] );
-   intrlv_4x32_512( &vhashA[ 7], &hash0[ 7], &hash1[ 7], &hash2[ 7], &hash3[ 7] );
-   intrlv_4x32_512( &vhashA[ 8], &hash0[ 8], &hash1[ 8], &hash2[ 8], &hash3[ 8] );
-   intrlv_4x32_512( &vhashA[ 9], &hash0[ 9], &hash1[ 9], &hash2[ 9], &hash3[ 9] );
-   intrlv_4x32_512( &vhashA[10], &hash0[10], &hash1[10], &hash2[10], &hash3[10] );
-   intrlv_4x32_512( &vhashA[11], &hash0[11], &hash1[11], &hash2[11], &hash3[11] );
-   intrlv_4x32_512( &vhashA[12], &hash0[12], &hash1[12], &hash2[12], &hash3[12] );
-   intrlv_4x32_512( &vhashA[13], &hash0[13], &hash1[13], &hash2[13], &hash3[13] );
-   intrlv_4x32_512( &vhashA[14], &hash0[14], &hash1[14], &hash2[14], &hash3[14] );
-   intrlv_4x32_512( &vhashA[15], &hash0[15], &hash1[15], &hash2[15], &hash3[15] );
-   intrlv_4x32_512( &vhashA[16], &hash0[16], &hash1[16], &hash2[16], &hash3[16] );
-   intrlv_4x32_512( &vhashA[17], &hash0[17], &hash1[17], &hash2[17], &hash3[17] );
-   intrlv_4x32_512( &vhashA[18], &hash0[18], &hash1[18], &hash2[18], &hash3[18] );
-   intrlv_4x32_512( &vhashA[19], &hash0[19], &hash1[19], &hash2[19], &hash3[19] );
-   intrlv_4x32_512( &vhashA[20], &hash0[20], &hash1[20], &hash2[20], &hash3[20] );
-   intrlv_4x32_512( &vhashA[21], &hash0[21], &hash1[21], &hash2[21], &hash3[21] );
-   intrlv_4x32_512( &vhashA[22], &hash0[22], &hash1[22], &hash2[22], &hash3[22] );
-   intrlv_4x32_512( &vhashA[23], &hash0[23], &hash1[23], &hash2[23], &hash3[23] );
+   intrlv_4x32_512( vhashX[ 0], hash0[ 0], hash1[ 0], hash2[ 0], hash3[ 0] );
+   intrlv_4x32_512( vhashX[ 1], hash0[ 1], hash1[ 1], hash2[ 1], hash3[ 1] );
+   intrlv_4x32_512( vhashX[ 2], hash0[ 2], hash1[ 2], hash2[ 2], hash3[ 2] );
+   intrlv_4x32_512( vhashX[ 3], hash0[ 3], hash1[ 3], hash2[ 3], hash3[ 3] );
+   intrlv_4x32_512( vhashX[ 4], hash0[ 4], hash1[ 4], hash2[ 4], hash3[ 4] );
+   intrlv_4x32_512( vhashX[ 5], hash0[ 5], hash1[ 5], hash2[ 5], hash3[ 5] );
+   intrlv_4x32_512( vhashX[ 6], hash0[ 6], hash1[ 6], hash2[ 6], hash3[ 6] );
+   intrlv_4x32_512( vhashX[ 7], hash0[ 7], hash1[ 7], hash2[ 7], hash3[ 7] );
+   intrlv_4x32_512( vhashX[ 8], hash0[ 8], hash1[ 8], hash2[ 8], hash3[ 8] );
+   intrlv_4x32_512( vhashX[ 9], hash0[ 9], hash1[ 9], hash2[ 9], hash3[ 9] );
+   intrlv_4x32_512( vhashX[10], hash0[10], hash1[10], hash2[10], hash3[10] );
+   intrlv_4x32_512( vhashX[11], hash0[11], hash1[11], hash2[11], hash3[11] );
+   intrlv_4x32_512( vhashX[12], hash0[12], hash1[12], hash2[12], hash3[12] );
+   intrlv_4x32_512( vhashX[13], hash0[13], hash1[13], hash2[13], hash3[13] );
+   intrlv_4x32_512( vhashX[14], hash0[14], hash1[14], hash2[14], hash3[14] );
+   intrlv_4x32_512( vhashX[15], hash0[15], hash1[15], hash2[15], hash3[15] );
+   intrlv_4x32_512( vhashX[16], hash0[16], hash1[16], hash2[16], hash3[16] );
+   intrlv_4x32_512( vhashX[17], hash0[17], hash1[17], hash2[17], hash3[17] );
+   intrlv_4x32_512( vhashX[18], hash0[18], hash1[18], hash2[18], hash3[18] );
+   intrlv_4x32_512( vhashX[19], hash0[19], hash1[19], hash2[19], hash3[19] );
+   intrlv_4x32_512( vhashX[20], hash0[20], hash1[20], hash2[20], hash3[20] );
+   intrlv_4x32_512( vhashX[21], hash0[21], hash1[21], hash2[21], hash3[21] );
+   intrlv_4x32_512( vhashX[22], hash0[22], hash1[22], hash2[22], hash3[22] );
+   intrlv_4x32_512( vhashX[23], hash0[23], hash1[23], hash2[23], hash3[23] );

   blake2s_4way_init( &ctx.blake2s, 32 );
-   blake2s_4way_full_blocks( &ctx.blake2s, vhash, vhashA, 64*24 );
-
-   dintrlv_4x32( &hash0[24], &hash1[24], &hash2[24], &hash3[24], vhash, 256 );
+   blake2s_4way_full_blocks( &ctx.blake2s, output, vhashX, 64*24 );
+/*
+   dintrlv_4x32( hash0[24], hash1[24], hash2[24], hash3[24], vhash, 256 );
     
-	memcpy(output,    &hash0[24], 32);
-   memcpy(output+32, &hash1[24], 32);
-   memcpy(output+64, &hash2[24], 32);
-   memcpy(output+96, &hash3[24], 32);
+	memcpy(output,    hash0[24], 32);
+   memcpy(output+32, hash1[24], 32);
+   memcpy(output+64, hash2[24], 32);
+   memcpy(output+96, hash3[24], 32);
+*/
 }

 int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
@@ -365,11 +371,14 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
 {
   uint32_t hash[4*16] __attribute__ ((aligned (64)));
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 4;
   const int thr_id = mythr->id;
   const uint32_t Htarg = ptarget[7];

@@ -385,6 +394,16 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
              _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
      x25x_4way_hash( hash, vdata );

+      for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
+      {
+         extr_lane_4x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+              }
+      }
+/*
      for ( int i = 0; i < 4; i++ )
      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
@@ -392,10 +411,11 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
         pdata[19] = n+i;
         submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
+*/
      n += 4;
-   } while ( likely( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart ) );
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );

-   *hashes_done = n - first_nonce + 1;
+   *hashes_done = n - first_nonce;
   return 0;
 }

--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.3.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.6.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.10.3'
-PACKAGE_STRING='cpuminer-opt 3.10.3'
+PACKAGE_VERSION='3.10.6'
+PACKAGE_STRING='cpuminer-opt 3.10.6'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.10.3 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.10.6 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.10.3:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.10.6:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.10.3
+cpuminer-opt configure 3.10.6
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.10.3, which was
+It was created by cpuminer-opt $as_me 3.10.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.10.3'
+ VERSION='3.10.6'


 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.10.3, which was
+This file was extended by cpuminer-opt $as_me 3.10.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.10.3
+cpuminer-opt config.status 3.10.6
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.10.3])
+AC_INIT([cpuminer-opt], [3.10.6])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -872,6 +872,7 @@ static uint64_t accept_sum  = 0;
 static uint64_t reject_sum  = 0;
 static double   norm_diff_sum = 0.;
 static uint32_t last_block_height = 0;
+static bool     new_job = false;
 static double   last_targetdiff = 0.;
 static double   ref_rate_hi = 0.;
 static double   ref_rate_lo = 1e100;
@@ -887,6 +888,7 @@ struct share_stats_t
   double share_diff;
   double stratum_diff;
   double target_diff;
+   char   job_id[32];
 };

 #define s_stats_size 8
@@ -1093,8 +1095,9 @@ static int share_result( int result, struct work *null_work,
                       rejected_share_count, solved_block_count );

   if ( have_stratum && !opt_quiet )
-      applog2( LOG_INFO, "Share diff %.3g (%5f%%), block %d",
-               my_stats.share_diff, share_ratio, stratum.block_height );
+      applog2( LOG_INFO, "Share diff %.3g (%5f%%), block %d, job %s",
+               my_stats.share_diff, share_ratio, stratum.block_height,
+               my_stats.job_id );

   if ( reason )
   {
@@ -1762,6 +1765,7 @@ void work_set_target_ratio( struct work* work, uint32_t* hash )
   share_stats[ s_put_ptr ].net_diff = net_diff;
   share_stats[ s_put_ptr ].stratum_diff = stratum_diff;
   share_stats[ s_put_ptr ].target_diff = work->targetdiff;
+   strcpy( share_stats[ s_put_ptr ].job_id, work->job_id );

   s_put_ptr = stats_ptr_incr( s_put_ptr );

@@ -2586,28 +2590,38 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
     || ( last_block_height != sctx->block_height ) )
   {
       double hr = 0.;
-
+       new_job = false;
       pthread_mutex_lock( &stats_lock );

       for ( int i = 0; i < opt_n_threads; i++ )
          hr += thr_hashrates[i];
       global_hashrate = hr;
       pthread_mutex_unlock( &stats_lock );
-  
-       if ( stratum_diff != sctx->job.diff )
-          applog( LOG_BLUE, "New stratum difficulty" );
-       if ( last_block_height != sctx->block_height )
-          applog( LOG_BLUE, "New block" );
+
+       if ( !opt_quiet )
+       {
+          if ( stratum_diff != sctx->job.diff )
+             applog( LOG_BLUE, "New stratum diff %g, block %d, job %s",
+                   sctx->job.diff, sctx->block_height, g_work->job_id );
+          else if ( last_block_height != sctx->block_height )
+             applog( LOG_BLUE, "New block %d, job %s", sctx->block_height,
+                                                        g_work->job_id );
+          else
+             applog( LOG_BLUE,"New job %s.", g_work->job_id );
+       }

       // Update data and calculate new estimates.
       stratum_diff = sctx->job.diff;
       last_block_height = stratum.block_height;
       last_targetdiff = g_work->targetdiff;

-       applog2( LOG_INFO, "%s %s block %d", short_url,
-                algo_names[opt_algo], stratum.block_height );
-       applog2( LOG_INFO, "Diff: net %g, stratum %g, target %g",
-                net_diff, stratum_diff, last_targetdiff );
+       if ( !opt_quiet )
+       {
+          applog2( LOG_INFO, "%s %s block %d", short_url,
+                             algo_names[opt_algo], stratum.block_height );
+          applog2( LOG_INFO, "Diff: net %g, stratum %g, target %g",
+                             net_diff, stratum_diff, last_targetdiff );
+       }

       if ( hr > 0. )
       {
@@ -2619,10 +2633,13 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
          sprintf_et( share_ttf, last_targetdiff * diff_to_hash / hr );
          scale_hash_for_display ( &hr, hr_units );

-          applog2( LOG_INFO, "TTF @ %.2f %sh/s: block %s, share %s",
-                              hr, hr_units, block_ttf, share_ttf );
+          if ( !opt_quiet )
+          {   
+             applog2( LOG_INFO, "TTF @ %.2f %sh/s: block %s, share %s",
+                                hr, hr_units, block_ttf, share_ttf );
+          }
       }
-   }     
+   }  // new diff/block   
 }

 void jr2_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
@@ -2700,25 +2717,23 @@ static void *stratum_thread(void *userdata )
      if ( stratum.job.job_id
          && ( !g_work_time || strcmp( stratum.job.job_id, g_work.job_id ) ) )
      {
+         new_job = true;
         pthread_mutex_lock(&g_work_lock);
         algo_gate.stratum_gen_work( &stratum, &g_work );
         time(&g_work_time);
         pthread_mutex_unlock(&g_work_lock);
         restart_threads();

-/*
         if ( stratum.job.clean || jsonrpc_2 )
         {
-            static uint32_t last_block_height;
-            if ( last_block_height != stratum.block_height )
-            {
-               last_block_height = stratum.block_height;
+            if ( !opt_quiet && last_block_height && new_job
+               &&  ( last_block_height == stratum.block_height ) )
+            {  
+               new_job = false;
+               applog( LOG_BLUE,"New job %s", g_work.job_id );
            }
-
         }
-         else
-*/
-         if (opt_debug && !opt_quiet)
+         else if (opt_debug && !opt_quiet)
         {
            applog( LOG_BLUE, "%s asks job %d for block %d", short_url,
                strtoul( stratum.job.job_id, NULL, 16 ), stratum.block_height );
@@ -2960,9 +2975,7 @@ void parse_arg(int key, char *arg )
 			show_usage_and_exit(1);
 		opt_retries = v;
 		break;
-//	case 'R':
-//      applog(LOG_WARNING,"\n-R is no longer valid, use --retry-pause instead.");
-      case 1025:
+   case 1025:
      v = atoi(arg);
 		if (v < 1 || v > 9999) /* sanity check */
 			show_usage_and_exit(1);
@@ -3018,11 +3031,14 @@ void parse_arg(int key, char *arg )
 			*hp++ = '@';
 		} else
 			hp = ap;
-		if (ap != arg) {
-			if (strncasecmp(arg, "http://", 7) &&
-			    strncasecmp(arg, "https://", 8) &&
-			    strncasecmp(arg, "stratum+tcp://", 14)) {
-				fprintf(stderr, "unknown protocol -- '%s'\n", arg);
+		if ( ap != arg )
+      {
+			if ( strncasecmp( arg, "http://", 7 )
+           && strncasecmp( arg, "https://", 8 )
+           && strncasecmp( arg, "stratum+tcp://", 14 )
+           && strncasecmp( arg, "stratum+tcps://", 15 ) )
+         {
+            fprintf(stderr, "unknown protocol -- '%s'\n", arg);
 				show_usage_and_exit(1);
 			}
 			free(rpc_url);
@@ -3427,7 +3443,7 @@ bool check_cpu_capability ()
     else if ( sw_has_sse42  )    printf( " SSE4.2" );
     else if ( sw_has_sse2   )    printf( " SSE2  " );
     if      ( sw_has_vaes   )    printf( " VAES"   );
-     else if ( sw_has_aes    )    printf( " AES "   );
+     else if ( sw_has_aes    )    printf( "  AES"   );
     if      ( sw_has_sha    )    printf( " SHA"    );

     printf("\nAlgo features:");
@@ -3439,7 +3455,7 @@ bool check_cpu_capability ()
        else if ( algo_has_sse42  )    printf( " SSE4.2" );
        else if ( algo_has_sse2   )    printf( " SSE2  " );
        if      ( algo_has_vaes   )    printf( " VAES"   );
-        else if ( algo_has_aes    )    printf( " AES "   );
+        else if ( algo_has_aes    )    printf( "  AES"   );
        if      ( algo_has_sha    )    printf( " SHA"    );
     }
     printf("\n");
@@ -3619,7 +3635,9 @@ int main(int argc, char *argv[])
 	pthread_mutex_init( &stratum.sock_lock, NULL );
 	pthread_mutex_init( &stratum.work_lock, NULL );

-	flags = !opt_benchmark && strncmp( rpc_url, "https:", 6 )
+	flags = !opt_benchmark
+           && ( strncmp( rpc_url, "https:", 6 )
+               || strncasecmp(rpc_url, "stratum+tcps://", 15 ) )
 	        ? ( CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL )
 	        : CURL_GLOBAL_ALL;
 	if ( curl_global_init( flags ) )
--- a/miner.h
+++ b/miner.h
@@ -874,9 +874,9 @@ Options:\n\
                          x16rt-veil    Veil (VEIL)\n\
                          x16s\n\
                          x17\n\
-                          x21s          Pigeoncoin (PGN)\n\
+                          x21s\n\
                          x22i\n\
-                          x25x          Sinovative (SIN)\n\
+                          x25x\n\
                          xevan         Bitsend (BSD)\n\
                          yescrypt      Globalboost-Y (BSTY)\n\
                          yescryptr8    BitZeny (ZNY)\n\
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -897,7 +897,7 @@ static inline void intrlv_16x32_512( void *dst, const void *s00,
   *( (uint32_t*)(d06) +(i) ) = s[ 6]; \
   *( (uint32_t*)(d07) +(i) ) = s[ 7]; \
   *( (uint32_t*)(d08) +(i) ) = s[ 8]; \
-   *( (uint32_t*)(d09) +(i) ) = s[ 0]; \
+   *( (uint32_t*)(d09) +(i) ) = s[ 9]; \
   *( (uint32_t*)(d10) +(i) ) = s[10]; \
   *( (uint32_t*)(d11) +(i) ) = s[11]; \
   *( (uint32_t*)(d12) +(i) ) = s[12]; \
@@ -2055,7 +2055,7 @@ static inline void intrlv_2x256( void *dst, const void *src0,
   if ( bit_len <= 512 ) return;
   d[4] = s0[2];
   if ( bit_len <= 640 ) return;
-   d[5] = s1[2];
+                      d[5] = s1[2];
   d[6] = s0[3];      d[7] = s1[3];
 }

@@ -2075,9 +2075,6 @@ static inline void dintrlv_2x256( void *dst0, void *dst1,
   d0[3] = s[6];      d1[3] = s[7];
 }

-
-
-
 #endif // AVX

 ///////////////////////////
@@ -2165,7 +2162,9 @@ static inline void rintrlv_4x32_4x64( void *dst,
   d[ 5] = _mm_unpackhi_epi32( s[ 4], s[ 5] );
   d[ 6] = _mm_unpacklo_epi32( s[ 6], s[ 7] );
   d[ 7] = _mm_unpackhi_epi32( s[ 6], s[ 7] );
+
   if ( bit_len <= 256 ) return;
+
   d[ 8] = _mm_unpacklo_epi32( s[ 8], s[ 9] );
   d[ 9] = _mm_unpackhi_epi32( s[ 8], s[ 9] );
   d[10] = _mm_unpacklo_epi32( s[10], s[11] );
@@ -2174,16 +2173,21 @@ static inline void rintrlv_4x32_4x64( void *dst,
   d[13] = _mm_unpackhi_epi32( s[12], s[13] );
   d[14] = _mm_unpacklo_epi32( s[14], s[15] );
   d[15] = _mm_unpackhi_epi32( s[14], s[15] );
+
   if ( bit_len <= 512 ) return;
+
   d[16] = _mm_unpacklo_epi32( s[16], s[17] );
   d[17] = _mm_unpackhi_epi32( s[16], s[17] );
   d[18] = _mm_unpacklo_epi32( s[18], s[19] );
   d[19] = _mm_unpackhi_epi32( s[18], s[19] );
+
   if ( bit_len <= 640 ) return;
+
   d[20] = _mm_unpacklo_epi32( s[20], s[21] );
   d[21] = _mm_unpackhi_epi32( s[20], s[21] );
   d[22] = _mm_unpacklo_epi32( s[22], s[23] );
   d[23] = _mm_unpackhi_epi32( s[22], s[23] );
+
   d[24] = _mm_unpacklo_epi32( s[24], s[25] );
   d[25] = _mm_unpackhi_epi32( s[24], s[25] );
   d[26] = _mm_unpacklo_epi32( s[26], s[27] );
@@ -2194,6 +2198,93 @@ static inline void rintrlv_4x32_4x64( void *dst,
   d[31] = _mm_unpackhi_epi32( s[30], s[31] );
 }

+// 8x32 -> 8x64
+
+static inline void rintrlv_8x32_8x64( void *dst,
+                                      const void *src, const int bit_len )
+{
+   __m128i *d = (__m128i*)dst;
+   const __m128i *s = (const __m128i*)src;
+
+   d[ 0] = _mm_unpacklo_epi32( s[ 0], s[ 2] );
+   d[ 1] = _mm_unpackhi_epi32( s[ 0], s[ 2] );
+   d[ 2] = _mm_unpacklo_epi32( s[ 1], s[ 3] );
+   d[ 3] = _mm_unpackhi_epi32( s[ 1], s[ 3] );
+   d[ 4] = _mm_unpacklo_epi32( s[ 4], s[ 6] );
+   d[ 5] = _mm_unpackhi_epi32( s[ 4], s[ 6] );
+   d[ 6] = _mm_unpacklo_epi32( s[ 5], s[ 7] );
+   d[ 7] = _mm_unpackhi_epi32( s[ 5], s[ 7] );
+
+   d[ 8] = _mm_unpacklo_epi32( s[ 8], s[10] );
+   d[ 9] = _mm_unpackhi_epi32( s[ 8], s[10] );
+   d[10] = _mm_unpacklo_epi32( s[ 9], s[11] );
+   d[11] = _mm_unpackhi_epi32( s[ 9], s[11] );
+   d[12] = _mm_unpacklo_epi32( s[12], s[14] );
+   d[13] = _mm_unpackhi_epi32( s[12], s[14] );
+   d[14] = _mm_unpacklo_epi32( s[13], s[15] );
+   d[15] = _mm_unpackhi_epi32( s[13], s[15] );
+
+   if ( bit_len <= 256 ) return;
+
+   d[16] = _mm_unpacklo_epi32( s[16], s[18] );
+   d[17] = _mm_unpackhi_epi32( s[16], s[18] );
+   d[18] = _mm_unpacklo_epi32( s[17], s[19] );
+   d[19] = _mm_unpackhi_epi32( s[17], s[19] );
+   d[20] = _mm_unpacklo_epi32( s[20], s[22] );
+   d[21] = _mm_unpackhi_epi32( s[20], s[22] );
+   d[22] = _mm_unpacklo_epi32( s[21], s[23] );
+   d[23] = _mm_unpackhi_epi32( s[21], s[23] );
+
+   d[24] = _mm_unpacklo_epi32( s[24], s[26] );
+   d[25] = _mm_unpackhi_epi32( s[24], s[26] );
+   d[26] = _mm_unpacklo_epi32( s[25], s[27] );
+   d[27] = _mm_unpackhi_epi32( s[25], s[27] );
+   d[28] = _mm_unpacklo_epi32( s[28], s[30] );
+   d[29] = _mm_unpackhi_epi32( s[28], s[30] );
+   d[30] = _mm_unpacklo_epi32( s[29], s[31] );
+   d[31] = _mm_unpackhi_epi32( s[29], s[31] );
+
+   if ( bit_len <= 512 ) return;
+
+   d[32] = _mm_unpacklo_epi32( s[32], s[34] );
+   d[33] = _mm_unpackhi_epi32( s[32], s[34] );
+   d[34] = _mm_unpacklo_epi32( s[33], s[35] );
+   d[35] = _mm_unpackhi_epi32( s[33], s[35] );
+   d[36] = _mm_unpacklo_epi32( s[36], s[38] );
+   d[37] = _mm_unpackhi_epi32( s[36], s[38] );
+   d[38] = _mm_unpacklo_epi32( s[37], s[39] );
+   d[39] = _mm_unpackhi_epi32( s[37], s[39] );
+
+   d[40] = _mm_unpacklo_epi32( s[40], s[42] );
+   d[41] = _mm_unpackhi_epi32( s[40], s[42] );
+   d[42] = _mm_unpacklo_epi32( s[41], s[43] );
+   d[43] = _mm_unpackhi_epi32( s[41], s[43] );
+   d[44] = _mm_unpacklo_epi32( s[44], s[46] );
+   d[45] = _mm_unpackhi_epi32( s[44], s[46] );
+   d[46] = _mm_unpacklo_epi32( s[45], s[47] );
+   d[47] = _mm_unpackhi_epi32( s[45], s[47] );
+
+   d[48] = _mm_unpacklo_epi32( s[48], s[50] );
+   d[49] = _mm_unpackhi_epi32( s[48], s[50] );
+   d[50] = _mm_unpacklo_epi32( s[49], s[51] );
+   d[51] = _mm_unpackhi_epi32( s[49], s[51] );
+   d[52] = _mm_unpacklo_epi32( s[52], s[54] );
+   d[53] = _mm_unpackhi_epi32( s[52], s[54] );
+   d[54] = _mm_unpacklo_epi32( s[53], s[55] );
+   d[55] = _mm_unpackhi_epi32( s[53], s[55] );
+
+   d[56] = _mm_unpacklo_epi32( s[56], s[58] );
+   d[57] = _mm_unpackhi_epi32( s[56], s[58] );
+   d[58] = _mm_unpacklo_epi32( s[57], s[59] );
+   d[59] = _mm_unpackhi_epi32( s[57], s[59] );
+   d[60] = _mm_unpacklo_epi32( s[60], s[62] );
+   d[61] = _mm_unpackhi_epi32( s[60], s[62] );
+   d[62] = _mm_unpacklo_epi32( s[61], s[63] );
+   d[63] = _mm_unpackhi_epi32( s[61], s[63] );
+}
+
+
+
 /*
 #define RLEAVE_4x32_4x64(i) do \
 { \
@@ -2225,7 +2316,6 @@ static inline void rintrlv_4x32_4x64( void *dst,

 // 2x128 -> 4x64

-
 static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
                                       const void *src1, const int bit_len )
 {
@@ -2268,7 +2358,6 @@ static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
   d[31] = _mm_unpackhi_epi64( s1[14], s1[15] );
 }

-
 /*
 #define RLEAVE_2x128_4x64( i ) do \
 { \
@@ -2339,7 +2428,6 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
   d1[15] = _mm_unpackhi_epi64( s[29], s[31] );
 }

-
 /*
 #define RLEAVE_4x64_2x128( i ) do \
 { \
@@ -2364,6 +2452,354 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
 }
 */

+// 2x128 -> 8x64
+
+static inline void rintrlv_4x128_8x64( void *dst, const void *src0,
+                                       const void *src1, const int bit_len )
+{
+   __m128i *d = (__m128i*)dst;
+   const __m128i *s0 = (const __m128i*)src0;
+   const __m128i *s1 = (const __m128i*)src1;
+
+   d[ 0] = _mm_unpacklo_epi64( s0[ 0], s0[ 1] );
+   d[ 1] = _mm_unpacklo_epi64( s0[ 2], s0[ 3] );
+   d[ 2] = _mm_unpacklo_epi64( s1[ 0], s1[ 1] );
+   d[ 3] = _mm_unpacklo_epi64( s1[ 2], s1[ 3] );
+   d[ 4] = _mm_unpackhi_epi64( s0[ 0], s0[ 1] );
+   d[ 5] = _mm_unpackhi_epi64( s0[ 2], s0[ 3] );
+   d[ 6] = _mm_unpackhi_epi64( s1[ 0], s1[ 1] );
+   d[ 7] = _mm_unpackhi_epi64( s1[ 2], s1[ 3] );
+
+   d[ 8] = _mm_unpacklo_epi64( s0[ 4], s0[ 5] );
+   d[ 9] = _mm_unpacklo_epi64( s0[ 6], s0[ 7] );
+   d[10] = _mm_unpacklo_epi64( s1[ 4], s1[ 5] );
+   d[11] = _mm_unpacklo_epi64( s1[ 6], s1[ 7] );
+   d[12] = _mm_unpackhi_epi64( s0[ 4], s0[ 5] );
+   d[13] = _mm_unpackhi_epi64( s0[ 6], s0[ 7] );
+   d[14] = _mm_unpackhi_epi64( s1[ 4], s1[ 5] );
+   d[15] = _mm_unpackhi_epi64( s1[ 6], s1[ 7] );
+
+   if ( bit_len <= 256 ) return;
+
+   d[16] = _mm_unpacklo_epi64( s0[ 8], s0[ 9] );
+   d[17] = _mm_unpacklo_epi64( s0[10], s0[11] );
+   d[18] = _mm_unpacklo_epi64( s1[ 8], s1[ 9] );
+   d[19] = _mm_unpacklo_epi64( s1[10], s1[11] );
+   d[20] = _mm_unpackhi_epi64( s0[ 8], s0[ 9] );
+   d[21] = _mm_unpackhi_epi64( s0[10], s0[11] );
+   d[22] = _mm_unpackhi_epi64( s1[ 8], s1[ 9] );
+   d[23] = _mm_unpackhi_epi64( s1[10], s1[11] );
+
+   d[24] = _mm_unpacklo_epi64( s0[12], s0[13] );
+   d[25] = _mm_unpacklo_epi64( s0[14], s0[15] );
+   d[26] = _mm_unpacklo_epi64( s1[12], s1[13] );
+   d[27] = _mm_unpacklo_epi64( s1[14], s1[15] );
+   d[28] = _mm_unpackhi_epi64( s0[12], s0[13] );
+   d[29] = _mm_unpackhi_epi64( s0[14], s0[15] );
+   d[30] = _mm_unpackhi_epi64( s1[12], s1[13] );
+   d[31] = _mm_unpackhi_epi64( s1[14], s1[15] );
+
+   if ( bit_len <= 512 ) return;
+
+   d[32] = _mm_unpacklo_epi64( s0[16], s0[17] );
+   d[33] = _mm_unpacklo_epi64( s0[18], s0[19] );
+   d[34] = _mm_unpacklo_epi64( s1[16], s1[17] );
+   d[35] = _mm_unpacklo_epi64( s1[18], s1[19] );
+   d[36] = _mm_unpackhi_epi64( s0[16], s0[17] );
+   d[37] = _mm_unpackhi_epi64( s0[18], s0[19] );
+   d[38] = _mm_unpackhi_epi64( s1[16], s1[17] );
+   d[39] = _mm_unpackhi_epi64( s1[18], s1[19] );
+
+   d[40] = _mm_unpacklo_epi64( s0[20], s0[21] );
+   d[41] = _mm_unpacklo_epi64( s0[22], s0[23] );
+   d[42] = _mm_unpacklo_epi64( s1[20], s1[21] );
+   d[43] = _mm_unpacklo_epi64( s1[22], s1[23] );
+   d[44] = _mm_unpackhi_epi64( s0[20], s0[21] );
+   d[45] = _mm_unpackhi_epi64( s0[22], s0[23] );
+   d[46] = _mm_unpackhi_epi64( s1[20], s1[21] );
+   d[47] = _mm_unpackhi_epi64( s1[22], s1[23] );
+
+   d[48] = _mm_unpacklo_epi64( s0[24], s0[25] );
+   d[49] = _mm_unpacklo_epi64( s0[26], s0[27] );
+   d[50] = _mm_unpacklo_epi64( s1[24], s1[25] );
+   d[51] = _mm_unpacklo_epi64( s1[26], s1[27] );
+   d[52] = _mm_unpackhi_epi64( s0[24], s0[25] );
+   d[53] = _mm_unpackhi_epi64( s0[26], s0[27] );
+   d[54] = _mm_unpackhi_epi64( s1[24], s1[25] );
+   d[55] = _mm_unpackhi_epi64( s1[26], s1[27] );
+
+   d[56] = _mm_unpacklo_epi64( s0[28], s0[29] );
+   d[57] = _mm_unpacklo_epi64( s0[30], s0[31] );
+   d[58] = _mm_unpacklo_epi64( s1[28], s1[29] );
+   d[59] = _mm_unpacklo_epi64( s1[30], s1[31] );
+   d[60] = _mm_unpackhi_epi64( s0[28], s0[29] );
+   d[61] = _mm_unpackhi_epi64( s0[30], s0[31] );
+   d[62] = _mm_unpackhi_epi64( s1[28], s1[29] );
+   d[63] = _mm_unpackhi_epi64( s1[30], s1[31] );
+}
+
+// 8x64 -> 4x128
+
+static inline void rintrlv_8x64_4x128( void *dst0, void *dst1,
+                                       const void *src, const int bit_len )
+{
+   __m128i *d0 = (__m128i*)dst0;
+   __m128i *d1 = (__m128i*)dst1;
+   const __m128i* s = (const __m128i*)src;
+
+   d0[ 0] = _mm_unpacklo_epi64( s[ 0], s[ 4] );
+   d0[ 1] = _mm_unpackhi_epi64( s[ 0], s[ 4] );
+   d1[ 0] = _mm_unpacklo_epi64( s[ 2], s[ 6] );
+   d1[ 1] = _mm_unpackhi_epi64( s[ 2], s[ 6] );
+   d0[ 2] = _mm_unpacklo_epi64( s[ 1], s[ 5] );
+   d0[ 3] = _mm_unpackhi_epi64( s[ 1], s[ 5] );
+   d1[ 2] = _mm_unpacklo_epi64( s[ 3], s[ 7] );
+   d1[ 3] = _mm_unpackhi_epi64( s[ 3], s[ 7] );
+
+   d0[ 4] = _mm_unpacklo_epi64( s[ 8], s[12] );
+   d0[ 5] = _mm_unpackhi_epi64( s[ 8], s[12] );
+   d1[ 4] = _mm_unpacklo_epi64( s[10], s[14] );
+   d1[ 5] = _mm_unpackhi_epi64( s[10], s[14] );
+   d0[ 6] = _mm_unpacklo_epi64( s[ 9], s[13] );
+   d0[ 7] = _mm_unpackhi_epi64( s[ 9], s[13] );
+   d1[ 6] = _mm_unpacklo_epi64( s[11], s[15] );
+   d1[ 7] = _mm_unpackhi_epi64( s[11], s[15] );
+
+   if ( bit_len <= 256 ) return;
+
+   d0[ 8] = _mm_unpacklo_epi64( s[16], s[20] );
+   d0[ 9] = _mm_unpackhi_epi64( s[16], s[20] );
+   d1[ 8] = _mm_unpacklo_epi64( s[18], s[22] );
+   d1[ 9] = _mm_unpackhi_epi64( s[18], s[22] );
+   d0[10] = _mm_unpacklo_epi64( s[17], s[21] );
+   d0[11] = _mm_unpackhi_epi64( s[17], s[21] );
+   d1[10] = _mm_unpacklo_epi64( s[19], s[23] );
+   d1[11] = _mm_unpackhi_epi64( s[19], s[23] );
+
+   d0[12] = _mm_unpacklo_epi64( s[24], s[28] );
+   d0[13] = _mm_unpackhi_epi64( s[24], s[28] );
+   d1[12] = _mm_unpacklo_epi64( s[26], s[30] );
+   d1[13] = _mm_unpackhi_epi64( s[26], s[30] );
+   d0[14] = _mm_unpacklo_epi64( s[25], s[29] );
+   d0[15] = _mm_unpackhi_epi64( s[25], s[29] );
+   d1[14] = _mm_unpacklo_epi64( s[27], s[31] );
+   d1[15] = _mm_unpackhi_epi64( s[27], s[31] );
+
+   if ( bit_len <= 512 ) return;
+
+   d0[16] = _mm_unpacklo_epi64( s[32], s[36] );
+   d0[17] = _mm_unpackhi_epi64( s[32], s[36] );
+   d1[16] = _mm_unpacklo_epi64( s[34], s[38] );
+   d1[17] = _mm_unpackhi_epi64( s[34], s[38] );
+   d0[18] = _mm_unpacklo_epi64( s[33], s[37] );
+   d0[19] = _mm_unpackhi_epi64( s[33], s[37] );
+   d1[18] = _mm_unpacklo_epi64( s[35], s[39] );
+   d1[19] = _mm_unpackhi_epi64( s[35], s[39] );
+
+   d0[20] = _mm_unpacklo_epi64( s[40], s[44] );
+   d0[21] = _mm_unpackhi_epi64( s[40], s[44] );
+   d1[20] = _mm_unpacklo_epi64( s[42], s[46] );
+   d1[21] = _mm_unpackhi_epi64( s[42], s[46] );
+   d0[22] = _mm_unpacklo_epi64( s[41], s[45] );
+   d0[23] = _mm_unpackhi_epi64( s[41], s[45] );
+   d1[22] = _mm_unpacklo_epi64( s[43], s[47] );
+   d1[23] = _mm_unpackhi_epi64( s[43], s[47] );
+
+   d0[24] = _mm_unpacklo_epi64( s[48], s[52] );
+   d0[25] = _mm_unpackhi_epi64( s[48], s[52] );
+   d1[24] = _mm_unpacklo_epi64( s[50], s[54] );
+   d1[25] = _mm_unpackhi_epi64( s[50], s[54] );
+   d0[26] = _mm_unpacklo_epi64( s[49], s[53] );
+   d0[27] = _mm_unpackhi_epi64( s[49], s[53] );
+   d1[26] = _mm_unpacklo_epi64( s[51], s[55] );
+   d1[27] = _mm_unpackhi_epi64( s[51], s[55] );
+
+   d0[28] = _mm_unpacklo_epi64( s[56], s[60] );
+   d0[29] = _mm_unpackhi_epi64( s[56], s[60] );
+   d1[28] = _mm_unpacklo_epi64( s[58], s[62] );
+   d1[29] = _mm_unpackhi_epi64( s[58], s[62] );
+   d0[30] = _mm_unpacklo_epi64( s[57], s[61] );
+   d0[31] = _mm_unpackhi_epi64( s[57], s[61] );
+   d1[30] = _mm_unpacklo_epi64( s[59], s[63] );
+   d1[31] = _mm_unpackhi_epi64( s[59], s[63] );
+}
+
+// 8x64 -> 2x256
+
+static inline void rintrlv_8x64_2x256( void *dst0, void *dst1, void *dst2,
+                          void *dst3,  const void *src, const int bit_len )
+{
+   __m128i *d0 = (__m128i*)dst0;
+   __m128i *d1 = (__m128i*)dst1;
+   __m128i *d2 = (__m128i*)dst2;
+   __m128i *d3 = (__m128i*)dst3;
+   const __m128i* s = (const __m128i*)src;
+
+   d0[ 0] = _mm_unpacklo_epi64( s[ 0], s[ 4] );
+   d1[ 0] = _mm_unpackhi_epi64( s[ 0], s[ 4] );
+   d2[ 0] = _mm_unpacklo_epi64( s[ 1], s[ 5] );   
+   d3[ 0] = _mm_unpackhi_epi64( s[ 1], s[ 5] );
+   d0[ 1] = _mm_unpacklo_epi64( s[ 2], s[ 6] ); 
+   d1[ 1] = _mm_unpackhi_epi64( s[ 2], s[ 6] );
+   d2[ 1] = _mm_unpacklo_epi64( s[ 3], s[ 7] ); 
+   d3[ 1] = _mm_unpackhi_epi64( s[ 3], s[ 7] );
+   
+   d0[ 2] = _mm_unpacklo_epi64( s[ 8], s[12] ); 
+   d1[ 2] = _mm_unpackhi_epi64( s[ 8], s[12] );
+   d2[ 2] = _mm_unpacklo_epi64( s[ 9], s[13] ); 
+   d3[ 2] = _mm_unpackhi_epi64( s[ 9], s[13] );
+   d0[ 3] = _mm_unpacklo_epi64( s[10], s[14] );
+   d1[ 3] = _mm_unpackhi_epi64( s[10], s[14] );
+   d2[ 3] = _mm_unpacklo_epi64( s[11], s[15] );
+   d3[ 3] = _mm_unpackhi_epi64( s[11], s[15] );
+
+   if ( bit_len <= 256 ) return;
+
+   d0[ 4] = _mm_unpacklo_epi64( s[16], s[20] );
+   d1[ 4] = _mm_unpackhi_epi64( s[16], s[20] );
+   d2[ 4] = _mm_unpacklo_epi64( s[17], s[21] );
+   d3[ 4] = _mm_unpackhi_epi64( s[17], s[21] );
+   d0[ 5] = _mm_unpacklo_epi64( s[18], s[22] );
+   d1[ 5] = _mm_unpackhi_epi64( s[18], s[22] );
+   d2[ 5] = _mm_unpacklo_epi64( s[19], s[23] );
+   d3[ 5] = _mm_unpackhi_epi64( s[19], s[23] );
+   
+   d0[ 6] = _mm_unpacklo_epi64( s[24], s[28] );
+   d1[ 6] = _mm_unpackhi_epi64( s[24], s[28] );
+   d2[ 6] = _mm_unpacklo_epi64( s[25], s[29] );
+   d3[ 6] = _mm_unpackhi_epi64( s[25], s[29] );
+   d0[ 7] = _mm_unpacklo_epi64( s[26], s[30] );
+   d1[ 7] = _mm_unpackhi_epi64( s[26], s[30] );
+   d2[ 7] = _mm_unpacklo_epi64( s[27], s[31] );
+   d3[ 7] = _mm_unpackhi_epi64( s[27], s[31] );
+
+   if ( bit_len <= 512 ) return;
+
+   d0[ 8] = _mm_unpacklo_epi64( s[32], s[36] );
+   d1[ 8] = _mm_unpackhi_epi64( s[32], s[36] );
+   d2[ 8] = _mm_unpacklo_epi64( s[33], s[37] );
+   d3[ 8] = _mm_unpackhi_epi64( s[33], s[37] );
+   d0[ 9] = _mm_unpacklo_epi64( s[34], s[38] );
+   d1[ 9] = _mm_unpackhi_epi64( s[34], s[38] );
+   d2[ 9] = _mm_unpacklo_epi64( s[35], s[39] );
+   d3[ 9] = _mm_unpackhi_epi64( s[35], s[39] );
+
+   d0[10] = _mm_unpacklo_epi64( s[40], s[44] );
+   d1[10] = _mm_unpackhi_epi64( s[40], s[44] );
+   d2[10] = _mm_unpacklo_epi64( s[41], s[45] );
+   d3[10] = _mm_unpackhi_epi64( s[41], s[45] );
+   d0[11] = _mm_unpacklo_epi64( s[42], s[46] );
+   d1[11] = _mm_unpackhi_epi64( s[42], s[46] );
+   d2[11] = _mm_unpacklo_epi64( s[43], s[47] );
+   d3[11] = _mm_unpackhi_epi64( s[43], s[47] );
+
+   d0[12] = _mm_unpacklo_epi64( s[48], s[52] );
+   d1[12] = _mm_unpackhi_epi64( s[48], s[52] );
+   d2[12] = _mm_unpacklo_epi64( s[49], s[53] );
+   d3[12] = _mm_unpackhi_epi64( s[49], s[53] );
+   d0[13] = _mm_unpacklo_epi64( s[50], s[54] );
+   d1[13] = _mm_unpackhi_epi64( s[50], s[54] );
+   d2[13] = _mm_unpacklo_epi64( s[51], s[55] );
+   d3[13] = _mm_unpackhi_epi64( s[51], s[55] );
+
+   d0[14] = _mm_unpacklo_epi64( s[56], s[60] );
+   d1[14] = _mm_unpackhi_epi64( s[56], s[60] );
+   d2[14] = _mm_unpacklo_epi64( s[57], s[61] );
+   d3[14] = _mm_unpackhi_epi64( s[57], s[61] );
+   d0[15] = _mm_unpacklo_epi64( s[58], s[62] );
+   d1[15] = _mm_unpackhi_epi64( s[58], s[62] );
+   d2[15] = _mm_unpacklo_epi64( s[59], s[63] );
+   d3[15] = _mm_unpackhi_epi64( s[59], s[63] );
+}
+
+// 4x128 -> 8x64
+
+static inline void rintrlv_2x256_8x64( void *dst, const void *src0,
+      const void *src1, const void *src2, const void *src3, const int bit_len )
+{
+   __m128i *d = (__m128i*)dst;
+   __m128i *s0 = (__m128i*)src0;
+   __m128i *s1 = (__m128i*)src1;
+   __m128i *s2 = (__m128i*)src2;
+   __m128i *s3 = (__m128i*)src3;
+
+   d[ 0] = _mm_unpacklo_epi64( s0[0], s0[2] );
+   d[ 1] = _mm_unpacklo_epi64( s1[0], s1[2] );
+   d[ 2] = _mm_unpacklo_epi64( s2[0], s2[2] );
+   d[ 3] = _mm_unpacklo_epi64( s3[0], s3[2] );
+   d[ 4] = _mm_unpackhi_epi64( s0[0], s0[2] );
+   d[ 5] = _mm_unpackhi_epi64( s1[0], s1[2] );
+   d[ 6] = _mm_unpackhi_epi64( s2[0], s2[2] );
+   d[ 7] = _mm_unpackhi_epi64( s3[0], s3[2] );
+
+   d[ 8] = _mm_unpacklo_epi64( s0[1], s0[3] );
+   d[ 9] = _mm_unpacklo_epi64( s1[1], s1[3] );
+   d[10] = _mm_unpacklo_epi64( s2[1], s2[3] );
+   d[11] = _mm_unpacklo_epi64( s3[1], s3[3] );
+   d[12] = _mm_unpackhi_epi64( s0[1], s0[3] );
+   d[13] = _mm_unpackhi_epi64( s1[1], s1[3] );
+   d[14] = _mm_unpackhi_epi64( s2[1], s2[3] );
+   d[15] = _mm_unpackhi_epi64( s3[1], s3[3] );
+
+   if ( bit_len <= 256 ) return;
+
+   d[16] = _mm_unpacklo_epi64( s0[4], s0[6] );
+   d[17] = _mm_unpacklo_epi64( s1[4], s1[6] );
+   d[18] = _mm_unpacklo_epi64( s2[4], s2[6] );
+   d[19] = _mm_unpacklo_epi64( s3[4], s3[6] );
+   d[20] = _mm_unpackhi_epi64( s0[4], s0[6] );
+   d[21] = _mm_unpackhi_epi64( s1[4], s1[6] );
+   d[22] = _mm_unpackhi_epi64( s2[4], s2[6] );
+   d[23] = _mm_unpackhi_epi64( s3[4], s3[6] );
+
+   d[24] = _mm_unpacklo_epi64( s0[5], s0[7] );
+   d[25] = _mm_unpacklo_epi64( s1[5], s1[7] );
+   d[26] = _mm_unpacklo_epi64( s2[5], s2[7] );
+   d[27] = _mm_unpacklo_epi64( s3[5], s3[7] );
+   d[28] = _mm_unpackhi_epi64( s0[5], s0[7] );
+   d[29] = _mm_unpackhi_epi64( s1[5], s1[7] );
+   d[30] = _mm_unpackhi_epi64( s2[5], s2[7] );
+   d[31] = _mm_unpackhi_epi64( s3[5], s3[7] );
+
+   if ( bit_len <= 512 ) return;
+
+   d[32] = _mm_unpacklo_epi64( s0[8], s0[10] );
+   d[33] = _mm_unpacklo_epi64( s1[8], s1[10] );
+   d[34] = _mm_unpacklo_epi64( s2[8], s2[10] );
+   d[35] = _mm_unpacklo_epi64( s3[8], s3[10] );
+   d[36] = _mm_unpackhi_epi64( s0[8], s0[10] );
+   d[37] = _mm_unpackhi_epi64( s1[8], s1[10] );
+   d[38] = _mm_unpackhi_epi64( s2[8], s2[10] );
+   d[39] = _mm_unpackhi_epi64( s3[8], s3[10] );
+
+   d[40] = _mm_unpacklo_epi64( s0[9], s0[11] );
+   d[41] = _mm_unpacklo_epi64( s1[9], s1[11] );
+   d[42] = _mm_unpacklo_epi64( s2[9], s2[11] );
+   d[43] = _mm_unpacklo_epi64( s3[9], s3[11] );
+   d[44] = _mm_unpackhi_epi64( s0[9], s0[11] );
+   d[45] = _mm_unpackhi_epi64( s1[9], s1[11] );
+   d[46] = _mm_unpackhi_epi64( s2[9], s2[11] );
+   d[47] = _mm_unpackhi_epi64( s3[9], s3[11] );
+
+   d[48] = _mm_unpacklo_epi64( s0[12], s0[14] );
+   d[49] = _mm_unpacklo_epi64( s1[12], s1[14] );
+   d[50] = _mm_unpacklo_epi64( s2[12], s2[14] );
+   d[51] = _mm_unpacklo_epi64( s3[12], s3[14] );
+   d[52] = _mm_unpackhi_epi64( s0[12], s0[14] );
+   d[53] = _mm_unpackhi_epi64( s1[12], s1[14] );
+   d[54] = _mm_unpackhi_epi64( s2[12], s2[14] );
+   d[55] = _mm_unpackhi_epi64( s3[12], s3[14] );
+
+   d[56] = _mm_unpacklo_epi64( s0[13], s0[15] );
+   d[57] = _mm_unpacklo_epi64( s1[13], s1[15] );
+   d[58] = _mm_unpacklo_epi64( s2[13], s2[15] );
+   d[59] = _mm_unpacklo_epi64( s3[13], s3[15] );
+   d[60] = _mm_unpackhi_epi64( s0[13], s0[15] );
+   d[61] = _mm_unpackhi_epi64( s1[13], s1[15] );
+   d[62] = _mm_unpackhi_epi64( s2[13], s2[15] );
+   d[63] = _mm_unpackhi_epi64( s3[13], s3[15] );
+}
+
 //
 // Some functions customized for mining.

--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -113,6 +113,9 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
   m512_const1_64( ( ( ( (uint64_t)(i1) << 32 ) ) \
                     | ( (uint64_t)(i0) & 0xffffffff ) ) )

+// { m128_1, m128_1, m128_0, m128_0 }
+#define m512_const_2x128( v1, v0 ) \
+   m512_mask_blend_epi64( 0x0f, m512_const1_128( v1 ), m512_const1_128( v0 ) )

 static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
                                      const uint64_t i1, const uint64_t i0 )
@@ -270,7 +273,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
               m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
                              0x28292a2b2c2d2e2f, 0x2021222324252627, \
                              0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                              0x08090a0b0c0d0e0f, 0x0001020304050607 ))
+                              0x08090a0b0c0d0e0f, 0x0001020304050607 ) )

 #define mm512_bswap_32( v ) \
   _mm512_shuffle_epi8( v, \
--- a/util.c
+++ b/util.c
@@ -1069,7 +1069,7 @@ double target_to_diff(uint32_t* target)
 #define socket_blocks() (errno == EAGAIN || errno == EWOULDBLOCK)
 #endif

-static bool send_line(curl_socket_t sock, char *s)
+static bool send_line( struct stratum_ctx *sctx, char *s )
 {
 	size_t sent = 0;
 	int len;
@@ -1077,24 +1077,35 @@ static bool send_line(curl_socket_t sock, char *s)
 	len = (int) strlen(s);
 	s[len++] = '\n';

-	while (len > 0) {
+	while ( len > 0 )
+   {
 		struct timeval timeout = {0, 0};
 		int n;
 		fd_set wd;

-		FD_ZERO(&wd);
-		FD_SET(sock, &wd);
-		if (select((int) (sock + 1), NULL, &wd, NULL, &timeout) < 1)
+		FD_ZERO( &wd );
+		FD_SET( sctx->sock, &wd );
+		if ( select( (int) ( sctx->sock + 1 ), NULL, &wd, NULL, &timeout ) < 1 )
 			return false;
-		n = send(sock, s + sent, len, 0);
-		if (n < 0) {
-			if (!socket_blocks())
-				return false;
-			n = 0;
-		}
+
+#if LIBCURL_VERSION_NUM >= 0x071802
+
+     CURLcode rc = curl_easy_send(sctx->curl, s + sent, len, (size_t *)&n);
+     if ( rc != CURLE_OK )
+     {
+        if ( rc != CURLE_AGAIN )
+#else                      
+     n = send(sock, s + sent, len, 0);
+     if ( n < 0 )
+     {
+     if ( !socket_blocks() )
+#endif
+        return false;
+	     n = 0;
+	  }
 		sent += n;
 		len -= n;
-	}
+   }

 	return true;
 }
@@ -1107,7 +1118,7 @@ bool stratum_send_line(struct stratum_ctx *sctx, char *s)
 		applog(LOG_DEBUG, "> %s", s);

 	pthread_mutex_lock(&sctx->sock_lock);
-	ret = send_line(sctx->sock, s);
+	ret = send_line( sctx, s );
 	pthread_mutex_unlock(&sctx->sock_lock);

 	return ret;
@@ -1167,14 +1178,27 @@ char *stratum_recv_line(struct stratum_ctx *sctx)
 			ssize_t n;

 			memset(s, 0, RBUFSIZE);
-			n = recv(sctx->sock, s, RECVSIZE, 0);
+
+#if LIBCURL_VERSION_NUM >= 0x071802
+
+			CURLcode rc = curl_easy_recv(sctx->curl, s, RECVSIZE, (size_t *)&n);
+			if (rc == CURLE_OK && !n) {
+				ret = false;
+				break;
+			}
+			if (rc != CURLE_OK) {
+				if (rc != CURLE_AGAIN || !socket_full(sctx->sock, 1)) {
+#else
+
+         n = recv(sctx->sock, s, RECVSIZE, 0);
 			if (!n) {
 				ret = false;
 				break;
 			}
 			if (n < 0) {
 				if (!socket_blocks() || !socket_full(sctx->sock, 1)) {
-					ret = false;
+#endif
+               ret = false;
 					break;
 				}
 			} else
@@ -1244,7 +1268,9 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
 	}
 	free(sctx->curl_url);
 	sctx->curl_url = (char*) malloc(strlen(url));
-	sprintf(sctx->curl_url, "http%s", strstr(url, "://"));
+	sprintf( sctx->curl_url, "http%s", strstr( url, "s://" ) 
+                              ? strstr( url, "s://" )
+                              : strstr (url, "://"  ) );

 	if (opt_protocol)
 		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
@@ -1254,7 +1280,9 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
 	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, sctx->curl_err_str);
 	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
 	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
-	if (opt_proxy) {
+	curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
+	curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
+   if (opt_proxy) {
 		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
 		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
 	}
@@ -1954,7 +1982,9 @@ static bool stratum_reconnect(struct stratum_ctx *sctx, json_t *params)
 		return false;

 	url = (char*) malloc(32 + strlen(host));
-	sprintf(url, "stratum+tcp://%s:%d", host, port);
+
+	strncpy( url, sctx->url, 15 );
+	sprintf( strstr( url, "://" ) + 3, "%s:%d", host, port );

 	if (!opt_redirect) {
 		applog(LOG_INFO, "Ignoring request to reconnect to %s", url);
Author	SHA1	Message	Date
Jay D Dee	241bc26767	v3.10.6	2019-12-25 01:26:26 -05:00
Jay D Dee	c65b0ff7a6	v3.10.5	2019-12-21 13:19:29 -05:00