v3.10.6

2025-09-17 23:44:27 +00:00 · 2019-12-25 01:26:26 -05:00
parent c65b0ff7a6
commit 241bc26767
35 changed files with 3036 additions and 643 deletions
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -463,6 +463,38 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
   return 0;
 }

+// Update and final when inlen is a multiple of 64 bytes
+int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
+                              const void *input, uint64_t inlen )
+{
+    __m256i *in = (__m256i*)input;
+    __m256i *buf = (__m256i*)S->buf;
+
+    while( inlen > BLAKE2S_BLOCKBYTES )
+    {
+       memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
+       S->buflen = BLAKE2S_BLOCKBYTES;
+       inlen -= BLAKE2S_BLOCKBYTES;
+       S->t[0] += BLAKE2S_BLOCKBYTES;
+       S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+       blake2s_8way_compress( S, buf );
+       S->buflen = 0;
+       in += ( BLAKE2S_BLOCKBYTES >> 2 );
+    }
+
+    // last block
+    memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
+    S->buflen = BLAKE2S_BLOCKBYTES;
+    S->t[0] += S->buflen;
+    S->t[1] += ( S->t[0] < S->buflen );
+    if ( S->last_node )  S->f[1] = ~0U;
+    S->f[0] = ~0U;
+    blake2s_8way_compress( S, buf );
+
+    for ( int i = 0; i < 8; ++i )
+      casti_m256i( out, i ) = S->h[ i ];
+    return 0;
+}

 #endif // __AVX2__

--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -95,8 +95,8 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen );
 int blake2s_8way_update( blake2s_8way_state *S, const void *in,
                         uint64_t inlen );
 int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
-//int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
-//                              const void *input, uint64_t inlen );
+int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
+                              const void *input, uint64_t inlen );

 #endif

--- a/algo/echo/echo-hash-4way.c
+++ b/algo/echo/echo-hash-4way.c
@@ -0,0 +1,559 @@
+#if defined(__AVX512VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#include "simd-utils.h"
+#include "echo-hash-4way.h"
+
+/*
+#include <memory.h>
+#include "miner.h"
+#include "hash_api.h"
+//#include "vperm.h"
+#include <immintrin.h>
+*/
+/*
+#ifndef NO_AES_NI
+#include <wmmintrin.h>
+#else
+#include <tmmintrin.h>
+#endif
+*/
+
+// not used
+/*
+const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
+const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
+const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1};
+const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C};
+const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1};
+const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8};
+const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09};
+const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79};
+const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8};
+const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170};
+const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1};
+const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363};
+const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6};
+const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b};
+const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e};
+const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e};
+const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515};
+const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c};
+const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601};
+const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06};
+const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b};
+*/
+
+/*
+MYALIGN const unsigned int 	const1[]		= {0x00000001, 0x00000000, 0x00000000, 0x00000000};
+MYALIGN const unsigned int	mul2mask[]		= {0x00001b00, 0x00000000, 0x00000000, 0x00000000};
+MYALIGN const unsigned int	lsbmask[]		= {0x01010101, 0x01010101, 0x01010101, 0x01010101};
+MYALIGN const unsigned int	invshiftrows[]	= {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
+MYALIGN const unsigned int	zero[]			= {0x00000000, 0x00000000, 0x00000000, 0x00000000};
+*/
+
+MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
+
+// do these need to be reversed?
+
+#define mul2mask \
+   m512_const4_32( 0x00001b00, 0, 0, 0 ) 
+
+#define lsbmask    m512_const1_32( 0x01010101 ) 
+
+#define ECHO_SUBBYTES( state, i, j ) \
+	state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
+	state[i][j] = _mm512_aesenc_epi128( state[i][j], m512_zero ); \
+	k1 = _mm512_add_epi32( k1, m512_one_32 )
+
+#define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) do \
+{ \
+   const int j1 = ( j+1 ) & 3; \
+   const int j2 = ( j+2 ) & 3; \
+   const int j3 = ( j+3 ) & 3; \
+   s2 = _mm512_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \
+	t1 = _mm512_srli_epi16( state1[ 0 ][ j ], 7 ); \
+	t1 = _mm512_and_si128( t1, lsbmask );\
+	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+	s2 = _mm512_xor_si512( s2, t2 ); \
+	state2[ 0 ] [j ] = s2; \
+	state2[ 1 ] [j ] = state1[ 0 ][ j ]; \
+	state2[ 2 ] [j ] = state1[ 0 ][ j ]; \
+	state2[ 3 ] [j ] = _mm512_xor_si512( s2, state1[ 0 ][ j ] );\
+	s2 = _mm512_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \
+	t1 = _mm512_srli_epi16( state1[ 1 ][ j1 ], 7 ); \
+	t1 = _mm512_and_si512( t1, lsbmask ); \
+	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+	s2 = _mm512_xor_si512( s2, t2 );\
+	state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
+                            _mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
+	state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
+	state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
+	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
+	s2 = _mm512_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \
+	t1 = _mm512_srli_epi16( state1[ 2 ][ j2 ], 7 ); \
+	t1 = _mm512_and_si512( t1, lsbmask ); \
+	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+	s2 = _mm512_xor_si512( s2, t2 ); \
+	state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
+	state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
+                            _mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
+	state2[ 2 ][ j ] = _mm512_xor_si512128( state2[ 2 ][ j ], s2 ); \
+	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
+	s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
+	t1 = _mm512_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
+	t1 = _mm512_and_si512( t1, lsbmask ); \
+	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+	s2 = _mm512_xor_si512( s2, t2 ); \
+	state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
+	state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
+	state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
+                            _mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
+	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 )
+} while(0)
+
+#define ECHO_ROUND_UNROLL2 \
+	ECHO_SUBBYTES(_state, 0, 0);\
+	ECHO_SUBBYTES(_state, 1, 0);\
+	ECHO_SUBBYTES(_state, 2, 0);\
+	ECHO_SUBBYTES(_state, 3, 0);\
+	ECHO_SUBBYTES(_state, 0, 1);\
+	ECHO_SUBBYTES(_state, 1, 1);\
+	ECHO_SUBBYTES(_state, 2, 1);\
+	ECHO_SUBBYTES(_state, 3, 1);\
+	ECHO_SUBBYTES(_state, 0, 2);\
+	ECHO_SUBBYTES(_state, 1, 2);\
+	ECHO_SUBBYTES(_state, 2, 2);\
+	ECHO_SUBBYTES(_state, 3, 2);\
+	ECHO_SUBBYTES(_state, 0, 3);\
+	ECHO_SUBBYTES(_state, 1, 3);\
+	ECHO_SUBBYTES(_state, 2, 3);\
+	ECHO_SUBBYTES(_state, 3, 3);\
+	ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
+	ECHO_SUBBYTES(_state2, 0, 0);\
+	ECHO_SUBBYTES(_state2, 1, 0);\
+	ECHO_SUBBYTES(_state2, 2, 0);\
+	ECHO_SUBBYTES(_state2, 3, 0);\
+	ECHO_SUBBYTES(_state2, 0, 1);\
+	ECHO_SUBBYTES(_state2, 1, 1);\
+	ECHO_SUBBYTES(_state2, 2, 1);\
+	ECHO_SUBBYTES(_state2, 3, 1);\
+	ECHO_SUBBYTES(_state2, 0, 2);\
+	ECHO_SUBBYTES(_state2, 1, 2);\
+	ECHO_SUBBYTES(_state2, 2, 2);\
+	ECHO_SUBBYTES(_state2, 3, 2);\
+	ECHO_SUBBYTES(_state2, 0, 3);\
+	ECHO_SUBBYTES(_state2, 1, 3);\
+	ECHO_SUBBYTES(_state2, 2, 3);\
+	ECHO_SUBBYTES(_state2, 3, 3);\
+	ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
+
+
+
+#define SAVESTATE(dst, src)\
+	dst[0][0] = src[0][0];\
+	dst[0][1] = src[0][1];\
+	dst[0][2] = src[0][2];\
+	dst[0][3] = src[0][3];\
+	dst[1][0] = src[1][0];\
+	dst[1][1] = src[1][1];\
+	dst[1][2] = src[1][2];\
+	dst[1][3] = src[1][3];\
+	dst[2][0] = src[2][0];\
+	dst[2][1] = src[2][1];\
+	dst[2][2] = src[2][2];\
+	dst[2][3] = src[2][3];\
+	dst[3][0] = src[3][0];\
+	dst[3][1] = src[3][1];\
+	dst[3][2] = src[3][2];\
+	dst[3][3] = src[3][3]
+
+
+void echo_4way_compress( echo_4way_context *ctx, const unsigned char *pmsg,
+               unsigned int uBlockCount )
+{
+  unsigned int r, b, i, j;
+  __m512i t1, t2, s2, k1;
+  __m512i _state[4][4], _state2[4][4], _statebackup[4][4]; 
+
+// unroll   
+  for ( i = 0; i < 4; i++ )
+  for ( j = 0; j < ctx->uHashSize / 256; j++ )
+	 _state[ i ][ j ] = ctx->state[ i ][ j ];
+
+  for ( b = 0; b < uBlockCount; b++ )
+  {
+    ctx->k = _mm512_add_epi64( ctx->k, ctx->const1536 );
+
+    // load message, make aligned, remove loadu
+    for( j = ctx->uHashSize / 256; j < 4; j++ )
+    {
+      for ( i = 0; i < 4; i++ )
+	   {
+        _state[ i ][ j ] = _mm512_loadu_si512( 
+                     (__m512i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i );
+	   }
+	 }
+
+    // save state
+	 SAVESTATE( _statebackup, _state );
+
+	 k1 = ctx->k;
+
+	 for ( r = 0; r < ctx->uRounds / 2; r++ )
+	 {
+		ECHO_ROUND_UNROLL2;
+	 }
+		
+	 if ( ctx->uHashSize == 256 )
+	 {
+	   for ( i = 0; i < 4; i++ )
+	   {
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _state[ i ][ 1 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _state[ i ][ 2 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _state[ i ][ 3 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 0 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 1 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 2 ] ) ;
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 3 ] );
+	   }
+	 }
+	 else
+	 {
+	   for ( i = 0; i < 4; i++ )
+	   {
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _state[ i ][ 2 ] );
+		   _state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
+                                              _state[ i ][ 3 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 0 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ] [0 ],
+                                              _statebackup[ i ][ 2 ] );
+		   _state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
+                                              _statebackup[ i ][ 1 ] );
+		   _state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
+                                              _statebackup[ i ][ 3 ] );
+      }
+	 }
+    pmsg += ctx->uBlockLength;
+  }
+  SAVESTATE(ctx->state, _state);
+
+}
+
+
+
+int echo_4way_init( echo_4way_context *ctx, int nHashSize )
+{
+	int i, j;
+
+   ctx->k = m512_zero; 
+	ctx->processed_bits = 0;
+	ctx->uBufferBytes = 0;
+
+	switch( nHashSize )
+	{
+		case 256:
+			ctx->uHashSize = 256;
+			ctx->uBlockLength = 192;
+			ctx->uRounds = 8;
+			ctx->hashsize = _mm512_const4_32( 0, 0, 0, 0x100 );
+			ctx->const1536 = _mm512_const4_32( 0, 0, 0, 0x600 );
+			break;
+
+		case 512:
+			ctx->uHashSize = 512;
+			ctx->uBlockLength = 128;
+			ctx->uRounds = 10;
+			ctx->hashsize = _mm512_const4_32( 0, 0, 0, 0x200 );
+			ctx->const1536 = _mm512_const4_32( 0, 0, 0, 0x400);
+			break;
+
+		default:
+			return BAD_HASHBITLEN;
+	}
+
+
+	for( i = 0; i < 4; i++ )
+		for( j = 0; j < nHashSize / 256; j++ )
+			ctx->state[ i ][ j ] = ctx->hashsize;
+
+	for( i = 0; i < 4; i++ )
+		for( j = nHashSize / 256; j < 4; j++ )
+			ctx->state[ i ][ j ] = m512_zero;
+
+	return SUCCESS;
+}
+
+int echo_4way_update( echo_4way_context *state, const BitSequence *data, DataLength databitlen )
+{
+	unsigned int uByteLength, uBlockCount, uRemainingBytes;
+
+	uByteLength = (unsigned int)(databitlen / 8);
+
+	if ( ( state->uBufferBytes + uByteLength ) >= state->uBlockLength )
+	{
+		if ( state->uBufferBytes != 0 )
+		{
+			// Fill the buffer
+			memcpy( state->buffer + state->uBufferBytes,
+               (void*)data, state->uBlockLength - state->uBufferBytes );
+
+			// Process buffer
+			echo_4way_compress( state, state->buffer, 1 );
+			state->processed_bits += state->uBlockLength * 8;
+
+			data += state->uBlockLength - state->uBufferBytes;
+			uByteLength -= state->uBlockLength - state->uBufferBytes;
+		}
+
+		// buffer now does not contain any unprocessed bytes
+
+		uBlockCount = uByteLength / state->uBlockLength;
+		uRemainingBytes = uByteLength % state->uBlockLength;
+
+		if ( uBlockCount > 0 )
+		{
+			echo_4way_compress( state, data, uBlockCount );
+
+			state->processed_bits += uBlockCount * state->uBlockLength * 8;
+			data += uBlockCount * state->uBlockLength;
+		}
+
+		if ( uRemainingBytes > 0 )
+		{
+			memcpy( state->buffer, (void*)data, uRemainingBytes );
+		}
+
+		state->uBufferBytes = uRemainingBytes;
+	}
+	else
+	{
+		memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength );
+		state->uBufferBytes += uByteLength;
+	}
+
+	return 0;
+}
+
+echo_4way_close( echo_4way_context *state, BitSequence *hashval )
+{
+	__m512i remainingbits;
+
+	// Add remaining bytes in the buffer
+	state->processed_bits += state->uBufferBytes * 8;
+
+	remainingbits = _mm512_set4_epi32( 0, 0, 0, state->uBufferBytes * 8 );
+
+	// Pad with 0x80
+	state->buffer[ state->uBufferBytes++ ] = 0x80;
+	
+	// Enough buffer space for padding in this block?
+	if ( ( state->uBlockLength - state->uBufferBytes ) >= 18)
+	{
+		// Pad with zeros
+		memset( state->buffer + state->uBufferBytes, 0,
+                         state->uBlockLength - ( state->uBufferBytes + 18 ) );
+
+		// Hash size
+		*( (unsigned short*)( state->buffer + state->uBlockLength - 18 ) )
+           = state->uHashSize;
+
+		// Processed bits
+		*( ( DataLength*)( state->buffer + state->uBlockLength - 16 ) )
+           = state->processed_bits;
+		*( ( DataLength*)( state->buffer + state->uBlockLength - 8 ) ) = 0;
+
+		// Last block contains message bits?
+		if ( state->uBufferBytes == 1 )
+		{
+			state->k = _mm512_xor_si512( state->k, state->k );
+			state->k = _mm512_sub_epi64( state->k, state->const1536 );
+		}
+		else
+		{
+			state->k = _mm512_add_epi64( state->k, remainingbits );
+			state->k = _mm512_sub_epi64( state->k, state->const1536 );
+		}
+
+		// Compress
+		echo_4way_compress( state, state->buffer, 1 );
+	}
+	else
+	{
+		// Fill with zero and compress
+		memset( state->buffer + state->uBufferBytes, 0,
+                        state->uBlockLength - state->uBufferBytes );
+		state->k = _mm512_add_epi64( state->k, remainingbits );
+		state->k = _mm512_sub_epi64( state->k, state->const1536 );
+		echo_4way_compress( state, state->buffer, 1 );
+
+		// Last block
+		memset( state->buffer, 0, state->uBlockLength - 18 );
+
+		// Hash size
+		*( (unsigned short*)( state->buffer + state->uBlockLength - 18 ) )
+            = state->uHashSize;
+
+		// Processed bits
+		*( (DataLength*)( state->buffer + state->uBlockLength - 16 ) )
+            = state->processed_bits;
+		*( (DataLength*)( state->buffer + state->uBlockLength - 8 ) ) = 0;
+
+		// Compress the last block
+		state->k = _mm512_xor_si512(state->k, state->k);
+		state->k = _mm512_sub_epi64(state->k, state->const1536);
+		echo_4way_compress(state, state->buffer, 1);
+	}
+
+	// Store the hash value
+	_mm512_storeu_si512( (__m512i*)hashval + 0, state->state[ 0][ 0 ]);
+	_mm512_storeu_si512( (__m512i*)hashval + 1, state->state[ 1][ 0 ]);
+
+	if ( state->uHashSize == 512 )
+	{
+		_mm512_storeu_si512((__m512i*)hashval + 2, state->state[ 2 ][ 0 ]);
+		_mm512_storeu_si512((__m512i*)hashval + 3, state->state[ 3 ][ 0 ]);
+	}
+
+	return 0;
+}
+
+int echo_4way_update_close( echo_4way_context *state, BitSequence *hashval,
+                              const BitSequence *data, DataLength databitlen )
+{
+  unsigned int uByteLength, uBlockCount, uRemainingBytes;
+
+  uByteLength = (unsigned int)(databitlen / 8);
+
+  if ( (state->uBufferBytes + uByteLength) >= state->uBlockLength )
+  {
+     if ( state->uBufferBytes != 0 )
+     {
+        // Fill the buffer
+        memcpy( state->buffer + state->uBufferBytes,
+                   (void*)data, state->uBlockLength - state->uBufferBytes );
+
+        // Process buffer
+        echo_4way_compress( state, state->buffer, 1 );
+        state->processed_bits += state->uBlockLength * 8;
+
+        data += state->uBlockLength - state->uBufferBytes;
+        uByteLength -= state->uBlockLength - state->uBufferBytes;
+     }
+
+     // buffer now does not contain any unprocessed bytes
+
+     uBlockCount = uByteLength / state->uBlockLength;
+     uRemainingBytes = uByteLength % state->uBlockLength;
+
+     if ( uBlockCount > 0 )
+     {
+        echo_4way_compress( state, data, uBlockCount );
+        state->processed_bits += uBlockCount * state->uBlockLength * 8;
+        data += uBlockCount * state->uBlockLength;
+     }
+
+     if ( uRemainingBytes > 0 )
+     memcpy(state->buffer, (void*)data, uRemainingBytes);
+     state->uBufferBytes = uRemainingBytes;
+  }
+  else
+  {
+     memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength );
+     state->uBufferBytes += uByteLength;
+  } 
+
+  __m512i remainingbits;
+
+  // Add remaining bytes in the buffer
+  state->processed_bits += state->uBufferBytes * 8;
+
+  remainingbits = _mm512_set4_epi32( 0, 0, 0, state->uBufferBytes * 8 );
+
+  // Pad with 0x80
+  state->buffer[ state->uBufferBytes++ ] = 0x80;
+  // Enough buffer space for padding in this block?
+  if ( (state->uBlockLength - state->uBufferBytes) >= 18 )
+   {
+     // Pad with zeros
+     memset( state->buffer + state->uBufferBytes, 0,i
+                        state->uBlockLength - (state->uBufferBytes + 18) );
+
+     // Hash size
+     *( (unsigned short*)(state->buffer + state->uBlockLength - 18) )
+                   = state->uHashSize;
+
+     // Processed bits
+     *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
+                   state->processed_bits;
+     *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
+
+     // Last block contains message bits?
+     if( state->uBufferBytes == 1 )
+     {
+        state->k = _mm512_xor_si512( state->k, state->k );
+        state->k = _mm512_sub_epi64( state->k, state->const1536 );
+     }
+     else
+     {
+        state->k = _mm_add_epi64( state->k, remainingbits );
+        state->k = _mm_sub_epi64( state->k, state->const1536 );
+     }
+
+     // Compress
+     echo_4way_compress( state, state->buffer, 1 );
+  }
+  else
+  {
+     // Fill with zero and compress
+     memset( state->buffer + state->uBufferBytes, 0,
+                state->uBlockLength - state->uBufferBytes );
+     state->k = _mm512_add_epi64( state->k, remainingbits );
+     state->k = _mm512_sub_epi64( state->k, state->const1536 );
+     echo_4way_compress( state, state->buffer, 1 );
+
+     // Last block
+     memset( state->buffer, 0, state->uBlockLength - 18 );
+
+     // Hash size
+     *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) =
+                 state->uHashSize;
+
+     // Processed bits
+     *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
+                  state->processed_bits;
+     *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
+     // Compress the last block
+     state->k = _mm512_xor_si512( state->k, state->k );
+     state->k = _mm512_sub_epi64( state->k, state->const1536 );
+     echo_4way_compress( state, state->buffer, 1) ;
+  }
+
+  // Store the hash value
+  _mm512_storeu_si512( (__m512i*)hashval + 0, state->state[ 0 ][ 0] );
+  _mm512_storeu_si512( (__m512i*)hashval + 1, state->state[ 1 ][ 0] );
+
+  if ( state->uHashSize == 512 )
+  {
+     _mm512_storeu_si512( (__m512i*)hashval + 2, state->state[ 2 ][ 0 ] );
+     _mm512_storeu_si512( (__m512i*)hashval + 3, state->state[ 3 ][ 0 ] );
+
+  }
+  return 0;
+}
+
+#endif
--- a/algo/echo/echo-hash-4way.h
+++ b/algo/echo/echo-hash-4way.h
@@ -0,0 +1,36 @@
+#if !defined(ECHO_HASH_4WAY_H__)
+#define ECHO_HASH_4WAY_H__ 1
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#include "simd-utils.h"
+
+typedef struct
+{
+   __m512i    state[4][4];
+   __m512i    buffer[ 4 * 192 / 16 ];  // 4x128 interleaved 192 bytes
+   __m512i    k;
+   __m512i    hashsize;
+   __m512i    const1536;
+
+   unsigned int   uRounds;
+   unsigned int   uHashSize;
+   unsigned int   uBlockLength;
+   unsigned int   uBufferBytes;
+   unsigned int   processed_bits;
+
+} echo_4way_context __attribute__ ((aligned (64)));
+
+int echo_4way_init( echo_4way_context *state, int hashbitlen );
+
+
+int echo_4way_update( echo_4way_context *state, const void *data,
+    unsigned int databitlen);
+
+int echo_close( echo_4way_context *state, void *hashval );
+
+int echo_4way_update_close( echo_4way_context *state, void *hashval,
+                              const void *data, int databitlen );
+
+#endif 
+#endif
--- a/algo/groestl/aes_ni/groestl-version.h
+++ b/algo/groestl/aes_ni/groestl-version.h
@@ -9,6 +9,7 @@

 //#ifndef NO_AES_NI

+// Not to be confused with AVX512VAES
 #define VAES
 // #define VAVX
 // #define VVPERM
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -1,15 +1,206 @@
 #include "lyra2-gate.h"
 #include <memory.h>
 #include <mm_malloc.h>
-
-#if defined (ALLIUM_4WAY)	
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/groestl/aes_ni/hash-groestl256.h"

+#if defined (ALLIUM_8WAY)  
+
+typedef struct {
+   blake256_8way_context     blake;
+   keccak256_8way_context    keccak;
+   cube_4way_context          cube;
+   skein256_8way_context     skein;
+   hashState_groestl256      groestl;
+} allium_8way_ctx_holder;
+
+static __thread allium_8way_ctx_holder allium_8way_ctx;
+
+bool init_allium_8way_ctx()
+{
+   keccak256_8way_init( &allium_8way_ctx.keccak );
+   cube_4way_init( &allium_8way_ctx.cube, 256, 16, 32 );
+   skein256_8way_init( &allium_8way_ctx.skein );
+   init_groestl256( &allium_8way_ctx.groestl, 32 );
+   return true;
+}
+
+void allium_8way_hash( void *state, const void *input )
+{
+   uint32_t vhash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
+   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (64)));
+   uint32_t hash2[8] __attribute__ ((aligned (64)));
+   uint32_t hash3[8] __attribute__ ((aligned (64)));
+   uint32_t hash4[8] __attribute__ ((aligned (64)));
+   uint32_t hash5[8] __attribute__ ((aligned (64)));
+   uint32_t hash6[8] __attribute__ ((aligned (64)));
+   uint32_t hash7[8] __attribute__ ((aligned (64)));
+   allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
+
+   memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
+   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
+   blake256_8way_close( &ctx.blake, vhash );
+
+   rintrlv_8x32_8x64( vhashA, vhash, 256 );
+   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
+   keccak256_8way_close( &ctx.keccak, vhash );
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 vhash, 256 );
+
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+  
+/* 
+   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
+   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
+   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
+   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
+   LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
+   LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
+   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
+   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
+*/
+
+
+
+   intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
+
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+
+
+/*
+   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
+   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
+   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
+   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
+   LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
+   LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
+   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
+   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
+*/
+
+
+
+   intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                hash7, 256 );
+
+   skein256_8way_update( &ctx.skein, vhash, 32 );
+   skein256_8way_close( &ctx.skein, vhash );
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 vhash, 256 );
+
+   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+128, hash4, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+160, hash5, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+192, hash6, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+}
+
+int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 8;
+   const uint32_t Htarg = ptarget[7];
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );
+   blake256_8way_init( &allium_8way_ctx.blake );
+   blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );
+
+   do {
+     *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                 n+3, n+2, n+1, n ) );
+
+     allium_8way_hash( hash, vdata );
+     pdata[19] = n;
+
+     for ( int lane = 0; lane < 8; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
+     {
+        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
+        {
+           pdata[19] = n + lane;
+           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+         }
+     }
+     n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+#elif defined (ALLIUM_4WAY)  
+
+
 typedef struct {
   blake256_4way_context     blake;
   keccak256_4way_context    keccak;
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -129,7 +129,11 @@ bool register_lyra2rev2_algo( algo_gate_t* gate )

 bool register_lyra2z_algo( algo_gate_t* gate )
 {
-#if defined(LYRA2Z_8WAY)
+#if defined(LYRA2Z_16WAY)
+  gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2z_16way;
+  gate->hash       = (void*)&lyra2z_16way_hash;
+#elif defined(LYRA2Z_8WAY)
  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_8way;
  gate->hash       = (void*)&lyra2z_8way_hash;
@@ -142,7 +146,7 @@ bool register_lyra2z_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2z;
  gate->hash       = (void*)&lyra2z_hash;
 #endif
-  gate->optimizations = SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE42_OPT | AVX2_OPT | AVX512_OPT;
  opt_target_factor = 256.0;
  return true;
 };
@@ -170,7 +174,11 @@ bool register_lyra2h_algo( algo_gate_t* gate )

 bool register_allium_algo( algo_gate_t* gate )
 {
-#if defined (ALLIUM_4WAY)
+#if defined (ALLIUM_8WAY)
+  gate->miner_thread_init = (void*)&init_allium_8way_ctx;
+  gate->scanhash  = (void*)&scanhash_allium_8way;
+  gate->hash      = (void*)&allium_8way_hash;
+#elif defined (ALLIUM_4WAY)
  gate->miner_thread_init = (void*)&init_allium_4way_ctx;
  gate->scanhash  = (void*)&scanhash_allium_4way;
  gate->hash      = (void*)&allium_4way_hash;
@@ -179,7 +187,7 @@ bool register_allium_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_allium;
  gate->hash      = (void*)&allium_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
  opt_target_factor = 256.0;
  return true;
 };
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -85,17 +85,25 @@ bool init_lyra2rev2_ctx();

 /////////////////////////

-#if defined(__SSE2__)
-  #define LYRA2Z_4WAY
-#endif
-#if defined(__AVX2__)
-  #define LYRA2Z_8WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define LYRA2Z_16WAY 1
+#elif defined(__AVX2__)
+  #define LYRA2Z_8WAY 1
+#elif defined(__SSE2__)
+  #define LYRA2Z_4WAY 1
 #endif


 #define LYRA2Z_MATRIX_SIZE  BLOCK_LEN_INT64 * 8 * 8 * 8

-#if defined(LYRA2Z_8WAY)
+#if defined(LYRA2Z_16WAY)
+
+void lyra2z_16way_hash( void *state, const void *input );
+int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );
+bool lyra2z_16way_thread_init();
+
+#elif defined(LYRA2Z_8WAY)

 void lyra2z_8way_hash( void *state, const void *input );
 int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
@@ -144,13 +152,22 @@ bool lyra2h_thread_init();

 //////////////////////////////////

-#if defined(__AVX2__) && defined(__AES__)
-  #define ALLIUM_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define ALLIUM_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define ALLIUM_4WAY 1
 #endif

 bool register_allium_algo( algo_gate_t* gate );

-#if defined(ALLIUM_4WAY)
+#if defined(ALLIUM_8WAY)
+
+void allium_8way_hash( void *state, const void *input );
+int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );
+bool init_allium_8way_ctx();
+
+#elif defined(ALLIUM_4WAY)

 void allium_4way_hash( void *state, const void *input );
 int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
--- a/algo/lyra2/lyra2-hash-2way.c
+++ b/algo/lyra2/lyra2-hash-2way.c
@@ -26,14 +26,17 @@
 #include "lyra2.h"
 #include "sponge.h"

-//  LYRA2RE 8 cols 8 rows used by lyea2re, allium, phi2, x22i, x25x.
+//  LYRA2RE 8 cols 8 rows used by lyra2re, allium, phi2, x22i, x25x, 
+//  dynamic matrix allocation.
 //
-//  LYRA2REV2 4 cols 4 rows used by lyra2rev2.
+//  LYRA2REV2 4 cols 4 rows used by lyra2rev2 and x21s, static matrix
+//  allocation.
 //
 //  LYRA2REV3 4 cols 4 rows with an extra twist in calculating
-//  rowa in the wandering phase. Used by lyra2rev3.
+//  rowa in the wandering phase. Used by lyra2rev3. Static matrix
+//  allocation.
 // 
-//  LYRA2Z various cols & rows and supports 80 input. Used by lyra2z,
+//  LYRA2Z various cols & rows and supports 80 byte input. Used by lyra2z,
 //  lyra2z330, lyra2h, 


@@ -60,7 +63,7 @@
 */

 // For lyra2rev3.
-// convert a simple offset to an index into interleaved data.
+// convert a simple offset to an index into 2x4 u64 interleaved data.
 // good for state and 4 row matrix. 
 // index = ( int( off / 4 ) * 2 ) + ( off mod 4 )

@@ -202,12 +205,8 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,

 // hard coded for 32 byte input as well as matrix size.
 // Other required versions include 80 byte input and different block
-// sizez
+// sizes.

-//int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
-//      const void *pwd, const uint64_t pwdlen, const void *salt,
-//      const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
-//      const uint64_t nCols )
 {
   //====================== Basic variables ============================//
   uint64_t _ALIGN(256) state[32];
@@ -335,159 +334,111 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
   return 0;
 }

-#endif // AVX512
-
-#if 0
-
 //////////////////////////////////////////////////
-int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
-            const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
-            const uint64_t timeCost, const uint64_t nRows,
-            const uint64_t nCols )
+
+int LYRA2Z_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
+               const void *pwd, const uint64_t pwdlen, const uint64_t timeCost,
+               const uint64_t nRows, const uint64_t nCols )
 {
    //========================== Basic variables ============================//
-    uint64_t _ALIGN(256) state[16];
-    int64_t row = 2; //index of row to be processed
-    int64_t prev = 1; //index of prev (last row ever computed/modified)
-    int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
-    int64_t tau; //Time Loop iterator
-    int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
-    int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
-    int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
-//    int64_t i; //auxiliary iteration counter
+    uint64_t _ALIGN(256) state[32];
+    int64_t row = 2;
+    int64_t prev = 1;
+    int64_t rowa0 = 0;
+    int64_t rowa1 = 0;
+    int64_t tau; 
+    int64_t step = 1;
+    int64_t window = 2;
+    int64_t gap = 1; 
    //=======================================================================/

-    //======= Initializing the Memory Matrix and pointers to it =============//
-    //Tries to allocate enough space for the whole memory matrix
-
    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-//    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
-
-//    memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
-
-    //==== Getting the password + salt + basil padded with 10*1 ============//
-    //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
-    //but this ensures that the password copied locally will be overwritten as soon as possible

    //First, we clean enough blocks for the password, salt, basil and padding
-    uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 *
+    uint64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 *
                       sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
-    byte *ptrByte = (byte*) wholeMatrix;
-    memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );

-    //Prepends the password
-    memcpy(ptrByte, pwd, pwdlen);
-    ptrByte += pwdlen;
+   uint64_t *ptr = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;

-    //Concatenates the salt
-    memcpy(ptrByte, salt, saltlen);
-    ptrByte += saltlen;
-    //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
-    memcpy(ptrByte, &kLen, sizeof (uint64_t));
-    ptrByte += sizeof (uint64_t);
-    memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
-    ptrByte += sizeof (uint64_t);
-    memcpy(ptrByte, &saltlen, sizeof (uint64_t));
-    ptrByte += sizeof (uint64_t);
-    memcpy(ptrByte, &timeCost, sizeof (uint64_t));
-    ptrByte += sizeof (uint64_t);
-    memcpy(ptrByte, &nRows, sizeof (uint64_t));
-    ptrByte += sizeof (uint64_t);
-    memcpy(ptrByte, &nCols, sizeof (uint64_t));
-    ptrByte += sizeof (uint64_t);
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;

-    //Now comes the padding
-    *ptrByte = 0x80; //first byte of padding: right after the password
-    ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
-    ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
-    *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+   // now build the rest interleaving on the fly.
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;

-    //=================== Initializing the Sponge State ====================//
-    //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
-//        uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32);
-//        if (state == NULL) {
-//                return -1;
-//        }
-//    initState( state );
+   uint64_t *ptrWord = wholeMatrix;

-    //============================== Setup Phase =============================//
-    //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
-    uint64_t *ptrWord = wholeMatrix;
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput,
+                               BLOCK_LEN_BLAKE2_SAFE_INT64 );

-    absorbBlockBlake2Safe( state, ptrWord, nBlocksInput,
-                           BLOCK_LEN_BLAKE2_SAFE_INT64 );
-/*
-    for ( i = 0; i < nBlocksInput; i++ )
-    {
-      absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
-      ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
-    }
-*/
-    //Initializes M[0] and M[1]
-        reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here
-        reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols);
+   //Initializes M[0] and M[1]
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );

-        do {
-                //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
-                reducedDuplexRowSetup(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                           &wholeMatrix[ 2 * ROW_LEN_INT64 ],  nCols );

-                //updates the value of row* (deterministically picked during Setup))
-                rowa = (rowa + step) & (window - 1);
-                //update prev: it now points to the last row ever computed
-                prev = row;
-                //updates row: goes to the next row to be computed
-                row++;
+   do
+   {
+     //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)

-                //Checks if all rows in the window where visited.
-                if (rowa == 0) {
-                        step = window + gap; //changes the step: approximately doubles its value
-                        window *= 2; //doubles the size of the re-visitation window
-                        gap = -gap; //inverts the modifier to the step
-                }
+     reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64],
+                                        &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64],
+                                        &wholeMatrix[ 2* row*ROW_LEN_INT64],
+                                        nCols );

-        } while (row < nRows);
+     rowa0 = (rowa0 + step) & (window - 1);
+     prev = row;
+     row++;

-    //======================== Wandering Phase =============================//
-    row = 0; //Resets the visitation to the first row of the memory matrix
-    for ( tau = 1; tau <= timeCost; tau++ )
-    {
-        //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+     if ( rowa0 == 0 )
+     {
+        step = window + gap;
+        window *= 2;
+        gap = -gap;
+     }
+   } while ( row < nRows );
+
+   row = 0;
+   for ( tau = 1; tau <= timeCost; tau++ )
+   {
        step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
-        do {
-        //Selects a pseudorandom index row*
-        //----------------------------------------------------------------------
-        //rowa = ((unsigned int)state[0]) & (nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
-        rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-        //-----------------------------------------------------------------
+      do
+      {
+        rowa0 = state[ 0 ] % nRows;
+        rowa1 = state[ 4 ] % nRows;

-        //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
-                reducedDuplexRow(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
+        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row *ROW_LEN_INT64 ],
+                                      nCols );

-        //update prev: it now points to the last row ever computed
        prev = row;
-
-        //updates row: goes to the next row to be computed
-        //---------------------------------------------------------------
-        //row = (row + step) & (nRows-1);       //(USE THIS IF nRows IS A POWER OF 2)
-        row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-        //--------------------------------------------------------------------
+        row = (row + step) % nRows;

      } while (row != 0);
-    }
+   }

-    //========================= Wrap-up Phase ===============================//
-    //Absorbs the last block of the memory matrix
-    absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
+   absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64 ],
+                            &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64 ] );

-    //Squeezes the key
-    squeeze( state, K, kLen );
+   //Squeezes the key
+   squeeze_2way( state, K, (unsigned int) kLen );

-    return 0;
+   return 0;
 }

-#endif
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+////////////////////////////////////////////////////

 // Lyra2RE doesn't like the new wholeMatrix implementation
 int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
@@ -495,7 +446,7 @@ int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
                  const uint64_t nRows, const uint64_t nCols )
 {
   //====================== Basic variables ============================//
-   uint64_t _ALIGN(256) state[16];
+   uint64_t _ALIGN(256) state[32];
   int64_t row = 2; //index of row to be processed
   int64_t prev = 1; //index of prev (last row ever computed/modified)
   int64_t rowa0 = 0;
@@ -517,25 +468,15 @@ int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;

   i = (int64_t)ROW_LEN_BYTES * nRows;
-   uint64_t *wholeMatrix = _mm_malloc( i, 64 );
+   uint64_t *wholeMatrix = _mm_malloc( 2*i, 64 );
   if (wholeMatrix == NULL)
      return -1;

-#if defined(__AVX2__)
-   memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
-#elif defined(__SSE2__)
-   memset_zero_128( (__m128i*)wholeMatrix, i>>4 );   
-#else
-   memset( wholeMatrix, 0, i );
-#endif
+   memset_zero_512( (__m512i*)wholeMatrix, i>>5 );

   uint64_t *ptrWord = wholeMatrix;
   uint64_t *pw = (uint64_t*)pwd;

-   //=== Getting the password + salt + basil padded with 10*1 ==========//
-   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
-   //but this ensures that the password copied locally will be overwritten as soon as possible
-
   //First, we clean enough blocks for the password, salt, basil and padding
   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
@@ -558,66 +499,8 @@ int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
   ptr[10] = ptr[14] = 0x80;
   ptr[11] = ptr[15] = 0x0100000000000000;

-   
-/*   
-   byte *ptrByte = (byte*) wholeMatrix;
-
-   //Prepends the password
-   memcpy(ptrByte, pwd, pwdlen);
-   ptrByte += pwdlen;
-
-   //Concatenates the salt
-   memcpy(ptrByte, salt, saltlen);
-   ptrByte += saltlen;
-
-//   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
-//                       - (saltlen + pwdlen) );
-
-   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
-   memcpy(ptrByte, &kLen, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = pwdlen;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = saltlen;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = timeCost;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = nRows;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = nCols;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-
-   //Now comes the padding
-   *ptrByte = 0x80; //first byte of padding: right after the password
-   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
-   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
-   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
-
-   //================= Initializing the Sponge State ====================//
-   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
-
-//   initState( state );
-
-   //========================= Setup Phase =============================//
-   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
-
-   ptrWord = wholeMatrix;
-
-*/
-
   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
-/*
-   for (i = 0; i < nBlocksInput; i++)
-   {
-       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
-       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
-   }
-*/
+
   //Initializes M[0] and M[1]
   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here

--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -62,6 +62,8 @@ int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *sa

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
+                  uint64_t timeCost, uint64_t nRows, uint64_t nCols );

 int LYRA2REV2_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
        uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
@@ -69,6 +71,9 @@ int LYRA2REV2_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
 int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
        uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );

+int LYRA2Z_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
+          uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
 #endif

 #endif /* LYRA2_H_ */
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -1,13 +1,240 @@
 #include "lyra2-gate.h"
-
-#ifdef LYRA2Z_4WAY
-
 #include <memory.h>
 #include <mm_malloc.h>
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"
 #include "algo/blake/blake-hash-4way.h"

+#if defined(LYRA2Z_16WAY)
+
+__thread uint64_t* lyra2z_16way_matrix;
+
+bool lyra2z_16way_thread_init()
+{
+ return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
+}
+
+static __thread blake256_16way_context l2z_16way_blake_mid;
+
+void lyra2z_16way_midstate( const void* input )
+{
+       blake256_16way_init( &l2z_16way_blake_mid );
+       blake256_16way_update( &l2z_16way_blake_mid, input, 64 );
+}
+
+void lyra2z_16way_hash( void *state, const void *input )
+{
+    uint32_t vhash[8*16] __attribute__ ((aligned (128)));
+    uint32_t hash0[8] __attribute__ ((aligned (64)));
+    uint32_t hash1[8] __attribute__ ((aligned (64)));
+    uint32_t hash2[8] __attribute__ ((aligned (64)));
+    uint32_t hash3[8] __attribute__ ((aligned (64)));
+    uint32_t hash4[8] __attribute__ ((aligned (64)));
+    uint32_t hash5[8] __attribute__ ((aligned (64)));
+    uint32_t hash6[8] __attribute__ ((aligned (64)));
+    uint32_t hash7[8] __attribute__ ((aligned (64)));
+    uint32_t hash8[8] __attribute__ ((aligned (64)));
+    uint32_t hash9[8] __attribute__ ((aligned (64)));
+    uint32_t hash10[8] __attribute__ ((aligned (64)));
+    uint32_t hash11[8] __attribute__ ((aligned (64)));
+    uint32_t hash12[8] __attribute__ ((aligned (64)));
+    uint32_t hash13[8] __attribute__ ((aligned (64)));
+    uint32_t hash14[8] __attribute__ ((aligned (64)));
+    uint32_t hash15[8] __attribute__ ((aligned (64)));
+    blake256_16way_context ctx_blake __attribute__ ((aligned (64)));
+
+    memcpy( &ctx_blake, &l2z_16way_blake_mid, sizeof l2z_16way_blake_mid );
+    blake256_16way_update( &ctx_blake, input + (64*16), 16 );
+    blake256_16way_close( &ctx_blake, vhash );
+
+    dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+              hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
+               vhash, 256 );
+
+    intrlv_2x256( vhash, hash0, hash1, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash0, hash1, vhash, 256 );
+    intrlv_2x256( vhash, hash2, hash3, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash2, hash3, vhash, 256 );
+    intrlv_2x256( vhash, hash4, hash5, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash4, hash5, vhash, 256 );
+    intrlv_2x256( vhash, hash6, hash7, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash6, hash7, vhash, 256 );
+    intrlv_2x256( vhash, hash8, hash9, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash8, hash9, vhash, 256 );
+    intrlv_2x256( vhash, hash10, hash11, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash10, hash11, vhash, 256 );
+    intrlv_2x256( vhash, hash12, hash13, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash12, hash13, vhash, 256 );
+    intrlv_2x256( vhash, hash14, hash15, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash14, hash15, vhash, 256 );
+   
+    memcpy( state,     hash0, 32 );
+    memcpy( state+ 32, hash1, 32 );
+    memcpy( state+ 64, hash2, 32 );
+    memcpy( state+ 96, hash3, 32 );
+    memcpy( state+128, hash4, 32 );
+    memcpy( state+160, hash5, 32 );
+    memcpy( state+192, hash6, 32 );
+    memcpy( state+224, hash7, 32 );
+    memcpy( state+256, hash8, 32 );
+    memcpy( state+288, hash9, 32 );
+    memcpy( state+320, hash10, 32 );
+    memcpy( state+352, hash11, 32 );
+    memcpy( state+384, hash12, 32 );
+    memcpy( state+416, hash13, 32 );
+    memcpy( state+448, hash14, 32 );
+    memcpy( state+480, hash15, 32 );
+}
+
+int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0000ff;
+
+   mm512_bswap32_intrlv80_16x32( vdata, pdata );
+   lyra2z_16way_midstate( vdata );
+
+   do {
+      *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
+                                                  n+11, n+10, n+ 9, n+ 8,
+                                                  n+ 7, n+ 6, n+ 5, n+ 4,
+                                                  n+ 3, n+ 2, n+ 1, n ) );
+      lyra2z_16way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 16; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
+           && !opt_benchmark )
+      {
+          pdata[19] = n+i;
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 16;
+   } while ( (n < max_nonce-16) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#elif defined(LYRA2Z_8WAY)
+
+__thread uint64_t* lyra2z_8way_matrix;
+
+bool lyra2z_8way_thread_init()
+{
+ return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
+}
+
+static __thread blake256_8way_context l2z_8way_blake_mid;
+
+void lyra2z_8way_midstate( const void* input )
+{
+       blake256_8way_init( &l2z_8way_blake_mid );
+       blake256_8way( &l2z_8way_blake_mid, input, 64 );
+}
+
+void lyra2z_8way_hash( void *state, const void *input )
+{
+     uint32_t hash0[8] __attribute__ ((aligned (64)));
+     uint32_t hash1[8] __attribute__ ((aligned (64)));
+     uint32_t hash2[8] __attribute__ ((aligned (64)));
+     uint32_t hash3[8] __attribute__ ((aligned (64)));
+     uint32_t hash4[8] __attribute__ ((aligned (64)));
+     uint32_t hash5[8] __attribute__ ((aligned (64)));
+     uint32_t hash6[8] __attribute__ ((aligned (64)));
+     uint32_t hash7[8] __attribute__ ((aligned (64)));
+     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+     blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
+
+     memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
+     blake256_8way( &ctx_blake, input + (64*8), 16 );
+     blake256_8way_close( &ctx_blake, vhash );
+
+     dintrlv_8x32( hash0, hash1, hash2, hash3,
+                   hash4, hash5, hash6, hash7, vhash, 256 );
+
+     LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash4, 32, hash4, 32, hash4, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash5, 32, hash5, 32, hash5, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
+
+
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0000ff;
+
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );
+   lyra2z_8way_midstate( vdata );
+
+   do {
+      *noncev = mm256_bswap_32(
+                 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
+      lyra2z_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
+           && !opt_benchmark )
+      {
+          pdata[19] = n+i;
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 8;
+   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+
+#elif defined(LYRA2Z_4WAY)
+
+
 __thread uint64_t* lyra2z_4way_matrix;

 bool lyra2z_4way_thread_init()
@@ -85,100 +312,3 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,

 #endif

-#if defined(LYRA2Z_8WAY)
-
-__thread uint64_t* lyra2z_8way_matrix;
-
-bool lyra2z_8way_thread_init()
-{
- return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
-}
-
-static __thread blake256_8way_context l2z_8way_blake_mid;
-
-void lyra2z_8way_midstate( const void* input )
-{
-       blake256_8way_init( &l2z_8way_blake_mid );
-       blake256_8way( &l2z_8way_blake_mid, input, 64 );
-}
-
-void lyra2z_8way_hash( void *state, const void *input )
-{
-     uint32_t hash0[8] __attribute__ ((aligned (64)));
-     uint32_t hash1[8] __attribute__ ((aligned (64)));
-     uint32_t hash2[8] __attribute__ ((aligned (64)));
-     uint32_t hash3[8] __attribute__ ((aligned (64)));
-     uint32_t hash4[8] __attribute__ ((aligned (64)));
-     uint32_t hash5[8] __attribute__ ((aligned (64)));
-     uint32_t hash6[8] __attribute__ ((aligned (64)));
-     uint32_t hash7[8] __attribute__ ((aligned (64)));
-     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
-     blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
-
-     memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
-     blake256_8way( &ctx_blake, input + (64*8), 16 );
-     blake256_8way_close( &ctx_blake, vhash );
-
-     dintrlv_8x32( hash0, hash1, hash2, hash3,
-                   hash4, hash5, hash6, hash7, vhash, 256 );
-
-     LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash4, 32, hash4, 32, hash4, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash5, 32, hash5, 32, hash5, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
-
-     memcpy( state,     hash0, 32 );
-     memcpy( state+ 32, hash1, 32 );
-     memcpy( state+ 64, hash2, 32 );
-     memcpy( state+ 96, hash3, 32 );
-     memcpy( state+128, hash4, 32 );
-     memcpy( state+160, hash5, 32 );
-     memcpy( state+192, hash6, 32 );
-     memcpy( state+224, hash7, 32 );
-}
-
-int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t hash[8*8] __attribute__ ((aligned (64)));
-   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   uint32_t n = first_nonce;
-   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-   if ( opt_benchmark )
-      ptarget[7] = 0x0000ff;
-
-   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-   lyra2z_8way_midstate( vdata );
-
-   do {
-      *noncev = mm256_bswap_32(
-                 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
-      lyra2z_8way_hash( hash, vdata );
-      pdata[19] = n;
-
-      for ( int i = 0; i < 8; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
-           && !opt_benchmark )
-      {
-          pdata[19] = n+i;         
-          submit_lane_solution( work, hash+(i<<3), mythr, i );
-      }
-      n += 8;
-   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
-
-   *hashes_done = n - first_nonce + 1;
-   return 0;
-}
-
-
-#endif
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -10,7 +10,140 @@
 #define LBRY_MIDSTATE    64
 #define LBRY_TAIL (LBRY_INPUT_SIZE) - (LBRY_MIDSTATE)

-#if defined(LBRY_8WAY)
+#if defined(LBRY_16WAY)
+
+static __thread sha256_16way_context sha256_16w_mid;
+
+void lbry_16way_hash( void* output, const void* input )
+{
+   uint32_t _ALIGN(128) vhashA[16<<4];
+   uint32_t _ALIGN(64) vhashB[16<<4];
+   uint32_t _ALIGN(64) vhashC[16<<4];
+   uint32_t _ALIGN(64) h0[32];
+   uint32_t _ALIGN(64) h1[32];
+   uint32_t _ALIGN(64) h2[32];
+   uint32_t _ALIGN(64) h3[32];
+   uint32_t _ALIGN(64) h4[32];
+   uint32_t _ALIGN(64) h5[32];
+   uint32_t _ALIGN(64) h6[32];
+   uint32_t _ALIGN(64) h7[32];
+   uint32_t _ALIGN(64) h8[32];
+   uint32_t _ALIGN(64) h9[32];
+   uint32_t _ALIGN(64) h10[32];
+   uint32_t _ALIGN(64) h11[32];
+   uint32_t _ALIGN(64) h12[32];
+   uint32_t _ALIGN(64) h13[32];
+   uint32_t _ALIGN(64) h14[32];
+   uint32_t _ALIGN(64) h15[32];
+   sha256_16way_context     ctx_sha256 __attribute__ ((aligned (64)));
+   sha512_8way_context     ctx_sha512;
+   ripemd160_16way_context  ctx_ripemd;
+
+   memcpy( &ctx_sha256, &sha256_16w_mid, sizeof(ctx_sha256) );
+   sha256_16way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL );
+   sha256_16way_close( &ctx_sha256, vhashA );
+
+   sha256_16way_init( &ctx_sha256 );
+   sha256_16way_update( &ctx_sha256, vhashA, 32 );
+   sha256_16way_close( &ctx_sha256, vhashA );
+
+   // reinterleave to do sha512 4-way 64 bit twice.
+   dintrlv_16x32( h0, h1, h2, h3, h4, h5, h6, h7,
+                  h8, h9, h10, h11, h12, h13, h14, h15, vhashA, 256 );
+   intrlv_8x64( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 256 );
+   intrlv_8x64( vhashB, h8, h9, h10, h11, h12, h13, h14, h15, 256 );
+
+   sha512_8way_init( &ctx_sha512 );
+   sha512_8way_update( &ctx_sha512, vhashA, 32 );
+   sha512_8way_close( &ctx_sha512, vhashA );
+
+   sha512_8way_init( &ctx_sha512 );
+   sha512_8way_update( &ctx_sha512, vhashB, 32 );
+   sha512_8way_close( &ctx_sha512, vhashB );
+
+   // back to 8-way 32 bit
+   dintrlv_8x64( h0, h1, h2, h3,h4, h5, h6, h7, vhashA, 512 );
+   dintrlv_8x64( h8, h9, h10, h11, h12, h13, h14, h15, vhashB, 512 );
+   intrlv_16x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7,
+                         h8, h9, h10, h11, h12, h13, h14, h15, 512 );
+
+   ripemd160_16way_init( &ctx_ripemd );
+   ripemd160_16way_update( &ctx_ripemd, vhashA, 32 );
+   ripemd160_16way_close( &ctx_ripemd, vhashB );
+
+   ripemd160_16way_init( &ctx_ripemd );
+   ripemd160_16way_update( &ctx_ripemd, vhashA+(8<<4), 32 );
+   ripemd160_16way_close( &ctx_ripemd, vhashC );
+
+   sha256_16way_init( &ctx_sha256 );
+   sha256_16way_update( &ctx_sha256, vhashB, 20 );
+   sha256_16way_update( &ctx_sha256, vhashC, 20 );
+   sha256_16way_close( &ctx_sha256, vhashA );
+
+   sha256_16way_init( &ctx_sha256 );
+   sha256_16way_update( &ctx_sha256, vhashA, 32 );
+   sha256_16way_close( &ctx_sha256, output );
+}
+
+int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[32*16] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[7<<4]);
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[27];
+   const uint32_t first_nonce = pdata[27];
+   const uint32_t Htarg = ptarget[7];
+   uint32_t edata[32] __attribute__ ((aligned (64)));
+   __m512i  *noncev = (__m512i*)vdata + 27;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   // we need bigendian data...
+   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
+   casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
+   casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
+   intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
+        edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
+
+   sha256_16way_init( &sha256_16w_mid );
+   sha256_16way( &sha256_16w_mid, vdata, LBRY_MIDSTATE );
+
+   do
+   {
+      *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
+                                                  n+11, n+10, n+ 9, n+ 8,
+                                                  n+ 7, n+ 6, n+ 5, n+ 4,
+                                                  n+ 3, n+ 2, n+ 1, n ) );
+      lbry_16way_hash( hash, vdata );
+
+      for ( int i = 0; i < 16; i++ )
+      if ( unlikely( hash7[ i ] <= Htarg ) )
+      {
+         // deinterleave hash for lane
+         extr_lane_16x32( lane_hash, hash, i, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+            pdata[27] = n + i;
+            submit_lane_solution( work, lane_hash, mythr, i );
+         }
+      }
+      n += 16;
+   } while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+
+
+#elif defined(LBRY_8WAY)

 static __thread sha256_8way_context sha256_8w_mid;

@@ -91,11 +224,6 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
   __m256i  *noncev = (__m256i*)vdata + 27;   // aligned
   int thr_id = mythr->id;  // thr_id arg is deprecated

-   uint64_t htmax[] = {          0,        0xF,       0xFF,
-                             0xFFF,     0xFFFF, 0x10000000 };
-   uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
-                        0xFFFFF000, 0xFFFF0000,          0 };
-
   // we need bigendian data...
   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
@@ -106,33 +234,30 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
   casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
   casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
   intrlv_8x32( vdata, edata, edata, edata, edata,
-                             edata, edata, edata, edata, 1024 );
+                       edata, edata, edata, edata, 1024 );
+
   sha256_8way_init( &sha256_8w_mid );
   sha256_8way( &sha256_8w_mid, vdata, LBRY_MIDSTATE );

-   for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] )
+   do
   {
-      uint32_t mask = masks[m];
-      do
-      {
-        *noncev = mm256_bswap_32( _mm256_set_epi32(
-                                          n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
-         lbry_8way_hash( hash, vdata );
+      *noncev = mm256_bswap_32( _mm256_set_epi32(
+                                       n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
+      lbry_8way_hash( hash, vdata );

-         for ( int i = 0; i < 8; i++ )  if ( !( hash7[ i ] & mask ) )
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( hash7[ i ] <= Htarg ) )
+      {
+         // deinterleave hash for lane
+         extr_lane_8x32( lane_hash, hash, i, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
-            // deinterleave hash for lane
-            extr_lane_8x32( lane_hash, hash, i, 256 );
-            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
-            {
-              pdata[27] = n + i;
-              submit_lane_solution( work, lane_hash, mythr, i );
-            }
+            pdata[27] = n + i;
+            submit_lane_solution( work, lane_hash, mythr, i );
         }
-         n += 8;
-      } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
-      break;
-   }
+      }
+      n += 8;
+   } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -98,16 +98,23 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }

 bool register_lbry_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT | SHA_OPT;
-#if defined (LBRY_8WAY)
+  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
+#if defined (LBRY_16WAY)
+  gate->scanhash              = (void*)&scanhash_lbry_16way;
+  gate->hash                  = (void*)&lbry_16way_hash;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
+#elif defined (LBRY_8WAY)
  gate->scanhash              = (void*)&scanhash_lbry_8way;
  gate->hash                  = (void*)&lbry_8way_hash;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
 #elif defined (LBRY_4WAY)
  gate->scanhash              = (void*)&scanhash_lbry_4way;
  gate->hash                  = (void*)&lbry_4way_hash;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
 #else 
  gate->scanhash              = (void*)&scanhash_lbry;
  gate->hash                  = (void*)&lbry_hash;
+  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
 #endif
  gate->calc_network_diff     = (void*)&lbry_calc_network_diff;
  gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
--- a/algo/ripemd/lbry-gate.h
+++ b/algo/ripemd/lbry-gate.h
@@ -4,11 +4,20 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

+
+// 16 way needs sha256 16 way
+//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+//  #define LBRY_16WAY
+#if defined(__AVX2__)
+  #define LBRY_8WAY
+#endif
+/*
 #if !defined(__SHA__)
 #if defined(__AVX2__)
  #define LBRY_8WAY
 #endif
 #endif
+*/

 #define LBRY_NTIME_INDEX 25
 #define LBRY_NBITS_INDEX 26
@@ -18,7 +27,12 @@

 bool register_lbry_algo( algo_gate_t* gate );

-#if defined(LBRY_8WAY)
+#if defined(LBRY_16WAY)
+
+void lbry_16way_hash( void *state, const void *input );
+int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+#elif defined(LBRY_8WAY)

 void lbry_8way_hash( void *state, const void *input );
 int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
--- a/algo/ripemd/lbry.c
+++ b/algo/ripemd/lbry.c
@@ -80,9 +80,6 @@ int scanhash_lbry( struct work *work, uint32_t max_nonce,
 	// we need bigendian data...
        swab32_array( endiandata, pdata, 32 );

-#ifdef DEBUG_ALGO
-	printf("[%d] Htarg=%X\n", thr_id, Htarg);
-#endif
 	for (int m=0; m < sizeof(masks); m++) {
 		if (Htarg <= htmax[m]) {
 			uint32_t mask = masks[m];
@@ -90,23 +87,11 @@ int scanhash_lbry( struct work *work, uint32_t max_nonce,
 				pdata[27] = ++n;
 				be32enc(&endiandata[27], n);
 				lbry_hash(hash64, &endiandata);
-#ifndef DEBUG_ALGO
 				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
-					*hashes_done = n - first_nonce + 1;
-					return true;
+               pdata[27] = n;
+               submit_solution( work, hash64, mythr );
 				}
-#else
-				if (!(n % 0x1000) && !thr_id) printf(".");
-				if (!(hash64[7] & mask)) {
-					printf("[%d]",thr_id);
-					if (fulltest(hash64, ptarget)) {
-						*hashes_done = n - first_nonce + 1;
-						return true;
-					}
-				}
-#endif
-			} while (n < max_nonce && !work_restart[thr_id].restart);
-			// see blake.c if else to understand the loop on htmax => mask
+			} while ( (n < max_nonce -8) && !work_restart[thr_id].restart);
 			break;
 		}
 	}
--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -623,3 +623,303 @@ void ripemd160_8way_close( ripemd160_8way_context  *sc, void *dst )

 #endif // __AVX2__

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+//  RIPEMD-160 16 way
+
+
+#define F16W_1(x, y, z) \
+   _mm512_xor_si512( _mm512_xor_si512( x, y ), z )
+
+#define F16W_2(x, y, z) \
+   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( y, z ), x ), z )
+
+#define F16W_3(x, y, z) \
+   _mm512_xor_si512( _mm512_or_si512( x, mm512_not( y ) ), z )
+
+#define F16W_4(x, y, z) \
+   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( x, y ), z ), y )
+
+#define F16W_5(x, y, z) \
+   _mm512_xor_si512( x, _mm512_or_si512( y, mm512_not( z ) ) )
+
+#define RR_16W(a, b, c, d, e, f, s, r, k) \
+do{ \
+   a = _mm512_add_epi32( mm512_rol_32( _mm512_add_epi32( _mm512_add_epi32( \
+                _mm512_add_epi32( a, f( b ,c, d ) ), r ), \
+                                 m512_const1_64( k ) ), s ), e ); \
+   c = mm512_rol_32( c, 10 );\
+} while (0)
+
+#define ROUND1_16W(a, b, c, d, e, f, s, r, k)  \
+        RR_16W(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
+
+#define ROUND2_16W(a, b, c, d, e, f, s, r, k)  \
+        RR_16W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
+
+static void ripemd160_16way_round( ripemd160_16way_context *sc )
+{
+   const __m512i *in = (__m512i*)sc->buf;
+   __m512i *h  = (__m512i*)sc->val;
+   register __m512i A1, B1, C1, D1, E1;
+   register __m512i A2, B2, C2, D2, E2;
+   __m512i tmp;
+
+   A1 = A2 = h[0];
+   B1 = B2 = h[1];
+   C1 = C2 = h[2];
+   D1 = D2 = h[3];
+   E1 = E2 = h[4];
+
+   ROUND1_16W( A, B, C, D, E, F16W_1, 11, in[ 0], 1 );
+   ROUND1_16W( E, A, B, C, D, F16W_1, 14, in[ 1], 1 );
+   ROUND1_16W( D, E, A, B, C, F16W_1, 15, in[ 2], 1 );
+   ROUND1_16W( C, D, E, A, B, F16W_1, 12, in[ 3], 1 );
+   ROUND1_16W( B, C, D, E, A, F16W_1,  5, in[ 4], 1 );
+   ROUND1_16W( A, B, C, D, E, F16W_1,  8, in[ 5], 1 );
+   ROUND1_16W( E, A, B, C, D, F16W_1,  7, in[ 6], 1 );
+   ROUND1_16W( D, E, A, B, C, F16W_1,  9, in[ 7], 1 );
+   ROUND1_16W( C, D, E, A, B, F16W_1, 11, in[ 8], 1 );
+   ROUND1_16W( B, C, D, E, A, F16W_1, 13, in[ 9], 1 );
+   ROUND1_16W( A, B, C, D, E, F16W_1, 14, in[10], 1 );
+   ROUND1_16W( E, A, B, C, D, F16W_1, 15, in[11], 1 );
+   ROUND1_16W( D, E, A, B, C, F16W_1,  6, in[12], 1 );
+   ROUND1_16W( C, D, E, A, B, F16W_1,  7, in[13], 1 );
+   ROUND1_16W( B, C, D, E, A, F16W_1,  9, in[14], 1 );
+   ROUND1_16W( A, B, C, D, E, F16W_1,  8, in[15], 1 );
+
+   ROUND1_16W( E, A, B, C, D, F16W_2,  7, in[ 7], 2 );
+   ROUND1_16W( D, E, A, B, C, F16W_2,  6, in[ 4], 2 );
+   ROUND1_16W( C, D, E, A, B, F16W_2,  8, in[13], 2 );
+   ROUND1_16W( B, C, D, E, A, F16W_2, 13, in[ 1], 2 );
+   ROUND1_16W( A, B, C, D, E, F16W_2, 11, in[10], 2 );
+   ROUND1_16W( E, A, B, C, D, F16W_2,  9, in[ 6], 2 );
+   ROUND1_16W( D, E, A, B, C, F16W_2,  7, in[15], 2 );
+   ROUND1_16W( C, D, E, A, B, F16W_2, 15, in[ 3], 2 );
+   ROUND1_16W( B, C, D, E, A, F16W_2,  7, in[12], 2 );
+   ROUND1_16W( A, B, C, D, E, F16W_2, 12, in[ 0], 2 );
+   ROUND1_16W( E, A, B, C, D, F16W_2, 15, in[ 9], 2 );
+   ROUND1_16W( D, E, A, B, C, F16W_2,  9, in[ 5], 2 );
+   ROUND1_16W( C, D, E, A, B, F16W_2, 11, in[ 2], 2 );
+   ROUND1_16W( B, C, D, E, A, F16W_2,  7, in[14], 2 );
+   ROUND1_16W( A, B, C, D, E, F16W_2, 13, in[11], 2 );
+   ROUND1_16W( E, A, B, C, D, F16W_2, 12, in[ 8], 2 );
+
+   ROUND1_16W( D, E, A, B, C, F16W_3, 11, in[ 3], 3 );
+   ROUND1_16W( C, D, E, A, B, F16W_3, 13, in[10], 3 );
+   ROUND1_16W( B, C, D, E, A, F16W_3,  6, in[14], 3 );
+   ROUND1_16W( A, B, C, D, E, F16W_3,  7, in[ 4], 3 );
+   ROUND1_16W( E, A, B, C, D, F16W_3, 14, in[ 9], 3 );
+   ROUND1_16W( D, E, A, B, C, F16W_3,  9, in[15], 3 );
+   ROUND1_16W( C, D, E, A, B, F16W_3, 13, in[ 8], 3 );
+   ROUND1_16W( B, C, D, E, A, F16W_3, 15, in[ 1], 3 );
+   ROUND1_16W( A, B, C, D, E, F16W_3, 14, in[ 2], 3 );
+   ROUND1_16W( E, A, B, C, D, F16W_3,  8, in[ 7], 3 );
+   ROUND1_16W( D, E, A, B, C, F16W_3, 13, in[ 0], 3 );
+   ROUND1_16W( C, D, E, A, B, F16W_3,  6, in[ 6], 3 );
+   ROUND1_16W( B, C, D, E, A, F16W_3,  5, in[13], 3 );
+   ROUND1_16W( A, B, C, D, E, F16W_3, 12, in[11], 3 );
+   ROUND1_16W( E, A, B, C, D, F16W_3,  7, in[ 5], 3 );
+   ROUND1_16W( D, E, A, B, C, F16W_3,  5, in[12], 3 );
+
+   ROUND1_16W( C, D, E, A, B, F16W_4, 11, in[ 1], 4 );
+   ROUND1_16W( B, C, D, E, A, F16W_4, 12, in[ 9], 4 );
+   ROUND1_16W( A, B, C, D, E, F16W_4, 14, in[11], 4 );
+   ROUND1_16W( E, A, B, C, D, F16W_4, 15, in[10], 4 );
+   ROUND1_16W( D, E, A, B, C, F16W_4, 14, in[ 0], 4 );
+   ROUND1_16W( C, D, E, A, B, F16W_4, 15, in[ 8], 4 );
+   ROUND1_16W( B, C, D, E, A, F16W_4,  9, in[12], 4 );
+   ROUND1_16W( A, B, C, D, E, F16W_4,  8, in[ 4], 4 );
+   ROUND1_16W( E, A, B, C, D, F16W_4,  9, in[13], 4 );
+   ROUND1_16W( D, E, A, B, C, F16W_4, 14, in[ 3], 4 );
+   ROUND1_16W( C, D, E, A, B, F16W_4,  5, in[ 7], 4 );
+   ROUND1_16W( B, C, D, E, A, F16W_4,  6, in[15], 4 );
+   ROUND1_16W( A, B, C, D, E, F16W_4,  8, in[14], 4 );
+   ROUND1_16W( E, A, B, C, D, F16W_4,  6, in[ 5], 4 );
+   ROUND1_16W( D, E, A, B, C, F16W_4,  5, in[ 6], 4 );
+   ROUND1_16W( C, D, E, A, B, F16W_4, 12, in[ 2], 4 );
+
+   ROUND1_16W( B, C, D, E, A, F16W_5,  9, in[ 4], 5 );
+   ROUND1_16W( A, B, C, D, E, F16W_5, 15, in[ 0], 5 );
+   ROUND1_16W( E, A, B, C, D, F16W_5,  5, in[ 5], 5 );
+   ROUND1_16W( D, E, A, B, C, F16W_5, 11, in[ 9], 5 );
+   ROUND1_16W( C, D, E, A, B, F16W_5,  6, in[ 7], 5 );
+   ROUND1_16W( B, C, D, E, A, F16W_5,  8, in[12], 5 );
+   ROUND1_16W( A, B, C, D, E, F16W_5, 13, in[ 2], 5 );
+   ROUND1_16W( E, A, B, C, D, F16W_5, 12, in[10], 5 );
+   ROUND1_16W( D, E, A, B, C, F16W_5,  5, in[14], 5 );
+   ROUND1_16W( C, D, E, A, B, F16W_5, 12, in[ 1], 5 );
+   ROUND1_16W( B, C, D, E, A, F16W_5, 13, in[ 3], 5 );
+   ROUND1_16W( A, B, C, D, E, F16W_5, 14, in[ 8], 5 );
+   ROUND1_16W( E, A, B, C, D, F16W_5, 11, in[11], 5 );
+   ROUND1_16W( D, E, A, B, C, F16W_5,  8, in[ 6], 5 );
+   ROUND1_16W( C, D, E, A, B, F16W_5,  5, in[15], 5 );
+   ROUND1_16W( B, C, D, E, A, F16W_5,  6, in[13], 5 );
+
+   ROUND2_16W( A, B, C, D, E, F16W_5,  8, in[ 5], 1 );
+   ROUND2_16W( E, A, B, C, D, F16W_5,  9, in[14], 1 );
+   ROUND2_16W( D, E, A, B, C, F16W_5,  9, in[ 7], 1 );
+   ROUND2_16W( C, D, E, A, B, F16W_5, 11, in[ 0], 1 );
+   ROUND2_16W( B, C, D, E, A, F16W_5, 13, in[ 9], 1 );
+   ROUND2_16W( A, B, C, D, E, F16W_5, 15, in[ 2], 1 );
+   ROUND2_16W( E, A, B, C, D, F16W_5, 15, in[11], 1 );
+   ROUND2_16W( D, E, A, B, C, F16W_5,  5, in[ 4], 1 );
+   ROUND2_16W( C, D, E, A, B, F16W_5,  7, in[13], 1 );
+   ROUND2_16W( B, C, D, E, A, F16W_5,  7, in[ 6], 1 );
+   ROUND2_16W( A, B, C, D, E, F16W_5,  8, in[15], 1 );
+   ROUND2_16W( E, A, B, C, D, F16W_5, 11, in[ 8], 1 );
+   ROUND2_16W( D, E, A, B, C, F16W_5, 14, in[ 1], 1 );
+   ROUND2_16W( C, D, E, A, B, F16W_5, 14, in[10], 1 );
+   ROUND2_16W( B, C, D, E, A, F16W_5, 12, in[ 3], 1 );
+   ROUND2_16W( A, B, C, D, E, F16W_5,  6, in[12], 1 );
+
+   ROUND2_16W( E, A, B, C, D, F16W_4,  9, in[ 6], 2 );
+   ROUND2_16W( D, E, A, B, C, F16W_4, 13, in[11], 2 );
+   ROUND2_16W( C, D, E, A, B, F16W_4, 15, in[ 3], 2 );
+   ROUND2_16W( B, C, D, E, A, F16W_4,  7, in[ 7], 2 );
+   ROUND2_16W( A, B, C, D, E, F16W_4, 12, in[ 0], 2 );
+   ROUND2_16W( E, A, B, C, D, F16W_4,  8, in[13], 2 );
+   ROUND2_16W( D, E, A, B, C, F16W_4,  9, in[ 5], 2 );
+   ROUND2_16W( C, D, E, A, B, F16W_4, 11, in[10], 2 );
+   ROUND2_16W( B, C, D, E, A, F16W_4,  7, in[14], 2 );
+   ROUND2_16W( A, B, C, D, E, F16W_4,  7, in[15], 2 );
+   ROUND2_16W( E, A, B, C, D, F16W_4, 12, in[ 8], 2 );
+   ROUND2_16W( D, E, A, B, C, F16W_4,  7, in[12], 2 );
+   ROUND2_16W( C, D, E, A, B, F16W_4,  6, in[ 4], 2 );
+   ROUND2_16W( B, C, D, E, A, F16W_4, 15, in[ 9], 2 );
+   ROUND2_16W( A, B, C, D, E, F16W_4, 13, in[ 1], 2 );
+   ROUND2_16W( E, A, B, C, D, F16W_4, 11, in[ 2], 2 );
+
+   ROUND2_16W( D, E, A, B, C, F16W_3,  9, in[15], 3 );
+   ROUND2_16W( C, D, E, A, B, F16W_3,  7, in[ 5], 3 );
+   ROUND2_16W( B, C, D, E, A, F16W_3, 15, in[ 1], 3 );
+   ROUND2_16W( A, B, C, D, E, F16W_3, 11, in[ 3], 3 );
+   ROUND2_16W( E, A, B, C, D, F16W_3,  8, in[ 7], 3 );
+   ROUND2_16W( D, E, A, B, C, F16W_3,  6, in[14], 3 );
+   ROUND2_16W( C, D, E, A, B, F16W_3,  6, in[ 6], 3 );
+   ROUND2_16W( B, C, D, E, A, F16W_3, 14, in[ 9], 3 );
+   ROUND2_16W( A, B, C, D, E, F16W_3, 12, in[11], 3 );
+   ROUND2_16W( E, A, B, C, D, F16W_3, 13, in[ 8], 3 );
+   ROUND2_16W( D, E, A, B, C, F16W_3,  5, in[12], 3 );
+   ROUND2_16W( C, D, E, A, B, F16W_3, 14, in[ 2], 3 );
+   ROUND2_16W( B, C, D, E, A, F16W_3, 13, in[10], 3 );
+   ROUND2_16W( A, B, C, D, E, F16W_3, 13, in[ 0], 3 );
+   ROUND2_16W( E, A, B, C, D, F16W_3,  7, in[ 4], 3 );
+   ROUND2_16W( D, E, A, B, C, F16W_3,  5, in[13], 3 );
+
+   ROUND2_16W( C, D, E, A, B, F16W_2, 15, in[ 8], 4 );
+   ROUND2_16W( B, C, D, E, A, F16W_2,  5, in[ 6], 4 );
+   ROUND2_16W( A, B, C, D, E, F16W_2,  8, in[ 4], 4 );
+   ROUND2_16W( E, A, B, C, D, F16W_2, 11, in[ 1], 4 );
+   ROUND2_16W( D, E, A, B, C, F16W_2, 14, in[ 3], 4 );
+   ROUND2_16W( C, D, E, A, B, F16W_2, 14, in[11], 4 );
+   ROUND2_16W( B, C, D, E, A, F16W_2,  6, in[15], 4 );
+   ROUND2_16W( A, B, C, D, E, F16W_2, 14, in[ 0], 4 );
+   ROUND2_16W( E, A, B, C, D, F16W_2,  6, in[ 5], 4 );
+   ROUND2_16W( D, E, A, B, C, F16W_2,  9, in[12], 4 );
+   ROUND2_16W( C, D, E, A, B, F16W_2, 12, in[ 2], 4 );
+   ROUND2_16W( B, C, D, E, A, F16W_2,  9, in[13], 4 );
+   ROUND2_16W( A, B, C, D, E, F16W_2, 12, in[ 9], 4 );
+   ROUND2_16W( E, A, B, C, D, F16W_2,  5, in[ 7], 4 );
+   ROUND2_16W( D, E, A, B, C, F16W_2, 15, in[10], 4 );
+   ROUND2_16W( C, D, E, A, B, F16W_2,  8, in[14], 4 );
+
+   ROUND2_16W( B, C, D, E, A, F16W_1,  8, in[12], 5 );
+   ROUND2_16W( A, B, C, D, E, F16W_1,  5, in[15], 5 );
+   ROUND2_16W( E, A, B, C, D, F16W_1, 12, in[10], 5 );
+   ROUND2_16W( D, E, A, B, C, F16W_1,  9, in[ 4], 5 );
+   ROUND2_16W( C, D, E, A, B, F16W_1, 12, in[ 1], 5 );
+   ROUND2_16W( B, C, D, E, A, F16W_1,  5, in[ 5], 5 );
+   ROUND2_16W( A, B, C, D, E, F16W_1, 14, in[ 8], 5 );
+   ROUND2_16W( E, A, B, C, D, F16W_1,  6, in[ 7], 5 );
+   ROUND2_16W( D, E, A, B, C, F16W_1,  8, in[ 6], 5 );
+   ROUND2_16W( C, D, E, A, B, F16W_1, 13, in[ 2], 5 );
+   ROUND2_16W( B, C, D, E, A, F16W_1,  6, in[13], 5 );
+   ROUND2_16W( A, B, C, D, E, F16W_1,  5, in[14], 5 );
+   ROUND2_16W( E, A, B, C, D, F16W_1, 15, in[ 0], 5 );
+   ROUND2_16W( D, E, A, B, C, F16W_1, 13, in[ 3], 5 );
+   ROUND2_16W( C, D, E, A, B, F16W_1, 11, in[ 9], 5 );
+   ROUND2_16W( B, C, D, E, A, F16W_1, 11, in[11], 5 );
+
+   tmp =  _mm512_add_epi32( _mm512_add_epi32( h[1], C1 ), D2 );
+   h[1] = _mm512_add_epi32( _mm512_add_epi32( h[2], D1 ), E2 );
+   h[2] = _mm512_add_epi32( _mm512_add_epi32( h[3], E1 ), A2 );
+   h[3] = _mm512_add_epi32( _mm512_add_epi32( h[4], A1 ), B2 );
+   h[4] = _mm512_add_epi32( _mm512_add_epi32( h[0], B1 ), C2 );
+   h[0] = tmp;
+}
+
+void ripemd160_16way_init( ripemd160_16way_context *sc )
+{
+   sc->val[0] = m512_const1_64( 0x6745230167452301 );
+   sc->val[1] = m512_const1_64( 0xEFCDAB89EFCDAB89 );
+   sc->val[2] = m512_const1_64( 0x98BADCFE98BADCFE );
+   sc->val[3] = m512_const1_64( 0x1032547610325476 );
+   sc->val[4] = m512_const1_64( 0xC3D2E1F0C3D2E1F0 );
+   sc->count_high = sc->count_low = 0;
+}
+
+void ripemd160_16way( ripemd160_16way_context *sc, const void *data,
+                      size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   size_t ptr;
+   const int block_size = 64;
+
+   ptr = (unsigned)sc->count_low & (block_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      uint32_t clow, clow2;
+
+      clen = block_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_512( sc->buf + (ptr>>2), vdata, clen>>2 );
+      vdata = vdata + (clen>>2);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == block_size )
+      {
+         ripemd160_16way_round( sc );
+         ptr = 0;
+      }
+      clow = sc->count_low;
+      clow2 = clow + clen;
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high++;
+   }
+}
+
+void ripemd160_16way_close( ripemd160_16way_context  *sc, void *dst )
+{
+   unsigned ptr, u;
+   uint32_t low, high;
+   const int block_size = 64;
+   const int pad = block_size - 8;
+
+   ptr = (unsigned)sc->count_low & ( block_size - 1U);
+   sc->buf[ ptr>>2 ] = m512_const1_32( 0x80 );
+   ptr += 4;
+
+   if ( ptr > pad )
+   {
+       memset_zero_512( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
+       ripemd160_16way_round( sc );
+       memset_zero_512( sc->buf, pad>>2 );
+   }
+   else
+       memset_zero_512( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
+
+    low = sc->count_low;
+    high = (sc->count_high << 3) | (low >> 29);
+    low = low << 3;
+    sc->buf[  pad>>2      ] = _mm512_set1_epi32( low  );
+    sc->buf[ (pad>>2) + 1 ] = _mm512_set1_epi32( high );
+    ripemd160_16way_round( sc );
+    for (u = 0; u < 5; u ++)
+        casti_m512i( dst, u ) = sc->val[u];
+}
+
+#endif  // AVX512
--- a/algo/ripemd/ripemd-hash-4way.h
+++ b/algo/ripemd/ripemd-hash-4way.h
@@ -32,7 +32,21 @@ void ripemd160_8way_init( ripemd160_8way_context *sc );
 void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len );
 void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+typedef struct
+{
+   __m512i buf[64>>2];
+   __m512i val[5];
+   uint32_t count_high, count_low;
+} __attribute__ ((aligned (128))) ripemd160_16way_context;
+
+void ripemd160_16way_init( ripemd160_16way_context *sc );
+void ripemd160_16way( ripemd160_16way_context *sc, const void *data,
+                      size_t len );
+void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst );
+
+#endif // AVX512
 #endif // __AVX2__
 #endif // __SSE4_2__
 #endif // RIPEMD_HASH_4WAY_H__
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -74,7 +74,8 @@ typedef struct {
 } sha256_8way_context __attribute__ ((aligned (128)));

 void sha256_8way_init( sha256_8way_context *sc );
-void sha256_8way( sha256_8way_context *sc, const void *data, size_t len );
+void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
+#define sha256_8way sha256_8way_update
 void sha256_8way_close( sha256_8way_context *sc, void *dst );

 //#define SPH_SIZE_sha512   512
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -100,9 +100,20 @@ c512( sph_shavite_big_context *sc, const void *msg )
   p3 = h[3];   

   // round
+
+//  working proof of concept   
+/*
+   __m512i K = m512_const1_128( m[0] );
+   __m512i X = _mm512_xor_si512( m512_const1_128( p1 ), K );
+   X = _mm512_aesenc_epi128( X, m512_zero );
+   k00 = _mm512_castsi512_si128( K );
+   x = _mm512_castsi512_si128( X );
+*/
+
   k00 = m[0];
   x = _mm_xor_si128( p1, k00 );
   x = _mm_aesenc_si128( x, zero );
+
   k01 = m[1];
   x = _mm_xor_si128( x, k01 );
   x = _mm_aesenc_si128( x, zero );
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -52,10 +52,10 @@ bool register_x16r_algo( algo_gate_t* gate )

 bool register_x16rv2_algo( algo_gate_t* gate )
 {
-#if defined (X16R_8WAY)
+#if defined (X16RV2_8WAY)
  gate->scanhash  = (void*)&scanhash_x16rv2_8way;
  gate->hash      = (void*)&x16rv2_8way_hash;
-#elif defined (X16R_4WAY)
+#elif defined (X16RV2_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rv2_4way;
  gate->hash      = (void*)&x16rv2_4way_hash;
 #else
@@ -205,10 +205,10 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )

 bool register_x16rt_algo( algo_gate_t* gate )
 {
-#if defined (X16R_8WAY)
+#if defined (X16RT_8WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_8way;
  gate->hash      = (void*)&x16rt_8way_hash;
-#elif defined (X16R_4WAY)
+#elif defined (X16RT_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16rt_4way_hash;
 #else
@@ -222,10 +222,10 @@ bool register_x16rt_algo( algo_gate_t* gate )

 bool register_x16rt_veil_algo( algo_gate_t* gate )
 {
-#if defined (X16R_8WAY)
+#if defined (X16RT_8WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_8way;
  gate->hash      = (void*)&x16rt_8way_hash;
-#elif defined (X16R_4WAY)
+#elif defined (X16RT_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16rt_4way_hash;
 #else
@@ -258,16 +258,23 @@ bool register_hex_algo( algo_gate_t* gate )

 bool register_x21s_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X21S_8WAY)
+  gate->scanhash          = (void*)&scanhash_x21s_8way;
+  gate->hash              = (void*)&x21s_8way_hash;
+  gate->miner_thread_init = (void*)&x21s_8way_thread_init;
+  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+#elif defined (X21S_4WAY)
  gate->scanhash          = (void*)&scanhash_x21s_4way;
  gate->hash              = (void*)&x21s_4way_hash;
  gate->miner_thread_init = (void*)&x21s_4way_thread_init;
+  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
 #else
  gate->scanhash          = (void*)&scanhash_x21s;
  gate->hash              = (void*)&x21s_hash;
  gate->miner_thread_init = (void*)&x21s_thread_init;
+  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
 #endif
-  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
+//  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
  x16_r_s_getAlgoString   = (void*)&x16s_getAlgoString;
  opt_target_factor = 256.0;
  return true;
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -12,6 +12,24 @@
  #define X16R_4WAY 1
 #endif

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X16RV2_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X16RV2_4WAY 1
+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X16RT_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X16RT_4WAY 1
+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X21S_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X21S_4WAY 1
+#endif
+
 enum x16r_Algo {
        BLAKE = 0,
        BMW,
@@ -46,18 +64,13 @@ bool register_x16rt_algo( algo_gate_t* gate );
 bool register_hex__algo( algo_gate_t* gate );
 bool register_x21s__algo( algo_gate_t* gate );

+// x16r, x16s
 #if defined(X16R_8WAY)

 void x16r_8way_hash( void *state, const void *input );
 int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

-void x16rv2_8way_hash( void *state, const void *input );
-int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr );
-void x16rt_8way_hash( void *state, const void *input );
-int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr );

 #elif defined(X16R_4WAY)

@@ -65,31 +78,65 @@ void x16r_4way_hash( void *state, const void *input );
 int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

-void x16rv2_4way_hash( void *state, const void *input );
-int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr );
-
-void x16rt_4way_hash( void *state, const void *input );
-int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr );
-
 #else

 void x16r_hash( void *state, const void *input );
 int scanhash_x16r( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );

+#endif
+
+// x16Rv2
+#if defined(X16RV2_8WAY)
+
+void x16rv2_8way_hash( void *state, const void *input );
+int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(X16RV2_4WAY)
+
+void x16rv2_4way_hash( void *state, const void *input );
+int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
+#else
+
 void x16rv2_hash( void *state, const void *input );
 int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );

+#endif
+
+// x16rt, veil
+#if defined(X16RT_8WAY)
+
+void x16rt_8way_hash( void *state, const void *input );
+int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(X16RT_4WAY)
+
+void x16rt_4way_hash( void *state, const void *input );
+int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
+#else
+
 void x16rt_hash( void *state, const void *input );
 int scanhash_x16rt( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );

 #endif

-#if defined(X16R_4WAY)
+// x21s
+#if defined(X21S_8WAY)
+
+void x21s_8way_hash( void *state, const void *input );
+int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+bool x21s_8way_thread_init();
+
+#elif defined(X21S_4WAY)

 void x21s_4way_hash( void *state, const void *input );
 int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -24,7 +24,7 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };

-#if defined (X16R_8WAY)
+#if defined (X16RT_8WAY)

 union _x16rt_8way_context_overlay
 {
@@ -407,7 +407,7 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
   return 0;
 }

-#elif defined (X16R_4WAY)
+#elif defined (X16RT_4WAY)

 union _x16rt_4way_context_overlay
 {
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -31,7 +31,7 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };

-#if defined (X16R_8WAY)
+#if defined (X16RV2_8WAY)

 union _x16rv2_8way_context_overlay
 {
@@ -497,10 +497,7 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
   return 0;
 }

-
-#elif defined (X16R_4WAY)
-
-
+#elif defined (X16RV2_4WAY)

 union _x16rv2_4way_context_overlay
 {
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -1,13 +1,10 @@
 /**
- * x16r algo implementation
+ * x21s algo implementation
 *
 * Implementation by tpruvot@github Jan 2018
 * Optimized by JayDDee@github Jan 2018
 */
 #include "x16r-gate.h"
-
-#if defined (X16R_4WAY)
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -21,6 +18,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -38,6 +36,483 @@

 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
+
+#if defined (X21S_8WAY)
+
+static __thread uint64_t* x21s_8way_matrix;
+
+union _x21s_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+    haval256_5_8way_context haval;
+    sph_tiger_context       tiger;
+    sph_gost512_context     gost;
+    sha256_8way_context     sha256;
+} __attribute__ ((aligned (64)));
+
+typedef union _x21s_8way_context_overlay x21s_8way_context_overlay;
+
+void x21s_8way_hash( void* output, const void* input )
+{
+   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[24] __attribute__ ((aligned (64)));
+   uint32_t hash1[24] __attribute__ ((aligned (64)));
+   uint32_t hash2[24] __attribute__ ((aligned (64)));
+   uint32_t hash3[24] __attribute__ ((aligned (64)));
+   uint32_t hash4[24] __attribute__ ((aligned (64)));
+   uint32_t hash5[24] __attribute__ ((aligned (64)));
+   uint32_t hash6[24] __attribute__ ((aligned (64)));
+   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   x21s_8way_context_overlay ctx;
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   void *in2 = (void*) hash2;
+   void *in3 = (void*) hash3;
+   void *in4 = (void*) hash4;
+   void *in5 = (void*) hash5;
+   void *in6 = (void*) hash6;
+   void *in7 = (void*) hash7;
+   int size = 80;
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 input, 640 );
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            blake512_8way_init( &ctx.blake );
+            if ( i == 0 )
+               blake512_8way_update( &ctx.blake, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               blake512_8way_update( &ctx.blake, vhash, size );
+            }
+            blake512_8way_close( &ctx.blake, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case BMW:
+            bmw512_8way_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_8way_update( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+            bmw512_8way_update( &ctx.bmw, vhash, size );
+            }
+            bmw512_8way_close( &ctx.bmw, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case GROESTL:
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                 (const char*)in0, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                 (const char*)in1, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                 (const char*)in2, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                 (const char*)in3, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                                 (const char*)in4, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                                 (const char*)in5, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                                 (const char*)in6, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                                 (const char*)in7, size<<3 );
+         break;
+         case SKEIN:
+            skein512_8way_init( &ctx.skein );
+            if ( i == 0 )
+               skein512_8way_update( &ctx.skein, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               skein512_8way_update( &ctx.skein, vhash, size );
+            }
+            skein512_8way_close( &ctx.skein, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case JH:
+            jh512_8way_init( &ctx.jh );
+            if ( i == 0 )
+               jh512_8way_update( &ctx.jh, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               jh512_8way_update( &ctx.jh, vhash, size );
+            }
+            jh512_8way_close( &ctx.jh, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case KECCAK:
+            keccak512_8way_init( &ctx.keccak );
+            if ( i == 0 )
+               keccak512_8way_update( &ctx.keccak, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               keccak512_8way_update( &ctx.keccak, vhash, size );
+            }
+            keccak512_8way_close( &ctx.keccak, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case LUFFA:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case CUBEHASH:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in0, size );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in1, size );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in2, size );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in3, size );
+            sph_shavite512_close( &ctx.shavite, hash3 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in4, size );
+            sph_shavite512_close( &ctx.shavite, hash4 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in5, size );
+            sph_shavite512_close( &ctx.shavite, hash5 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in6, size );
+            sph_shavite512_close( &ctx.shavite, hash6 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in7, size );
+            sph_shavite512_close( &ctx.shavite, hash7 );
+         break;
+         case SIMD:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case ECHO:
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                                (const BitSequence*)in0, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                                (const BitSequence*)in1, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                                (const BitSequence*)in2, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                                (const BitSequence*)in3, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
+                                (const BitSequence*)in4, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
+                                (const BitSequence*)in5, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
+                                (const BitSequence*)in6, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
+                                (const BitSequence*)in7, size<<3 );
+         break;
+         case HAMSI:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+
+             hamsi512_8way_init( &ctx.hamsi );
+             hamsi512_8way_update( &ctx.hamsi, vhash, size );
+             hamsi512_8way_close( &ctx.hamsi, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+             break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in0, size );
+             sph_fugue512_close( &ctx.fugue, hash0 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in1, size );
+             sph_fugue512_close( &ctx.fugue, hash1 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in2, size );
+             sph_fugue512_close( &ctx.fugue, hash2 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in3, size );
+             sph_fugue512_close( &ctx.fugue, hash3 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in4, size );
+             sph_fugue512_close( &ctx.fugue, hash4 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in5, size );
+             sph_fugue512_close( &ctx.fugue, hash5 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in6, size );
+             sph_fugue512_close( &ctx.fugue, hash6 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in7, size );
+             sph_fugue512_close( &ctx.fugue, hash7 );
+         break;
+         case SHABAL:
+             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                          size<<3 );
+             shabal512_8way_init( &ctx.shabal );
+             shabal512_8way_update( &ctx.shabal, vhash, size );
+             shabal512_8way_close( &ctx.shabal, vhash );
+             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in0, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash0 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in1, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash1 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in2, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash2 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in3, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in4, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash4 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in5, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash5 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in6, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash6 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in7, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash7 );
+         break;
+         case SHA_512:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+             sha512_8way_init( &ctx.sha512 );
+             sha512_8way_update( &ctx.sha512, vhash, size );
+             sha512_8way_close( &ctx.sha512, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+      }
+      size = 64;
+   }
+
+   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                    hash7 );
+
+   haval256_5_8way_init( &ctx.haval );
+   haval256_5_8way_update( &ctx.haval, vhash, 64 );
+   haval256_5_8way_close( &ctx.haval, vhash );
+
+   dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                     hash7, vhash );
+
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash0, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash0 );
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash1, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash1 );
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash2, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash2 );
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash3, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash3 );
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash4, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash4 );
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash5, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash5 );
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash6, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash6 );
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash7, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash7 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2REV2_2WAY( x21s_8way_matrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2REV2_2WAY( x21s_8way_matrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2REV2_2WAY( x21s_8way_matrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2REV2_2WAY( x21s_8way_matrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash0, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash0 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash1, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash1 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash2, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash2 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash3, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash3 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash4, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash4 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash5, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash5 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash6, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash6 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash7, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash7 );
+
+   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                    hash7 );
+   sha256_8way_init( &ctx.sha256 );
+   sha256_8way_update( &ctx.sha256, vhash, 64 );
+   sha256_8way_close( &ctx.sha256, output );
+}
+
+int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &hash[7<<3];
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 16;
+   int thr_id = mythr->id;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+    ptarget[7] = 0x0cff;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+   uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+   }
+
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+               _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                 n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+      x21s_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( unlikely( hash7[lane] <= Htarg ) )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+             pdata[19] = n + lane;
+             submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( (  n < last_nonce ) && !(*restart) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+bool x21s_8way_thread_init()
+{
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+   const int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+   x21s_8way_matrix = _mm_malloc( 2 * size, 64 );
+   return x21s_8way_matrix;
+}
+
+#elif defined (X21S_4WAY)
+
 static __thread uint64_t* x21s_4way_matrix;

 union _x21s_4way_context_overlay
--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -1,7 +1,4 @@
 #include "x22i-gate.h"
-
-#if defined(X22I_4WAY)
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -12,6 +9,7 @@
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/shavite-hash-2way.h"
+#include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -25,6 +23,426 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/swifftx/swifftx.h"

+
+#if defined(X22I_8WAY)
+
+union _x22i_8way_ctx_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    hashState_echo          echo;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+    haval256_5_8way_context haval;
+    sph_tiger_context       tiger;
+    sph_gost512_context     gost;
+    sha256_8way_context     sha256;
+};
+typedef union _x22i_8way_ctx_overlay x22i_8way_ctx_overlay;
+
+void x22i_8way_hash( void *output, const void *input )
+{
+   uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+   uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
+   uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
+   uint64_t hash0[8*4] __attribute__ ((aligned (64)));
+   uint64_t hash1[8*4] __attribute__ ((aligned (64)));
+   uint64_t hash2[8*4] __attribute__ ((aligned (64)));
+   uint64_t hash3[8*4] __attribute__ ((aligned (64)));
+   uint64_t hash4[8*4] __attribute__ ((aligned (64)));
+   uint64_t hash5[8*4] __attribute__ ((aligned (64)));
+   uint64_t hash6[8*4] __attribute__ ((aligned (64)));
+   uint64_t hash7[8*4] __attribute__ ((aligned (64)));
+
+//   unsigned char hash[64 * 4] __attribute__((aligned(64))) = {0};
+   unsigned char hashA0[64]    __attribute__((aligned(64))) = {0};
+   unsigned char hashA1[64]    __attribute__((aligned(32))) = {0};
+   unsigned char hashA2[64]    __attribute__((aligned(32))) = {0};
+   unsigned char hashA3[64]    __attribute__((aligned(32))) = {0};
+   unsigned char hashA4[64]    __attribute__((aligned(64))) = {0};
+   unsigned char hashA5[64]    __attribute__((aligned(32))) = {0};
+   unsigned char hashA6[64]    __attribute__((aligned(32))) = {0};
+   unsigned char hashA7[64]    __attribute__((aligned(32))) = {0};
+   x22i_8way_ctx_overlay ctx;
+
+   blake512_8way_init( &ctx.blake );
+   blake512_8way_update( &ctx.blake, input, 80 );
+   blake512_8way_close( &ctx.blake, vhash );
+
+   bmw512_8way_init( &ctx.bmw );
+   bmw512_8way_update( &ctx.bmw, vhash, 64 );
+   bmw512_8way_close( &ctx.bmw, vhash );
+
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
+                     hash4, hash5, hash6, hash7, vhash );
+
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                  (const char*)hash0, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                  (const char*)hash1, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                  (const char*)hash2, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                  (const char*)hash3, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                  (const char*)hash4, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                  (const char*)hash5, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                  (const char*)hash6, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                  (const char*)hash7, 512 );
+
+   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
+                           hash4, hash5, hash6, hash7 );
+   
+   skein512_8way_init( &ctx.skein );
+   skein512_8way_update( &ctx.skein, vhash, 64 );
+   skein512_8way_close( &ctx.skein, vhash );
+
+   jh512_8way_init( &ctx.jh );
+   jh512_8way_update( &ctx.jh, vhash, 64 );
+   jh512_8way_close( &ctx.jh, vhash );
+
+   keccak512_8way_init( &ctx.keccak );
+   keccak512_8way_update( &ctx.keccak, vhash, 64 );
+   keccak512_8way_close( &ctx.keccak, vhash );
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+   luffa_4way_init( &ctx.luffa, 512 );
+   luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+   luffa_4way_init( &ctx.luffa, 512 );
+   luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+   cube_4way_init( &ctx.cube, 512, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+   cube_4way_init( &ctx.cube, 512, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+   dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+   dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512( &ctx.shavite, hash0, 64 );
+   sph_shavite512_close( &ctx.shavite, hash0 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512( &ctx.shavite, hash1, 64 );
+   sph_shavite512_close( &ctx.shavite, hash1 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512( &ctx.shavite, hash2, 64 );
+   sph_shavite512_close( &ctx.shavite, hash2 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512( &ctx.shavite, hash3, 64 );
+   sph_shavite512_close( &ctx.shavite, hash3 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512( &ctx.shavite, hash4, 64 );
+   sph_shavite512_close( &ctx.shavite, hash4 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512( &ctx.shavite, hash5, 64 );
+   sph_shavite512_close( &ctx.shavite, hash5 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512( &ctx.shavite, hash6, 64 );
+   sph_shavite512_close( &ctx.shavite, hash6 );
+   sph_shavite512_init( &ctx.shavite );
+   sph_shavite512( &ctx.shavite, hash7, 64 );
+   sph_shavite512_close( &ctx.shavite, hash7 );
+
+   intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+   intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
+
+   simd_4way_init( &ctx.simd, 512 );
+   simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+   simd_4way_init( &ctx.simd, 512 );
+   simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+   dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+   dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash0,
+                            (const BitSequence*)hash0, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash1,
+                            (const BitSequence*)hash1, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash2,
+                            (const BitSequence*)hash2, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash3,
+                            (const BitSequence*)hash3, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash4,
+                            (const BitSequence*)hash4, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash5,
+                            (const BitSequence*)hash5, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash6,
+                            (const BitSequence*)hash6, 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash7,
+                            (const BitSequence*)hash7, 512 );
+
+   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
+                           hash4, hash5, hash6, hash7 );
+
+   hamsi512_8way_init( &ctx.hamsi );
+   hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+   hamsi512_8way_close( &ctx.hamsi, vhash );
+
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
+                     hash4, hash5, hash6, hash7, vhash );
+   
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash0, 64 );
+   sph_fugue512_close( &ctx.fugue, hash0 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash1, 64 );
+   sph_fugue512_close( &ctx.fugue, hash1 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash2, 64 );
+   sph_fugue512_close( &ctx.fugue, hash2 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash3, 64 );
+   sph_fugue512_close( &ctx.fugue, hash3 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash4, 64 );
+   sph_fugue512_close( &ctx.fugue, hash4 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash5, 64 );
+   sph_fugue512_close( &ctx.fugue, hash5 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash6, 64 );
+   sph_fugue512_close( &ctx.fugue, hash6 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash7, 64 );
+   sph_fugue512_close( &ctx.fugue, hash7 );
+
+   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3,
+                           hash4, hash5, hash6, hash7 );
+
+   shabal512_8way_init( &ctx.shabal );
+   shabal512_8way_update( &ctx.shabal, vhash, 64 );
+   shabal512_8way_close( &ctx.shabal, vhash );
+
+   dintrlv_8x32_512( &hash0[8], &hash1[8], &hash2[8], &hash3[8],
+                     &hash4[8], &hash5[8], &hash6[8], &hash7[8], vhash );
+
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, &hash0[8], 64 );
+   sph_whirlpool_close( &ctx.whirlpool, &hash0[16] );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, &hash1[8], 64 );
+   sph_whirlpool_close( &ctx.whirlpool, &hash1[16] );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, &hash2[8], 64 );
+   sph_whirlpool_close( &ctx.whirlpool, &hash2[16] );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, &hash3[8], 64 );
+   sph_whirlpool_close( &ctx.whirlpool, &hash3[16] );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, &hash4[8], 64 );
+   sph_whirlpool_close( &ctx.whirlpool, &hash4[16] );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, &hash5[8], 64 );
+   sph_whirlpool_close( &ctx.whirlpool, &hash5[16] );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, &hash6[8], 64 );
+   sph_whirlpool_close( &ctx.whirlpool, &hash6[16] );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, &hash7[8], 64 );
+   sph_whirlpool_close( &ctx.whirlpool, &hash7[16] );
+
+   intrlv_8x64_512( vhash, &hash0[16], &hash1[16], &hash2[16], &hash3[16],
+                           &hash4[16], &hash5[16], &hash6[16], &hash7[16] );
+
+   sha512_8way_init( &ctx.sha512 );
+   sha512_8way_update( &ctx.sha512, vhash, 64 );
+   sha512_8way_close( &ctx.sha512, vhash );
+
+   dintrlv_8x64_512( &hash0[24], &hash1[24], &hash2[24], &hash3[24],
+                     &hash4[24], &hash5[24], &hash6[24], &hash7[24], vhash );
+               
+   ComputeSingleSWIFFTX((unsigned char*)hash0, (unsigned char*)hashA0);
+   ComputeSingleSWIFFTX((unsigned char*)hash1, (unsigned char*)hashA1);
+   ComputeSingleSWIFFTX((unsigned char*)hash2, (unsigned char*)hashA2);
+   ComputeSingleSWIFFTX((unsigned char*)hash3, (unsigned char*)hashA3);
+   ComputeSingleSWIFFTX((unsigned char*)hash4, (unsigned char*)hashA4);
+   ComputeSingleSWIFFTX((unsigned char*)hash5, (unsigned char*)hashA5);
+   ComputeSingleSWIFFTX((unsigned char*)hash6, (unsigned char*)hashA6);
+   ComputeSingleSWIFFTX((unsigned char*)hash7, (unsigned char*)hashA7);
+
+   intrlv_8x32_512( vhashA, hashA0, hashA1, hashA2, hashA3,
+                            hashA4, hashA5, hashA6, hashA7 );
+
+   memset( vhash, 0, 64*8 );
+
+   haval256_5_8way_init( &ctx.haval );
+   haval256_5_8way_update( &ctx.haval, vhashA, 64 );
+   haval256_5_8way_close( &ctx.haval, vhash );
+
+   dintrlv_8x32_512( hash0, hash1, hash2, hash3,
+                     hash4, hash5, hash6, hash7, vhash );
+
+   memset( hashA0, 0, 64 );
+   memset( hashA1, 0, 64 );
+   memset( hashA2, 0, 64 );
+   memset( hashA3, 0, 64 );
+   memset( hashA4, 0, 64 );
+   memset( hashA5, 0, 64 );
+   memset( hashA6, 0, 64 );
+   memset( hashA7, 0, 64 );
+
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash0, 64);
+   sph_tiger_close(&ctx.tiger, (void*) hashA0);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash1, 64);
+   sph_tiger_close(&ctx.tiger, (void*) hashA1);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash2, 64);
+   sph_tiger_close(&ctx.tiger, (void*) hashA2);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash3, 64);
+   sph_tiger_close(&ctx.tiger, (void*) hashA3);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash4, 64);
+   sph_tiger_close(&ctx.tiger, (void*) hashA4);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash5, 64);
+   sph_tiger_close(&ctx.tiger, (void*) hashA5);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash6, 64);
+   sph_tiger_close(&ctx.tiger, (void*) hashA6);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash7, 64);
+   sph_tiger_close(&ctx.tiger, (void*) hashA7);
+
+   memset( hash0, 0, 64 );
+   memset( hash1, 0, 64 );
+   memset( hash2, 0, 64 );
+   memset( hash3, 0, 64 );
+   memset( hash4, 0, 64 );
+   memset( hash5, 0, 64 );
+   memset( hash6, 0, 64 );
+   memset( hash7, 0, 64 );
+
+   intrlv_2x256( vhash, hashA0, hashA1, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hashA2, hashA3, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hashA4, hashA5, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hashA6, hashA7, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash0, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash0 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash1, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash1 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash2, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash2 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash3, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash3 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash4, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash4 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash5, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash5 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash6, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash6 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash7, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash7 );
+
+   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3,
+                           hash4, hash5, hash6, hash7 );
+
+   sha256_8way_init( &ctx.sha256 );
+   sha256_8way_update( &ctx.sha256, vhash, 64 );
+   sha256_8way_close( &ctx.sha256, output );
+}
+
+int scanhash_x22i_8way( struct work* work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 8;
+   const int thr_id = mythr->id;
+   const uint32_t Htarg = ptarget[7];
+
+   if (opt_benchmark)
+      ((uint32_t*)ptarget)[7] = 0x08ff;
+
+   InitializeSWIFFTX();
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+               _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                 n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+      x22i_8way_hash( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if unlikely( ( hash7[ lane ] <= Htarg ) )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+#elif defined(X22I_4WAY)
+
+
 union _x22i_4way_ctx_overlay
 {
    blake512_4way_context   blake;
--- a/algo/x22/x22i-gate.c
+++ b/algo/x22/x22i-gate.c
@@ -2,27 +2,39 @@

 bool register_x22i_algo( algo_gate_t* gate )
 {
-#if defined (X22I_4WAY)
+#if defined (X22I_8WAY)
+  gate->scanhash  = (void*)&scanhash_x22i_8way;
+  gate->hash      = (void*)&x22i_8way_hash;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+#elif defined (X22I_4WAY)
  gate->scanhash  = (void*)&scanhash_x22i_4way;
  gate->hash      = (void*)&x22i_4way_hash;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
 #else
  gate->scanhash  = (void*)&scanhash_x22i;
  gate->hash      = (void*)&x22i_hash;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
  return true;
 };

 bool register_x25x_algo( algo_gate_t* gate )
 {
-#if defined (X22I_4WAY)
+#if defined (X25X_8WAY)
+  gate->scanhash  = (void*)&scanhash_x25x_8way;
+  gate->hash      = (void*)&x25x_8way_hash;
+//  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+#elif defined (X25X_4WAY)
  gate->scanhash  = (void*)&scanhash_x25x_4way;
  gate->hash      = (void*)&x25x_4way_hash;
+//  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
 #else
  gate->scanhash  = (void*)&scanhash_x25x;
  gate->hash      = (void*)&x25x_hash;
+//  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
+
  return true;
 };

--- a/algo/x22/x22i-gate.h
+++ b/algo/x22/x22i-gate.h
@@ -6,30 +6,64 @@
 #include <stdint.h>
 #include <unistd.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X22I_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X22I_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X22I_4WAY 1
 #endif

-bool register_x22i__algo( algo_gate_t* gate );
+bool register_x22i_algo( algo_gate_t* gate );

-#if defined(X22I_4WAY)
+#if defined(X22I_8WAY)
+
+void x22i_8way_hash( void *state, const void *input );
+int scanhash_x22i_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(X22I_4WAY)

 void x22i_4way_hash( void *state, const void *input );
 int scanhash_x22i_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

-void x25x_4way_hash( void *state, const void *input );
-int scanhash_x25x_4way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr );
-
-#endif
+#else

 void x22i_hash( void *state, const void *input );
 int scanhash_x22i( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );

+#endif
+
+
+// Big problems with x25x 8 way. It blows up just by increasing the
+// buffer sizes and nothing else. It may have to do with accessing 2 dim arrays.
+
+//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+//  #define X25X_8WAY 1
+#if defined(__AVX2__) && defined(__AES__)
+  #define X25X_4WAY 1
+#endif
+
+bool register_x25i_algo( algo_gate_t* gate );
+
+#if defined(X25X_8WAY)
+
+void x25x_8way_hash( void *state, const void *input );
+int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(X25X_4WAY)
+
+void x25x_4way_hash( void *state, const void *input );
+int scanhash_x25x_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
+#else
+
 void x25x_hash( void *state, const void *input );
 int scanhash_x25x( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );

+#endif
+
 #endif  // X22I_GATE_H__
--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -1,6 +1,6 @@
 #include "x22i-gate.h"

-#if defined(X22I_4WAY)
+#if defined(X25X_4WAY)

 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
@@ -88,276 +88,282 @@ void x25x_4way_hash( void *output, const void *input )
   unsigned char hash2[25][64] __attribute__((aligned(64))) = {0};
   unsigned char hash3[25][64] __attribute__((aligned(64))) = {0};
   uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-   unsigned char vhashA[24][64*4] __attribute__ ((aligned (64)));
+// Doubling the size of vhashX breaks everything. It may have something
+// to do with accessing arrays: vhashX vs vhashX[0] vs &vhash[0].
+// Changing notation did seem to allow the larger buffer but still resulted
+// in problems further along.
+//   unsigned char vhashX[24][64*8] __attribute__ ((aligned (64)));
+   unsigned char vhashX[24][64*4] __attribute__ ((aligned (64)));
   x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64)));

   blake512_4way_init( &ctx.blake );
   blake512_4way( &ctx.blake, input, 80 );
   blake512_4way_close( &ctx.blake, vhash );
-   dintrlv_4x64_512( &hash0[0], &hash1[0], &hash2[0], &hash3[0], vhash );
+   dintrlv_4x64_512( hash0[0], hash1[0], hash2[0], hash3[0], vhash );

   bmw512_4way_init( &ctx.bmw );
   bmw512_4way( &ctx.bmw, vhash, 64 );
   bmw512_4way_close( &ctx.bmw, vhash );
-   dintrlv_4x64_512( &hash0[1], &hash1[1], &hash2[1], &hash3[1], vhash );
+   dintrlv_4x64_512( hash0[1], hash1[1], hash2[1], hash3[1], vhash );

   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)&hash0[2],
-                                  (const char*)&hash0[1], 512 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash0[2],
+                                  (const char*)hash0[1], 512 );
   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)&hash1[2],
-                                  (const char*)&hash1[1], 512 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash1[2],
+                                  (const char*)hash1[1], 512 );
   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)&hash2[2],
-                                  (const char*)&hash2[1], 512 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash2[2],
+                                  (const char*)hash2[1], 512 );
   init_groestl( &ctx.groestl, 64 );
-   update_and_final_groestl( &ctx.groestl, (char*)&hash3[2],
-                                  (const char*)&hash3[1], 512 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash3[2],
+                                  (const char*)hash3[1], 512 );
   
-   intrlv_4x64_512( vhash, &hash0[2], &hash1[2], &hash2[2], &hash3[2] );
+   intrlv_4x64_512( vhash, hash0[2], hash1[2], hash2[2], hash3[2] );

   skein512_4way_init( &ctx.skein );
   skein512_4way( &ctx.skein, vhash, 64 );
   skein512_4way_close( &ctx.skein, vhash );
-   dintrlv_4x64_512( &hash0[3], &hash1[3], &hash2[3], &hash3[3], vhash );
+   dintrlv_4x64_512( hash0[3], hash1[3], hash2[3], hash3[3], vhash );

   jh512_4way_init( &ctx.jh );
   jh512_4way( &ctx.jh, vhash, 64 );
   jh512_4way_close( &ctx.jh, vhash );
-   dintrlv_4x64_512( &hash0[4], &hash1[4], &hash2[4], &hash3[4], vhash );
+   dintrlv_4x64_512( hash0[4], hash1[4], hash2[4], hash3[4], vhash );

   keccak512_4way_init( &ctx.keccak );
   keccak512_4way( &ctx.keccak, vhash, 64 );
   keccak512_4way_close( &ctx.keccak, vhash );
-   dintrlv_4x64_512( &hash0[5], &hash1[5], &hash2[5], &hash3[5], vhash );
+   dintrlv_4x64_512( hash0[5], hash1[5], hash2[5], hash3[5], vhash );
   
   init_luffa( &ctx.luffa, 512 );
-   update_and_final_luffa( &ctx.luffa, (BitSequence*)&hash0[6],
-                                (const BitSequence*)&hash0[5], 64 );
+   update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0[6],
+                                (const BitSequence*)hash0[5], 64 );
   init_luffa( &ctx.luffa, 512 );
-   update_and_final_luffa( &ctx.luffa, (BitSequence*)&hash1[6],
-                                (const BitSequence*)&hash1[5], 64 );
+   update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1[6],
+                                (const BitSequence*)hash1[5], 64 );
   init_luffa( &ctx.luffa, 512 );
-   update_and_final_luffa( &ctx.luffa, (BitSequence*)&hash2[6],
-                                (const BitSequence*)&hash2[5], 64 );
+   update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2[6],
+                                (const BitSequence*)hash2[5], 64 );
   init_luffa( &ctx.luffa, 512 );
-   update_and_final_luffa( &ctx.luffa, (BitSequence*)&hash3[6],
-                                (const BitSequence*)&hash3[5], 64 );
+   update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3[6],
+                                (const BitSequence*)hash3[5], 64 );

   cubehashInit( &ctx.cube, 512, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*) &hash0[7],
-                              (const byte*)&hash0[6], 64 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash0[7],
+                              (const byte*)hash0[6], 64 );
   cubehashInit( &ctx.cube, 512, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*) &hash1[7],
-                              (const byte*)&hash1[6], 64 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash1[7],
+                              (const byte*)hash1[6], 64 );
   cubehashInit( &ctx.cube, 512, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*) &hash2[7],
-                              (const byte*)&hash2[6], 64 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash2[7],
+                              (const byte*)hash2[6], 64 );
   cubehashInit( &ctx.cube, 512, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*) &hash3[7],
-                              (const byte*)&hash3[6], 64 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash3[7],
+                              (const byte*)hash3[6], 64 );

 	sph_shavite512_init(&ctx.shavite);
-	sph_shavite512(&ctx.shavite, (const void*) &hash0[7], 64);
-	sph_shavite512_close(&ctx.shavite, &hash0[8]);
+	sph_shavite512(&ctx.shavite, (const void*) hash0[7], 64);
+	sph_shavite512_close(&ctx.shavite, hash0[8]);
   sph_shavite512_init(&ctx.shavite);
-   sph_shavite512(&ctx.shavite, (const void*) &hash1[7], 64);
-   sph_shavite512_close(&ctx.shavite, &hash1[8]);
+   sph_shavite512(&ctx.shavite, (const void*) hash1[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash1[8]);
   sph_shavite512_init(&ctx.shavite);
-   sph_shavite512(&ctx.shavite, (const void*) &hash2[7], 64);
-   sph_shavite512_close(&ctx.shavite, &hash2[8]);
+   sph_shavite512(&ctx.shavite, (const void*) hash2[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash2[8]);
   sph_shavite512_init(&ctx.shavite);
-   sph_shavite512(&ctx.shavite, (const void*) &hash3[7], 64);
-   sph_shavite512_close(&ctx.shavite, &hash3[8]);
+   sph_shavite512(&ctx.shavite, (const void*) hash3[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash3[8]);

   init_sd( &ctx.simd, 512 );
-   update_final_sd( &ctx.simd, (BitSequence*)&hash0[9],
-                         (const BitSequence*)&hash0[8], 512 );
+   update_final_sd( &ctx.simd, (BitSequence*)hash0[9],
+                         (const BitSequence*)hash0[8], 512 );
   init_sd( &ctx.simd, 512 );
-   update_final_sd( &ctx.simd, (BitSequence*)&hash1[9],
-                         (const BitSequence*)&hash1[8], 512 );
+   update_final_sd( &ctx.simd, (BitSequence*)hash1[9],
+                         (const BitSequence*)hash1[8], 512 );
   init_sd( &ctx.simd, 512 );
-   update_final_sd( &ctx.simd, (BitSequence*)&hash2[9],
-                         (const BitSequence*)&hash2[8], 512 );
+   update_final_sd( &ctx.simd, (BitSequence*)hash2[9],
+                         (const BitSequence*)hash2[8], 512 );
   init_sd( &ctx.simd, 512 );
-   update_final_sd( &ctx.simd, (BitSequence*)&hash3[9],
-                         (const BitSequence*)&hash3[8], 512 );
+   update_final_sd( &ctx.simd, (BitSequence*)hash3[9],
+                         (const BitSequence*)hash3[8], 512 );

   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)&hash0[10],
-                            (const BitSequence*)&hash0[9], 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash0[10],
+                            (const BitSequence*)hash0[9], 512 );
   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)&hash1[10],
-                            (const BitSequence*)&hash1[9], 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash1[10],
+                            (const BitSequence*)hash1[9], 512 );
   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)&hash2[10],
-                            (const BitSequence*)&hash2[9], 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash2[10],
+                            (const BitSequence*)hash2[9], 512 );
   init_echo( &ctx.echo, 512 );
-   update_final_echo ( &ctx.echo, (BitSequence*)&hash3[10],
-                            (const BitSequence*)&hash3[9], 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash3[10],
+                            (const BitSequence*)hash3[9], 512 );

-   intrlv_4x64_512( vhash, &hash0[10], &hash1[10], &hash2[10], &hash3[10] );
+   intrlv_4x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10] );

   hamsi512_4way_init( &ctx.hamsi );
   hamsi512_4way( &ctx.hamsi, vhash, 64 );
   hamsi512_4way_close( &ctx.hamsi, vhash );
-   dintrlv_4x64_512( &hash0[11], &hash1[11], &hash2[11], &hash3[11], vhash );
+   dintrlv_4x64_512( hash0[11], hash1[11], hash2[11], hash3[11], vhash );

 	sph_fugue512_init(&ctx.fugue);
-	sph_fugue512(&ctx.fugue, (const void*) &hash0[11], 64);
-	sph_fugue512_close(&ctx.fugue, &hash0[12]);
+	sph_fugue512(&ctx.fugue, (const void*) hash0[11], 64);
+	sph_fugue512_close(&ctx.fugue, hash0[12]);
   sph_fugue512_init(&ctx.fugue);
-   sph_fugue512(&ctx.fugue, (const void*) &hash1[11], 64);
-   sph_fugue512_close(&ctx.fugue, &hash1[12]);
+   sph_fugue512(&ctx.fugue, (const void*) hash1[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash1[12]);
   sph_fugue512_init(&ctx.fugue);
-   sph_fugue512(&ctx.fugue, (const void*) &hash2[11], 64);
-   sph_fugue512_close(&ctx.fugue, &hash2[12]);
+   sph_fugue512(&ctx.fugue, (const void*) hash2[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash2[12]);
   sph_fugue512_init(&ctx.fugue);
-   sph_fugue512(&ctx.fugue, (const void*) &hash3[11], 64);
-   sph_fugue512_close(&ctx.fugue, &hash3[12]);
+   sph_fugue512(&ctx.fugue, (const void*) hash3[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash3[12]);

-   intrlv_4x32_512( vhash, &hash0[12], &hash1[12], &hash2[12], &hash3[12] );
+   intrlv_4x32_512( vhash, hash0[12], hash1[12], hash2[12], hash3[12] );

   shabal512_4way_init( &ctx.shabal );
   shabal512_4way( &ctx.shabal, vhash, 64 );
   shabal512_4way_close( &ctx.shabal, vhash );
-   dintrlv_4x32_512( &hash0[13], &hash1[13], &hash2[13], &hash3[13], vhash );
+   dintrlv_4x32_512( hash0[13], hash1[13], hash2[13], hash3[13], vhash );

 	sph_whirlpool_init(&ctx.whirlpool);
-	sph_whirlpool (&ctx.whirlpool, (const void*) &hash0[13], 64);
-	sph_whirlpool_close(&ctx.whirlpool, &hash0[14]);
+	sph_whirlpool (&ctx.whirlpool, (const void*) hash0[13], 64);
+	sph_whirlpool_close(&ctx.whirlpool, hash0[14]);
   sph_whirlpool_init(&ctx.whirlpool);
-   sph_whirlpool (&ctx.whirlpool, (const void*) &hash1[13], 64);
-   sph_whirlpool_close(&ctx.whirlpool, &hash1[14]);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash1[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash1[14]);
   sph_whirlpool_init(&ctx.whirlpool);
-   sph_whirlpool (&ctx.whirlpool, (const void*) &hash2[13], 64);
-   sph_whirlpool_close(&ctx.whirlpool, &hash2[14]);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash2[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash2[14]);
   sph_whirlpool_init(&ctx.whirlpool);
-   sph_whirlpool (&ctx.whirlpool, (const void*) &hash3[13], 64);
-   sph_whirlpool_close(&ctx.whirlpool, &hash3[14]);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash3[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash3[14]);

-   intrlv_4x64_512( vhash, &hash0[14], &hash1[14], &hash2[14], &hash3[14] );
+   intrlv_4x64_512( vhash, hash0[14], hash1[14], hash2[14], hash3[14] );

   sha512_4way_init( &ctx.sha512 );
   sha512_4way( &ctx.sha512, vhash, 64 );
   sha512_4way_close( &ctx.sha512, vhash );
-   dintrlv_4x64_512( &hash0[15], &hash1[15], &hash2[15], &hash3[15], vhash );
+   dintrlv_4x64_512( hash0[15], hash1[15], hash2[15], hash3[15], vhash );


-   ComputeSingleSWIFFTX((unsigned char*)&hash0[12], (unsigned char*)&hash0[16]);
-   ComputeSingleSWIFFTX((unsigned char*)&hash1[12], (unsigned char*)&hash1[16]);
-   ComputeSingleSWIFFTX((unsigned char*)&hash2[12], (unsigned char*)&hash2[16]);
-   ComputeSingleSWIFFTX((unsigned char*)&hash3[12], (unsigned char*)&hash3[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash0[12], (unsigned char*)hash0[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash1[12], (unsigned char*)hash1[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash2[12], (unsigned char*)hash2[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash3[12], (unsigned char*)hash3[16]);

-   intrlv_4x32_512( &vhashA, &hash0[16], &hash1[16], &hash2[16], &hash3[16] );
+   intrlv_4x32_512( vhashX[0], hash0[16], hash1[16], hash2[16], hash3[16] );

   memset( vhash, 0, 64*4 );
   
   haval256_5_4way_init( &ctx.haval );
-   haval256_5_4way( &ctx.haval, vhashA, 64 );
+   haval256_5_4way( &ctx.haval, vhashX[0], 64 );
   haval256_5_4way_close( &ctx.haval, vhash );
-   dintrlv_4x32_512( &hash0[17], &hash1[17], &hash2[17], &hash3[17], vhash );
+   dintrlv_4x32_512( hash0[17], hash1[17], hash2[17], hash3[17], vhash );

 	sph_tiger_init(&ctx.tiger);
-	sph_tiger (&ctx.tiger, (const void*) &hash0[17], 64);
-	sph_tiger_close(&ctx.tiger, (void*) &hash0[18]);
+	sph_tiger (&ctx.tiger, (const void*) hash0[17], 64);
+	sph_tiger_close(&ctx.tiger, (void*) hash0[18]);
   sph_tiger_init(&ctx.tiger);
-   sph_tiger (&ctx.tiger, (const void*) &hash1[17], 64);
-   sph_tiger_close(&ctx.tiger, (void*) &hash1[18]);
+   sph_tiger (&ctx.tiger, (const void*) hash1[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash1[18]);
   sph_tiger_init(&ctx.tiger);
-   sph_tiger (&ctx.tiger, (const void*) &hash2[17], 64);
-   sph_tiger_close(&ctx.tiger, (void*) &hash2[18]);
+   sph_tiger (&ctx.tiger, (const void*) hash2[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash2[18]);
   sph_tiger_init(&ctx.tiger);
-   sph_tiger (&ctx.tiger, (const void*) &hash3[17], 64);
-   sph_tiger_close(&ctx.tiger, (void*) &hash3[18]);
+   sph_tiger (&ctx.tiger, (const void*) hash3[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash3[18]);

-	LYRA2RE( (void*)&hash0[19], 32, (const void*)&hash0[18], 32,
-            (const void*)&hash0[18], 32, 1, 4, 4 );
-   LYRA2RE( (void*)&hash1[19], 32, (const void*)&hash1[18], 32,
-            (const void*)&hash1[18], 32, 1, 4, 4 );
-   LYRA2RE( (void*)&hash2[19], 32, (const void*)&hash2[18], 32,
-            (const void*)&hash2[18], 32, 1, 4, 4 );
-   LYRA2RE( (void*)&hash3[19], 32, (const void*)&hash3[18], 32,
-            (const void*)&hash3[18], 32, 1, 4, 4 );
+	LYRA2RE( (void*)hash0[19], 32, (const void*)hash0[18], 32,
+            (const void*)hash0[18], 32, 1, 4, 4 );
+   LYRA2RE( (void*)hash1[19], 32, (const void*)hash1[18], 32,
+            (const void*)hash1[18], 32, 1, 4, 4 );
+   LYRA2RE( (void*)hash2[19], 32, (const void*)hash2[18], 32,
+            (const void*)hash2[18], 32, 1, 4, 4 );
+   LYRA2RE( (void*)hash3[19], 32, (const void*)hash3[18], 32,
+            (const void*)hash3[18], 32, 1, 4, 4 );

 	sph_gost512_init(&ctx.gost);
-	sph_gost512 (&ctx.gost, (const void*) &hash0[19], 64);
-	sph_gost512_close(&ctx.gost, (void*) &hash0[20]);
+	sph_gost512 (&ctx.gost, (const void*) hash0[19], 64);
+	sph_gost512_close(&ctx.gost, (void*) hash0[20]);
   sph_gost512_init(&ctx.gost);
-   sph_gost512 (&ctx.gost, (const void*) &hash1[19], 64);
-   sph_gost512_close(&ctx.gost, (void*) &hash1[20]);
+   sph_gost512 (&ctx.gost, (const void*) hash1[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash1[20]);
   sph_gost512_init(&ctx.gost);
-   sph_gost512 (&ctx.gost, (const void*) &hash2[19], 64);
-   sph_gost512_close(&ctx.gost, (void*) &hash2[20]);
+   sph_gost512 (&ctx.gost, (const void*) hash2[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash2[20]);
   sph_gost512_init(&ctx.gost);
-   sph_gost512 (&ctx.gost, (const void*) &hash3[19], 64);
-   sph_gost512_close(&ctx.gost, (void*) &hash3[20]);
+   sph_gost512 (&ctx.gost, (const void*) hash3[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash3[20]);

-   intrlv_4x32_512( vhashA, &hash0[20], &hash1[20], &hash2[20], &hash3[20] );
+   intrlv_4x32_512( vhashX[0], hash0[20], hash1[20], hash2[20], hash3[20] );
   memset( vhash, 0, 64*4 );

   sha256_4way_init( &ctx.sha256 );
-   sha256_4way( &ctx.sha256, vhashA, 64 );
+   sha256_4way( &ctx.sha256, vhashX[0], 64 );
   sha256_4way_close( &ctx.sha256, vhash );
-   dintrlv_4x32_512( &hash0[21], &hash1[21], &hash2[21], &hash3[21], vhash );
+   dintrlv_4x32_512( hash0[21], hash1[21], hash2[21], hash3[21], vhash );

   sph_panama_init(&ctx.panama);
-   sph_panama (&ctx.panama, (const void*) &hash0[21], 64 );
-   sph_panama_close(&ctx.panama, (void*) &hash0[22]);
+   sph_panama (&ctx.panama, (const void*) hash0[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash0[22]);
   sph_panama_init(&ctx.panama);
-   sph_panama (&ctx.panama, (const void*) &hash1[21], 64 );
-   sph_panama_close(&ctx.panama, (void*) &hash1[22]);
+   sph_panama (&ctx.panama, (const void*) hash1[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash1[22]);
   sph_panama_init(&ctx.panama);
-   sph_panama (&ctx.panama, (const void*) &hash2[21], 64 );
-   sph_panama_close(&ctx.panama, (void*) &hash2[22]);
+   sph_panama (&ctx.panama, (const void*) hash2[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash2[22]);
   sph_panama_init(&ctx.panama);
-   sph_panama (&ctx.panama, (const void*) &hash3[21], 64 );
-   sph_panama_close(&ctx.panama, (void*) &hash3[22]);
+   sph_panama (&ctx.panama, (const void*) hash3[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash3[22]);

-   laneHash(512, (const BitSequence*)&hash0[22], 512, (BitSequence*)&hash0[23]);
-   laneHash(512, (const BitSequence*)&hash1[22], 512, (BitSequence*)&hash1[23]);
-   laneHash(512, (const BitSequence*)&hash2[22], 512, (BitSequence*)&hash2[23]);
-   laneHash(512, (const BitSequence*)&hash3[22], 512, (BitSequence*)&hash3[23]);
+   laneHash(512, (const BitSequence*)hash0[22], 512, (BitSequence*)hash0[23]);
+   laneHash(512, (const BitSequence*)hash1[22], 512, (BitSequence*)hash1[23]);
+   laneHash(512, (const BitSequence*)hash2[22], 512, (BitSequence*)hash2[23]);
+   laneHash(512, (const BitSequence*)hash3[22], 512, (BitSequence*)hash3[23]);

   x25x_shuffle( hash0 );
   x25x_shuffle( hash1 );
   x25x_shuffle( hash2 );
   x25x_shuffle( hash3 );

-   intrlv_4x32_512( &vhashA[ 0], &hash0[ 0], &hash1[ 0], &hash2[ 0], &hash3[ 0] );
-   intrlv_4x32_512( &vhashA[ 1], &hash0[ 1], &hash1[ 1], &hash2[ 1], &hash3[ 1] );
-   intrlv_4x32_512( &vhashA[ 2], &hash0[ 2], &hash1[ 2], &hash2[ 2], &hash3[ 2] );
-   intrlv_4x32_512( &vhashA[ 3], &hash0[ 3], &hash1[ 3], &hash2[ 3], &hash3[ 3] );
-   intrlv_4x32_512( &vhashA[ 4], &hash0[ 4], &hash1[ 4], &hash2[ 4], &hash3[ 4] );
-   intrlv_4x32_512( &vhashA[ 5], &hash0[ 5], &hash1[ 5], &hash2[ 5], &hash3[ 5] );
-   intrlv_4x32_512( &vhashA[ 6], &hash0[ 6], &hash1[ 6], &hash2[ 6], &hash3[ 6] );
-   intrlv_4x32_512( &vhashA[ 7], &hash0[ 7], &hash1[ 7], &hash2[ 7], &hash3[ 7] );
-   intrlv_4x32_512( &vhashA[ 8], &hash0[ 8], &hash1[ 8], &hash2[ 8], &hash3[ 8] );
-   intrlv_4x32_512( &vhashA[ 9], &hash0[ 9], &hash1[ 9], &hash2[ 9], &hash3[ 9] );
-   intrlv_4x32_512( &vhashA[10], &hash0[10], &hash1[10], &hash2[10], &hash3[10] );
-   intrlv_4x32_512( &vhashA[11], &hash0[11], &hash1[11], &hash2[11], &hash3[11] );
-   intrlv_4x32_512( &vhashA[12], &hash0[12], &hash1[12], &hash2[12], &hash3[12] );
-   intrlv_4x32_512( &vhashA[13], &hash0[13], &hash1[13], &hash2[13], &hash3[13] );
-   intrlv_4x32_512( &vhashA[14], &hash0[14], &hash1[14], &hash2[14], &hash3[14] );
-   intrlv_4x32_512( &vhashA[15], &hash0[15], &hash1[15], &hash2[15], &hash3[15] );
-   intrlv_4x32_512( &vhashA[16], &hash0[16], &hash1[16], &hash2[16], &hash3[16] );
-   intrlv_4x32_512( &vhashA[17], &hash0[17], &hash1[17], &hash2[17], &hash3[17] );
-   intrlv_4x32_512( &vhashA[18], &hash0[18], &hash1[18], &hash2[18], &hash3[18] );
-   intrlv_4x32_512( &vhashA[19], &hash0[19], &hash1[19], &hash2[19], &hash3[19] );
-   intrlv_4x32_512( &vhashA[20], &hash0[20], &hash1[20], &hash2[20], &hash3[20] );
-   intrlv_4x32_512( &vhashA[21], &hash0[21], &hash1[21], &hash2[21], &hash3[21] );
-   intrlv_4x32_512( &vhashA[22], &hash0[22], &hash1[22], &hash2[22], &hash3[22] );
-   intrlv_4x32_512( &vhashA[23], &hash0[23], &hash1[23], &hash2[23], &hash3[23] );
+   intrlv_4x32_512( vhashX[ 0], hash0[ 0], hash1[ 0], hash2[ 0], hash3[ 0] );
+   intrlv_4x32_512( vhashX[ 1], hash0[ 1], hash1[ 1], hash2[ 1], hash3[ 1] );
+   intrlv_4x32_512( vhashX[ 2], hash0[ 2], hash1[ 2], hash2[ 2], hash3[ 2] );
+   intrlv_4x32_512( vhashX[ 3], hash0[ 3], hash1[ 3], hash2[ 3], hash3[ 3] );
+   intrlv_4x32_512( vhashX[ 4], hash0[ 4], hash1[ 4], hash2[ 4], hash3[ 4] );
+   intrlv_4x32_512( vhashX[ 5], hash0[ 5], hash1[ 5], hash2[ 5], hash3[ 5] );
+   intrlv_4x32_512( vhashX[ 6], hash0[ 6], hash1[ 6], hash2[ 6], hash3[ 6] );
+   intrlv_4x32_512( vhashX[ 7], hash0[ 7], hash1[ 7], hash2[ 7], hash3[ 7] );
+   intrlv_4x32_512( vhashX[ 8], hash0[ 8], hash1[ 8], hash2[ 8], hash3[ 8] );
+   intrlv_4x32_512( vhashX[ 9], hash0[ 9], hash1[ 9], hash2[ 9], hash3[ 9] );
+   intrlv_4x32_512( vhashX[10], hash0[10], hash1[10], hash2[10], hash3[10] );
+   intrlv_4x32_512( vhashX[11], hash0[11], hash1[11], hash2[11], hash3[11] );
+   intrlv_4x32_512( vhashX[12], hash0[12], hash1[12], hash2[12], hash3[12] );
+   intrlv_4x32_512( vhashX[13], hash0[13], hash1[13], hash2[13], hash3[13] );
+   intrlv_4x32_512( vhashX[14], hash0[14], hash1[14], hash2[14], hash3[14] );
+   intrlv_4x32_512( vhashX[15], hash0[15], hash1[15], hash2[15], hash3[15] );
+   intrlv_4x32_512( vhashX[16], hash0[16], hash1[16], hash2[16], hash3[16] );
+   intrlv_4x32_512( vhashX[17], hash0[17], hash1[17], hash2[17], hash3[17] );
+   intrlv_4x32_512( vhashX[18], hash0[18], hash1[18], hash2[18], hash3[18] );
+   intrlv_4x32_512( vhashX[19], hash0[19], hash1[19], hash2[19], hash3[19] );
+   intrlv_4x32_512( vhashX[20], hash0[20], hash1[20], hash2[20], hash3[20] );
+   intrlv_4x32_512( vhashX[21], hash0[21], hash1[21], hash2[21], hash3[21] );
+   intrlv_4x32_512( vhashX[22], hash0[22], hash1[22], hash2[22], hash3[22] );
+   intrlv_4x32_512( vhashX[23], hash0[23], hash1[23], hash2[23], hash3[23] );

   blake2s_4way_init( &ctx.blake2s, 32 );
-   blake2s_4way_full_blocks( &ctx.blake2s, vhash, vhashA, 64*24 );
-
-   dintrlv_4x32( &hash0[24], &hash1[24], &hash2[24], &hash3[24], vhash, 256 );
+   blake2s_4way_full_blocks( &ctx.blake2s, output, vhashX, 64*24 );
+/*
+   dintrlv_4x32( hash0[24], hash1[24], hash2[24], hash3[24], vhash, 256 );
     
-	memcpy(output,    &hash0[24], 32);
-   memcpy(output+32, &hash1[24], 32);
-   memcpy(output+64, &hash2[24], 32);
-   memcpy(output+96, &hash3[24], 32);
+	memcpy(output,    hash0[24], 32);
+   memcpy(output+32, hash1[24], 32);
+   memcpy(output+64, hash2[24], 32);
+   memcpy(output+96, hash3[24], 32);
+*/
 }

 int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
@@ -365,11 +371,14 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
 {
   uint32_t hash[4*16] __attribute__ ((aligned (64)));
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 4;
   const int thr_id = mythr->id;
   const uint32_t Htarg = ptarget[7];

@@ -385,6 +394,16 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
              _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
      x25x_4way_hash( hash, vdata );

+      for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
+      {
+         extr_lane_4x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+              }
+      }
+/*
      for ( int i = 0; i < 4; i++ )
      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
@@ -392,10 +411,11 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
         pdata[19] = n+i;
         submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
+*/
      n += 4;
-   } while ( likely( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart ) );
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );

-   *hashes_done = n - first_nonce + 1;
+   *hashes_done = n - first_nonce;
   return 0;
 }