v3.10.2

2026-07-14 19:06:50 +00:00 · 2019-12-09 15:59:02 -05:00
parent 73430b13b1
commit a17ff6f189
48 changed files with 3561 additions and 1367 deletions
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -127,7 +127,7 @@ typedef struct {

 typedef blake_4way_big_context blake512_4way_context;

-void blake512_4way_init( void *cc );
+void blake512_4way_init( blake_4way_big_context *sc );
 void blake512_4way_update( void *cc, const void *data, size_t len );
 #define blake512_4way blake512_4way_update
 void blake512_4way_close( void *cc, void *dst );
@@ -136,6 +136,37 @@ void blake512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+//Blake-256 16 way
+
+typedef struct {
+   __m512i buf[16];
+   __m512i H[8];
+   size_t ptr;
+   uint32_t T0, T1;
+   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
+} blake_16way_small_context __attribute__ ((aligned (128)));
+
+// Default 14 rounds
+typedef blake_16way_small_context blake256_16way_context;
+void blake256_16way_init(void *cc);
+void blake256_16way_update(void *cc, const void *data, size_t len);
+void blake256_16way_close(void *cc, void *dst);
+
+// 14 rounds, blake, decred
+typedef blake_16way_small_context blake256r14_16way_context;
+void blake256r14_16way_init(void *cc);
+void blake256r14_16way_update(void *cc, const void *data, size_t len);
+void blake256r14_16way_close(void *cc, void *dst);
+
+// 8 rounds, blakecoin, vanilla
+typedef blake_16way_small_context blake256r8_16way_context;
+void blake256r8_16way_init(void *cc);
+void blake256r8_16way_update(void *cc, const void *data, size_t len);
+void blake256r8_16way_close(void *cc, void *dst);
+
+
+// Blake-512 8 way
+
 typedef struct {
   __m512i buf[16];
   __m512i H[8];
@@ -146,7 +177,7 @@ typedef struct {

 typedef blake_8way_big_context blake512_8way_context;

-void blake512_8way_init( void *cc );
+void blake512_8way_init( blake_8way_big_context *sc );
 void blake512_8way_update( void *cc, const void *data, size_t len );
 void blake512_8way_close( void *cc, void *dst );
 void blake512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -680,6 +680,144 @@ do { \
 } while (0)


+#endif
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// Blaske-256 16 way AVX512
+
+#define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
+do { \
+   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
+                         _mm512_xor_si512( _mm512_set1_epi32( c1 ), m0 ) ); \
+   d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
+   c = _mm512_add_epi32( c, d ); \
+   b = mm512_ror_32( _mm512_xor_si512( b, c ), 12 ); \
+   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
+                         _mm512_xor_si512( _mm512_set1_epi32( c0 ), m1 ) ); \
+   d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \
+   c = _mm512_add_epi32( c, d ); \
+   b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \
+} while (0)
+
+#define ROUND_S_16WAY(r)   do { \
+        GS_16WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
+        GS_16WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
+        GS_16WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
+        GS_16WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
+        GS_16WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
+        GS_16WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
+        GS_16WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
+        GS_16WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
+} while (0)
+
+#define DECL_STATE32_16WAY \
+   __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
+   sph_u32 T0, T1;
+
+#define READ_STATE32_16WAY(state) \
+do { \
+   H0 = (state)->H[0]; \
+   H1 = (state)->H[1]; \
+   H2 = (state)->H[2]; \
+   H3 = (state)->H[3]; \
+   H4 = (state)->H[4]; \
+   H5 = (state)->H[5]; \
+   H6 = (state)->H[6]; \
+   H7 = (state)->H[7]; \
+   T0 = (state)->T0; \
+   T1 = (state)->T1; \
+} while (0)
+
+#define WRITE_STATE32_16WAY(state) \
+do { \
+   (state)->H[0] = H0; \
+   (state)->H[1] = H1; \
+   (state)->H[2] = H2; \
+   (state)->H[3] = H3; \
+   (state)->H[4] = H4; \
+   (state)->H[5] = H5; \
+   (state)->H[6] = H6; \
+   (state)->H[7] = H7; \
+   (state)->T0 = T0; \
+   (state)->T1 = T1; \
+} while (0)
+
+#define COMPRESS32_16WAY( rounds ) \
+do { \
+   __m512i M0, M1, M2, M3, M4, M5, M6, M7; \
+   __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
+   __m512i V0, V1, V2, V3, V4, V5, V6, V7; \
+   __m512i V8, V9, VA, VB, VC, VD, VE, VF; \
+   __m512i shuf_bswap32; \
+   V0 = H0; \
+   V1 = H1; \
+   V2 = H2; \
+   V3 = H3; \
+   V4 = H4; \
+   V5 = H5; \
+   V6 = H6; \
+   V7 = H7; \
+   V8 = m512_const1_64( 0x243F6A88243F6A88 ); \
+   V9 = m512_const1_64( 0x85A308D385A308D3 ); \
+   VA = m512_const1_64( 0x13198A2E13198A2E ); \
+   VB = m512_const1_64( 0x0370734403707344 ); \
+   VC = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
+                              m512_const1_64( 0xA4093822A4093822 ) ); \
+   VD = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
+                              m512_const1_64( 0x299F31D0299F31D0 ) ); \
+   VE = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
+                              m512_const1_64( 0x082EFA98082EFA98 ) ); \
+   VF = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
+                              m512_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
+   shuf_bswap32 = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
+                                 0x2c2d2e2f28292a2b, 0x2425262720212223, \
+                                 0x1c1d1e1f18191a1b, 0x1415161710111213, \
+                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+   M0 = _mm512_shuffle_epi8( * buf    , shuf_bswap32 ); \
+   M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
+   M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
+   M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
+   M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
+   M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
+   M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
+   M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
+   M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
+   M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
+   MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
+   MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
+   MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
+   MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
+   ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
+   MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
+   ROUND_S_16WAY(0); \
+   ROUND_S_16WAY(1); \
+   ROUND_S_16WAY(2); \
+   ROUND_S_16WAY(3); \
+   ROUND_S_16WAY(4); \
+   ROUND_S_16WAY(5); \
+   ROUND_S_16WAY(6); \
+   ROUND_S_16WAY(7); \
+   if (rounds == 14) \
+   { \
+      ROUND_S_16WAY(8); \
+      ROUND_S_16WAY(9); \
+      ROUND_S_16WAY(0); \
+      ROUND_S_16WAY(1); \
+      ROUND_S_16WAY(2); \
+      ROUND_S_16WAY(3); \
+   } \
+   H0 = _mm512_xor_si512( _mm512_xor_si512( V8, V0 ), H0 ); \
+   H1 = _mm512_xor_si512( _mm512_xor_si512( V9, V1 ), H1 ); \
+   H2 = _mm512_xor_si512( _mm512_xor_si512( VA, V2 ), H2 ); \
+   H3 = _mm512_xor_si512( _mm512_xor_si512( VB, V3 ), H3 ); \
+   H4 = _mm512_xor_si512( _mm512_xor_si512( VC, V4 ), H4 ); \
+   H5 = _mm512_xor_si512( _mm512_xor_si512( VD, V5 ), H5 ); \
+   H6 = _mm512_xor_si512( _mm512_xor_si512( VE, V6 ), H6 ); \
+   H7 = _mm512_xor_si512( _mm512_xor_si512( VF, V7 ), H7 ); \
+} while (0)
+
 #endif

 // Blake-256 4 way
@@ -916,6 +1054,179 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,

 #endif

+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+//Blake-256 16 way AVX512
+
+static void
+blake32_16way_init( blake_16way_small_context *sc, const sph_u32 *iv,
+                   const sph_u32 *salt, int rounds )
+{
+   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E6676A09E667 );
+   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE85BB67AE85 );
+   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF3723C6EF372 );
+   casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53AA54FF53A );
+   casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527F510E527F );
+   casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C9B05688C );
+   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9AB1F83D9AB );
+   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD195BE0CD19 );
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
+   sc->rounds = rounds;
+}
+
+static void
+blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   __m512i *buf;
+   size_t ptr;
+   const int buf_size = 64;   // number of elements, sizeof/4
+   DECL_STATE32_16WAY
+   buf = sc->buf;
+   ptr = sc->ptr;
+   if ( len < buf_size - ptr )
+   {
+        memcpy_512( buf + (ptr>>2), vdata, len>>2 );
+        ptr += len;
+        sc->ptr = ptr;
+        return;
+   }
+   READ_STATE32_16WAY(sc);
+   while ( len > 0 )
+   {
+      size_t clen;
+
+      clen = buf_size - ptr;
+      if (clen > len)
+           clen = len;
+      memcpy_512( buf + (ptr>>2), vdata, clen>>2 );
+      ptr += clen;
+      vdata += (clen>>2);
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+          if ( ( T0 = T0 + 512 ) < 512 )
+                T1 = T1 + 1;
+          COMPRESS32_16WAY( sc->rounds );
+          ptr = 0;
+      }
+   }
+   WRITE_STATE32_16WAY(sc);
+   sc->ptr = ptr;
+}
+
+static void
+blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
+                    void *dst, size_t out_size_w32 )
+{
+   __m512i buf[16];
+   size_t ptr;
+   unsigned bit_len;
+   sph_u32 th, tl;
+
+   ptr = sc->ptr;
+   bit_len = ((unsigned)ptr << 3);
+   buf[ptr>>2] = m512_const1_64( 0x0000008000000080ULL );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+
+   if ( ptr == 0 )
+   {
+        sc->T0 = 0xFFFFFE00UL;
+        sc->T1 = 0xFFFFFFFFUL;
+   }
+   else if ( sc->T0 == 0 )
+   {
+        sc->T0 = 0xFFFFFE00UL + bit_len;
+        sc->T1 = sc->T1 - 1;
+   }
+   else
+        sc->T0 -= 512 - bit_len;
+
+   if ( ptr <= 52 )
+   {
+       memset_zero_512( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
+       if ( out_size_w32 == 8 )
+           buf[52>>2] = _mm512_or_si512( buf[52>>2],
+                                m512_const1_64( 0x0100000001000000ULL ) );
+       buf[+56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
+       buf[+60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
+       blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
+   }
+   else
+   {
+        memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
+        blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
+        sc->T0 = 0xFFFFFE00UL;
+        sc->T1 = 0xFFFFFFFFUL;
+        memset_zero_512( buf, 56>>2 );
+       if ( out_size_w32 == 8 )
+           buf[52>>2] = m512_const1_64( 0x0100000001000000ULL );
+        buf[56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
+        buf[60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
+        blake32_16way( sc, buf, 64 );
+   }
+   mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
+}
+
+void
+blake256_16way_init(void *cc)
+{
+   blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
+}
+
+void
+blake256_16way_update(void *cc, const void *data, size_t len)
+{
+        blake32_16way(cc, data, len);
+}
+
+void
+blake256_16way_close_update(void *cc, void *dst)
+{
+        blake32_16way_close(cc, 0, 0, dst, 8);
+}
+
+void blake256r14_16way_init(void *cc)
+{
+   blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
+}
+
+void
+blake256r14_16way_update(void *cc, const void *data, size_t len)
+{
+   blake32_16way(cc, data, len);
+}
+
+void
+blake256r14_16way_close(void *cc, void *dst)
+{
+   blake32_16way_close(cc, 0, 0, dst, 8);
+}
+
+void blake256r8_16way_init(void *cc)
+{
+   blake32_16way_init( cc, IV256, salt_zero_8way_small, 8 );
+}
+
+void
+blake256r8_16way_update(void *cc, const void *data, size_t len)
+{
+   blake32_16way(cc, data, len);
+}
+
+void
+blake256r8_16way_close(void *cc, void *dst)
+{
+   blake32_16way_close(cc, 0, 0, dst, 8);
+}
+
+#endif // AVX512
+
+
+
 // Blake-256 4 way

 // default 14 rounds, backward copatibility
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -42,20 +42,13 @@
 extern "C"{
 #endif

-#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE
-#define SPH_SMALL_FOOTPRINT_BLAKE   1
-#endif
-
-#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE)
-#define SPH_COMPACT_BLAKE_64   1
-#endif
-
 #ifdef _MSC_VER
 #pragma warning (disable: 4146)
 #endif

-// Blake-512
-
+// Blake-512 common
+   
+/*
 static const sph_u64 IV512[8] = {
 	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
 	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
@@ -65,10 +58,6 @@ static const sph_u64 IV512[8] = {

 static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };

-#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
-
-// Blake-256 4 & 8 way, Blake-512 4 way
-
 static const unsigned sigma[16][16] = {
 	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
 	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
@@ -88,7 +77,17 @@ static const unsigned sigma[16][16] = {
 	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 }
 };

-#endif
+static const sph_u64 CB[16] = {
+   SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
+   SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
+   SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
+   SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
+   SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
+   SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
+   SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
+   SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
+
+*/

 #define Z00   0
 #define Z01   1
@@ -285,23 +284,6 @@ static const unsigned sigma[16][16] = {
 #define CBE   SPH_C64(0x0801F2E2858EFC16)
 #define CBF   SPH_C64(0x636920D871574E69)

-/*
-#if SPH_COMPACT_BLAKE_64
-// not used
-static const sph_u64 CB[16] = {
-	SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
-	SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
-	SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
-	SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
-	SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
-	SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
-	SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
-	SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
-};
-
-#endif
-*/
-
 #define READ_STATE64(state)   do { \
      H0 = (state)->H[0]; \
      H1 = (state)->H[1]; \
@@ -338,7 +320,7 @@ static const sph_u64 CB[16] = {

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-// Blake-512 8 way
+// Blake-512 8 way AVX512

 #define GB_8WAY(m0, m1, c0, c1, a, b, c, d)   do { \
   a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
@@ -364,7 +346,6 @@ static const sph_u64 CB[16] = {
   GB_8WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
   } while (0)

-
 #define DECL_STATE64_8WAY \
   __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
        __m512i S0, S1, S2, S3; \
@@ -443,9 +424,7 @@ static const sph_u64 CB[16] = {
  H7 = mm512_xor4( VF, V7, S3, H7 ); \
 } while (0)

-static void
-blake64_8way_init( blake_8way_big_context *sc, const sph_u64 *iv,
-              const sph_u64 *salt )
+void blake512_8way_init( blake_8way_big_context *sc )
 {
   __m512i zero = m512_zero;
   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
@@ -511,20 +490,20 @@ blake64_8way( blake_8way_big_context *sc, const void *data, size_t len )
 }

 static void
-blake64_8way_close( blake_8way_big_context *sc,
-   unsigned ub, unsigned n, void *dst, size_t out_size_w64)
+blake64_8way_close( blake_8way_big_context *sc, void *dst )
 {
   __m512i buf[16];
   size_t ptr;
   unsigned bit_len;
-   uint64_t z, zz;
+//   uint64_t z, zz;
   sph_u64 th, tl;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-   z = 0x80 >> n;
-   zz = ((ub & -z) | z) & 0xFF;
-   buf[ptr>>3] = _mm512_set1_epi64( zz );
+//   z = 0x80 >> n;
+//   zz = ((ub & -z) | z) & 0xFF;
+//   buf[ptr>>3] = _mm512_set1_epi64( zz );
+   buf[ptr>>3] = m512_const1_64( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;
   if (ptr == 0 )
@@ -544,11 +523,10 @@ blake64_8way_close( blake_8way_big_context *sc,
   if ( ptr <= 104 )
   {
       memset_zero_512( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
-       if ( out_size_w64 == 8 )
-          buf[(104>>3)] = _mm512_or_si512( buf[(104>>3)],
+       buf[104>>3] = _mm512_or_si512( buf[104>>3],
                                 m512_const1_64( 0x0100000000000000ULL ) );
-       *(buf+(112>>3)) = _mm512_set1_epi64( bswap_64( th ) );
-       *(buf+(120>>3)) = _mm512_set1_epi64( bswap_64( tl ) );
+       buf[112>>3] = m512_const1_64( bswap_64( th ) );
+       buf[120>>3] = m512_const1_64( bswap_64( tl ) );

       blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
   }
@@ -560,22 +538,15 @@ blake64_8way_close( blake_8way_big_context *sc,
       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
       memset_zero_512( buf, 112>>3 );
-       if ( out_size_w64 == 8 )
-           buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
-       *(buf+(112>>3)) = _mm512_set1_epi64( bswap_64( th ) );
-       *(buf+(120>>3)) = _mm512_set1_epi64( bswap_64( tl ) );
+       buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
+       buf[112>>3] = m512_const1_64( bswap_64( th ) );
+       buf[120>>3] = m512_const1_64( bswap_64( tl ) );

       blake64_8way( sc, buf, 128 );
   }
   mm512_block_bswap_64( (__m512i*)dst, sc->H );
 }

-void
-blake512_8way_init(void *cc)
-{
-   blake64_8way_init(cc, IV512, salt_zero_big);
-}
-
 void
 blake512_8way_update(void *cc, const void *data, size_t len)
 {
@@ -591,7 +562,7 @@ blake512_8way_close(void *cc, void *dst)
 void
 blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
-   blake64_8way_close(cc, ub, n, dst, 8);
+   blake64_8way_close(cc, dst);
 }

 #endif  // AVX512
@@ -698,11 +669,8 @@ blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
  H7 = mm256_xor4( VF, V7, S3, H7 ); \
 } while (0)

-//static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };

-static void
-blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
-              const sph_u64 *salt )
+void blake512_4way_init( blake_4way_big_context *sc )
 {
   __m256i zero = m256_zero;
   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
@@ -713,12 +681,10 @@ blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
-
   casti_m256i( sc->S, 0 ) = zero;
   casti_m256i( sc->S, 1 ) = zero;
   casti_m256i( sc->S, 2 ) = zero;
   casti_m256i( sc->S, 3 ) = zero;
-
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
 }
@@ -768,20 +734,16 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
 }

 static void
-blake64_4way_close( blake_4way_big_context *sc,
-	unsigned ub, unsigned n, void *dst, size_t out_size_w64)
+blake64_4way_close( blake_4way_big_context *sc, void *dst )
 {
   __m256i buf[16];
   size_t ptr;
   unsigned bit_len;
-   uint64_t z, zz;
   sph_u64 th, tl;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-   z = 0x80 >> n;
-   zz = ((ub & -z) | z) & 0xFF;
-   buf[ptr>>3] = _mm256_set1_epi64x( zz );
+   buf[ptr>>3] = m256_const1_64( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;
   if (ptr == 0 )
@@ -798,40 +760,41 @@ blake64_4way_close( blake_4way_big_context *sc,
   {
        sc->T0 -= 1024 - bit_len;
   }
+
   if ( ptr <= 104 )
   {
       memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
-       if ( out_size_w64 == 8 )
-          buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)],
+       buf[104>>3] = _mm256_or_si256( buf[104>>3],
                                 m256_const1_64( 0x0100000000000000ULL ) );
-       *(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) );
-       *(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) );
+       buf[112>>3] = m256_const1_64( bswap_64( th ) );
+       buf[120>>3] = m256_const1_64( bswap_64( tl ) );

       blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
   }
   else
-  {
+   {
       memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );

       blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
       memset_zero_256( buf, 112>>3 ); 
-       if ( out_size_w64 == 8 )
-           buf[104>>3] = m256_const1_64( 0x0100000000000000ULL );
-       *(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) );
-       *(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) );
+       buf[104>>3] = m256_const1_64( 0x0100000000000000ULL );
+       buf[112>>3] = m256_const1_64( bswap_64( th ) );
+       buf[120>>3] = m256_const1_64( bswap_64( tl ) );

       blake64_4way( sc, buf, 128 );
   }
   mm256_block_bswap_64( (__m256i*)dst, sc->H );
 }

+/*
 void
 blake512_4way_init(void *cc)
 {
 	blake64_4way_init(cc, IV512, salt_zero_big);
 }
+*/

 void
 blake512_4way_update(void *cc, const void *data, size_t len)
@@ -842,15 +805,18 @@ blake512_4way_update(void *cc, const void *data, size_t len)
 void
 blake512_4way_close(void *cc, void *dst)
 {
-	blake512_4way_addbits_and_close(cc, 0, 0, dst);
+   blake64_4way_close( cc, dst );
+
+//   blake512_4way_addbits_and_close(cc, dst);
 }

+/*
 void
 blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
 	blake64_4way_close(cc, ub, n, dst, 8);
 }
-
+*/
 #ifdef __cplusplus
 }
 #endif
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -64,7 +64,8 @@ typedef bmw_4way_small_context bmw256_4way_context;

 void bmw256_4way_init( bmw256_4way_context *ctx );

-void bmw256_4way(void *cc, const void *data, size_t len);
+void bmw256_4way_update(void *cc, const void *data, size_t len);
+#define bmw256_4way bmw256_4way_update

 void bmw256_4way_close(void *cc, void *dst);

@@ -87,11 +88,33 @@ typedef struct {
 typedef bmw_8way_small_context bmw256_8way_context;

 void bmw256_8way_init( bmw256_8way_context *ctx );
-void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len );
+void bmw256_8way_update( bmw256_8way_context *ctx, const void *data,
+                         size_t len );
+#define bmw256_8way bmw256_8way_update
 void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );

 #endif

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// BMW-256 16 way 32
+
+typedef struct {
+   __m512i buf[16];
+   __m512i H[16];
+   size_t ptr;
+   uint32_t bit_count;  // assume bit_count fits in 32 bits
+} bmw_16way_small_context __attribute__ ((aligned (128)));
+
+typedef bmw_16way_small_context bmw256_16way_context;
+
+void bmw256_16way_init( bmw256_16way_context *ctx );
+void bmw256_16way_update( bmw256_16way_context *ctx, const void *data,
+                          size_t len );
+void bmw256_16way_close( bmw256_16way_context *ctx, void *dst );
+
+#endif
+

 #if defined(__SSE2__)

--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -564,7 +564,7 @@ bmw256_4way_init(void *cc)
 */

 void
-bmw256_4way(void *cc, const void *data, size_t len)
+bmw256_4way_update(void *cc, const void *data, size_t len)
 {
 	bmw32_4way(cc, data, len);
 }
@@ -1014,7 +1014,8 @@ void bmw256_8way_init( bmw256_8way_context *ctx )
   ctx->bit_count = 0;
 }

-void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
+void bmw256_8way_update( bmw256_8way_context *ctx, const void *data,
+                         size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   __m256i *buf;
@@ -1092,6 +1093,513 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )

 #endif // __AVX2__

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// BMW-256 16 way 32
+
+
+#define s16s0(x) \
+   mm512_xor4( _mm512_srli_epi32( (x), 1), \
+                _mm512_slli_epi32( (x), 3), \
+                mm512_rol_32( (x),  4), \
+                mm512_rol_32( (x), 19) )
+
+#define s16s1(x) \
+   mm512_xor4( _mm512_srli_epi32( (x), 1), \
+                _mm512_slli_epi32( (x), 2), \
+                mm512_rol_32( (x), 8), \
+                mm512_rol_32( (x), 23) )
+
+#define s16s2(x) \
+   mm512_xor4( _mm512_srli_epi32( (x), 2), \
+               _mm512_slli_epi32( (x), 1), \
+               mm512_rol_32( (x), 12), \
+               mm512_rol_32( (x), 25) )
+
+#define s16s3(x) \
+   mm512_xor4( _mm512_srli_epi32( (x), 2), \
+               _mm512_slli_epi32( (x), 2), \
+               mm512_rol_32( (x), 15), \
+               mm512_rol_32( (x), 29) )
+
+#define s16s4(x) \
+  _mm512_xor_si512( (x), _mm512_srli_epi32( (x), 1 ) )
+
+#define s16s5(x) \
+  _mm512_xor_si512( (x), _mm512_srli_epi32( (x), 2 ) )
+
+#define r16s1(x)    mm512_rol_32( x,  3 ) 
+#define r16s2(x)    mm512_rol_32( x,  7 ) 
+#define r16s3(x)    mm512_rol_32( x, 13 ) 
+#define r16s4(x)    mm512_rol_32( x, 16 ) 
+#define r16s5(x)    mm512_rol_32( x, 19 ) 
+#define r16s6(x)    mm512_rol_32( x, 23 ) 
+#define r16s7(x)    mm512_rol_32( x, 27 ) 
+
+#define mm512_rol_off_32( M, j, off ) \
+   mm512_rol_32( M[ ( (j) + (off) ) & 0xF ] , \
+                  ( ( (j) + (off) ) & 0xF ) + 1 )
+
+#define add_elt_s16( M, H, j ) \
+   _mm512_xor_si512( \
+      _mm512_add_epi32( \
+            _mm512_sub_epi32( _mm512_add_epi32( mm512_rol_off_32( M, j, 0 ), \
+                                                mm512_rol_off_32( M, j, 3 ) ), \
+                             mm512_rol_off_32( M, j, 10 ) ), \
+            _mm512_set1_epi32( ( (j) + 16 ) * 0x05555555UL ) ), \
+       H[ ( (j)+7 ) & 0xF ] )
+
+#define expand1s16( qt, M, H, i ) \
+   _mm512_add_epi32( add_elt_s16( M, H, (i)-16 ), \
+                     mm512_add4_32( mm512_add4_32( s16s1( qt[ (i)-16 ] ), \
+                                                   s16s2( qt[ (i)-15 ] ), \
+                                                   s16s3( qt[ (i)-14 ] ), \
+                                                   s16s0( qt[ (i)-13 ] ) ), \
+                                    mm512_add4_32( s16s1( qt[ (i)-12 ] ), \
+                                                   s16s2( qt[ (i)-11 ] ), \
+                                                   s16s3( qt[ (i)-10 ] ), \
+                                                   s16s0( qt[ (i)- 9 ] ) ), \
+                                    mm512_add4_32( s16s1( qt[ (i)- 8 ] ), \
+                                                   s16s2( qt[ (i)- 7 ] ), \
+                                                   s16s3( qt[ (i)- 6 ] ), \
+                                                   s16s0( qt[ (i)- 5 ] ) ), \
+                                    mm512_add4_32( s16s1( qt[ (i)- 4 ] ), \
+                                                   s16s2( qt[ (i)- 3 ] ), \
+                                                   s16s3( qt[ (i)- 2 ] ), \
+                                                   s16s0( qt[ (i)- 1 ] ) ) ) )
+
+#define expand2s16( qt, M, H, i) \
+   _mm512_add_epi32( add_elt_s16( M, H, (i)-16 ), \
+      mm512_add4_32( mm512_add4_32( qt[ (i)-16 ], \
+                                    r16s1( qt[ (i)-15 ] ), \
+                                    qt[ (i)-14 ], \
+                                    r16s2( qt[ (i)-13 ] ) ), \
+                     mm512_add4_32( qt[ (i)-12 ], \
+                                    r16s3( qt[ (i)-11 ] ), \
+                                    qt[ (i)-10 ], \
+                                    r16s4( qt[ (i)- 9 ] ) ), \
+                     mm512_add4_32( qt[ (i)- 8 ], \
+                                    r16s5( qt[ (i)- 7 ] ), \
+                                    qt[ (i)- 6 ], \
+                                    r16s6( qt[ (i)- 5 ] ) ), \
+                     mm512_add4_32( qt[ (i)- 4 ], \
+                                    r16s7( qt[ (i)- 3 ] ), \
+                                    s16s4( qt[ (i)- 2 ] ), \
+                                    s16s5( qt[ (i)- 1 ] ) ) ) )
+
+
+#define W16s0 \
+   _mm512_add_epi32( \
+      _mm512_add_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 5], H[ 5] ), \
+                           _mm512_xor_si512( M[ 7], H[ 7] ) ), \
+         _mm512_xor_si512( M[10], H[10] ) ), \
+      _mm512_add_epi32( _mm512_xor_si512( M[13], H[13] ), \
+                        _mm512_xor_si512( M[14], H[14] ) ) )
+
+#define W16s1 \
+   _mm512_add_epi32( \
+       _mm512_add_epi32( \
+          _mm512_sub_epi32( _mm512_xor_si512( M[ 6], H[ 6] ), \
+                            _mm512_xor_si512( M[ 8], H[ 8] ) ), \
+          _mm512_xor_si512( M[11], H[11] ) ), \
+       _mm512_sub_epi32( _mm512_xor_si512( M[14], H[14] ), \
+                         _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W16s2 \
+   _mm512_sub_epi32( \
+      _mm512_add_epi32( \
+         _mm512_add_epi32( _mm512_xor_si512( M[ 0], H[ 0] ), \
+                           _mm512_xor_si512( M[ 7], H[ 7] ) ), \
+         _mm512_xor_si512( M[ 9], H[ 9] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[12], H[12] ), \
+                        _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W16s3 \
+   _mm512_sub_epi32( \
+      _mm512_add_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 0], H[ 0] ), \
+                           _mm512_xor_si512( M[ 1], H[ 1] ) ), \
+         _mm512_xor_si512( M[ 8], H[ 8] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[10], H[10] ), \
+                        _mm512_xor_si512( M[13], H[13] ) ) )
+
+#define W16s4 \
+   _mm512_sub_epi32( \
+      _mm512_add_epi32( \
+         _mm512_add_epi32( _mm512_xor_si512( M[ 1], H[ 1] ), \
+                           _mm512_xor_si512( M[ 2], H[ 2] ) ), \
+         _mm512_xor_si512( M[ 9], H[ 9] ) ), \
+      _mm512_add_epi32( _mm512_xor_si512( M[11], H[11] ), \
+                        _mm512_xor_si512( M[14], H[14] ) ) )
+
+#define W16s5 \
+   _mm512_sub_epi32( \
+      _mm512_add_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 3], H[ 3] ), \
+                           _mm512_xor_si512( M[ 2], H[ 2] ) ), \
+         _mm512_xor_si512( M[10], H[10] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[12], H[12] ), \
+                        _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W16s6 \
+   _mm512_sub_epi32( \
+      _mm512_sub_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 4], H[ 4] ), \
+                           _mm512_xor_si512( M[ 0], H[ 0] ) ), \
+         _mm512_xor_si512( M[ 3], H[ 3] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[11], H[11] ), \
+                        _mm512_xor_si512( M[13], H[13] ) ) )
+
+#define W16s7 \
+   _mm512_sub_epi32( \
+      _mm512_sub_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 1], H[ 1] ), \
+                           _mm512_xor_si512( M[ 4], H[ 4] ) ), \
+         _mm512_xor_si512( M[ 5], H[ 5] ) ), \
+      _mm512_add_epi32( _mm512_xor_si512( M[12], H[12] ), \
+                        _mm512_xor_si512( M[14], H[14] ) ) )
+
+#define W16s8 \
+   _mm512_add_epi32( \
+      _mm512_sub_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 2], H[ 2] ), \
+                           _mm512_xor_si512( M[ 5], H[ 5] ) ), \
+         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[13], H[13] ), \
+                        _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W16s9 \
+   _mm512_sub_epi32( \
+      _mm512_add_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 0], H[ 0] ), \
+                           _mm512_xor_si512( M[ 3], H[ 3] ) ), \
+         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[ 7], H[ 7] ), \
+                        _mm512_xor_si512( M[14], H[14] ) ) )
+
+#define W16s10 \
+   _mm512_sub_epi32( \
+      _mm512_sub_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 8], H[ 8] ), \
+                           _mm512_xor_si512( M[ 1], H[ 1] ) ), \
+         _mm512_xor_si512( M[ 4], H[ 4] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[ 7], H[ 7] ), \
+                        _mm512_xor_si512( M[15], H[15] ) ) )
+
+#define W16s11 \
+   _mm512_sub_epi32( \
+      _mm512_sub_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 8], H[ 8] ), \
+                           _mm512_xor_si512( M[ 0], H[ 0] ) ), \
+         _mm512_xor_si512( M[ 2], H[ 2] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[ 5], H[ 5] ), \
+                        _mm512_xor_si512( M[ 9], H[ 9] ) ) )
+
+#define W16s12 \
+   _mm512_sub_epi32( \
+      _mm512_sub_epi32( \
+         _mm512_add_epi32( _mm512_xor_si512( M[ 1], H[ 1] ), \
+                           _mm512_xor_si512( M[ 3], H[ 3] ) ), \
+         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[ 9], H[ 9] ), \
+                        _mm512_xor_si512( M[10], H[10] ) ) )
+
+#define W16s13 \
+   _mm512_add_epi32( \
+      _mm512_add_epi32( \
+         _mm512_add_epi32( _mm512_xor_si512( M[ 2], H[ 2] ), \
+                           _mm512_xor_si512( M[ 4], H[ 4] ) ), \
+         _mm512_xor_si512( M[ 7], H[ 7] ) ), \
+      _mm512_add_epi32( _mm512_xor_si512( M[10], H[10] ), \
+                        _mm512_xor_si512( M[11], H[11] ) ) )
+
+#define W16s14 \
+   _mm512_sub_epi32( \
+      _mm512_add_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[ 3], H[ 3] ), \
+                           _mm512_xor_si512( M[ 5], H[ 5] ) ), \
+         _mm512_xor_si512( M[ 8], H[ 8] ) ), \
+      _mm512_add_epi32( _mm512_xor_si512( M[11], H[11] ), \
+                        _mm512_xor_si512( M[12], H[12] ) ) )
+
+#define W16s15 \
+   _mm512_sub_epi32( \
+      _mm512_sub_epi32( \
+         _mm512_sub_epi32( _mm512_xor_si512( M[12], H[12] ), \
+                           _mm512_xor_si512( M[ 4], H[4] ) ), \
+         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
+      _mm512_sub_epi32( _mm512_xor_si512( M[ 9], H[ 9] ), \
+                        _mm512_xor_si512( M[13], H[13] ) ) )
+
+void compress_small_16way( const __m512i *M, const __m512i H[16],
+                     __m512i dH[16] )
+{
+   __m512i qt[32], xl, xh;
+
+   qt[ 0] = _mm512_add_epi32( s16s0( W16s0 ), H[ 1] );
+   qt[ 1] = _mm512_add_epi32( s16s1( W16s1 ), H[ 2] );
+   qt[ 2] = _mm512_add_epi32( s16s2( W16s2 ), H[ 3] );
+   qt[ 3] = _mm512_add_epi32( s16s3( W16s3 ), H[ 4] );
+   qt[ 4] = _mm512_add_epi32( s16s4( W16s4 ), H[ 5] );
+   qt[ 5] = _mm512_add_epi32( s16s0( W16s5 ), H[ 6] );
+   qt[ 6] = _mm512_add_epi32( s16s1( W16s6 ), H[ 7] );
+   qt[ 7] = _mm512_add_epi32( s16s2( W16s7 ), H[ 8] );
+   qt[ 8] = _mm512_add_epi32( s16s3( W16s8 ), H[ 9] );
+   qt[ 9] = _mm512_add_epi32( s16s4( W16s9 ), H[10] );
+   qt[10] = _mm512_add_epi32( s16s0( W16s10), H[11] );
+   qt[11] = _mm512_add_epi32( s16s1( W16s11), H[12] );
+   qt[12] = _mm512_add_epi32( s16s2( W16s12), H[13] );
+   qt[13] = _mm512_add_epi32( s16s3( W16s13), H[14] );
+   qt[14] = _mm512_add_epi32( s16s4( W16s14), H[15] );
+   qt[15] = _mm512_add_epi32( s16s0( W16s15), H[ 0] );
+   qt[16] = expand1s16( qt, M, H, 16 );
+   qt[17] = expand1s16( qt, M, H, 17 );
+   qt[18] = expand2s16( qt, M, H, 18 );
+   qt[19] = expand2s16( qt, M, H, 19 );
+   qt[20] = expand2s16( qt, M, H, 20 );
+   qt[21] = expand2s16( qt, M, H, 21 );
+   qt[22] = expand2s16( qt, M, H, 22 );
+   qt[23] = expand2s16( qt, M, H, 23 );
+   qt[24] = expand2s16( qt, M, H, 24 );
+   qt[25] = expand2s16( qt, M, H, 25 );
+   qt[26] = expand2s16( qt, M, H, 26 );
+   qt[27] = expand2s16( qt, M, H, 27 );
+   qt[28] = expand2s16( qt, M, H, 28 );
+   qt[29] = expand2s16( qt, M, H, 29 );
+   qt[30] = expand2s16( qt, M, H, 30 );
+   qt[31] = expand2s16( qt, M, H, 31 );
+
+   xl = _mm512_xor_si512(
+              mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
+              mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
+   xh = _mm512_xor_si512( xl,  _mm512_xor_si512(
+                 mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
+                 mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
+
+#define DH1L( m, sl, sr, a, b, c ) \
+   _mm512_add_epi32( \
+               _mm512_xor_si512( M[m], \
+                  _mm512_xor_si512( _mm512_slli_epi32( xh, sl ), \
+                                    _mm512_srli_epi32( qt[a], sr ) ) ), \
+               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
+
+#define DH1R( m, sl, sr, a, b, c ) \
+   _mm512_add_epi32( \
+               _mm512_xor_si512( M[m], \
+                  _mm512_xor_si512( _mm512_srli_epi32( xh, sl ), \
+                                    _mm512_slli_epi32( qt[a], sr ) ) ), \
+               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
+
+#define DH2L( m, rl, sl, h, a, b, c ) \
+   _mm512_add_epi32( _mm512_add_epi32( \
+       mm512_rol_32( dH[h], rl ), \
+          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
+                 _mm512_xor_si512( _mm512_slli_epi32( xl, sl ), \
+                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
+
+#define DH2R( m, rl, sr, h, a, b, c ) \
+   _mm512_add_epi32( _mm512_add_epi32( \
+       mm512_rol_32( dH[h], rl ), \
+          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
+                 _mm512_xor_si512( _mm512_srli_epi32( xl, sr ), \
+                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
+
+   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
+   dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
+   dH[ 2] = DH1R(  2,  5,  5, 18, 26, 2 );
+   dH[ 3] = DH1R(  3,  1,  5, 19, 27, 3 );
+   dH[ 4] = DH1R(  4,  3,  0, 20, 28, 4 );
+   dH[ 5] = DH1L(  5,  6,  6, 21, 29, 5 );
+   dH[ 6] = DH1R(  6,  4,  6, 22, 30, 6 );
+   dH[ 7] = DH1R(  7, 11,  2, 23, 31, 7 );
+   dH[ 8] = DH2L(  8,  9,  8,  4, 24, 23,  8 );
+   dH[ 9] = DH2R(  9, 10,  6,  5, 25, 16,  9 );
+   dH[10] = DH2L( 10, 11,  6,  6, 26, 17, 10 );
+   dH[11] = DH2L( 11, 12,  4,  7, 27, 18, 11 );
+   dH[12] = DH2R( 12, 13,  3,  0, 28, 19, 12 );
+   dH[13] = DH2R( 13, 14,  4,  1, 29, 20, 13 );
+   dH[14] = DH2R( 14, 15,  7,  2, 30, 21, 14 );
+   dH[15] = DH2R( 15, 16,  2,  3, 31, 22, 15 );
+
+#undef DH1L
+#undef DH1R
+#undef DH2L
+#undef DH2R
+
+}
+
+static const __m512i final_s16[16] =
+{
+    { 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0,
+      0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0,
+      0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0,
+      0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
+    { 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1,
+      0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1,
+      0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1,
+      0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
+    { 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2,
+      0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2,
+      0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2,
+      0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 },
+    { 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3,
+      0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3,
+      0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3,
+      0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 },
+    { 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4,
+      0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4,
+      0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4,
+      0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 },
+    { 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5,
+      0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5,
+      0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5,
+      0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 },
+    { 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6,
+      0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6,
+      0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6,
+      0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 },
+    { 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7,
+      0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7,
+      0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7,
+      0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 },
+    { 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8,
+      0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8,
+      0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8,
+      0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 },
+    { 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9,
+      0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9,
+      0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9,
+      0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 },
+    { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
+      0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
+      0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
+      0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
+    { 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab,
+      0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab,
+      0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab,
+      0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab },
+    { 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac,
+      0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac,
+      0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac,
+      0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac },
+    { 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad,
+      0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad,
+      0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad,
+      0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad },
+    { 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae,
+      0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae,
+      0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae,
+      0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
+    { 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf,
+      0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf,
+      0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf,
+      0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
+};
+
+
+void bmw256_16way_init( bmw256_16way_context *ctx )
+{
+   ctx->H[ 0] = m512_const1_64( 0x4041424340414243 );
+   ctx->H[ 1] = m512_const1_64( 0x4445464744454647 );
+   ctx->H[ 2] = m512_const1_64( 0x48494A4B48494A4B );
+   ctx->H[ 3] = m512_const1_64( 0x4C4D4E4F4C4D4E4F );
+   ctx->H[ 4] = m512_const1_64( 0x5051525350515253 );
+   ctx->H[ 5] = m512_const1_64( 0x5455565754555657 );
+   ctx->H[ 6] = m512_const1_64( 0x58595A5B58595A5B );
+   ctx->H[ 7] = m512_const1_64( 0x5C5D5E5F5C5D5E5F );
+   ctx->H[ 8] = m512_const1_64( 0x6061626360616263 );
+   ctx->H[ 9] = m512_const1_64( 0x6465666764656667 );
+   ctx->H[10] = m512_const1_64( 0x68696A6B68696A6B );
+   ctx->H[11] = m512_const1_64( 0x6C6D6E6F6C6D6E6F );
+   ctx->H[12] = m512_const1_64( 0x7071727370717273 );
+   ctx->H[13] = m512_const1_64( 0x7475767774757677 );
+   ctx->H[14] = m512_const1_64( 0x78797A7B78797A7B );
+   ctx->H[15] = m512_const1_64( 0x7C7D7E7F7C7D7E7F );
+   ctx->ptr       = 0;
+   ctx->bit_count = 0;
+}
+
+void bmw256_16way_update( bmw256_16way_context *ctx, const void *data,
+                          size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   __m512i *buf;
+   __m512i htmp[16];
+   __m512i *h1, *h2;
+   size_t ptr;
+   const int buf_size = 64;  // bytes of one lane, compatible with len
+
+   ctx->bit_count += len << 3;
+   buf = ctx->buf;
+   ptr = ctx->ptr;
+   h1 = ctx->H;
+   h2 = htmp;
+
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_512( buf + (ptr>>2), vdata, clen >> 2 );
+      vdata = vdata + (clen>>2);
+      len -= clen;
+      ptr += clen;
+      if ( ptr == buf_size )
+      {
+         __m512i *ht;
+         compress_small_16way( buf, h1, h2 );
+         ht = h1;
+         h1 = h2;
+         h2 = ht;
+         ptr = 0;
+      }
+   }
+   ctx->ptr = ptr;
+
+   if ( h1 != ctx->H )
+        memcpy_512( ctx->H, h1, 16 );
+}
+
+void bmw256_16way_close( bmw256_16way_context *ctx, void *dst )
+{
+   __m512i *buf;
+   __m512i h1[16], h2[16], *h;
+   size_t ptr, u, v;
+   const int buf_size = 64;  // bytes of one lane, compatible with len
+
+   buf = ctx->buf;
+   ptr = ctx->ptr;
+   buf[ ptr>>2 ] = m512_const1_64( 0x0000008000000080 );
+   ptr += 4;
+   h = ctx->H;
+
+   if (  ptr > (buf_size - 4) )
+   {
+      memset_zero_512( buf + (ptr>>2), (buf_size - ptr) >> 2 );
+      compress_small_16way( buf, h, h1 );
+      ptr = 0;
+      h = h1;
+   }
+   memset_zero_512( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
+   buf[ (buf_size - 8) >> 2 ] = _mm512_set1_epi32( ctx->bit_count );
+   buf[ (buf_size - 4) >> 2 ] = m512_zero;
+
+   compress_small_16way( buf, h, h2 );
+
+   for ( u = 0; u < 16; u ++ )
+      buf[u] = h2[u];
+
+   compress_small_16way( buf, final_s16, h1 );
+   for (u = 0, v = 16 - 8; u < 8; u ++, v ++)
+      casti_m512i(dst,u) = h1[v];
+}
+
+
+#endif // AVX512
+
+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/bmw/bmw512-4way.c
+++ b/algo/bmw/bmw512-4way.c
@@ -18,16 +18,17 @@ void bmw512hash_8way(void *state, const void *input)
 int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
-   uint32_t hash[16*8] __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash[16*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[49]);   // 3*16+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-//   const uint32_t Htarg = ptarget[7];
+   const uint32_t Htarg = ptarget[7];
   int thr_id = mythr->id;

   mm512_bswap32_intrlv80_8x64( vdata, pdata );
@@ -39,7 +40,8 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
      bmw512hash_8way( hash, vdata );

      for ( int lane = 0; lane < 8; lane++ )
-      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
+      if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
+//      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) )
@@ -48,15 +50,14 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
              submit_lane_solution( work, lane_hash, mythr, lane );
          }
      }
-      n += 4;
+      n += 8;

-   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart) );

-   *hashes_done = n - first_nonce + 1;
+   *hashes_done = n - first_nonce;
   return 0;
 }
   
-
 #elif defined(BMW512_4WAY)

 //#ifdef BMW512_4WAY
@@ -72,16 +73,17 @@ void bmw512hash_4way(void *state, const void *input)
 int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-   uint32_t hash[16*4] __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (128)));
+   uint32_t hash[16*4] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[25]);   // 3*8+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce -  4;
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-//   const uint32_t Htarg = ptarget[7];
+   const uint32_t Htarg = ptarget[7];
    int thr_id = mythr->id;  // thr_id arg is deprecated

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
@@ -92,7 +94,8 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
      bmw512hash_4way( hash, vdata );

      for ( int lane = 0; lane < 4; lane++ )
-      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
+      if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
+//      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) )
@@ -103,9 +106,9 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
      }
      n += 4;

-   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );

-   *hashes_done = n - first_nonce + 1;
+   *hashes_done = n - first_nonce;
   return 0;
 }

--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -58,8 +58,7 @@ static const sph_u64 IV512[] = {

 #if defined(__SSE2__)

-// BMW-512 2 way 64
-
+// BMW-512 2 way 64 

 #define s2b0(x) \
   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 1), \
@@ -824,87 +823,57 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
           mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
           mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );

-   dH[ 0] = _mm256_add_epi64(
-               _mm256_xor_si256( M[0],
-                  _mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
-                                    _mm256_srli_epi64( qt[16], 5 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ) );
-   dH[ 1] = _mm256_add_epi64(
-               _mm256_xor_si256( M[1],
-                  _mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
-                                    _mm256_slli_epi64( qt[17], 8 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ) );
-   dH[ 2] = _mm256_add_epi64(
-               _mm256_xor_si256( M[2],
-                  _mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
-                                    _mm256_slli_epi64( qt[18], 5 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ) );
-   dH[ 3] = _mm256_add_epi64(
-               _mm256_xor_si256( M[3],
-                  _mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
-                                    _mm256_slli_epi64( qt[19], 5 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ) );
-   dH[ 4] = _mm256_add_epi64(
-               _mm256_xor_si256( M[4],
-                  _mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
-                                    _mm256_slli_epi64( qt[20], 0 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ) );
-   dH[ 5] = _mm256_add_epi64(
-               _mm256_xor_si256( M[5],
-                  _mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
-                                    _mm256_srli_epi64( qt[21], 6 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ) );
-   dH[ 6] = _mm256_add_epi64(
-               _mm256_xor_si256( M[6],
-                  _mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
-                                    _mm256_slli_epi64( qt[22], 6 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ) );
-   dH[ 7] = _mm256_add_epi64(
-               _mm256_xor_si256( M[7],
-                  _mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
-                                    _mm256_slli_epi64( qt[23], 2 ) ) ),
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ) );
-   dH[ 8] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[4], 9 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
-                 _mm256_xor_si256( _mm256_slli_epi64( xl, 8 ),
-                                   _mm256_xor_si256( qt[23], qt[ 8] ) ) );
-   dH[ 9] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[5], 10 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 6 ),
-                                   _mm256_xor_si256( qt[16], qt[ 9] ) ) );
-   dH[10] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[6], 11 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
-                 _mm256_xor_si256( _mm256_slli_epi64( xl, 6 ),
-                                   _mm256_xor_si256( qt[17], qt[10] ) ) );
-   dH[11] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[7], 12 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
-                 _mm256_xor_si256( _mm256_slli_epi64( xl, 4 ),
-                                   _mm256_xor_si256( qt[18], qt[11] ) ) );
-   dH[12] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[0], 13 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 3 ),
-                                   _mm256_xor_si256( qt[19], qt[12] ) ) );
-   dH[13] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[1], 14 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 4 ),
-                                   _mm256_xor_si256( qt[20], qt[13] ) ) );
-   dH[14] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[2], 15 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 7 ),
-                                   _mm256_xor_si256( qt[21], qt[14] ) ) );
-   dH[15] = _mm256_add_epi64( _mm256_add_epi64(
-              mm256_rol_64( dH[3], 16 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 2 ),
-                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
-} 
+
+#define DH1L( m, sl, sr, a, b, c ) \
+   _mm256_add_epi64( \
+               _mm256_xor_si256( M[m], \
+                  _mm256_xor_si256( _mm256_slli_epi64( xh, sl ), \
+                                    _mm256_srli_epi64( qt[a], sr ) ) ), \
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+
+#define DH1R( m, sl, sr, a, b, c ) \
+   _mm256_add_epi64( \
+               _mm256_xor_si256( M[m], \
+                  _mm256_xor_si256( _mm256_srli_epi64( xh, sl ), \
+                                    _mm256_slli_epi64( qt[a], sr ) ) ), \
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+
+#define DH2L( m, rl, sl, h, a, b, c ) \
+   _mm256_add_epi64( _mm256_add_epi64( \
+       mm256_rol_64( dH[h], rl ), \
+          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
+                 _mm256_xor_si256( _mm256_slli_epi64( xl, sl ), \
+                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+
+#define DH2R( m, rl, sr, h, a, b, c ) \
+   _mm256_add_epi64( _mm256_add_epi64( \
+       mm256_rol_64( dH[h], rl ), \
+          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, sr ), \
+                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+
+   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
+   dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
+   dH[ 2] = DH1R(  2,  5,  5, 18, 26, 2 );
+   dH[ 3] = DH1R(  3,  1,  5, 19, 27, 3 );
+   dH[ 4] = DH1R(  4,  3,  0, 20, 28, 4 );
+   dH[ 5] = DH1L(  5,  6,  6, 21, 29, 5 );
+   dH[ 6] = DH1R(  6,  4,  6, 22, 30, 6 );
+   dH[ 7] = DH1R(  7, 11,  2, 23, 31, 7 );
+   dH[ 8] = DH2L(  8,  9,  8,  4, 24, 23,  8 );
+   dH[ 9] = DH2R(  9, 10,  6,  5, 25, 16,  9 );
+   dH[10] = DH2L( 10, 11,  6,  6, 26, 17, 10 );
+   dH[11] = DH2L( 11, 12,  4,  7, 27, 18, 11 );
+   dH[12] = DH2R( 12, 13,  3,  0, 28, 19, 12 );
+   dH[13] = DH2R( 13, 14,  4,  1, 29, 20, 13 );
+   dH[14] = DH2R( 14, 15,  7,  2, 30, 21, 14 );
+   dH[15] = DH2R( 15, 16,  2,  3, 31, 22, 15 );
+
+#undef DH1L
+#undef DH1R
+#undef DH2L
+#undef DH2R
+}

 static const __m256i final_b[16] =
 {
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -28,6 +28,10 @@ static const uint64_t IV512[] =

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+// 4 way 128 is handy to avoid reinterleaving in many algos.
+// If reinterleaving is necessary it may be more efficient to use
+// 2 way 256. The same transform code should work for both.
+
 static void transform_4way( cube_4way_context *sp )
 {
    int r;
@@ -201,6 +205,8 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,

 #endif // AVX512

+// 2 way 128 
+
 static void transform_2way( cube_2way_context *sp )
 {
    int r;
--- a/algo/cubehash/cube-hash-2way.c.save
+++ b/algo/cubehash/cube-hash-2way.c.save
@@ -1,203 +0,0 @@
-#if defined(__AVX2__)
-
-#include <stdbool.h>
-#include <unistd.h>
-#include <memory.h>
-#include "cube-hash-2way.h"
-
-// 2x128
-
-
-// The result of hashing 10 rounds of initial data which consists of params
-// zero padded.
-static const uint64_t IV256[] =
-{
-0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
-0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
-0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
-0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
-};
-
-static const uint64_t IV512[] =
-{
-0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
-0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
-0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
-0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
-};
-
-
-static void transform_2way( cube_2way_context *sp )
-{
-    int r;
-    const int rounds = sp->rounds;
-
-    __m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1;
-
-    x0 = _mm256_load_si256( (__m256i*)sp->h     );
-    x1 = _mm256_load_si256( (__m256i*)sp->h + 1 );
-    x2 = _mm256_load_si256( (__m256i*)sp->h + 2 );
-    x3 = _mm256_load_si256( (__m256i*)sp->h + 3 );
-    x4 = _mm256_load_si256( (__m256i*)sp->h + 4 );
-    x5 = _mm256_load_si256( (__m256i*)sp->h + 5 );
-    x6 = _mm256_load_si256( (__m256i*)sp->h + 6 );
-    x7 = _mm256_load_si256( (__m256i*)sp->h + 7 );
-
-    for ( r = 0; r < rounds; ++r )
-    {
-        x4 = _mm256_add_epi32( x0, x4 );
-        x5 = _mm256_add_epi32( x1, x5 );
-        x6 = _mm256_add_epi32( x2, x6 );
-        x7 = _mm256_add_epi32( x3, x7 );
-        y0 = x0;
-        y1 = x1;
-        x0 = mm256_rol_32( x2, 7 );
-        x1 = mm256_rol_32( x3, 7 );
-        x2 = mm256_rol_32( y0, 7 );
-        x3 = mm256_rol_32( y1, 7 );
-        x0 = _mm256_xor_si256( x0, x4 );
-        x1 = _mm256_xor_si256( x1, x5 );
-        x2 = _mm256_xor_si256( x2, x6 );
-        x3 = _mm256_xor_si256( x3, x7 );
-        x4 = mm256_swap64_128( x4 );
-        x5 = mm256_swap64_128( x5 );
-        x6 = mm256_swap64_128( x6 );
-        x7 = mm256_swap64_128( x7 );
-        x4 = _mm256_add_epi32( x0, x4 );
-        x5 = _mm256_add_epi32( x1, x5 );
-        x6 = _mm256_add_epi32( x2, x6 );
-        x7 = _mm256_add_epi32( x3, x7 );
-        y0 = x0;
-        y1 = x2;
-        x0 = mm256_rol_32( x1, 11 );
-        x1 = mm256_rol_32( y0, 11 );
-        x2 = mm256_rol_32( x3, 11 );
-        x3 = mm256_rol_32( y1, 11 );
-        x0 = _mm256_xor_si256( x0, x4 );
-        x1 = _mm256_xor_si256( x1, x5 );
-        x2 = _mm256_xor_si256( x2, x6 );
-        x3 = _mm256_xor_si256( x3, x7 );
-        x4 = mm256_swap32_64( x4 );
-        x5 = mm256_swap32_64( x5 );
-        x6 = mm256_swap32_64( x6 );
-        x7 = mm256_swap32_64( x7 );
-    }
-
-    _mm256_store_si256( (__m256i*)sp->h,     x0 );
-    _mm256_store_si256( (__m256i*)sp->h + 1, x1 );
-    _mm256_store_si256( (__m256i*)sp->h + 2, x2 );
-    _mm256_store_si256( (__m256i*)sp->h + 3, x3 );
-    _mm256_store_si256( (__m256i*)sp->h + 4, x4 );
-    _mm256_store_si256( (__m256i*)sp->h + 5, x5 );
-    _mm256_store_si256( (__m256i*)sp->h + 6, x6 );
-    _mm256_store_si256( (__m256i*)sp->h + 7, x7 );
-
-}
-
-int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
-                    int blockbytes )
-{
-    __m256i *h = (__m256i*)sp->h;
-    __m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
-                                                : (__m128i*)IV256 );
-    sp->hashlen   = hashbitlen/128;
-    sp->blocksize = blockbytes/16;
-    sp->rounds    = rounds;
-    sp->pos       = 0;
-
-    h[ 0] = m256_const1_128( iv[0] );
-    h[ 1] = m256_const1_128( iv[1] );
-    h[ 2] = m256_const1_128( iv[2] );
-    h[ 3] = m256_const1_128( iv[3] );
-    h[ 4] = m256_const1_128( iv[4] );
-    h[ 5] = m256_const1_128( iv[5] );
-    h[ 6] = m256_const1_128( iv[6] );
-    h[ 7] = m256_const1_128( iv[7] );
-    h[ 0] = m256_const1_128( iv[0] );
-    h[ 1] = m256_const1_128( iv[1] );
-    h[ 2] = m256_const1_128( iv[2] );
-    h[ 3] = m256_const1_128( iv[3] );
-    h[ 4] = m256_const1_128( iv[4] );
-    h[ 5] = m256_const1_128( iv[5] );
-    h[ 6] = m256_const1_128( iv[6] );
-    h[ 7] = m256_const1_128( iv[7] );
-    
-    return 0;
-}
-
-
-int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
-{
-    const int len = size >> 4;
-    const __m256i *in = (__m256i*)data;
-    int i;
-
-    // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
-    // Current usage sata is either 64 or 80 bytes.
-
-    for ( i = 0; i < len; i++ )
-    {
-        sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
-        sp->pos++;
-        if ( sp->pos == sp->blocksize )
-        {
-           transform_2way( sp );
-           sp->pos = 0;
-        }
-    }
-    return 0;
-}
-
-int cube_2way_close( cube_2way_context *sp, void *output )
-{
-    __m256i *hash = (__m256i*)output;
-    int i;
-
-    // pos is zero for 64 byte data, 1 for 80 byte data.
-    sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
-                                   m256_const2_64( 0, 0x0000000000000080 ) );
-    transform_2way( sp );
-
-    sp->h[7] = _mm256_xor_si256( sp->h[7],
-                                   m256_const2_64( 0x0000000100000000, 0 ) );
-
-    for ( i = 0; i < 10; ++i )           transform_2way( sp );
-
-    memcpy( hash, sp->h, sp->hashlen<<5 );
-    return 0;
-}
-
-int cube_2way_update_close( cube_2way_context *sp, void *output,
-                               const void *data, size_t size )
-{
-    const int len = size >> 4;
-    const __m256i *in = (__m256i*)data;
-    __m256i *hash = (__m256i*)output;
-    int i;
-
-    for ( i = 0; i < len; i++ )
-    {
-        sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
-        sp->pos++;
-        if ( sp->pos == sp->blocksize )
-        {
-           transform_2way( sp );
-           sp->pos = 0;
-        }
-    }
-
-    // pos is zero for 64 byte data, 1 for 80 byte data.
-    sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
-                                    m256_const2_64( 0, 0x0000000000000080 ) );
-    transform_2way( sp );
-
-    sp->h[7] = _mm256_xor_si256( sp->h[7],
-                                    m256_const2_64( 0x0000000100000000, 0 ) );
-
-    for ( i = 0; i < 10; ++i )    transform_2way( sp );
-
-    memcpy( hash, sp->h, sp->hashlen<<5 );
-    return 0;
-}
-
-#endif
--- a/algo/cubehash/cube-hash-2way.h.save
+++ b/algo/cubehash/cube-hash-2way.h.save
@@ -1,36 +0,0 @@
-#ifndef CUBE_HASH_2WAY_H__
-#define CUBE_HASH_2WAY_H__
-
-#if defined(__AVX2__)
-
-#include <stdint.h>
-#include "simd-utils.h"
-
-// 2x128, 2 way parallel SSE2
-
-struct _cube_2way_context
-{
-    __m256i h[8];
-    int hashlen;           // __m128i
-    int rounds;
-    int blocksize;         // __m128i
-    int pos;               // number of __m128i read into x from current block
-} __attribute__ ((aligned (64)));
-
-typedef struct _cube_2way_context cube_2way_context;
-
-int cube_2way_init( cube_2way_context* sp, int hashbitlen, int rounds,
-                       int blockbytes );
-// reinitialize context with same parameters, much faster.
-int cube_2way_reinit( cube_2way_context *sp );
-
-int cube_2way_update( cube_2way_context *sp, const void *data, size_t size );
-
-int cube_2way_close( cube_2way_context *sp, void *output );
-
-int cube_2way_update_close( cube_2way_context *sp, void *output,
-                            const void *data, size_t size );
-
-
-#endif
-#endif
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -1,6 +1,7 @@
 #include <string.h>
 #include <immintrin.h>
 #include "luffa-hash-2way.h"
+#include <stdio.h>

 #if defined(__AVX2__)

@@ -318,22 +319,6 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
    chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
    chainv[5] = _mm512_xor_si512( chainv[5], msg1 );

-    MULT24W( chainv[2], chainv[3], MASK );
-    chainv[2] = _mm512_xor_si512( chainv[2], chainv[0] );
-    chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );
-
-    MULT24W( chainv[0], chainv[1], MASK );
-    chainv[0] = _mm512_xor_si512( _mm512_xor_si512( chainv[0], t0 ), msg0 );
-    chainv[1] = _mm512_xor_si512( _mm512_xor_si512( chainv[1], t1 ), msg1 );
-
-    MULT24W( msg0, msg1, MASK );
-    chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
-    chainv[3] = _mm512_xor_si512( chainv[3], msg1 );
-
-    MULT24W( msg0, msg1, MASK );
-    chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
-    chainv[5] = _mm512_xor_si512( chainv[5], msg1 );
-
    MULT24W( msg0, msg1, MASK );
    chainv[6] = _mm512_xor_si512( chainv[6], msg0 );
    chainv[7] = _mm512_xor_si512( chainv[7], msg1 );
@@ -345,14 +330,10 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
    MULT24W( msg0, msg1, MASK );

    // replace with ror
-    chainv[3] = _mm512_or_si512( _mm512_slli_epi32( chainv[3],  1 ),
-                                 _mm512_srli_epi32( chainv[3], 31 ) );
-    chainv[5] = _mm512_or_si512( _mm512_slli_epi32( chainv[5],  2 ),
-                                 _mm512_srli_epi32( chainv[5], 30 ) );
-    chainv[7] = _mm512_or_si512( _mm512_slli_epi32( chainv[7],  3 ),
-                                 _mm512_srli_epi32( chainv[7], 29 ) );
-    chainv[9] = _mm512_or_si512( _mm512_slli_epi32( chainv[9],  4 ),
-                                 _mm512_srli_epi32( chainv[9], 28 ) );
+    chainv[3] = _mm512_rol_epi32( chainv[3], 1 );
+    chainv[5] = _mm512_rol_epi32( chainv[5], 2 );
+    chainv[7] = _mm512_rol_epi32( chainv[7], 3 );
+    chainv[9] = _mm512_rol_epi32( chainv[9], 4 );

    NMLTOM10244W( chainv[0], chainv[2], chainv[4], chainv[6],
                x[0], x[1], x[2], x[3],
@@ -394,7 +375,7 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )

 void finalization512_4way( luffa_4way_context *state, uint32 *b )
 {
-    uint32 hash[8*4] __attribute((aligned(128)));
+    uint32_t hash[8*4] __attribute((aligned(128)));
    __m512i* chainv = state->chainv;
    __m512i t[2];
    __m512i zero[2];
@@ -424,7 +405,7 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
    t[1] = _mm512_shuffle_epi32( t[1], 27 );

    _mm512_store_si512( (__m512i*)&hash[0], t[0] );
-    _mm512_store_si512( (__m512i*)&hash[8], t[1] );
+    _mm512_store_si512( (__m512i*)&hash[16], t[1] );

    casti_m512i( b, 0 ) = _mm512_shuffle_epi8(
                                  casti_m512i( hash, 0 ), shuff_bswap32 );
@@ -448,7 +429,7 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
    t[1] = _mm512_shuffle_epi32( t[1], 27 );

    _mm512_store_si512( (__m512i*)&hash[0], t[0] );
-    _mm512_store_si512( (__m512i*)&hash[8], t[1] );
+    _mm512_store_si512( (__m512i*)&hash[16], t[1] );

    casti_m512i( b, 2 ) = _mm512_shuffle_epi8(
                                  casti_m512i( hash, 0 ), shuff_bswap32 );
@@ -493,8 +474,8 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
                                   0x2c2d2e2f28292a2b, 0x2425262720212223,
                                   0x1c1d1e1f18191a1b, 0x1415161710111213,
                                   0x0c0d0e0f08090a0b, 0x0405060700010203 );
-         
-    state-> rembytes = (int)len & 0x1F;
+
+    state->rembytes = (int)len & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
@@ -578,8 +559,9 @@ int luffa_4way_update_close( luffa_4way_context *state,
    }

    finalization512_4way( state, (uint32*)output );
+
    if ( state->hashbitlen > 512 )
-        finalization512_4way( state, (uint32*)( output+32 ) );
+        finalization512_4way( state, (uint32*)( output+64 ) );

    return 0;
 }
@@ -860,14 +842,10 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )

    MULT2( msg0, msg1, MASK );

-    chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3],  1 ),
-                                 _mm256_srli_epi32( chainv[3], 31 ) );
-    chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5],  2 ),
-                                 _mm256_srli_epi32( chainv[5], 30 ) );
-    chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7],  3 ),
-                                 _mm256_srli_epi32( chainv[7], 29 ) );
-    chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9],  4 ),
-                                 _mm256_srli_epi32( chainv[9], 28 ) );
+    chainv[3] = mm256_rol_32( chainv[3], 1 );
+    chainv[5] = mm256_rol_32( chainv[5], 2 );
+    chainv[7] = mm256_rol_32( chainv[7], 3 );
+    chainv[9] = mm256_rol_32( chainv[9], 4 );

    NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
                x[0], x[1], x[2], x[3],
@@ -1093,6 +1071,7 @@ int luffa_2way_update_close( luffa_2way_context *state,
    }

    finalization512_2way( state, (uint32*)output );
+
    if ( state->hashbitlen > 512 )
        finalization512_2way( state, (uint32*)( output+32 ) );

--- a/algo/luffa/luffa-hash-2way.c.save
+++ b/algo/luffa/luffa-hash-2way.c.save
@@ -1,573 +0,0 @@
-#include <string.h>
-#include <immintrin.h>
-#include "luffa-hash-2way.h"
-
-#if defined(__AVX2__)
-
-#include "simd-utils.h"
-
-#define cns(i)  m256_const1_128( ( (__m128i*)CNS_INIT)[i] )
-
-#define ADD_CONSTANT(a,b,c0,c1)\
-    a = _mm256_xor_si256(a,c0);\
-    b = _mm256_xor_si256(b,c1);\
-
-#define MULT2( a0, a1, mask ) \
-do { \
-  __m256i b = _mm256_xor_si256( a0, \
-                   _mm256_shuffle_epi32( _mm256_and_si256(a1,mask), 16 ) ); \
-  a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
-  a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) );  \
-} while(0)
-
-// confirm pointer arithmetic
-// ok but use array indexes
-#define STEP_PART(x,c0,c1,t)\
-    SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
-    SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
-    MIXWORD(*x,*(x+4),*t,*(t+1));\
-    MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
-    MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
-    MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
-    ADD_CONSTANT(*x, *(x+4), c0, c1);
-
-#define SUBCRUMB(a0,a1,a2,a3,t)\
-    t  = _mm256_load_si256(&a0);\
-    a0 = _mm256_or_si256(a0,a1);\
-    a2 = _mm256_xor_si256(a2,a3);\
-    a1 = _mm256_andnot_si256(a1, m256_neg1 );\
-    a0 = _mm256_xor_si256(a0,a3);\
-    a3 = _mm256_and_si256(a3,t);\
-    a1 = _mm256_xor_si256(a1,a3);\
-    a3 = _mm256_xor_si256(a3,a2);\
-    a2 = _mm256_and_si256(a2,a0);\
-    a0 = _mm256_andnot_si256(a0, m256_neg1 );\
-    a2 = _mm256_xor_si256(a2,a1);\
-    a1 = _mm256_or_si256(a1,a3);\
-    t  = _mm256_xor_si256(t,a1);\
-    a3 = _mm256_xor_si256(a3,a2);\
-    a2 = _mm256_and_si256(a2,a1);\
-    a1 = _mm256_xor_si256(a1,a0);\
-    a0 = _mm256_load_si256(&t);\
-
-#define MIXWORD(a,b,t1,t2)\
-    b  = _mm256_xor_si256(a,b);\
-    t1 = _mm256_slli_epi32(a,2);\
-    t2 = _mm256_srli_epi32(a,30);\
-     a = _mm256_or_si256(t1,t2);\
-    a  = _mm256_xor_si256(a,b);\
-    t1 = _mm256_slli_epi32(b,14);\
-    t2 = _mm256_srli_epi32(b,18);\
-    b  = _mm256_or_si256(t1,t2);\
-    b  = _mm256_xor_si256(a,b);\
-    t1 = _mm256_slli_epi32(a,10);\
-    t2 = _mm256_srli_epi32(a,22);\
-    a  = _mm256_or_si256(t1,t2);\
-    a  = _mm256_xor_si256(a,b);\
-    t1 = _mm256_slli_epi32(b,1);\
-    t2 = _mm256_srli_epi32(b,31);\
-    b  = _mm256_or_si256(t1,t2);
-
-#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
-    a1 = _mm256_shuffle_epi32(a1,147);\
-    t0 = _mm256_load_si256(&a1);\
-    a1 = _mm256_unpacklo_epi32(a1,a0);\
-    t0 = _mm256_unpackhi_epi32(t0,a0);\
-    t1 = _mm256_shuffle_epi32(t0,78);\
-    a0 = _mm256_shuffle_epi32(a1,78);\
-    SUBCRUMB(t1,t0,a0,a1,tmp0);\
-    t0 = _mm256_unpacklo_epi32(t0,t1);\
-    a1 = _mm256_unpacklo_epi32(a1,a0);\
-    a0 = _mm256_load_si256(&a1);\
-    a0 = _mm256_unpackhi_epi64(a0,t0);\
-    a1 = _mm256_unpacklo_epi64(a1,t0);\
-    a1 = _mm256_shuffle_epi32(a1,57);\
-    MIXWORD(a0,a1,tmp0,tmp1);\
-    ADD_CONSTANT(a0,a1,c0,c1);
-
-#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
-    s2 = _mm256_load_si256(&r1);\
-    q2 = _mm256_load_si256(&p1);\
-    r2 = _mm256_shuffle_epi32(r2,216);\
-    p2 = _mm256_shuffle_epi32(p2,216);\
-    r1 = _mm256_unpacklo_epi32(r1,r0);\
-    p1 = _mm256_unpacklo_epi32(p1,p0);\
-    s2 = _mm256_unpackhi_epi32(s2,r0);\
-    q2 = _mm256_unpackhi_epi32(q2,p0);\
-    s0 = _mm256_load_si256(&r2);\
-    q0 = _mm256_load_si256(&p2);\
-    r2 = _mm256_unpacklo_epi64(r2,r1);\
-    p2 = _mm256_unpacklo_epi64(p2,p1);\
-    s1 = _mm256_load_si256(&s0);\
-    q1 = _mm256_load_si256(&q0);\
-    s0 = _mm256_unpackhi_epi64(s0,r1);\
-    q0 = _mm256_unpackhi_epi64(q0,p1);\
-    r2 = _mm256_shuffle_epi32(r2,225);\
-    p2 = _mm256_shuffle_epi32(p2,225);\
-    r0 = _mm256_load_si256(&s1);\
-    p0 = _mm256_load_si256(&q1);\
-    s0 = _mm256_shuffle_epi32(s0,225);\
-    q0 = _mm256_shuffle_epi32(q0,225);\
-    s1 = _mm256_unpacklo_epi64(s1,s2);\
-    q1 = _mm256_unpacklo_epi64(q1,q2);\
-    r0 = _mm256_unpackhi_epi64(r0,s2);\
-    p0 = _mm256_unpackhi_epi64(p0,q2);\
-    s2 = _mm256_load_si256(&r0);\
-    q2 = _mm256_load_si256(&p0);\
-    s3 = _mm256_load_si256(&r2);\
-    q3 = _mm256_load_si256(&p2);\
-
-#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
-    s0 = _mm256_load_si256(&r0);\
-    q0 = _mm256_load_si256(&p0);\
-    s1 = _mm256_load_si256(&r2);\
-    q1 = _mm256_load_si256(&p2);\
-    r0 = _mm256_unpackhi_epi32(r0,r1);\
-    p0 = _mm256_unpackhi_epi32(p0,p1);\
-    r2 = _mm256_unpackhi_epi32(r2,r3);\
-    p2 = _mm256_unpackhi_epi32(p2,p3);\
-    s0 = _mm256_unpacklo_epi32(s0,r1);\
-    q0 = _mm256_unpacklo_epi32(q0,p1);\
-    s1 = _mm256_unpacklo_epi32(s1,r3);\
-    q1 = _mm256_unpacklo_epi32(q1,p3);\
-    r1 = _mm256_load_si256(&r0);\
-    p1 = _mm256_load_si256(&p0);\
-    r0 = _mm256_unpackhi_epi64(r0,r2);\
-    p0 = _mm256_unpackhi_epi64(p0,p2);\
-    s0 = _mm256_unpackhi_epi64(s0,s1);\
-    q0 = _mm256_unpackhi_epi64(q0,q1);\
-    r1 = _mm256_unpacklo_epi64(r1,r2);\
-    p1 = _mm256_unpacklo_epi64(p1,p2);\
-    s2 = _mm256_load_si256(&r0);\
-    q2 = _mm256_load_si256(&p0);\
-    s1 = _mm256_load_si256(&r1);\
-    q1 = _mm256_load_si256(&p1);\
-
-#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
-    s1 = _mm256_load_si256(&r3);\
-    q1 = _mm256_load_si256(&p3);\
-    s3 = _mm256_load_si256(&r3);\
-    q3 = _mm256_load_si256(&p3);\
-    s1 = _mm256_unpackhi_epi32(s1,r2);\
-    q1 = _mm256_unpackhi_epi32(q1,p2);\
-    s3 = _mm256_unpacklo_epi32(s3,r2);\
-    q3 = _mm256_unpacklo_epi32(q3,p2);\
-    s0 = _mm256_load_si256(&s1);\
-    q0 = _mm256_load_si256(&q1);\
-    s2 = _mm256_load_si256(&s3);\
-    q2 = _mm256_load_si256(&q3);\
-    r3 = _mm256_load_si256(&r1);\
-    p3 = _mm256_load_si256(&p1);\
-    r1 = _mm256_unpacklo_epi32(r1,r0);\
-    p1 = _mm256_unpacklo_epi32(p1,p0);\
-    r3 = _mm256_unpackhi_epi32(r3,r0);\
-    p3 = _mm256_unpackhi_epi32(p3,p0);\
-    s0 = _mm256_unpackhi_epi64(s0,r3);\
-    q0 = _mm256_unpackhi_epi64(q0,p3);\
-    s1 = _mm256_unpacklo_epi64(s1,r3);\
-    q1 = _mm256_unpacklo_epi64(q1,p3);\
-    s2 = _mm256_unpackhi_epi64(s2,r1);\
-    q2 = _mm256_unpackhi_epi64(q2,p1);\
-    s3 = _mm256_unpacklo_epi64(s3,r1);\
-    q3 = _mm256_unpacklo_epi64(q3,p1);
-
-#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
-    NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
-
-/* initial values of chaining variables */
-static const uint32 IV[40] __attribute((aligned(32))) = {
-    0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
-    0xdef610bb,0xee058139,0x90152df4,0x6e292011,
-    0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
-    0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
-    0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
-    0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
-    0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
-    0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
-    0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
-    0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
-};
-
-/* Round Constants */
-static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
-    0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
-    0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
-    0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
-    0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
-    0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
-    0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
-    0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
-    0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
-    0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
-    0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
-    0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
-    0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
-    0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
-    0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
-    0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
-    0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
-    0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
-    0x00000000,0x00000000,0x00000000,0x5090d577,
-    0x00000000,0x00000000,0x00000000,0xac11d7fa,
-    0x00000000,0x00000000,0x00000000,0x2d1925ab,
-    0x00000000,0x00000000,0x00000000,0x1bcb66f2,
-    0x00000000,0x00000000,0x00000000,0xb46496ac,
-    0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
-    0x00000000,0x00000000,0x00000000,0xd1925ab0,
-    0x00000000,0x00000000,0x00000000,0x78602649,
-    0x00000000,0x00000000,0x00000000,0x29131ab6,
-    0x00000000,0x00000000,0x00000000,0x8edae952,
-    0x00000000,0x00000000,0x00000000,0x0fc053c3,
-    0x00000000,0x00000000,0x00000000,0x3b6ba548,
-    0x00000000,0x00000000,0x00000000,0x3f014f0c,
-    0x00000000,0x00000000,0x00000000,0xedae9520,
-    0x00000000,0x00000000,0x00000000,0xfc053c31
-};
-
-
-
-/***************************************************/
-/* Round function         */
-/* state: hash context    */
-
-void rnd512_2way( luffa_2way_context *state, __m256i *msg )
-{
-    __m256i t0, t1;
-    __m256i *chainv = state->chainv;
-    __m256i msg0, msg1;
-    __m256i tmp[2];
-    __m256i x[8];
-    const __m256i MASK = m256_const2_64( 0, 0x00000000ffffffff );
-
-    t0 = chainv[0];
-    t1 = chainv[1];
-
-    t0 = _mm256_xor_si256( t0, chainv[2] );
-    t1 = _mm256_xor_si256( t1, chainv[3] );
-    t0 = _mm256_xor_si256( t0, chainv[4] );
-    t1 = _mm256_xor_si256( t1, chainv[5] );
-    t0 = _mm256_xor_si256( t0, chainv[6] );
-    t1 = _mm256_xor_si256( t1, chainv[7] );
-    t0 = _mm256_xor_si256( t0, chainv[8] );
-    t1 = _mm256_xor_si256( t1, chainv[9] );
-
-    MULT2( t0, t1, MASK );
-
-    msg0 = _mm256_shuffle_epi32( msg[0], 27 );
-    msg1 = _mm256_shuffle_epi32( msg[1], 27 );
-
-    chainv[0] = _mm256_xor_si256( chainv[0], t0 );
-    chainv[1] = _mm256_xor_si256( chainv[1], t1 );
-    chainv[2] = _mm256_xor_si256( chainv[2], t0 );
-    chainv[3] = _mm256_xor_si256( chainv[3], t1 );
-    chainv[4] = _mm256_xor_si256( chainv[4], t0 );
-    chainv[5] = _mm256_xor_si256( chainv[5], t1 );
-    chainv[6] = _mm256_xor_si256( chainv[6], t0 );
-    chainv[7] = _mm256_xor_si256( chainv[7], t1 );
-    chainv[8] = _mm256_xor_si256( chainv[8], t0 );
-    chainv[9] = _mm256_xor_si256( chainv[9], t1 );
-
-    t0 = chainv[0];
-    t1 = chainv[1];
-
-    MULT2( chainv[0], chainv[1], MASK );
-    chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
-    chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
-
-    MULT2( chainv[2], chainv[3], MASK );
-    chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
-    chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
-
-    MULT2( chainv[4], chainv[5], MASK );
-    chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
-    chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
-
-    MULT2( chainv[6], chainv[7], MASK );
-    chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
-    chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
-
-    MULT2( chainv[8], chainv[9], MASK );
-    chainv[8] = _mm256_xor_si256( chainv[8], t0 );
-    chainv[9] = _mm256_xor_si256( chainv[9], t1 );
-
-    t0 = chainv[8];
-    t1 = chainv[9];
-
-    MULT2( chainv[8], chainv[9], MASK );
-    chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
-    chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
-
-    MULT2( chainv[6], chainv[7], MASK );
-    chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
-    chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
-
-    MULT2( chainv[4], chainv[5], MASK );
-    chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
-    chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
-
-    MULT2( chainv[2], chainv[3], MASK );
-    chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
-    chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
-
-    MULT2( chainv[0], chainv[1], MASK );
-    chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
-    chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );
-
-    MULT2( msg0, msg1, MASK );
-    chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
-    chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
-
-    MULT2( msg0, msg1, MASK );
-    chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
-    chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
-
-    MULT2( msg0, msg1, MASK );
-    chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
-    chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
-
-    MULT2( msg0, msg1, MASK );
-    chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
-    chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
-
-    MULT2( msg0, msg1, MASK );
-
-    chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3],  1 ),
-                                 _mm256_srli_epi32( chainv[3], 31 ) );
-    chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5],  2 ),
-                                 _mm256_srli_epi32( chainv[5], 30 ) );
-    chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7],  3 ),
-                                 _mm256_srli_epi32( chainv[7], 29 ) );
-    chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9],  4 ),
-                                 _mm256_srli_epi32( chainv[9], 28 ) );
-
-    NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
-                x[0], x[1], x[2], x[3],
-                chainv[1],chainv[3],chainv[5],chainv[7],
-                x[4], x[5], x[6], x[7] );
-
-    STEP_PART( &x[0], cns( 0), cns( 1), &tmp[0] );
-    STEP_PART( &x[0], cns( 2), cns( 3), &tmp[0] );
-    STEP_PART( &x[0], cns( 4), cns( 5), &tmp[0] );
-    STEP_PART( &x[0], cns( 6), cns( 7), &tmp[0] );
-    STEP_PART( &x[0], cns( 8), cns( 9), &tmp[0] );
-    STEP_PART( &x[0], cns(10), cns(11), &tmp[0] );
-    STEP_PART( &x[0], cns(12), cns(13), &tmp[0] );
-    STEP_PART( &x[0], cns(14), cns(15), &tmp[0] );
-
-    MIXTON1024( x[0], x[1], x[2], x[3],
-                chainv[0], chainv[2], chainv[4],chainv[6],
-                x[4], x[5], x[6], x[7],
-                chainv[1],chainv[3],chainv[5],chainv[7]);
-
-    /* Process last 256-bit block */
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31),
-                tmp[0], tmp[1] );
-}
-
-/***************************************************/
-/* Finalization function  */
-/* state: hash context    */
-/* b[8]: hash values      */
-
-void finalization512_2way( luffa_2way_context *state, uint32 *b )
-{
-    uint32 hash[8] __attribute((aligned(64)));
-    __m256i* chainv = state->chainv;
-    __m256i t[2];
-    __m256i zero[2];
-    zero[0] = zero[1] = m256_zero;
-    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
-                                                 0x1415161710111213,
-                                                 0x0c0d0e0f08090a0b,
-                                                 0x0405060700010203 );
-    /*---- blank round with m=0 ----*/
-    rnd512_2way( state, zero );
-
-    t[0] = chainv[0];
-    t[1] = chainv[1];
-
-    t[0] = _mm256_xor_si256( t[0], chainv[2] );
-    t[1] = _mm256_xor_si256( t[1], chainv[3] );
-    t[0] = _mm256_xor_si256( t[0], chainv[4] );
-    t[1] = _mm256_xor_si256( t[1], chainv[5] );
-    t[0] = _mm256_xor_si256( t[0], chainv[6] );
-    t[1] = _mm256_xor_si256( t[1], chainv[7] );
-    t[0] = _mm256_xor_si256( t[0], chainv[8] );
-    t[1] = _mm256_xor_si256( t[1], chainv[9] );
-
-    t[0] = _mm256_shuffle_epi32( t[0], 27 );
-    t[1] = _mm256_shuffle_epi32( t[1], 27 );
-
-    _mm256_store_si256( (__m256i*)&hash[0], t[0] );
-    _mm256_store_si256( (__m256i*)&hash[8], t[1] );
-
-    casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
-                                  casti_m256i( hash, 0 ), shuff_bswap32 );
-    casti_m256i( b, 1 ) = _mm256_shuffle_epi8( 
-                                  casti_m256i( hash, 1 ), shuff_bswap32 );
-
-    rnd512_2way( state, zero );
-
-    t[0] = chainv[0];
-    t[1] = chainv[1];
-    t[0] = _mm256_xor_si256( t[0], chainv[2] );
-    t[1] = _mm256_xor_si256( t[1], chainv[3] );
-    t[0] = _mm256_xor_si256( t[0], chainv[4] );
-    t[1] = _mm256_xor_si256( t[1], chainv[5] );
-    t[0] = _mm256_xor_si256( t[0], chainv[6] );
-    t[1] = _mm256_xor_si256( t[1], chainv[7] );
-    t[0] = _mm256_xor_si256( t[0], chainv[8] );
-    t[1] = _mm256_xor_si256( t[1], chainv[9] );
-
-    t[0] = _mm256_shuffle_epi32( t[0], 27 );
-    t[1] = _mm256_shuffle_epi32( t[1], 27 );
-
-    _mm256_store_si256( (__m256i*)&hash[0], t[0] );
-    _mm256_store_si256( (__m256i*)&hash[8], t[1] );
-
-    casti_m256i( b, 2 ) = _mm256_shuffle_epi8( 
-                                  casti_m256i( hash, 0 ), shuff_bswap32 );
-    casti_m256i( b, 3 ) = _mm256_shuffle_epi8( 
-                                  casti_m256i( hash, 1 ), shuff_bswap32 );
-}
-
-int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
-{
-    state->hashbitlen = hashbitlen;
-    __m128i *iv = (__m128i*)IV;
-    
-    state->chainv[0] = m256_const1_128( iv[0] );
-    state->chainv[1] = m256_const1_128( iv[1] );
-    state->chainv[2] = m256_const1_128( iv[2] );
-    state->chainv[3] = m256_const1_128( iv[3] );
-    state->chainv[4] = m256_const1_128( iv[4] );
-    state->chainv[5] = m256_const1_128( iv[5] );
-    state->chainv[6] = m256_const1_128( iv[6] );
-    state->chainv[7] = m256_const1_128( iv[7] );
-    state->chainv[8] = m256_const1_128( iv[8] );
-    state->chainv[9] = m256_const1_128( iv[9] );
-
-    ((__m256i*)state->buffer)[0] = m256_zero;
-    ((__m256i*)state->buffer)[1] = m256_zero;
-
-    return 0;
-}
-
-// Do not call luffa_update_close after having called luffa_update.
-// Once luffa_update has been called only call luffa_update or luffa_close.
-int luffa_2way_update( luffa_2way_context *state, const void *data,
-                       size_t len )
-{
-    __m256i *vdata  = (__m256i*)data;
-    __m256i *buffer = (__m256i*)state->buffer;
-    __m256i msg[2];
-    int i;
-    int blocks = (int)len >> 5;
-    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
-                                                 0x1415161710111213,
-                                                 0x0c0d0e0f08090a0b,
-                                                 0x0405060700010203 );
-    state-> rembytes = (int)len & 0x1F;
-
-    // full blocks
-    for ( i = 0; i < blocks; i++, vdata+=2 )
-    {
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
-       rnd512_2way( state, msg );
-    }
-
-    // 16 byte partial block exists for 80 byte len
-    // store in buffer for transform in final for midstate to work
-    if ( state->rembytes  )
-    {
-      // remaining data bytes
-      buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
-      buffer[1] = m256_const2_64( 0, 0x0000000080000000 );
-    }
-    return 0;
-}
-
-int luffa_2way_close( luffa_2way_context *state, void *hashval )
-{
-    __m256i *buffer = (__m256i*)state->buffer;
-    __m256i msg[2];
-
-    // transform pad block
-    if ( state->rembytes )
-      // not empty, data is in buffer
-      rnd512_2way( state, buffer );
-    else
-    {     // empty pad block, constant data
-      msg[0] = m256_const2_64( 0, 0x0000000080000000 );
-      msg[1] = m256_zero;
-      rnd512_2way( state, msg );
-    }
-    finalization512_2way( state, (uint32*)hashval );
-
-    if ( state->hashbitlen > 512 )
-        finalization512_2way( state, (uint32*)( hashval+32 ) );
-    return 0;
-}
-
-int luffa_2way_update_close( luffa_2way_context *state,
-                 void *output, const void *data, size_t inlen )
-{
-// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
-    const __m256i *vdata  = (__m256i*)data;
-    __m256i msg[2];
-    int i;
-    const int blocks = (int)( inlen >> 5 );
-    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
-                                                 0x1415161710111213,
-                                                 0x0c0d0e0f08090a0b,
-                                                 0x0405060700010203 );
-
-    state->rembytes = inlen & 0x1F;
-
-    // full blocks
-    for ( i = 0; i < blocks; i++, vdata+=2 )
-    {
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
-       rnd512_2way( state, msg );
-    }
-
-    // 16 byte partial block exists for 80 byte len
-    if ( state->rembytes  )
-    {
-       // padding of partial block
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = m256_const2_64( 0, 0x0000000080000000 );
-       rnd512_2way( state, msg );
-    }
-    else
-    {
-       // empty pad block
-       msg[0] = m256_const2_64( 0, 0x0000000080000000 );
-       msg[1] = m256_zero;
-       rnd512_2way( state, msg );
-    }
-
-    finalization512_2way( state, (uint32*)output );
-    if ( state->hashbitlen > 512 )
-        finalization512_2way( state, (uint32*)( output+32 ) );
-
-    return 0;
-}
-
-#endif
--- a/algo/luffa/luffa-hash-2way.h.save
+++ b/algo/luffa/luffa-hash-2way.h.save
@@ -1,69 +0,0 @@
-#if !defined(LUFFA_HASH_2WAY_H__)
-#define LUFFA_HASH_2WAY_H__ 1
-/*
- * luffa_for_sse2.h
- * Version 2.0 (Sep 15th 2009)
- *
- * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
- *
- * Hitachi, Ltd. is the owner of this software and hereby grant
- * the U.S. Government and any interested party the right to use
- * this software for the purposes of the SHA-3 evaluation process,
- * notwithstanding that this software is copyrighted.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#if defined(__AVX2__)
-
-#include <immintrin.h>
-#include "algo/sha/sha3-defs.h"
-#include "simd-utils.h"
-
-/* The length of digests*/
-#define DIGEST_BIT_LEN_224 224
-#define DIGEST_BIT_LEN_256 256
-#define DIGEST_BIT_LEN_384 384
-#define DIGEST_BIT_LEN_512 512
-
-/*********************************/
-/* The parameters of Luffa       */
-#define MSG_BLOCK_BIT_LEN 256  /*The bit length of a message block*/
-#define MSG_BLOCK_BYTE_LEN (MSG_BLOCK_BIT_LEN >> 3) /* The byte length
-                                                     * of a message block*/
-
-/* The number of blocks in Luffa */
-#define WIDTH_224 3
-#define WIDTH_256 3
-#define WIDTH_384 4
-#define WIDTH_512 5
-
-/* The limit of the length of message */
-#define LIMIT_224 64
-#define LIMIT_256 64
-#define LIMIT_384 128
-#define LIMIT_512 128
-/*********************************/
-
-typedef struct {
-    uint32 buffer[8*2] __attribute((aligned(64)));
-    __m256i chainv[10] __attribute((aligned(32)));   /* Chaining values */
-    int hashbitlen;
-    int rembytes;
-} luffa_2way_context;
-
-int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
-int luffa_2way_update( luffa_2way_context *state, const void *data,
-                       size_t len );
-int luffa_2way_close( luffa_2way_context *state, void *hashval );
-int luffa_2way_update_close( luffa_2way_context *state, void *output,
-                                   const void *data, size_t inlen );
-
-#endif
-#endif
--- a/algo/lyra2/lyra2-hash-2way.c
+++ b/algo/lyra2/lyra2-hash-2way.c
@@ -0,0 +1,715 @@
+/**
+ * Implementation of the Lyra2 Password Hashing Scheme (PHS).
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <mm_malloc.h>
+#include "compat.h"
+#include "lyra2.h"
+#include "sponge.h"
+
+/**
+ * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
+ * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
+ * where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all
+ * integer parameters (treated as type "unsigned int") in the order they are provided, plus the value
+ * of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols).
+ *
+ * @param K The derived key to be output by the algorithm
+ * @param kLen Desired key length
+ * @param pwd User password
+ * @param pwdlen Password length
+ * @param salt Salt
+ * @param saltlen Salt length
+ * @param timeCost Parameter to determine the processing time (T)
+ * @param nRows Number or rows of the memory matrix (R)
+ * @param nCols Number of columns of the memory matrix (C)
+ *
+ * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
+ */
+
+int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
+               const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
+               const uint64_t timeCost, const uint64_t nRows,
+               const uint64_t nCols )
+{
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[16];
+   int64_t row = 2; //index of row to be processed
+   int64_t prev = 1; //index of prev (last row ever computed/modified)
+   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+   int64_t tau; //Time Loop iterator
+   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+//   int64_t i; //auxiliary iteration counter
+   int64_t v64; // 64bit var for memcpy
+   //====================================================================/
+
+   //=== Initializing the Memory Matrix and pointers to it =============//
+   //Tries to allocate enough space for the whole memory matrix
+
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+//   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   // for Lyra2REv2, nCols = 4, v1 was using 8
+   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
+                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+   uint64_t *ptrWord = wholeMatrix;
+
+//   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+
+   //=== Getting the password + salt + basil padded with 10*1 ==========//
+   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+   //but this ensures that the password copied locally will be overwritten as soon as possible
+
+   //First, we clean enough blocks for the password, salt, basil and padding
+   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+
+   byte *ptrByte = (byte*) wholeMatrix;
+
+   //Prepends the password
+   memcpy(ptrByte, pwd, pwdlen);
+   ptrByte += pwdlen;
+
+   //Concatenates the salt
+   memcpy(ptrByte, salt, saltlen);
+   ptrByte += saltlen;
+
+   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
+                       - (saltlen + pwdlen) );
+
+   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+   memcpy(ptrByte, &kLen, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = pwdlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = saltlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = timeCost;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nRows;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nCols;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+
+   //Now comes the padding
+   *ptrByte = 0x80; //first byte of padding: right after the password
+   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+
+// from here on it's all simd acces to state and matrix
+// define vector pointers and adjust sizes and pointer offsets
+
+   //================= Initializing the Sponge State ====================//
+   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+
+//   initState( state );
+
+   //========================= Setup Phase =============================//
+   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+   
+   ptrWord = wholeMatrix;
+
+   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
+/*
+   for (i = 0; i < nBlocksInput; i++)
+   {
+       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
+       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
+   }
+*/
+
+   //Initializes M[0] and M[1]
+   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+
+   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
+                      nCols);
+
+   do
+   {
+      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+
+      //updates the value of row* (deterministically picked during Setup))
+      rowa = (rowa + step) & (window - 1);
+      //update prev: it now points to the last row ever computed
+
+      prev = row;
+      //updates row: goes to the next row to be computed
+      row++;
+
+      //Checks if all rows in the window where visited.
+      if (rowa == 0)
+      {
+         step = window + gap; //changes the step: approximately doubles its value
+         window *= 2; //doubles the size of the re-visitation window
+         gap = -gap; //inverts the modifier to the step
+      }
+
+   } while (row < nRows);
+
+   //===================== Wandering Phase =============================//
+   row = 0; //Resets the visitation to the first row of the memory matrix
+   for (tau = 1; tau <= timeCost; tau++)
+   {
+       //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+       step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+       do
+       {
+           //Selects a pseudorandom index row*
+           //-----------------------------------------------
+           rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+
+           //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //-------------------------------------------
+
+           //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+           //update prev: it now points to the last row ever computed
+           prev = row;
+
+           //updates row: goes to the next row to be computed
+           //----------------------------------------------------
+           row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //----------------------------------------------------
+
+       } while (row != 0);
+   }
+
+   //===================== Wrap-up Phase ===============================//
+   //Absorbs the last block of the memory matrix
+   absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
+   //Squeezes the key
+   squeeze(state, K, (unsigned int) kLen);
+
+   return 0;
+}
+
+/////////////////////////////////////////////////
+
+// 2 way 256
+// drop salt, salt len arguments, hard code some others.
+// Data is interleaved 2x256.
+
+int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
+      const void *pwd, const uint64_t pwdlen, const void *salt,
+      const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
+      const uint64_t nCols )
+{
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[16];
+   int64_t row = 2; //index of row to be processed
+   int64_t prev = 1; //index of prev (last row ever computed/modified)
+   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+   int64_t tau; //Time Loop iterator
+   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+//   int64_t i; //auxiliary iteration counter
+   int64_t v64; // 64bit var for memcpy
+   uint64_t instance0 = 0; // Seperate instance for each lane
+   uint64_t instance1 = 0;
+   //====================================================================/
+
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+   const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
+
+   uint64_t *ptrWord = wholeMatrix;
+
+//  2 way 256 rewrite. Salt always == password, and data is interleaved,
+//  need to build in parallel:
+//  {   password,    (64 or 80 bytes)
+//      salt,        (64 or 80 bytes) =  same as password
+//      Klen,        (u64)  = 32 bytes
+//      pwdlen,      (u64)
+//      saltlen,     (u64)
+//      timecost,    (u64)
+//      nrows,       (u64)
+//      ncols,       (u64)
+//      0x80,        (byte)
+//      { 0 .. 0 },
+//      1            (byte)
+//   }
+   
+//   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+
+   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+
+   byte *ptrByte = (byte*) wholeMatrix;
+
+   //Prepends the password
+   memcpy(ptrByte, pwd, pwdlen);
+   ptrByte += pwdlen;
+
+   //Concatenates the salt
+   memcpy(ptrByte, salt, saltlen);
+   ptrByte += saltlen;
+
+   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
+                       - (saltlen + pwdlen) );
+
+   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+   memcpy(ptrByte, &kLen, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = pwdlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = saltlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = timeCost;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nRows;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nCols;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+
+   //Now comes the padding
+   *ptrByte = 0x80; //first byte of padding: right after the password
+   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+
+// from here on it's all simd acces to state and matrix
+// define vector pointers and adjust sizes and pointer offsets
+
+   ptrWord = wholeMatrix;
+
+   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
+   reducedSqueezeRow0( state, &wholeMatrix[0], nCols );
+
+   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
+                      nCols);
+
+   do
+   {
+
+      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+
+      rowa = (rowa + step) & (window - 1);
+
+      prev = row;
+      row++;
+
+      if (rowa == 0)
+      {
+         step = window + gap; //changes the step: approximately doubles its value
+         window *= 2; //doubles the size of the re-visitation window
+         gap = -gap; //inverts the modifier to the step
+      }
+
+   } while (row < nRows);
+
+   row = 0;
+   for (tau = 1; tau <= timeCost; tau++)
+   {
+      step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
+      do
+      {
+        // This part is not parallel, rowa will be different for each lane.
+        // state (u64[16]) is interleaved 2x256, need to extract seperately.
+
+        // index = 2 * instance / 4 * 4 + instance % 4
+        uint64_t index0 = ( ( (instance0 & 0xf) >> 3 ) << 2 )
+                           + ( instance0 & 0x3 )
+        uint64_t index1 = ( ( (instance1 & 0xf) >> 3 ) << 2 )
+                           + ( instance1 & 0x3 )
+
+        instance0 = state[ index0 ] & 0xf;
+        instance1 = (state+4)[ index1 ] & 0xf;
+
+        rowa0 = state[ instance0 ];
+        rowa1 = (state+4)[ instance1 ];
+
+        reducedDuplexRow_2way( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                                      &wholeMatrix[rowa0*ROW_LEN_INT64],
+                                      &wholeMatrix[rowa1*ROW_LEN_INT64],
+                                      &wholeMatrix[row*ROW_LEN_INT64], nCols );
+/*
+           instance = state[instance & 0xF];
+           rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
+
+           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+*/
+        // End of divergence.
+
+        prev = row;
+        row = (row + step) & (unsigned int)(nRows-1); 
+
+       } while ( row != 0 );
+   }
+
+   absorbBlock( state, &wholeMatrix[rowa*ROW_LEN_INT64] );
+   squeeze( state, K, (unsigned int) kLen );
+
+   return 0;
+}
+
+
+
+//////////////////////////////////////////////////
+int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
+            const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
+            const uint64_t timeCost, const uint64_t nRows,
+            const uint64_t nCols )
+{
+    //========================== Basic variables ============================//
+    uint64_t _ALIGN(256) state[16];
+    int64_t row = 2; //index of row to be processed
+    int64_t prev = 1; //index of prev (last row ever computed/modified)
+    int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+    int64_t tau; //Time Loop iterator
+    int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+    int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+    int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+//    int64_t i; //auxiliary iteration counter
+    //=======================================================================/
+
+    //======= Initializing the Memory Matrix and pointers to it =============//
+    //Tries to allocate enough space for the whole memory matrix
+
+    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+//    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+//    memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+
+    //==== Getting the password + salt + basil padded with 10*1 ============//
+    //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+    //but this ensures that the password copied locally will be overwritten as soon as possible
+
+    //First, we clean enough blocks for the password, salt, basil and padding
+    uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 *
+                       sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+    byte *ptrByte = (byte*) wholeMatrix;
+    memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );
+
+    //Prepends the password
+    memcpy(ptrByte, pwd, pwdlen);
+    ptrByte += pwdlen;
+
+    //Concatenates the salt
+    memcpy(ptrByte, salt, saltlen);
+    ptrByte += saltlen;
+    //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+    memcpy(ptrByte, &kLen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &saltlen, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &timeCost, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &nRows, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+    memcpy(ptrByte, &nCols, sizeof (uint64_t));
+    ptrByte += sizeof (uint64_t);
+
+    //Now comes the padding
+    *ptrByte = 0x80; //first byte of padding: right after the password
+    ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+    ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+    *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+
+    //=================== Initializing the Sponge State ====================//
+    //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+//        uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32);
+//        if (state == NULL) {
+//                return -1;
+//        }
+//    initState( state );
+
+    //============================== Setup Phase =============================//
+    //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+    uint64_t *ptrWord = wholeMatrix;
+
+    absorbBlockBlake2Safe( state, ptrWord, nBlocksInput,
+                           BLOCK_LEN_BLAKE2_SAFE_INT64 );
+/*
+    for ( i = 0; i < nBlocksInput; i++ )
+    {
+      absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
+      ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
+    }
+*/
+    //Initializes M[0] and M[1]
+        reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here
+        reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols);
+
+        do {
+                //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+                reducedDuplexRowSetup(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
+
+                //updates the value of row* (deterministically picked during Setup))
+                rowa = (rowa + step) & (window - 1);
+                //update prev: it now points to the last row ever computed
+                prev = row;
+                //updates row: goes to the next row to be computed
+                row++;
+
+                //Checks if all rows in the window where visited.
+                if (rowa == 0) {
+                        step = window + gap; //changes the step: approximately doubles its value
+                        window *= 2; //doubles the size of the re-visitation window
+                        gap = -gap; //inverts the modifier to the step
+                }
+
+        } while (row < nRows);
+
+    //======================== Wandering Phase =============================//
+    row = 0; //Resets the visitation to the first row of the memory matrix
+    for ( tau = 1; tau <= timeCost; tau++ )
+    {
+        //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+        step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+        do {
+        //Selects a pseudorandom index row*
+        //----------------------------------------------------------------------
+        //rowa = ((unsigned int)state[0]) & (nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+        rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+        //-----------------------------------------------------------------
+
+        //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+                reducedDuplexRow(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
+
+        //update prev: it now points to the last row ever computed
+        prev = row;
+
+        //updates row: goes to the next row to be computed
+        //---------------------------------------------------------------
+        //row = (row + step) & (nRows-1);       //(USE THIS IF nRows IS A POWER OF 2)
+        row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+        //--------------------------------------------------------------------
+
+      } while (row != 0);
+    }
+
+    //========================= Wrap-up Phase ===============================//
+    //Absorbs the last block of the memory matrix
+    absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
+
+    //Squeezes the key
+    squeeze( state, K, kLen );
+
+    return 0;
+}
+
+// Lyra2RE doesn't like the new wholeMatrix implementation
+int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
+             const void *salt, const uint64_t saltlen, const uint64_t timeCost,
+             const uint64_t nRows, const uint64_t nCols )
+{
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[16];
+   int64_t row = 2; //index of row to be processed
+   int64_t prev = 1; //index of prev (last row ever computed/modified)
+   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+   int64_t tau; //Time Loop iterator
+   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+   int64_t i; //auxiliary iteration counter
+   int64_t v64; // 64bit var for memcpy
+   //====================================================================/
+
+   //=== Initializing the Memory Matrix and pointers to it =============//
+   //Tries to allocate enough space for the whole memory matrix
+
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   // for Lyra2REv2, nCols = 4, v1 was using 8
+   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
+                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+
+   i = (int64_t)ROW_LEN_BYTES * nRows;
+   uint64_t *wholeMatrix = _mm_malloc( i, 64 );
+   if (wholeMatrix == NULL)
+      return -1;
+
+#if defined(__AVX2__)
+   memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
+#elif defined(__SSE2__)
+   memset_zero_128( (__m128i*)wholeMatrix, i>>4 );   
+#else
+   memset( wholeMatrix, 0, i );
+#endif
+
+   uint64_t *ptrWord = wholeMatrix;
+
+   //=== Getting the password + salt + basil padded with 10*1 ==========//
+   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+   //but this ensures that the password copied locally will be overwritten as soon as possible
+
+   //First, we clean enough blocks for the password, salt, basil and padding
+   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+
+   byte *ptrByte = (byte*) wholeMatrix;
+
+   //Prepends the password
+   memcpy(ptrByte, pwd, pwdlen);
+   ptrByte += pwdlen;
+
+   //Concatenates the salt
+   memcpy(ptrByte, salt, saltlen);
+   ptrByte += saltlen;
+
+//   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
+//                       - (saltlen + pwdlen) );
+
+   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+   memcpy(ptrByte, &kLen, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = pwdlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = saltlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = timeCost;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nRows;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nCols;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+
+   //Now comes the padding
+   *ptrByte = 0x80; //first byte of padding: right after the password
+   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+
+   //================= Initializing the Sponge State ====================//
+   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+
+//   initState( state );
+
+   //========================= Setup Phase =============================//
+   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+
+   ptrWord = wholeMatrix;
+
+   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
+/*
+   for (i = 0; i < nBlocksInput; i++)
+   {
+       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
+       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
+   }
+*/
+   //Initializes M[0] and M[1]
+   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+
+   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
+                      nCols);
+
+   do
+   {
+      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+
+      //updates the value of row* (deterministically picked during Setup))
+      rowa = (rowa + step) & (window - 1);
+      //update prev: it now points to the last row ever computed
+
+      prev = row;
+      //updates row: goes to the next row to be computed
+      row++;
+
+      //Checks if all rows in the window where visited.
+      if (rowa == 0)
+      {
+         step = window + gap; //changes the step: approximately doubles its value
+         window *= 2; //doubles the size of the re-visitation window
+         gap = -gap; //inverts the modifier to the step
+      }
+
+   } while (row < nRows);
+
+   //===================== Wandering Phase =============================//
+   row = 0; //Resets the visitation to the first row of the memory matrix
+   for (tau = 1; tau <= timeCost; tau++)
+   {
+       //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+       step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+       do
+       {
+           //Selects a pseudorandom index row*
+           //-----------------------------------------------
+           rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+
+           //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //-------------------------------------------
+
+           //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+           //update prev: it now points to the last row ever computed
+           prev = row;
+
+           //updates row: goes to the next row to be computed
+           //----------------------------------------------------
+           row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //----------------------------------------------------
+
+       } while (row != 0);
+   }
+
+   //===================== Wrap-up Phase ===============================//
+   //Absorbs the last block of the memory matrix
+   absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
+   //Squeezes the key
+   squeeze(state, K, (unsigned int) kLen);
+
+   //================== Freeing the memory =============================//
+   _mm_free(wholeMatrix);
+
+   return 0;
+}
+
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -0,0 +1,319 @@
+/**
+ * A simple implementation of Blake2b's internal permutation
+ * in the form of a sponge.
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "algo-gate.h"
+#include <string.h>
+#include <stdio.h>
+#include <time.h>
+#include <immintrin.h>
+#include "sponge.h"
+#include "lyra2.h"
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
+{
+    const int len_m256i = len / 32;
+    const int fullBlocks = len_m256i / BLOCK_LEN_M256I;
+    __m512i* state = (__m512i*)State;
+    __m512i* out   = (__m512i*)Out;
+    int i;
+
+    //Squeezes full blocks
+    for ( i = 0; i < fullBlocks; i++ )
+    {
+       memcpy_512( out, state, BLOCK_LEN_M256I*2 );
+       LYRA_ROUND_2WAY_AVX2( state[0], state[1], state[2], state[3] );
+       out += BLOCK_LEN_M256I*2;
+    }
+    //Squeezes remaining bytes
+    memcpy_512( out, state, ( (len_m256i % BLOCK_LEN_M256I) * 2 ) );
+}
+
+inline void absorbBlock_2way( uint64_t *State, const uint64_t *In ) 
+{
+    register __m512i state0, state1, state2, state3;
+    __m512i *in = (__m512i*)In;
+
+    state0 = _mm512_load_si512( (__m512i*)State     );
+    state1 = _mm512_load_si512( (__m512i*)State + 1 );
+    state2 = _mm512_load_si512( (__m512i*)State + 2 );
+    state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+    state0 = _mm512_xor_si512( state0, in[0] );
+    state1 = _mm512_xor_si512( state1, in[1] );
+    state2 = _mm512_xor_si512( state2, in[2] );
+
+    LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
+
+    _mm512_store_si512( (__m512i*)State,     state0 );
+    _mm512_store_si512( (__m512i*)State + 1, state1 );
+    _mm512_store_si512( (__m512i*)State + 2, state2 );
+    _mm512_store_si512( (__m512i*)State + 3, state3 );
+
+}
+
+inline void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
+                      const uint64_t nBlocks, const uint64_t block_len )
+{
+  register __m512i state0, state1, state2, state3;
+
+  state0 = 
+  state1 = m512_zero;
+  state2 = m512_const4_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
+                           0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
+  state3 = m512_const4_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
+                           0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
+
+  for ( int i = 0; i < nBlocks; i++ )
+  { 
+    __m512i *in = (__m512i*)In;
+    state0 = _mm512_xor_si512( state0, in[0] );
+    state1 = _mm512_xor_si512( state1, in[1] );
+
+    LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
+    In += block_len * 2;
+  }
+
+  _mm512_store_si512( (__m512i*)State,     state0 );
+  _mm512_store_si512( (__m512i*)State + 1, state1 );
+  _mm512_store_si512( (__m512i*)State + 2, state2 );
+  _mm512_store_si512( (__m512i*)State + 3, state3 );
+
+}
+
+inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
+                                     uint64_t nCols )
+{
+    int i;
+
+    //M[row][C-1-col] = H.reduced_squeeze()
+
+
+    register __m512i state0, state1, state2, state3;
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
+
+    state0 = _mm512_load_si512( (__m512i*)State     );
+    state1 = _mm512_load_si512( (__m512i*)State + 1 );
+    state2 = _mm512_load_si512( (__m512i*)State + 2 );
+    state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+    for ( i = 0; i < 9; i += 3)
+    {
+        _mm_prefetch( out - i,     _MM_HINT_T0 );
+        _mm_prefetch( out - i - 2, _MM_HINT_T0 );
+    }
+
+    for ( i = 0; i < nCols; i++ )
+    {
+       _mm_prefetch( out -  9, _MM_HINT_T0 );
+       _mm_prefetch( out - 11, _MM_HINT_T0 );
+                   
+       out[0] = state0;
+       out[1] = state1;
+       out[2] = state2;
+
+       //Goes to next block (column) that will receive the squeezed data
+       out -= BLOCK_LEN_M256I * 2;
+
+       LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+    }
+
+    _mm512_store_si512( (__m512i*)State,     state0 );
+    _mm512_store_si512( (__m512i*)State + 1, state1 );
+    _mm512_store_si512( (__m512i*)State + 2, state2 );
+    _mm512_store_si512( (__m512i*)State + 3, state3 );
+}
+
+// This function has to deal with gathering 2 256 bit rowin vectors from
+// non-contiguous memory. Extra work and performance penalty.
+
+inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
+                 uint64_t *rowOut, uint64_t nCols )
+{
+    int i;
+    register __m512i state0, state1, state2, state3;
+    __m512i *in = (__m256i*)rowIn;
+
+    state0 = _mm512_load_si512( (__m512i*)State     );
+    state1 = _mm512_load_si512( (__m512i*)State + 1 );
+    state2 = _mm512_load_si512( (__m512i*)State + 2 );
+    state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+    for ( i = 0; i < nCols; i++ )
+    {
+         state0 = _mm512_xor_si512( state0, in[0] );
+         state1 = _mm512_xor_si512( state1, in[1] );
+         state2 = _mm512_xor_si512( state2, in[2] );
+
+         LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+
+         out[0] = _mm512_xor_si512( state0, in[0] );
+         out[1] = _mm512_xor_si512( state1, in[1] );
+         out[2] = _mm512_xor_si512( state2, in[2] );
+
+         //Input: next column (i.e., next block in sequence)
+         in0 += BLOCK_LEN_M256I;
+         in1 += BLOCK_LEN_M256I;
+         //Output: goes to previous column
+         out -= BLOCK_LEN_M256I * 2;
+    }
+
+    _mm512_store_si256( (__m512i*)State,     state0 );
+    _mm512_store_si256( (__m512i*)State + 1, state1 );
+    _mm512_store_si256( (__m512i*)State + 2, state2 );
+    _mm512_store_si256( (__m512i*)State + 3, state3 );
+   }
+}
+
+inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
+                       uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols )
+{
+    int i;
+
+    register __m512i state0, state1, state2, state3;
+    __m512i* in    = (__m512i*)rowIn;
+    __m512i* inout = (__m512i*)rowInOut;
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
+    __m512i  t0, t1, t2;
+
+    state0 = _mm512_load_si512( (__m512i*)State     );
+    state1 = _mm512_load_si512( (__m512i*)State + 1 );
+    state2 = _mm512_load_si512( (__m512i*)State + 2 );
+    state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+    for ( i = 0; i < nCols; i++ )
+    {
+       state0 = _mm512_xor_si512( state0,
+                                  _mm512_add_epi64( in[0], inout[0] ) );
+       state1 = _mm512_xor_si512( state1,
+                                  _mm512_add_epi64( in[1], inout[1] ) );
+       state2 = _mm512_xor_si512( state2,
+                                  _mm512_add_epi64( in[2], inout[2] ) );
+
+       LYRA_ROUND_2WAY AVX512( state0, state1, state2, state3 );
+
+       out[0] = _mm512_xor_si512( state0, in[0] );
+       out[1] = _mm512_xor_si512( state1, in[1] );
+       out[2] = _mm512_xor_si512( state2, in[2] );
+
+       //M[row*][col] = M[row*][col] XOR rotW(rand)
+       t0 = _mm512_permutex_epi64( state0, 0x93 );
+       t1 = _mm512_permutex_epi64( state1, 0x93 );
+       t2 = _mm512_permutex_epi64( state2, 0x93 );
+
+       inout[0] = _mm512_xor_si512( inout[0],
+                                 _mm512_mask_blend_epi32( t0, t2, 0x03 ) );
+       inout[1] = _mm512_xor_si512( inout[1],
+                                 _mm512_mask_blend_epi32( t1, t0, 0x03 ) );
+       inout[2] = _mm512_xor_si512( inout[2],
+                                 _mm512_mask_blend_epi32( t2, t1, 0x03 ) );
+
+       //Inputs: next column (i.e., next block in sequence)
+       in    += BLOCK_LEN_M256I * 2;
+       inout += BLOCK_LEN_M256I * 2;
+       //Output: goes to previous column
+       out   -= BLOCK_LEN_M256I * 2;
+    }
+
+    _mm512_store_si512( (__m512i*)State,     state0 );
+    _mm512_store_si512( (__m512i*)State + 1, state1 );
+    _mm512_store_si512( (__m512i*)State + 2, state2 );
+    _mm512_store_si512( (__m512i*)State + 3, state3 );
+}
+
+inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
+                uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut,
+                uint64_t nCols )
+{
+   int i;
+
+   register __m512i state0, state1, state2, state3;
+    __m256i *in0 = (__m256i*)rowIn0;
+    __m256i *in0 = (__m256i*)rowIn0;
+    __m2512* in    = (__m512i*)rowIn;
+    __m2512* inout = (__m512i*)rowInOut;
+    __m512i* out   = (__m512i*)rowOut;
+    __m512i  t0, t1, t2;
+
+    _mm_prefetch( in0,     _MM_HINT_T0 );
+    _mm_prefetch( in1,     _MM_HINT_T0 );
+    _mm_prefetch( in0 + 2, _MM_HINT_T0 );
+    _mm_prefetch( in1 + 2, _MM_HINT_T0 );
+    _mm_prefetch( in0 + 4, _MM_HINT_T0 );
+    _mm_prefetch( in1 + 4, _MM_HINT_T0 );
+    _mm_prefetch( in0 + 6, _MM_HINT_T0 );
+    _mm_prefetch( in1 + 6, _MM_HINT_T0 );
+   
+   state0 = _mm512_load_si512( (__m512i*)State     );
+   state1 = _mm512_load_si512( (__m512i*)State + 1 );
+   state2 = _mm512_load_si512( (__m512i*)State + 2 );
+   state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+      //Absorbing "M[prev] [+] M[row*]"
+
+//         state0 = _mm512_xor_si512( state0, mm512_concat_256( in1[0], in0[0] );
+//         state1 = _mm512_xor_si512( state1, mm512_concat_256( in1[1], in0[1] );
+//         state2 = _mm512_xor_si512( state2, mm512_concat_256( in1[2], in0[2] );
+      t0 = mm512_concat_256( in1[0], in0[0] );
+      t1 = mm512_concat_256( in1[1], in0[1] );
+      t2 = mm512_concat_256( in1[2], in0[2] );
+      
+      state0 = _mm512_xor_si512( state0,
+                                     _mm512_add_epi64( t0, inout[0] ) );
+      state1 = _mm512_xor_si512( state1,
+                                     _mm512_add_epi64( t1, inout[1] ) );
+      state2 = _mm512_xor_si512( state2,
+                                     _mm512_add_epi64( t2, inout[2] ) );
+
+      //Applies the reduced-round transformation f to the sponge's state
+      LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+
+      //M[rowOut][col] = M[rowOut][col] XOR rand
+      out[0] = _mm512_xor_si512( out[0], state0 );
+      out[1] = _mm512_xor_si512( out[1], state1 );
+      out[2] = _mm512_xor_si512( out[2], state2 );
+
+      //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+      t0 = _mm512_permutex_epi64( state0, 0x93 );
+      t1 = _mm512_permutex_epi64( state1, 0x93 );
+      t2 = _mm512_permutex_epi64( state2, 0x93 );
+
+      inout[0] = _mm512_xor_si512( inout[0],
+                                   _mm512_mask_blend_epi32( t0, t2, 0x03 ) );
+      inout[1] = _mm512_xor_si512( inout[1],
+                                   _mm512_mask_blend_epi32( t1, t0, 0x03 ) );
+      inout[2] = _mm512_xor_si512( inout[2],
+                                   _mm512_mask_blend_epi32( t2, t1, 0x03 ) );
+
+       //Goes to next block
+       in    += BLOCK_LEN_M256I * 2;
+       out   += BLOCK_LEN_M256I * 2;
+       inout += BLOCK_LEN_M256I * 2;
+   }
+
+   _mm512_store_si512( (__m512i*)State,     state0 );
+   _mm512_store_si512( (__m512i*)State + 1, state1 );
+   _mm512_store_si512( (__m512i*)State + 2, state2 );
+   _mm512_store_si512( (__m512i*)State + 3, state3 );
+}
+
+#endif // AVX512
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -52,8 +52,46 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 // However, 2 way parallel looks trivial to code for AVX512 except for
 // a data dependency with rowa.

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define G2W_4X64(a,b,c,d) \
+   a = _mm512_add_epi64( a, b ); \
+   d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
+   c = _mm512_add_epi64( c, d ); \
+   b = mm512_ror_64( _mm512_xor_si512( b, c ), 24 ); \
+   a = _mm512_add_epi64( a, b ); \
+   d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
+   c = _mm512_add_epi64( c, d ); \
+   b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 );
+
+#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   G_4X64( s0, s1, s2, s3 ); \
+   s1 = mm512_ror_1x64( s1); \
+   s2 = mm512_swap128_256( s2 ); \
+   s3 = mm512_rol1x64_256( s3 ); \
+   G_4X64( s0, s1, s2, s3 ); \
+   s1 = mm512_rol1x64_256( s1 ); \
+   s2 = mm512_swap128_256( s2 ); \
+   s3 = mm512_ror1x64_256( s3 );
+
+#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 )
+
+
+#endif  // AVX512
+
 #if defined __AVX2__
-// only available with avx2

 // process 4 columns in parallel
 // returns void, updates all args
@@ -89,9 +127,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
-   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   LYRA_ROUND_AVX2( s0, s1, s2, s3 )

-#elif defined(__SSE2__)
+#endif
+
+#if defined(__SSE2__)

 // process 2 columns in parallel
 // returns void, all args updated
@@ -129,7 +169,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
-   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7)


 #endif // AVX2 else SSE2
@@ -161,6 +201,30 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    G(r,7,v[ 3],v[ 4],v[ 9],v[14]);


+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+//---- Housekeeping
+void initState_2way( uint64_t state[/*16*/] );
+
+//---- Squeezes
+void squeeze_2way( uint64_t *state, unsigned char *out, unsigned int len );
+void reducedSqueezeRow0_2way( uint64_t* state, uint64_t* row, uint64_t nCols );
+
+//---- Absorbs
+void absorbBlock_2way( uint64_t *state, const uint64_t *in );
+void absorbBlockBlake2Safe_2way( uint64_t *state, const uint64_t *in,
+                            const uint64_t nBlocks, const uint64_t block_len );
+
+//---- Duplexes
+void reducedDuplexRow1_2way( uint64_t *state, uint64_t *rowIn,
+                             uint64_t *rowOut, uint64_t nCols);
+void reducedDuplexRowSetup_2way( uint64_t *state, uint64_t *rowIn,
+                    uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
+void reducedDuplexRow_2way(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
+
+#endif
+
+
 //---- Housekeeping
 void initState(uint64_t state[/*16*/]);

@@ -178,20 +242,4 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint6
 void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
 void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);

-//---- Misc
-void printArray(unsigned char *array, unsigned int size, char *name);
-
-////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-////TESTS////
-//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2);
-//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
-/////////////
-
-
 #endif /* SPONGE_H_ */
--- a/algo/quark/hmq1725-gate.h
+++ b/algo/quark/hmq1725-gate.h
@@ -5,7 +5,7 @@
 #include <stdint.h>

 #if defined(__AVX2__) && defined(__AES__)
-//  #define HMQ1725_4WAY
+//  #define HMQ1725_4WAY 1
 #endif

 bool register_hmq1725_algo( algo_gate_t* gate );
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -4,7 +4,8 @@
 #include <string.h>
 #include <stdio.h>
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/cubehash_sse2.h" 
+#include "algo/cubehash/cube-hash-2way.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -13,73 +14,70 @@

 typedef struct
 {
-        luffa_4way_context      luffa;
-        cubehashParam           cube;
-        sph_shavite512_context  shavite;
-        simd_4way_context       simd;
-        hashState_echo          echo;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    simd_2way_context       simd2;
+    hashState_echo          echo;
 } qubit_4way_ctx_holder;

 qubit_4way_ctx_holder qubit_4way_ctx;

 void init_qubit_4way_ctx()
 {
-        cubehashInit(&qubit_4way_ctx.cube,512,16,32);
-        sph_shavite512_init(&qubit_4way_ctx.shavite);
-        simd_4way_init( &qubit_4way_ctx.simd, 512 );
-        init_echo(&qubit_4way_ctx.echo, 512);
+    cube_4way_init( &qubit_4way_ctx.cube, 512, 16, 32 );
+    sph_shavite512_init(&qubit_4way_ctx.shavite);
+    simd_4way_init( &qubit_4way_ctx.simd, 512 );
+    simd_2way_init( &qubit_4way_ctx.simd2, 512 );
+    init_echo(&qubit_4way_ctx.echo, 512);
 };

 void qubit_4way_hash( void *output, const void *input )
 {
-     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
-     uint64_t hash0[8] __attribute__ ((aligned (64)));
-     uint64_t hash1[8] __attribute__ ((aligned (64)));
-     uint64_t hash2[8] __attribute__ ((aligned (64)));
-     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint32_t vhash[16*4] __attribute__ ((aligned (128)));
+     uint32_t hash0[16] __attribute__ ((aligned (64)));
+     uint32_t hash1[16] __attribute__ ((aligned (64)));
+     uint32_t hash2[16] __attribute__ ((aligned (64)));
+     uint32_t hash3[16] __attribute__ ((aligned (64)));
     qubit_4way_ctx_holder ctx;

     memcpy( &ctx, &qubit_4way_ctx, sizeof(qubit_4way_ctx) );
+
     luffa_4way_update( &ctx.luffa, input + (64<<2), 16 );
     luffa_4way_close( &ctx.luffa, vhash );
-     dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
-
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
-     memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
-     memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
-     memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
-
+     
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
-     memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
+     memcpy( &ctx.shavite, &qubit_4way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash1, 64 );
     sph_shavite512_close( &ctx.shavite, hash1 );
-     memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
+     memcpy( &ctx.shavite, &qubit_4way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash2, 64 );
     sph_shavite512_close( &ctx.shavite, hash2 );
-     memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
+     memcpy( &ctx.shavite, &qubit_4way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );

-     intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );

     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, 512 );
-     memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
+     memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, 512 );
-     memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
+     memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, 512 );
-     memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
+     memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

@@ -92,71 +90,40 @@ void qubit_4way_hash( void *output, const void *input )
 int scanhash_qubit_4way( struct work *work,uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[4*16] __attribute__ ((aligned (128)));
-     uint32_t vdata[4*24] __attribute__ ((aligned (64)));
+     uint32_t hash[4*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *noncep = vdata + 64+3;   // 4*16 + 3
-     int thr_id = mythr->id;  
+     int thr_id = mythr->id;
     const uint32_t Htarg = ptarget[7];
-     uint64_t htmax[] = {          0,        0xF,       0xFF,
-                               0xFFF,     0xFFFF, 0x10000000  };
-     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
-                          0xFFFFF000, 0xFFFF0000,          0  };
-
-     casti_m512i( endiandata, 0 ) = mm512_bswap_32( casti_m512i( pdata, 0 ) );
-     casti_m512i( endiandata, 1 ) = mm512_bswap_32( casti_m512i( pdata, 1 ) );
-     casti_m512i( endiandata, 4 ) = mm512_bswap_32( casti_m512i( pdata, 4 ) );
-
-     uint64_t *edata = (uint64_t*)endiandata;
-     intrlv_4x128( (uint64_t*)vdata, edata, edata, 640 );

+     mm512_bswap32_intrlv80_4x128( vdata, pdata );
     luffa_4way_init( &qubit_4way_ctx.luffa, 512 );
     luffa_4way_update( &qubit_4way_ctx.luffa, vdata, 64 );

-     for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+     do
     {
-        uint32_t mask = masks[m];
-        do
-        {
-            be32enc( noncep,   n   );
-            be32enc( noncep+4, n+1 );
-            be32enc( noncep+8, n+2 );
-            be32enc( noncep+12, n+3 );
-            qubit_4way_hash( hash, vdata );
-            pdata[19] = n;
+        be32enc( noncep,    n   );
+        be32enc( noncep+ 4, n+1 );
+        be32enc( noncep+ 8, n+2 );
+        be32enc( noncep+12, n+3 );

-            if ( !( hash[7] & mask ) )
-            if ( fulltest( hash, ptarget) && !opt_benchmark )
-            {
-                pdata[19] = n;
-                submit_lane_solution( work, hash, mythr, 0 );
-            }
-            if ( !( (hash+8)[7] & mask ) )
-            if ( fulltest( hash+8, ptarget) && !opt_benchmark )
-            {
-               pdata[19] = n+1;
-               submit_lane_solution( work, hash+8, mythr, 1 );
-            }
-            if ( !( hash+16[7] & mask ) )
-            if ( fulltest( hash, ptarget) && !opt_benchmark )
-            {
-                pdata[19] = n+2;
-                submit_lane_solution( work, hash, mythr, 2 );
-            }
-            if ( !( (hash+24)[7] & mask ) )
-            if ( fulltest( hash+8, ptarget) && !opt_benchmark )
-            {
-               pdata[19] = n+3;
-               submit_lane_solution( work, hash+8, mythr, 3 );
-            }
-            n += 4;
-         } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
-         break;
-     }
+        qubit_4way_hash( hash, vdata );
+        pdata[19] = n;
+
+        for ( int lane = 0; lane < 4; lane++ )
+        if ( ( hash+(lane<<3) )[7] < Htarg )
+        if ( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark )
+        {
+           pdata[19] = n + lane;
+           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+        }
+        n += 4;
+     } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
     *hashes_done = n - first_nonce;
     return 0;
 }
--- a/algo/qubit/qubit-gate.c
+++ b/algo/qubit/qubit-gate.c
@@ -2,14 +2,12 @@

 bool register_qubit_algo( algo_gate_t* gate )
 {
-/*   
+   
 #if defined (QUBIT_4WAY)
-  init_qubit_2way_ctx();
+  init_qubit_4way_ctx();
  gate->scanhash  = (void*)&scanhash_qubit_4way;
  gate->hash      = (void*)&qubit_4way_hash;
-#elif defined (QUBIT_4WAY)
-*/
-#if defined (QUBIT_2WAY)
+#elif defined (QUBIT_2WAY)
  init_qubit_2way_ctx();
  gate->scanhash  = (void*)&scanhash_qubit_2way;
  gate->hash      = (void*)&qubit_2way_hash;
@@ -18,7 +16,7 @@ bool register_qubit_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_qubit;
  gate->hash      = (void*)&qubit_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/qubit/qubit-gate.h
+++ b/algo/qubit/qubit-gate.h
@@ -4,17 +4,15 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-/*
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define QUBIT_2WAY 1
+  #define QUBIT_4WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
-*/
-#if defined(__AVX2__) && defined(__AES__)
  #define QUBIT_2WAY 1
 #endif

 bool register_qubit_algo( algo_gate_t* gate );
-/*
+
 #if defined(QUBIT_4WAY)

 void qubit_4way_hash( void *state, const void *input );
@@ -23,8 +21,6 @@ int scanhash_qubit_4way( struct work *work, uint32_t max_nonce,
 void init_qubit_4way_ctx();

 #elif defined(QUBIT_2WAY)
-*/
-#if defined(QUBIT_2WAY)

 void qubit_2way_hash( void *state, const void *input );
 int scanhash_qubit_2way( struct work *work, uint32_t max_nonce,
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -735,7 +735,7 @@ do { \
  fft128_4way( a+512 );
 }

-#define c1_16( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
+#define c1_16_512( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}

 void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
 {
@@ -744,8 +744,12 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
  __m512i *S = (__m512i*) state;
  __m512i *M = (__m512i*) msg;
  __m512i *W = (__m512i*) fft;
-  static const m512_v16 code[] = { c1_16(185), c1_16(233),
-                                   c1_16(185), c1_16(233) };
+
+  static const m512_v16 code[] = { c1_16_512(185), c1_16_512(233) };
+
+
+//  static const m512_v16 code[] = { c1_16(185), c1_16(233),
+//                                   c1_16(185), c1_16(233) };


  S0l = _mm512_xor_si512( S[0], M[0] );
@@ -999,7 +1003,9 @@ void SIMD_4way_Compress( simd_4way_context *state, const void *m, int final )
 {
   m512_v16 Y[32];
   uint16_t *y = (uint16_t*) Y[0].u16;
+
   fft256_4way_msg( y, m, final );
+
   rounds512_4way( state->A, m, y );
 }

@@ -1340,7 +1346,8 @@ do { \
  DO_REDUCE_FULL_S( 6 );
  DO_REDUCE_FULL_S( 7 );

-#undef BUTTERFLY
+#undef BUTTERFLY_0
+#undef BUTTERFLY_N
 #undef DO_REDUCE

  A[0] = X0;
@@ -1491,6 +1498,7 @@ do { \

  fft128_2way( a );
  fft128_2way( a+256 );
+
 }

 #define c1_16( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
@@ -1751,7 +1759,9 @@ void SIMD_2way_Compress( simd_2way_context *state, const void *m, int final )
 {
   m256_v16 Y[32];
   uint16_t *y = (uint16_t*) Y[0].u16;
+
   fft256_2way_msg( y, m, final );
+
   rounds512_2way( state->A, m, y );
 }

@@ -1864,6 +1874,7 @@ int simd_2way_update_close( simd_2way_context *state, void *hashval,
    {
      // We can hash the data directly from the input buffer.
      SIMD_2way_Compress( state, data, 0 );
+
      databitlen -= bs;
      data += 2*( bs/8 );
      state->count += bs;
@@ -1874,7 +1885,8 @@ int simd_2way_update_close( simd_2way_context *state, void *hashval,
      int len = bs - current;
      if ( databitlen < len )
      {
-        memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) );
+
+         memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) );
        state->count += databitlen;
        break;
      }
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -1,11 +1,7 @@
 #include "cpuminer-config.h"
 #include "c11-gate.h"
-
-#if defined (C11_4WAY)
-
 #include <string.h>
 #include <stdint.h>
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -13,11 +9,237 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"

+#if defined (C11_8WAY)
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+} c11_8way_ctx_holder;
+
+c11_8way_ctx_holder c11_8way_ctx;
+
+void init_c11_8way_ctx()
+{
+     blake512_8way_init( &c11_8way_ctx.blake );
+     bmw512_8way_init( &c11_8way_ctx.bmw );
+     init_groestl( &c11_8way_ctx.groestl, 64 );
+     skein512_8way_init( &c11_8way_ctx.skein );
+     jh512_8way_init( &c11_8way_ctx.jh );
+     keccak512_8way_init( &c11_8way_ctx.keccak );
+     luffa_4way_init( &c11_8way_ctx.luffa, 512 );
+     cube_4way_init( &c11_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &c11_8way_ctx.shavite );
+     simd_4way_init( &c11_8way_ctx.simd, 512 );
+     init_echo( &c11_8way_ctx.echo, 512 );
+}
+
+void c11_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     c11_8way_ctx_holder ctx;
+     memcpy( &ctx, &c11_8way_ctx, sizeof(c11_8way_ctx) );
+
+     // 1 Blake 4way
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     // 2 Bmw
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     // Serial
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     // 4way
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+
+     // 4 JH
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     // 5 Keccak
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     // 6 Skein
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     // Serial
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     // 7 Luffa + 8 cube
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     // 10 Simd
+     intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
+     intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     int thr_id = mythr->id;   
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     const uint32_t Htarg = ptarget[7];
+
+     max_nonce -= 8;
+
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+        _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                          n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+        c11_8way_hash( hash, vdata );
+        pdata[19] = n;
+
+        for ( int i = 0; i < 8; i++ )
+        if ( ( ( hash+(i<<3) )[7] < Htarg )
+             && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+        {
+           pdata[19] = n+i;
+           submit_lane_solution( work, hash+(i<<3), mythr, i );
+        }
+        n += 8;
+     } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+     
+#elif defined (C11_4WAY)
+
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
--- a/algo/x11/c11-gate.c
+++ b/algo/x11/c11-gate.c
@@ -2,7 +2,11 @@

 bool register_c11_algo( algo_gate_t* gate )
 {
-#if defined (C11_4WAY)
+#if defined (C11_8WAY)
+  init_c11_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_c11_8way;
+  gate->hash      = (void*)&c11_8way_hash;
+#elif defined (C11_4WAY)
  init_c11_4way_ctx();
  gate->scanhash  = (void*)&scanhash_c11_4way;
  gate->hash      = (void*)&c11_4way_hash;
@@ -11,7 +15,7 @@ bool register_c11_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_c11;
  gate->hash      = (void*)&c11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x11/c11-gate.h
+++ b/algo/x11/c11-gate.h
@@ -4,29 +4,36 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define C11_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define C11_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define C11_4WAY 1
 #endif

+
 bool register_c11_algo( algo_gate_t* gate );
+#if defined(C11_8WAY)

-#if defined(C11_4WAY)
+void c11_8way_hash( void *state, const void *input );
+int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+void init_c11_8way_ctx();
+
+#elif defined(C11_4WAY)

 void c11_4way_hash( void *state, const void *input );
-
 int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_c11_4way_ctx();

-#endif
+#else

 void c11_hash( void *state, const void *input );
-
 int scanhash_c11( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_c11_ctx();

 #endif

+#endif
+
--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -1,8 +1,5 @@
 #include "cpuminer-config.h"
 #include "x11-gate.h"
-
-#if defined (X11_4WAY)
-
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/blake-hash-4way.h"
@@ -12,11 +9,235 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"

+#if defined (X11_8WAY)
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+} x11_8way_ctx_holder;
+
+x11_8way_ctx_holder x11_8way_ctx;
+
+void init_x11_8way_ctx()
+{
+     blake512_8way_init( &x11_8way_ctx.blake );
+     bmw512_8way_init( &x11_8way_ctx.bmw );
+     init_groestl( &x11_8way_ctx.groestl, 64 );
+     skein512_8way_init( &x11_8way_ctx.skein );
+     jh512_8way_init( &x11_8way_ctx.jh );
+     keccak512_8way_init( &x11_8way_ctx.keccak );
+     luffa_4way_init( &x11_8way_ctx.luffa, 512 );
+     cube_4way_init( &x11_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x11_8way_ctx.shavite );
+     simd_4way_init( &x11_8way_ctx.simd, 512 );
+     init_echo( &x11_8way_ctx.echo, 512 );
+}
+
+void x11_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     x11_8way_ctx_holder ctx;
+     memcpy( &ctx, &x11_8way_ctx, sizeof(x11_8way_ctx) );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     // Serial
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     // 4way
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     // Luffa + Cube
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+int scanhash_x11_8way( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     int thr_id = mythr->id;
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     const uint32_t Htarg = ptarget[7];
+
+     const uint32_t last_nonce = max_nonce -8;
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+         _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                           n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+         x11_8way_hash( hash, vdata );
+         pdata[19] = n;
+
+         for ( int i = 0; i < 8; i++ )
+         if ( ( hash+(i<<3) )[7] < Htarg
+              && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+         {
+             pdata[19] = n+i;
+             submit_lane_solution( work, hash+(i<<3), mythr, i );
+         }
+         n += 8;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+
+#elif defined (X11_4WAY)
+
+
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
--- a/algo/x11/x11-gate.c
+++ b/algo/x11/x11-gate.c
@@ -1,8 +1,12 @@
 #include "x11-gate.h"

-bool register_x11_algo( algo_gate_t* gate )
+bool register_x11_algo( algo_gate_t *gate )
 {
-#if defined (X11_4WAY)
+#if defined (X11_8WAY)
+  init_x11_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x11_8way;
+  gate->hash      = (void*)&x11_8way_hash;
+#elif defined (X11_4WAY)
  init_x11_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x11_4way;
  gate->hash      = (void*)&x11_4way_hash;
@@ -11,7 +15,7 @@ bool register_x11_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x11;
  gate->hash      = (void*)&x11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x11/x11-gate.h
+++ b/algo/x11/x11-gate.h
@@ -4,29 +4,35 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X11_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X11_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X11_4WAY 1
 #endif

 bool register_x11_algo( algo_gate_t* gate );
+#if defined(X11_8WAY)

-#if defined(X11_4WAY)
+void x11_8way_hash( void *state, const void *input );
+int scanhash_x11_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+void init_x11_8way_ctx();
+
+#elif defined(X11_4WAY)

 void x11_4way_hash( void *state, const void *input );
-
 int scanhash_x11_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x11_4way_ctx();

-#endif
+#else

 void x11_hash( void *state, const void *input );
-
 int scanhash_x11( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x11_ctx();

 #endif

+#endif
+
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -1,11 +1,7 @@
 #include "cpuminer-config.h"
 #include "x11gost-gate.h"
-
-#if defined (X11GOST_4WAY)
-
 #include <string.h>
 #include <stdint.h>
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -14,18 +10,269 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"

+#if defined (X11GOST_8WAY)
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;    
+    keccak512_8way_context  keccak;    
+    sph_gost512_context     gost;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+} x11gost_8way_ctx_holder;
+
+x11gost_8way_ctx_holder x11gost_8way_ctx;
+
+void init_x11gost_8way_ctx()
+{
+     blake512_8way_init( &x11gost_8way_ctx.blake );
+     bmw512_8way_init( &x11gost_8way_ctx.bmw );
+     init_groestl( &x11gost_8way_ctx.groestl, 64 );
+     skein512_8way_init( &x11gost_8way_ctx.skein );
+     jh512_8way_init( &x11gost_8way_ctx.jh );
+     keccak512_8way_init( &x11gost_8way_ctx.keccak );
+     sph_gost512_init( &x11gost_8way_ctx.gost );
+     luffa_4way_init( &x11gost_8way_ctx.luffa, 512 );
+     cube_4way_init( &x11gost_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x11gost_8way_ctx.shavite );
+     simd_4way_init( &x11gost_8way_ctx.simd, 512 );
+     init_echo( &x11gost_8way_ctx.echo, 512 );
+}
+
+void x11gost_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     x11gost_8way_ctx_holder ctx;
+     memcpy( &ctx, &x11gost_8way_ctx, sizeof(x11gost_8way_ctx) );
+
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     // Serial
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     // 4way
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     // Serial
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, hash1 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash4, 64 );
+     sph_gost512_close( &ctx.gost, hash4 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash5, 64 );
+     sph_gost512_close( &ctx.gost, hash5 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash6, 64 );
+     sph_gost512_close( &ctx.gost, hash6 );
+     memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash7, 64 );
+     sph_gost512_close( &ctx.gost, hash7 );
+
+
+     // Luffa + Cube
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+int scanhash_x11gost_8way( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     int thr_id = mythr->id; 
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     const uint32_t Htarg = ptarget[7];
+
+     max_nonce -= 8;
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+         _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                           n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+         x11gost_8way_hash( hash, vdata );
+         pdata[19] = n;
+
+         for ( int i = 0; i < 8; i++ )
+         if ( ( hash+(i<<3) )[7] < Htarg 
+              && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+         {
+             pdata[19] = n+i;
+             submit_lane_solution( work, hash+(i<<3), mythr, i );
+         }
+         n += 8;
+     } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined (X11GOST_4WAY)
+
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
    skein512_4way_context   skein;
-    jh512_4way_context      jh;    
-    keccak512_4way_context  keccak;    
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
    sph_gost512_context     gost;
    luffa_2way_context      luffa;
    cubehashParam           cube;
@@ -76,10 +323,10 @@ void x11gost_4way_hash( void *state, const void *input )
     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl, 
+     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl, 
+     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

@@ -175,7 +422,7 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
-     int thr_id = mythr->id;  // thr_id arg is deprecated
+     int thr_id = mythr->id;
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
@@ -185,7 +432,7 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,

     mm256_bswap32_intrlv80_4x64( vdata, pdata );

-     for (int m=0; m < 6; m++) 
+     for (int m=0; m < 6; m++)
       if (Htarg <= htmax[m])
       {
         uint32_t mask = masks[m];
--- a/algo/x11/x11gost-gate.c
+++ b/algo/x11/x11gost-gate.c
@@ -2,7 +2,11 @@

 bool register_x11gost_algo( algo_gate_t* gate )
 {
-#if defined (X11GOST_4WAY)
+#if defined (X11GOST_8WAY)
+  init_x11gost_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x11gost_8way;
+  gate->hash      = (void*)&x11gost_8way_hash;
+#elif defined (X11GOST_4WAY)
  init_x11gost_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x11gost_4way;
  gate->hash      = (void*)&x11gost_4way_hash;
@@ -11,7 +15,7 @@ bool register_x11gost_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x11gost;
  gate->hash      = (void*)&x11gost_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x11/x11gost-gate.h
+++ b/algo/x11/x11gost-gate.h
@@ -4,29 +4,36 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X11GOST_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X11GOST_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X11GOST_4WAY 1
 #endif

 bool register_x11gost_algo( algo_gate_t* gate );

-#if defined(X11GOST_4WAY)
+#if defined(X11GOST_8WAY)
+
+void x11gost_8way_hash( void *state, const void *input );
+int scanhash_x11gost_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+void init_x11gost_8way_ctx();
+
+#elif defined(X11GOST_4WAY)

 void x11gost_4way_hash( void *state, const void *input );
-
 int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x11gost_4way_ctx();

-#endif
+#else

 void x11gost_hash( void *state, const void *input );
-
 int scanhash_x11gost( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x11gost_ctx();

 #endif

+#endif
+
--- a/algo/x13/phi1612-4way.c
+++ b/algo/x13/phi1612-4way.c
@@ -1,7 +1,4 @@
 #include "phi1612-gate.h"
-
-#if defined(PHI1612_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -9,10 +6,193 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/echo/aes_ni/hash_api.h"

+#if defined(PHI1612_8WAY)
+
+typedef struct {
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    cube_4way_context       cube;
+    sph_fugue512_context    fugue;
+    sph_gost512_context     gost;
+    hashState_echo          echo;
+} phi1612_8way_ctx_holder;
+
+phi1612_8way_ctx_holder phi1612_8way_ctx __attribute__ ((aligned (64)));
+
+void init_phi1612_8way_ctx()
+{
+     skein512_8way_init( &phi1612_8way_ctx.skein );
+     jh512_8way_init( &phi1612_8way_ctx.jh );
+     cube_4way_init( &phi1612_8way_ctx.cube, 512, 16, 32 );
+     sph_fugue512_init( &phi1612_8way_ctx.fugue );
+     sph_gost512_init( &phi1612_8way_ctx.gost );
+     init_echo( &phi1612_8way_ctx.echo, 512 );
+};
+
+void phi1612_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     phi1612_8way_ctx_holder ctx;
+     memcpy( &ctx, &phi1612_8way_ctx, sizeof(phi1612_8way_ctx) );
+
+     // Skein parallel 4way
+     skein512_8way_update( &ctx.skein, input, 80 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     // JH
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     // Cubehash
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     // Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     // Gost
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, hash1 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash4, 64 );
+     sph_gost512_close( &ctx.gost, hash4 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash5, 64 );
+     sph_gost512_close( &ctx.gost, hash5 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash6, 64 );
+     sph_gost512_close( &ctx.gost, hash6 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash7, 64 );
+     sph_gost512_close( &ctx.gost, hash7 );
+
+     // Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+int scanhash_phi1612_8way( struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     const uint32_t first_nonce = pdata[19];
+     uint32_t n = first_nonce;
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     int thr_id = mythr->id;  
+     const uint32_t Htarg = ptarget[7];
+
+     if ( opt_benchmark )
+          ( (uint32_t*)ptarget )[7] = 0x0cff;
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do {
+           *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+               _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                 n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+        phi1612_8way_hash( hash, vdata );
+        pdata[19] = n;
+
+        for ( int i = 0; i < 8; i++ )
+        if ( (hash+(i<<3))[7] <= Htarg )
+        if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+        {
+           pdata[19] = n+i;
+           submit_lane_solution( work, hash+(i<<3), mythr, i );
+        }
+        n += 8;
+     } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined(PHI1612_4WAY)
+
+
 typedef struct {
    skein512_4way_context   skein;
    jh512_4way_context      jh;
--- a/algo/x13/phi1612-gate.c
+++ b/algo/x13/phi1612-gate.c
@@ -2,7 +2,11 @@

 bool register_phi1612_algo( algo_gate_t* gate )
 {
-#if defined(PHI1612_4WAY)
+#if defined(PHI1612_8WAY)
+  init_phi1612_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_phi1612_8way;
+  gate->hash      = (void*)&phi1612_8way_hash;
+#elif defined(PHI1612_4WAY)
  init_phi1612_4way_ctx();
  gate->scanhash  = (void*)&scanhash_phi1612_4way;
  gate->hash      = (void*)&phi1612_4way_hash;
@@ -11,7 +15,7 @@ bool register_phi1612_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_phi1612;
  gate->hash      = (void*)&phi1612_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x13/phi1612-gate.h
+++ b/algo/x13/phi1612-gate.h
@@ -4,29 +4,35 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define PHI1612_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define PHI1612_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define PHI1612_4WAY 1
 #endif

 bool register_phi1612_algo( algo_gate_t* gate );

-#if defined(PHI1612_4WAY)
+#if defined(PHI1612_8WAY)
+
+void phi1612_8way_hash( void *state, const void *input );
+int scanhash_phi1612_8way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+void init_phi1612_8way_ctx();
+
+#elif defined(PHI1612_4WAY)

 void phi1612_4way_hash( void *state, const void *input );
-
 int scanhash_phi1612_4way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_phi1612_4way_ctx();

-#endif
+#else

 void phi1612_hash( void *state, const void *input );
-
 int scanhash_phi1612( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_phi1612_ctx();

 #endif
+#endif

--- a/algo/x13/skunk-4way.c
+++ b/algo/x13/skunk-4way.c
@@ -1,7 +1,4 @@
 #include "skunk-gate.h"
-
-#if defined(SKUNK_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -10,6 +7,146 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
+
+#if defined(SKUNK_8WAY)
+
+typedef struct {
+    skein512_8way_context skein;
+    cube_4way_context     cube;
+    sph_fugue512_context  fugue;
+    sph_gost512_context   gost;
+} skunk_8way_ctx_holder;
+
+static __thread skunk_8way_ctx_holder skunk_8way_ctx;
+
+void skunk_8way_hash( void *output, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     skunk_8way_ctx_holder ctx __attribute__ ((aligned (64)));
+     memcpy( &ctx, &skunk_8way_ctx, sizeof(skunk_8way_ctx) );
+
+     skein512_8way_update( &ctx.skein, input, 80 );
+     skein512_8way_close( &ctx.skein, vhash );
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                        hash7, vhash, 512 );
+  
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); 
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 ); 
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 ); 
+     cube_4way_init( &ctx.cube, 512, 16, 32 );           
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );  
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, output );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, output+ 32 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, output+ 64 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, output+ 96 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash4, 64 );
+     sph_gost512_close( &ctx.gost, output+128 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash5, 64 );
+     sph_gost512_close( &ctx.gost, output+160 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash6, 64 );
+     sph_gost512_close( &ctx.gost, output+192 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash7, 64 );
+     sph_gost512_close( &ctx.gost, output+224 );
+}
+
+int scanhash_skunk_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   const uint32_t Htarg = ptarget[7];
+   int thr_id = mythr->id;  
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+      ((uint32_t*)ptarget)[7] = 0x0cff;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+      skunk_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
+      if ( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      {
+         pdata[19] = n+i;
+         submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n +=8;
+   } while ( likely( ( n < max_nonce-8 ) && !(*restart) ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+bool skunk_8way_thread_init()
+{
+   skein512_8way_init( &skunk_8way_ctx.skein );
+   cube_4way_init( &skunk_8way_ctx.cube, 512, 16, 32 );
+   sph_fugue512_init( &skunk_8way_ctx.fugue );
+   sph_gost512_init( &skunk_8way_ctx.gost );
+   return true;
+}
+
+#elif defined(SKUNK_4WAY)

 typedef struct {
    skein512_4way_context skein;
--- a/algo/x13/skunk-gate.c
+++ b/algo/x13/skunk-gate.c
@@ -2,12 +2,15 @@

 bool register_skunk_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AVX2_OPT;
-#if defined (SKUNK_4WAY)
+   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+#if defined (SKUNK_8WAY)
+   gate->miner_thread_init = (void*)&skunk_8way_thread_init;
+   gate->scanhash = (void*)&scanhash_skunk_8way;
+   gate->hash     = (void*)&skunk_8way_hash;
+#elif defined (SKUNK_4WAY)
   gate->miner_thread_init = (void*)&skunk_4way_thread_init;
   gate->scanhash = (void*)&scanhash_skunk_4way;
   gate->hash     = (void*)&skunk_4way_hash;
-//   init_skunk_4way_ctx();
 #else
   gate->miner_thread_init = (void*)&skunk_thread_init;
   gate->scanhash = (void*)&scanhash_skunk;
--- a/algo/x13/skunk-gate.h
+++ b/algo/x13/skunk-gate.h
@@ -4,29 +4,33 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__)
-  #define SKUNK_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define SKUNK_8WAY 1
+#elif defined(__AVX2__)
+  #define SKUNK_4WAY 1
 #endif

 bool register_skunk_algo( algo_gate_t* gate );

-#if defined(SKUNK_4WAY)
+#if defined(SKUNK_8WAY)
+
+void skunk_8way_hash( void *state, const void *input );
+int scanhash_skunk_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+bool skunk_8way_thread_init();
+
+#elif defined(SKUNK_4WAY)

 void skunk_4way_hash( void *state, const void *input );
-
 int scanhash_skunk_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
-
 bool skunk_4way_thread_init();
-//void init_skunk_4way_ctx();

 #endif

 void skunkhash( void *state, const void *input );
-
 int scanhash_skunk( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
-
 bool skunk_thread_init();

 #endif
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -127,6 +127,7 @@ void x17_4way_hash( void *state, const void *input )
     dintrlv_2x128_512( hash0, hash1, vhashA );
     dintrlv_2x128_512( hash2, hash3, vhashB );

+
     // 11 Echo serial
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,