v3.10.1

2026-02-22 16:33:08 +00:00 · 2019-12-05 19:09:23 -05:00
parent 40039386a0
commit 73430b13b1
52 changed files with 4515 additions and 874 deletions
--- a/algo/argon2/argon2d/argon2d/opt.c
+++ b/algo/argon2/argon2d/argon2d/opt.c
@@ -21,7 +21,7 @@

 #include "argon2.h"
 #include "core.h"
-
+#include "simd-utils.h"
 #include "../blake2/blake2.h"
 #include "../blake2/blamka-round-opt.h"

@@ -37,24 +37,28 @@

 #if defined(__AVX512F__)

-static void fill_block(__m512i *state, const block *ref_block,
-                       block *next_block, int with_xor) {
+static void fill_block( __m512i *state, const block *ref_block,
+                       block *next_block, int with_xor )
+{
    __m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK];
    unsigned int i;

-    if (with_xor) {
-        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
-            state[i] = _mm512_xor_si512(
-                state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
-            block_XY[i] = _mm512_xor_si512(
-                state[i], _mm512_loadu_si512((const __m512i *)next_block->v + i));
-        }
-    } else {
-        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
-            block_XY[i] = state[i] = _mm512_xor_si512(
-                state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
+    if ( with_xor )
+    {
+        for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
+        {
+            state[i] = _mm512_xor_si512( state[i],
+                      _mm512_load_si512( (const __m512i*)ref_block->v + i ) );
+            block_XY[i] = _mm512_xor_si512( state[i],
+                      _mm512_load_si512( (const __m512i*)next_block->v + i ) );
        }
    }
+    else
+    {
+        for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
+            block_XY[i] = state[i] = _mm512_xor_si512( state[i],
+                      _mm512_load_si512( (const __m512i*)ref_block->v + i ) );
+    }

    BLAKE2_ROUND_1( state[ 0], state[ 1], state[ 2], state[ 3],
                    state[ 4], state[ 5], state[ 6], state[ 7] );
@@ -66,23 +70,10 @@ static void fill_block(__m512i *state, const block *ref_block,
    BLAKE2_ROUND_2( state[ 1], state[ 3], state[ 5], state[ 7],
                    state[ 9], state[11], state[13], state[15] );

-/*
-    for (i = 0; i < 2; ++i) {
-        BLAKE2_ROUND_1(
-            state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
-            state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
-    }
-
-    for (i = 0; i < 2; ++i) {
-        BLAKE2_ROUND_2(
-            state[2 * 0 + i], state[2 * 1 + i], state[2 * 2 + i], state[2 * 3 + i],
-            state[2 * 4 + i], state[2 * 5 + i], state[2 * 6 + i], state[2 * 7 + i]);
-    }
-*/
-
-    for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
-        state[i] = _mm512_xor_si512(state[i], block_XY[i]);
-        _mm512_storeu_si512((__m512i *)next_block->v + i, state[i]);
+    for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
+    {
+        state[i] = _mm512_xor_si512( state[i], block_XY[i] );
+        _mm512_store_si512( (__m512i*)next_block->v + i, state[i] );
    }
 }

@@ -125,18 +116,6 @@ static void fill_block(__m256i *state, const block *ref_block,
    BLAKE2_ROUND_2( state[ 3], state[ 7], state[11], state[15],
                    state[19], state[23], state[27], state[31] );

-/*
-    for (i = 0; i < 4; ++i) {
-        BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
-                       state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
-    }
-
-    for (i = 0; i < 4; ++i) {
-        BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i],
-                       state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
-    }
-*/
-
    for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
        state[i] = _mm256_xor_si256(state[i], block_XY[i]);
        _mm256_store_si256((__m256i *)next_block->v + i, state[i]);
@@ -153,14 +132,14 @@ static void fill_block(__m128i *state, const block *ref_block,
    if (with_xor) {
        for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
            state[i] = _mm_xor_si128(
-                state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
+                state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
            block_XY[i] = _mm_xor_si128(
-                state[i], _mm_loadu_si128((const __m128i *)next_block->v + i));
+                state[i], _mm_load_si128((const __m128i *)next_block->v + i));
        }
    } else {
        for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
            block_XY[i] = state[i] = _mm_xor_si128(
-                state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
+                state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
        }
    }

@@ -198,22 +177,9 @@ static void fill_block(__m128i *state, const block *ref_block,
    BLAKE2_ROUND( state[ 7], state[15], state[23], state[31],  
                  state[39], state[47], state[55], state[63] );

-/*
-    for (i = 0; i < 8; ++i) {
-        BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
-            state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
-            state[8 * i + 6], state[8 * i + 7]);
-    }
-
-    for (i = 0; i < 8; ++i) {
-        BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i],
-            state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
-            state[8 * 6 + i], state[8 * 7 + i]);
-    }
-*/
    for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
        state[i] = _mm_xor_si128(state[i], block_XY[i]);
-        _mm_storeu_si128((__m128i *)next_block->v + i, state[i]);
+        _mm_store_si128((__m128i *)next_block->v + i, state[i]);
    }
 }

--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -427,14 +427,14 @@ static __m512i muladd(__m512i x, __m512i y)
 #define SWAP_QUARTERS(A0, A1) \
    do { \
        SWAP_HALVES(A0, A1); \
-        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
-        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+        A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
+        A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
    } while((void)0, 0)

 #define UNSWAP_QUARTERS(A0, A1) \
    do { \
-        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
-        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+        A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
+        A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
        SWAP_HALVES(A0, A1); \
    } while((void)0, 0)

--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -118,20 +118,42 @@ void blake256r8_8way_close(void *cc, void *dst);
 // Blake-512 4 way

 typedef struct {
-   __m256i buf[16] __attribute__ ((aligned (64)));
+   __m256i buf[16];
   __m256i H[8];
   __m256i S[4];   
   size_t ptr;
   sph_u64 T0, T1;
-} blake_4way_big_context;
+} blake_4way_big_context __attribute__ ((aligned (128)));

 typedef blake_4way_big_context blake512_4way_context;

-void blake512_4way_init(void *cc);
-void blake512_4way(void *cc, const void *data, size_t len);
-void blake512_4way_close(void *cc, void *dst);
-void blake512_4way_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
+void blake512_4way_init( void *cc );
+void blake512_4way_update( void *cc, const void *data, size_t len );
+#define blake512_4way blake512_4way_update
+void blake512_4way_close( void *cc, void *dst );
+void blake512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                      void *dst );
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+typedef struct {
+   __m512i buf[16];
+   __m512i H[8];
+   __m512i S[4];
+   size_t ptr;
+   sph_u64 T0, T1;
+} blake_8way_big_context __attribute__ ((aligned (128)));
+
+typedef blake_8way_big_context blake512_8way_context;
+
+void blake512_8way_init( void *cc );
+void blake512_8way_update( void *cc, const void *data, size_t len );
+void blake512_8way_close( void *cc, void *dst );
+void blake512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                      void *dst );
+
+#endif  // AVX512
+

 #endif  // AVX2

--- a/algo/blake/blake2b-4way.c
+++ b/algo/blake/blake2b-4way.c
@@ -17,7 +17,7 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &(hash[25]);   // 3*8+1
+   uint32_t *hash7 = &(hash[49]);   // 3*16+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   int thr_id = mythr->id;
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -54,7 +54,6 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

-
 // Blake-512

 static const sph_u64 IV512[8] = {
@@ -64,6 +63,7 @@ static const sph_u64 IV512[8] = {
 	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
 };

+static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };

 #if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64

@@ -264,8 +264,6 @@ static const unsigned sigma[16][16] = {
 #define Mx_(n)      Mx__(n)
 #define Mx__(n)     M ## n

-// Blake-512 4 way
-
 #define CBx(r, i)   CBx_(Z ## r ## i)
 #define CBx_(n)     CBx__(n)
 #define CBx__(n)    CB ## n
@@ -287,6 +285,7 @@ static const unsigned sigma[16][16] = {
 #define CBE   SPH_C64(0x0801F2E2858EFC16)
 #define CBF   SPH_C64(0x636920D871574E69)

+/*
 #if SPH_COMPACT_BLAKE_64
 // not used
 static const sph_u64 CB[16] = {
@@ -301,7 +300,301 @@ static const sph_u64 CB[16] = {
 };

 #endif
+*/

+#define READ_STATE64(state)   do { \
+      H0 = (state)->H[0]; \
+      H1 = (state)->H[1]; \
+      H2 = (state)->H[2]; \
+      H3 = (state)->H[3]; \
+      H4 = (state)->H[4]; \
+      H5 = (state)->H[5]; \
+      H6 = (state)->H[6]; \
+      H7 = (state)->H[7]; \
+      S0 = (state)->S[0]; \
+      S1 = (state)->S[1]; \
+      S2 = (state)->S[2]; \
+      S3 = (state)->S[3]; \
+      T0 = (state)->T0; \
+      T1 = (state)->T1; \
+   } while (0)
+
+#define WRITE_STATE64(state)   do { \
+      (state)->H[0] = H0; \
+      (state)->H[1] = H1; \
+      (state)->H[2] = H2; \
+      (state)->H[3] = H3; \
+      (state)->H[4] = H4; \
+      (state)->H[5] = H5; \
+      (state)->H[6] = H6; \
+      (state)->H[7] = H7; \
+      (state)->S[0] = S0; \
+      (state)->S[1] = S1; \
+      (state)->S[2] = S2; \
+      (state)->S[3] = S3; \
+      (state)->T0 = T0; \
+      (state)->T1 = T1; \
+   } while (0)
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// Blake-512 8 way
+
+#define GB_8WAY(m0, m1, c0, c1, a, b, c, d)   do { \
+   a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
+                 _mm512_set1_epi64( c1 ), m0 ), b ), a ); \
+   d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
+   c = _mm512_add_epi64( c, d ); \
+   b = mm512_ror_64( _mm512_xor_si512( b, c ), 25 ); \
+   a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
+                 _mm512_set1_epi64( c0 ), m1 ), b ), a ); \
+   d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
+   c = _mm512_add_epi64( c, d ); \
+   b = mm512_ror_64( _mm512_xor_si512( b, c ), 11 ); \
+} while (0)
+
+#define ROUND_B_8WAY(r)   do { \
+   GB_8WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
+   GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
+   GB_8WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
+   GB_8WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
+   GB_8WAY(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
+   GB_8WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
+   GB_8WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
+   GB_8WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
+   } while (0)
+
+
+#define DECL_STATE64_8WAY \
+   __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
+        __m512i S0, S1, S2, S3; \
+   sph_u64 T0, T1;
+
+#define COMPRESS64_8WAY   do \
+{ \
+  __m512i M0, M1, M2, M3, M4, M5, M6, M7; \
+  __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
+  __m512i V0, V1, V2, V3, V4, V5, V6, V7; \
+  __m512i V8, V9, VA, VB, VC, VD, VE, VF; \
+  __m512i shuf_bswap64; \
+  V0 = H0; \
+  V1 = H1; \
+  V2 = H2; \
+  V3 = H3; \
+  V4 = H4; \
+  V5 = H5; \
+  V6 = H6; \
+  V7 = H7; \
+  V8 = _mm512_xor_si512( S0, m512_const1_64( CB0 ) );  \
+  V9 = _mm512_xor_si512( S1, m512_const1_64( CB1 ) );  \
+  VA = _mm512_xor_si512( S2, m512_const1_64( CB2 ) );  \
+  VB = _mm512_xor_si512( S3, m512_const1_64( CB3 ) );  \
+  VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
+                         m512_const1_64( CB4 ) );  \
+  VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
+                         m512_const1_64( CB5 ) );  \
+  VE = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
+                         m512_const1_64( CB6 ) );  \
+  VF = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
+                         m512_const1_64( CB7 ) );  \
+  shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
+                                0x28292a2b2c2d2e2f, 0x2021222324252627, \
+                                0x18191a1b1c1d1e1f, 0x1011121314151617, \
+                                0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
+  M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
+  M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
+  M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
+  M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
+  M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
+  M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
+  M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
+  M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
+  M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
+  M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
+  MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
+  MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
+  MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
+  MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
+  ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
+  MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
+  ROUND_B_8WAY(0); \
+  ROUND_B_8WAY(1); \
+  ROUND_B_8WAY(2); \
+  ROUND_B_8WAY(3); \
+  ROUND_B_8WAY(4); \
+  ROUND_B_8WAY(5); \
+  ROUND_B_8WAY(6); \
+  ROUND_B_8WAY(7); \
+  ROUND_B_8WAY(8); \
+  ROUND_B_8WAY(9); \
+  ROUND_B_8WAY(0); \
+  ROUND_B_8WAY(1); \
+  ROUND_B_8WAY(2); \
+  ROUND_B_8WAY(3); \
+  ROUND_B_8WAY(4); \
+  ROUND_B_8WAY(5); \
+  H0 = mm512_xor4( V8, V0, S0, H0 ); \
+  H1 = mm512_xor4( V9, V1, S1, H1 ); \
+  H2 = mm512_xor4( VA, V2, S2, H2 ); \
+  H3 = mm512_xor4( VB, V3, S3, H3 ); \
+  H4 = mm512_xor4( VC, V4, S0, H4 ); \
+  H5 = mm512_xor4( VD, V5, S1, H5 ); \
+  H6 = mm512_xor4( VE, V6, S2, H6 ); \
+  H7 = mm512_xor4( VF, V7, S3, H7 ); \
+} while (0)
+
+static void
+blake64_8way_init( blake_8way_big_context *sc, const sph_u64 *iv,
+              const sph_u64 *salt )
+{
+   __m512i zero = m512_zero;
+   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
+   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
+   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
+   casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
+   casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
+   casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
+   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
+   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
+
+   casti_m512i( sc->S, 0 ) = zero;
+   casti_m512i( sc->S, 1 ) = zero;
+   casti_m512i( sc->S, 2 ) = zero;
+   casti_m512i( sc->S, 3 ) = zero;
+
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
+}
+
+static void
+blake64_8way( blake_8way_big_context *sc, const void *data, size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   __m512i *buf;
+   size_t ptr;
+   DECL_STATE64_8WAY
+
+   const int buf_size = 128;  //  sizeof/8
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   if ( len < (buf_size - ptr) )
+   {
+   memcpy_512( buf + (ptr>>3), vdata, len>>3 );
+   ptr += len;
+   sc->ptr = ptr;
+   return;
+   }
+
+   READ_STATE64(sc);
+   while ( len > 0 )
+   {
+   size_t clen;
+
+   clen = buf_size - ptr;
+   if ( clen > len )
+      clen = len;
+   memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
+   ptr += clen;
+   vdata = vdata + (clen>>3);
+   len -= clen;
+   if ( ptr == buf_size )
+        {
+      if ( ( T0 = SPH_T64(T0 + 1024) ) < 1024 )
+         T1 = SPH_T64(T1 + 1);
+      COMPRESS64_8WAY;
+      ptr = 0;
+   }
+   }
+   WRITE_STATE64(sc);
+   sc->ptr = ptr;
+}
+
+static void
+blake64_8way_close( blake_8way_big_context *sc,
+   unsigned ub, unsigned n, void *dst, size_t out_size_w64)
+{
+   __m512i buf[16];
+   size_t ptr;
+   unsigned bit_len;
+   uint64_t z, zz;
+   sph_u64 th, tl;
+
+   ptr = sc->ptr;
+   bit_len = ((unsigned)ptr << 3);
+   z = 0x80 >> n;
+   zz = ((ub & -z) | z) & 0xFF;
+   buf[ptr>>3] = _mm512_set1_epi64( zz );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+   if (ptr == 0 )
+   {
+   sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
+   sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
+   }
+   else if ( sc->T0 == 0 )
+   {
+   sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
+   sc->T1 = SPH_T64(sc->T1 - 1);
+   }
+   else
+   {
+        sc->T0 -= 1024 - bit_len;
+   }
+   if ( ptr <= 104 )
+   {
+       memset_zero_512( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
+       if ( out_size_w64 == 8 )
+          buf[(104>>3)] = _mm512_or_si512( buf[(104>>3)],
+                                 m512_const1_64( 0x0100000000000000ULL ) );
+       *(buf+(112>>3)) = _mm512_set1_epi64( bswap_64( th ) );
+       *(buf+(120>>3)) = _mm512_set1_epi64( bswap_64( tl ) );
+
+       blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
+   }
+   else
+  {
+       memset_zero_512( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
+
+       blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
+       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
+       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
+       memset_zero_512( buf, 112>>3 );
+       if ( out_size_w64 == 8 )
+           buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
+       *(buf+(112>>3)) = _mm512_set1_epi64( bswap_64( th ) );
+       *(buf+(120>>3)) = _mm512_set1_epi64( bswap_64( tl ) );
+
+       blake64_8way( sc, buf, 128 );
+   }
+   mm512_block_bswap_64( (__m512i*)dst, sc->H );
+}
+
+void
+blake512_8way_init(void *cc)
+{
+   blake64_8way_init(cc, IV512, salt_zero_big);
+}
+
+void
+blake512_8way_update(void *cc, const void *data, size_t len)
+{
+   blake64_8way(cc, data, len);
+}
+
+void
+blake512_8way_close(void *cc, void *dst)
+{
+   blake512_8way_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+   blake64_8way_close(cc, ub, n, dst, 8);
+}
+
+#endif  // AVX512

 // Blake-512 4 way

@@ -318,29 +611,6 @@ static const sph_u64 CB[16] = {
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
 } while (0)

-#if SPH_COMPACT_BLAKE_64
-// not used
-#define ROUND_B_4WAY(r)   do { \
-	GB_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
-		CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \
-	GB_4WAY(M[sigma[r][0x2]], M[sigma[r][0x3]], \
-		CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \
-	GB_4WAY(M[sigma[r][0x4]], M[sigma[r][0x5]], \
-		CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \
-	GB_4WAY(M[sigma[r][0x6]], M[sigma[r][0x7]], \
-		CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \
-	GB_4WAY(M[sigma[r][0x8]], M[sigma[r][0x9]], \
-		CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \
-	GB_4WAY(M[sigma[r][0xA]], M[sigma[r][0xB]], \
-		CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \
-	GB_4WAY(M[sigma[r][0xC]], M[sigma[r][0xD]], \
-		CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \
-	GB_4WAY(M[sigma[r][0xE]], M[sigma[r][0xF]], \
-		CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \
-} while (0)
-
-#else
-//current_impl
 #define ROUND_B_4WAY(r)   do { \
 	GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
 	GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
@@ -352,120 +622,11 @@ static const sph_u64 CB[16] = {
 	GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
 	} while (0)

-#endif
-
-
-// Blake-512 4 way
-
 #define DECL_STATE64_4WAY \
 	__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
        __m256i S0, S1, S2, S3; \
 	sph_u64 T0, T1;

-#define READ_STATE64_4WAY(state)   do { \
-		H0 = (state)->H[0]; \
-		H1 = (state)->H[1]; \
-		H2 = (state)->H[2]; \
-		H3 = (state)->H[3]; \
-		H4 = (state)->H[4]; \
-		H5 = (state)->H[5]; \
-		H6 = (state)->H[6]; \
-		H7 = (state)->H[7]; \
-		S0 = (state)->S[0]; \
-		S1 = (state)->S[1]; \
-		S2 = (state)->S[2]; \
-		S3 = (state)->S[3]; \
-		T0 = (state)->T0; \
-		T1 = (state)->T1; \
-	} while (0)
-
-#define WRITE_STATE64_4WAY(state)   do { \
-		(state)->H[0] = H0; \
-		(state)->H[1] = H1; \
-		(state)->H[2] = H2; \
-		(state)->H[3] = H3; \
-		(state)->H[4] = H4; \
-		(state)->H[5] = H5; \
-		(state)->H[6] = H6; \
-		(state)->H[7] = H7; \
-		(state)->S[0] = S0; \
-		(state)->S[1] = S1; \
-		(state)->S[2] = S2; \
-		(state)->S[3] = S3; \
-		(state)->T0 = T0; \
-		(state)->T1 = T1; \
-	} while (0)
-
-#if SPH_COMPACT_BLAKE_64
-
-// not used
-#define COMPRESS64_4WAY   do { \
-	__m256i M[16]; \
-	__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
-	__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
-   const __m256i shuff_bswap64 = m256_const2_64( 0x08090a0b0c0d0e0f, \
-                                                 0x0001020304050607 ) \
-   unsigned r; \
-	V0 = H0; \
-	V1 = H1; \
-	V2 = H2; \
-	V3 = H3; \
-	V4 = H4; \
-	V5 = H5; \
-	V6 = H6; \
-	V7 = H7; \
-   V8 = _mm256_xor_si256( S0, _mm256_set1_epi64x( CB0 ) ); \
-   V9 = _mm256_xor_si256( S1, _mm256_set1_epi64x( CB1 ) ); \
-   VA = _mm256_xor_si256( S2, _mm256_set1_epi64x( CB2 ) ); \
-   VB = _mm256_xor_si256( S3, _mm256_set1_epi64x( CB3 ) ); \
-   VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
-                          _mm256_set1_epi64x( CB4 ) ); \
-   VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
-                          _mm256_set1_epi64x( CB5 ) ); \
-   VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
-                          _mm256_set1_epi64x( CB6 ) ); \
-   VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
-                          _mm256_set1_epi64x( CB7, CB7, CB7, CB7 ) ); \
-   M[0x0] = _mm256_shuffle_epi8( *(buf+ 0), shuff_bswap64 ); \
-	M[0x1] = _mm256_shuffle_epi8( *(buf+ 1), shuff_bswap64 ); \
-	M[0x2] = _mm256_shuffle_epi8( *(buf+ 2), shuff_bswap64 ); \
-	M[0x3] = _mm256_shuffle_epi8( *(buf+ 3), shuff_bswap64 ); \
-	M[0x4] = _mm256_shuffle_epi8( *(buf+ 4), shuff_bswap64 ); \
-	M[0x5] = _mm256_shuffle_epi8( *(buf+ 5), shuff_bswap64 ); \
-	M[0x6] = _mm256_shuffle_epi8( *(buf+ 6), shuff_bswap64 ); \
-	M[0x7] = _mm256_shuffle_epi8( *(buf+ 7), shuff_bswap64 ); \
-	M[0x8] = _mm256_shuffle_epi8( *(buf+ 8), shuff_bswap64 ); \
-	M[0x9] = _mm256_shuffle_epi8( *(buf+ 9), shuff_bswap64 ); \
-	M[0xA] = _mm256_shuffle_epi8( *(buf+10), shuff_bswap64 ); \
-	M[0xB] = _mm256_shuffle_epi8( *(buf+11), shuff_bswap64 ); \
-	M[0xC] = _mm256_shuffle_epi8( *(buf+12), shuff_bswap64 ); \
-	M[0xD] = _mm256_shuffle_epi8( *(buf+13), shuff_bswap64 ); \
-	M[0xE] = _mm256_shuffle_epi8( *(buf+14), shuff_bswap64 ); \
-	M[0xF] = _mm256_shuffle_epi8( *(buf+15), shuff_bswap64 ); \
-	for (r = 0; r < 16; r ++) \
-		ROUND_B_4WAY(r); \
-   H0 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
-   H1 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
-   H2 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S2, V2 ), VA ), H2 ); \
-   H3 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S3, V3 ), VB ), H3 ); \
-   H4 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S0, V4 ), VC ), H4 ); \
-   H5 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S1, V5 ), VD ), H5 ); \
-   H6 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S2, V6 ), VE ), H6 ); \
-   H7 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S3, V7 ), VF ), H7 ); \
-} while (0)
-
-#else
-
-//current impl
-
 #define COMPRESS64_4WAY   do \
 { \
  __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
@@ -493,7 +654,8 @@ static const sph_u64 CB[16] = {
                         m256_const1_64( CB6 ) );  \
  VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
                         m256_const1_64( CB7 ) );  \
-  shuf_bswap64 = m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
+  shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
+                                0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
  M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
  M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
  M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
@@ -536,9 +698,7 @@ static const sph_u64 CB[16] = {
  H7 = mm256_xor4( VF, V7, S3, H7 ); \
 } while (0)

-#endif
-
-static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
+//static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };

 static void
 blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
@@ -583,7 +743,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
 	return;
   }

-   READ_STATE64_4WAY(sc);
+   READ_STATE64(sc);
   while ( len > 0 )
   {
 	size_t clen;
@@ -603,7 +763,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
 		ptr = 0;
 	}
   }
-   WRITE_STATE64_4WAY(sc);
+   WRITE_STATE64(sc);
   sc->ptr = ptr;
 }

@@ -674,7 +834,7 @@ blake512_4way_init(void *cc)
 }

 void
-blake512_4way(void *cc, const void *data, size_t len)
+blake512_4way_update(void *cc, const void *data, size_t len)
 {
 	blake64_4way(cc, data, len);
 }
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -107,7 +107,8 @@ typedef struct {
 typedef bmw_2way_big_context bmw512_2way_context;

 void bmw512_2way_init( bmw512_2way_context *ctx );
-void bmw512_2way( bmw512_2way_context *ctx, const void *data, size_t len );
+void bmw512_2way_update( bmw512_2way_context *ctx, const void *data,
+                         size_t len );
 void bmw512_2way_close( bmw512_2way_context *ctx, void *dst );

 #endif // __SSE2__
@@ -128,7 +129,8 @@ typedef bmw_4way_big_context bmw512_4way_context;

 void bmw512_4way_init(void *cc);

-void bmw512_4way(void *cc, const void *data, size_t len);
+void bmw512_4way_update(void *cc, const void *data, size_t len);
+#define bmw512_4way bmw512_4way_update

 void bmw512_4way_close(void *cc, void *dst);

--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -561,13 +561,10 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )

 #endif  // __SSE2__

-
-
 #if defined(__AVX2__)

 // BMW-512 4 way 64

-
 #define sb0(x) \
   mm256_xor4( _mm256_srli_epi64( (x), 1), _mm256_slli_epi64( (x), 3), \
                mm256_rol_64(     (x), 4),  mm256_rol_64(     (x),37) )
@@ -1047,7 +1044,7 @@ bmw512_4way_init(void *cc)
 }

 void
-bmw512_4way(void *cc, const void *data, size_t len)
+bmw512_4way_update(void *cc, const void *data, size_t len)
 {
 	bmw64_4way(cc, data, len);
 }
@@ -1137,8 +1134,6 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
                     s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ), \
      add_elt_b8( M, H, (i)-16 ) )

-
-
 #define W8b0 \
   _mm512_add_epi64( \
      _mm512_add_epi64( \
@@ -1328,21 +1323,28 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
           mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
           mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );

-#define DH1( m, sl, sr, a, b, c ) \
+#define DH1L( m, sl, sr, a, b, c ) \
   _mm512_add_epi64( \
               _mm512_xor_si512( M[m], \
                  _mm512_xor_si512( _mm512_slli_epi64( xh, sl ), \
                                    _mm512_srli_epi64( qt[a], sr ) ) ), \
               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )

-#define DHL( m, rl, sl, h, a, b, c ) \
+#define DH1R( m, sl, sr, a, b, c ) \
+   _mm512_add_epi64( \
+               _mm512_xor_si512( M[m], \
+                  _mm512_xor_si512( _mm512_srli_epi64( xh, sl ), \
+                                    _mm512_slli_epi64( qt[a], sr ) ) ), \
+               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
+
+#define DH2L( m, rl, sl, h, a, b, c ) \
   _mm512_add_epi64( _mm512_add_epi64( \
       mm512_rol_64( dH[h], rl ), \
          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
                 _mm512_xor_si512( _mm512_slli_epi64( xl, sl ), \
                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
   
-#define DHR( m, rl, sr, h, a, b, c ) \
+#define DH2R( m, rl, sr, h, a, b, c ) \
   _mm512_add_epi64( _mm512_add_epi64( \
       mm512_rol_64( dH[h], rl ), \
          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
@@ -1350,26 +1352,27 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
                                   _mm512_xor_si512( qt[b], qt[c] ) ) );


-   dH[ 0] = DH1(  0,  5,  5, 16, 24, 0 );
-   dH[ 1] = DH1(  1,  7,  8, 17, 25, 1 );
-   dH[ 2] = DH1(  2,  5,  5, 18, 26, 2 );
-   dH[ 3] = DH1(  3,  1,  5, 19, 27, 3 );
-   dH[ 4] = DH1(  4,  3,  0, 20, 28, 4 );
-   dH[ 5] = DH1(  5,  6,  6, 21, 29, 5 );
-   dH[ 6] = DH1(  6,  4,  6, 22, 30, 6 );
-   dH[ 7] = DH1(  7, 11,  2, 23, 31, 7 );
-   dH[ 8] = DHL(  8,  9,  8,  4, 24, 23,  8 );
-   dH[ 9] = DHR(  9, 10,  6,  5, 25, 16,  9 );
-   dH[10] = DHL( 10, 11,  6,  6, 26, 17, 10 );
-   dH[11] = DHL( 11, 12,  4,  7, 27, 18, 11 );
-   dH[12] = DHR( 12, 13,  3,  0, 28, 19, 12 );
-   dH[13] = DHR( 13, 14,  4,  1, 29, 20, 13 );
-   dH[14] = DHR( 14, 15,  7,  2, 30, 21, 14 );
-   dH[15] = DHR( 15, 16,  2,  3, 31, 22, 15 );
+   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
+   dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
+   dH[ 2] = DH1R(  2,  5,  5, 18, 26, 2 );
+   dH[ 3] = DH1R(  3,  1,  5, 19, 27, 3 );
+   dH[ 4] = DH1R(  4,  3,  0, 20, 28, 4 );
+   dH[ 5] = DH1L(  5,  6,  6, 21, 29, 5 );
+   dH[ 6] = DH1R(  6,  4,  6, 22, 30, 6 );
+   dH[ 7] = DH1R(  7, 11,  2, 23, 31, 7 );
+   dH[ 8] = DH2L(  8,  9,  8,  4, 24, 23,  8 );
+   dH[ 9] = DH2R(  9, 10,  6,  5, 25, 16,  9 );
+   dH[10] = DH2L( 10, 11,  6,  6, 26, 17, 10 );
+   dH[11] = DH2L( 11, 12,  4,  7, 27, 18, 11 );
+   dH[12] = DH2R( 12, 13,  3,  0, 28, 19, 12 );
+   dH[13] = DH2R( 13, 14,  4,  1, 29, 20, 13 );
+   dH[14] = DH2R( 14, 15,  7,  2, 30, 21, 14 );
+   dH[15] = DH2R( 15, 16,  2,  3, 31, 22, 15 );

-#undef DH1
-#undef DHL
-#undef DHR
+#undef DH1L
+#undef DH1R
+#undef DH2L
+#undef DH2R
         
 }

--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -26,6 +26,180 @@ static const uint64_t IV512[] =
 0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
 };

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+static void transform_4way( cube_4way_context *sp )
+{
+    int r;
+    const int rounds = sp->rounds;
+
+    __m512i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1;
+
+    x0 = _mm512_load_si512( (__m512i*)sp->h     );
+    x1 = _mm512_load_si512( (__m512i*)sp->h + 1 );
+    x2 = _mm512_load_si512( (__m512i*)sp->h + 2 );
+    x3 = _mm512_load_si512( (__m512i*)sp->h + 3 );
+    x4 = _mm512_load_si512( (__m512i*)sp->h + 4 );
+    x5 = _mm512_load_si512( (__m512i*)sp->h + 5 );
+    x6 = _mm512_load_si512( (__m512i*)sp->h + 6 );
+    x7 = _mm512_load_si512( (__m512i*)sp->h + 7 );
+
+    for ( r = 0; r < rounds; ++r )
+    {
+        x4 = _mm512_add_epi32( x0, x4 );
+        x5 = _mm512_add_epi32( x1, x5 );
+        x6 = _mm512_add_epi32( x2, x6 );
+        x7 = _mm512_add_epi32( x3, x7 );
+        y0 = x0;
+        y1 = x1;
+        x0 = mm512_rol_32( x2, 7 );
+        x1 = mm512_rol_32( x3, 7 );
+        x2 = mm512_rol_32( y0, 7 );
+        x3 = mm512_rol_32( y1, 7 );
+        x0 = _mm512_xor_si512( x0, x4 );
+        x1 = _mm512_xor_si512( x1, x5 );
+        x2 = _mm512_xor_si512( x2, x6 );
+        x3 = _mm512_xor_si512( x3, x7 );
+        x4 = mm512_swap64_128( x4 );
+        x5 = mm512_swap64_128( x5 );
+        x6 = mm512_swap64_128( x6 );
+        x7 = mm512_swap64_128( x7 );
+        x4 = _mm512_add_epi32( x0, x4 );
+        x5 = _mm512_add_epi32( x1, x5 );
+        x6 = _mm512_add_epi32( x2, x6 );
+        x7 = _mm512_add_epi32( x3, x7 );
+        y0 = x0;
+        y1 = x2;
+        x0 = mm512_rol_32( x1, 11 );
+        x1 = mm512_rol_32( y0, 11 );
+        x2 = mm512_rol_32( x3, 11 );
+        x3 = mm512_rol_32( y1, 11 );
+        x0 = _mm512_xor_si512( x0, x4 );
+        x1 = _mm512_xor_si512( x1, x5 );
+        x2 = _mm512_xor_si512( x2, x6 );
+        x3 = _mm512_xor_si512( x3, x7 );
+        x4 = mm512_swap32_64( x4 );
+        x5 = mm512_swap32_64( x5 );
+        x6 = mm512_swap32_64( x6 );
+        x7 = mm512_swap32_64( x7 );
+    }
+
+    _mm512_store_si512( (__m512i*)sp->h,     x0 );
+    _mm512_store_si512( (__m512i*)sp->h + 1, x1 );
+    _mm512_store_si512( (__m512i*)sp->h + 2, x2 );
+    _mm512_store_si512( (__m512i*)sp->h + 3, x3 );
+    _mm512_store_si512( (__m512i*)sp->h + 4, x4 );
+    _mm512_store_si512( (__m512i*)sp->h + 5, x5 );
+    _mm512_store_si512( (__m512i*)sp->h + 6, x6 );
+    _mm512_store_si512( (__m512i*)sp->h + 7, x7 );
+}
+
+int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
+                    int blockbytes )
+{
+    __m512i *h = (__m512i*)sp->h;
+    __m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
+                                                : (__m128i*)IV256 );
+    sp->hashlen   = hashbitlen/128;
+    sp->blocksize = blockbytes/16;
+    sp->rounds    = rounds;
+    sp->pos       = 0;
+
+    h[ 0] = m512_const1_128( iv[0] );
+    h[ 1] = m512_const1_128( iv[1] );
+    h[ 2] = m512_const1_128( iv[2] );
+    h[ 3] = m512_const1_128( iv[3] );
+    h[ 4] = m512_const1_128( iv[4] );
+    h[ 5] = m512_const1_128( iv[5] );
+    h[ 6] = m512_const1_128( iv[6] );
+    h[ 7] = m512_const1_128( iv[7] );
+    h[ 0] = m512_const1_128( iv[0] );
+    h[ 1] = m512_const1_128( iv[1] );
+    h[ 2] = m512_const1_128( iv[2] );
+    h[ 3] = m512_const1_128( iv[3] );
+    h[ 4] = m512_const1_128( iv[4] );
+    h[ 5] = m512_const1_128( iv[5] );
+    h[ 6] = m512_const1_128( iv[6] );
+    h[ 7] = m512_const1_128( iv[7] );
+
+    return 0;
+}
+
+int cube_4way_update( cube_4way_context *sp, const void *data, size_t size )
+{
+    const int len = size >> 4;
+    const __m512i *in = (__m512i*)data;
+    int i;
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_4way( sp );
+           sp->pos = 0;
+        }
+    }
+    return 0;
+}
+
+int cube_4way_close( cube_4way_context *sp, void *output )
+{
+    __m512i *hash = (__m512i*)output;
+    int i;
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
+                                 m512_const2_64( 0, 0x0000000000000080 ) );
+    transform_4way( sp );
+
+    sp->h[7] = _mm512_xor_si512( sp->h[7],
+                                 m512_const2_64( 0x0000000100000000, 0 ) );
+
+    for ( i = 0; i < 10; ++i ) 
+       transform_4way( sp );
+
+    memcpy( hash, sp->h, sp->hashlen<<6 );
+    return 0;
+}
+
+int cube_4way_update_close( cube_4way_context *sp, void *output,
+                               const void *data, size_t size )
+{
+    const int len = size >> 4;
+    const __m512i *in = (__m512i*)data;
+    __m512i *hash = (__m512i*)output;
+    int i;
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_4way( sp );
+           sp->pos = 0;
+        }
+    }
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
+                                    m512_const2_64( 0, 0x0000000000000080 ) );
+    transform_4way( sp );
+
+    sp->h[7] = _mm512_xor_si512( sp->h[7],
+                                    m512_const2_64( 0x0000000100000000, 0 ) );
+
+    for ( i = 0; i < 10; ++i )
+       transform_4way( sp );
+
+    memcpy( hash, sp->h, sp->hashlen<<6);
+    return 0;
+}
+
+
+#endif // AVX512

 static void transform_2way( cube_2way_context *sp )
 {
@@ -91,7 +265,6 @@ static void transform_2way( cube_2way_context *sp )
    _mm256_store_si256( (__m256i*)sp->h + 5, x5 );
    _mm256_store_si256( (__m256i*)sp->h + 6, x6 );
    _mm256_store_si256( (__m256i*)sp->h + 7, x7 );
-
 }

 int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
@@ -132,9 +305,6 @@ int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
    const __m256i *in = (__m256i*)data;
    int i;

-    // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
-    // Current usage sata is either 64 or 80 bytes.
-
    for ( i = 0; i < len; i++ )
    {
        sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
--- a/algo/cubehash/cube-hash-2way.c.save
+++ b/algo/cubehash/cube-hash-2way.c.save
@@ -0,0 +1,203 @@
+#if defined(__AVX2__)
+
+#include <stdbool.h>
+#include <unistd.h>
+#include <memory.h>
+#include "cube-hash-2way.h"
+
+// 2x128
+
+
+// The result of hashing 10 rounds of initial data which consists of params
+// zero padded.
+static const uint64_t IV256[] =
+{
+0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
+0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
+0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
+0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
+};
+
+static const uint64_t IV512[] =
+{
+0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
+0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
+0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
+0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
+};
+
+
+static void transform_2way( cube_2way_context *sp )
+{
+    int r;
+    const int rounds = sp->rounds;
+
+    __m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1;
+
+    x0 = _mm256_load_si256( (__m256i*)sp->h     );
+    x1 = _mm256_load_si256( (__m256i*)sp->h + 1 );
+    x2 = _mm256_load_si256( (__m256i*)sp->h + 2 );
+    x3 = _mm256_load_si256( (__m256i*)sp->h + 3 );
+    x4 = _mm256_load_si256( (__m256i*)sp->h + 4 );
+    x5 = _mm256_load_si256( (__m256i*)sp->h + 5 );
+    x6 = _mm256_load_si256( (__m256i*)sp->h + 6 );
+    x7 = _mm256_load_si256( (__m256i*)sp->h + 7 );
+
+    for ( r = 0; r < rounds; ++r )
+    {
+        x4 = _mm256_add_epi32( x0, x4 );
+        x5 = _mm256_add_epi32( x1, x5 );
+        x6 = _mm256_add_epi32( x2, x6 );
+        x7 = _mm256_add_epi32( x3, x7 );
+        y0 = x0;
+        y1 = x1;
+        x0 = mm256_rol_32( x2, 7 );
+        x1 = mm256_rol_32( x3, 7 );
+        x2 = mm256_rol_32( y0, 7 );
+        x3 = mm256_rol_32( y1, 7 );
+        x0 = _mm256_xor_si256( x0, x4 );
+        x1 = _mm256_xor_si256( x1, x5 );
+        x2 = _mm256_xor_si256( x2, x6 );
+        x3 = _mm256_xor_si256( x3, x7 );
+        x4 = mm256_swap64_128( x4 );
+        x5 = mm256_swap64_128( x5 );
+        x6 = mm256_swap64_128( x6 );
+        x7 = mm256_swap64_128( x7 );
+        x4 = _mm256_add_epi32( x0, x4 );
+        x5 = _mm256_add_epi32( x1, x5 );
+        x6 = _mm256_add_epi32( x2, x6 );
+        x7 = _mm256_add_epi32( x3, x7 );
+        y0 = x0;
+        y1 = x2;
+        x0 = mm256_rol_32( x1, 11 );
+        x1 = mm256_rol_32( y0, 11 );
+        x2 = mm256_rol_32( x3, 11 );
+        x3 = mm256_rol_32( y1, 11 );
+        x0 = _mm256_xor_si256( x0, x4 );
+        x1 = _mm256_xor_si256( x1, x5 );
+        x2 = _mm256_xor_si256( x2, x6 );
+        x3 = _mm256_xor_si256( x3, x7 );
+        x4 = mm256_swap32_64( x4 );
+        x5 = mm256_swap32_64( x5 );
+        x6 = mm256_swap32_64( x6 );
+        x7 = mm256_swap32_64( x7 );
+    }
+
+    _mm256_store_si256( (__m256i*)sp->h,     x0 );
+    _mm256_store_si256( (__m256i*)sp->h + 1, x1 );
+    _mm256_store_si256( (__m256i*)sp->h + 2, x2 );
+    _mm256_store_si256( (__m256i*)sp->h + 3, x3 );
+    _mm256_store_si256( (__m256i*)sp->h + 4, x4 );
+    _mm256_store_si256( (__m256i*)sp->h + 5, x5 );
+    _mm256_store_si256( (__m256i*)sp->h + 6, x6 );
+    _mm256_store_si256( (__m256i*)sp->h + 7, x7 );
+
+}
+
+int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
+                    int blockbytes )
+{
+    __m256i *h = (__m256i*)sp->h;
+    __m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
+                                                : (__m128i*)IV256 );
+    sp->hashlen   = hashbitlen/128;
+    sp->blocksize = blockbytes/16;
+    sp->rounds    = rounds;
+    sp->pos       = 0;
+
+    h[ 0] = m256_const1_128( iv[0] );
+    h[ 1] = m256_const1_128( iv[1] );
+    h[ 2] = m256_const1_128( iv[2] );
+    h[ 3] = m256_const1_128( iv[3] );
+    h[ 4] = m256_const1_128( iv[4] );
+    h[ 5] = m256_const1_128( iv[5] );
+    h[ 6] = m256_const1_128( iv[6] );
+    h[ 7] = m256_const1_128( iv[7] );
+    h[ 0] = m256_const1_128( iv[0] );
+    h[ 1] = m256_const1_128( iv[1] );
+    h[ 2] = m256_const1_128( iv[2] );
+    h[ 3] = m256_const1_128( iv[3] );
+    h[ 4] = m256_const1_128( iv[4] );
+    h[ 5] = m256_const1_128( iv[5] );
+    h[ 6] = m256_const1_128( iv[6] );
+    h[ 7] = m256_const1_128( iv[7] );
+    
+    return 0;
+}
+
+
+int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
+{
+    const int len = size >> 4;
+    const __m256i *in = (__m256i*)data;
+    int i;
+
+    // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
+    // Current usage sata is either 64 or 80 bytes.
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_2way( sp );
+           sp->pos = 0;
+        }
+    }
+    return 0;
+}
+
+int cube_2way_close( cube_2way_context *sp, void *output )
+{
+    __m256i *hash = (__m256i*)output;
+    int i;
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
+                                   m256_const2_64( 0, 0x0000000000000080 ) );
+    transform_2way( sp );
+
+    sp->h[7] = _mm256_xor_si256( sp->h[7],
+                                   m256_const2_64( 0x0000000100000000, 0 ) );
+
+    for ( i = 0; i < 10; ++i )           transform_2way( sp );
+
+    memcpy( hash, sp->h, sp->hashlen<<5 );
+    return 0;
+}
+
+int cube_2way_update_close( cube_2way_context *sp, void *output,
+                               const void *data, size_t size )
+{
+    const int len = size >> 4;
+    const __m256i *in = (__m256i*)data;
+    __m256i *hash = (__m256i*)output;
+    int i;
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_2way( sp );
+           sp->pos = 0;
+        }
+    }
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
+                                    m256_const2_64( 0, 0x0000000000000080 ) );
+    transform_2way( sp );
+
+    sp->h[7] = _mm256_xor_si256( sp->h[7],
+                                    m256_const2_64( 0x0000000100000000, 0 ) );
+
+    for ( i = 0; i < 10; ++i )    transform_2way( sp );
+
+    memcpy( hash, sp->h, sp->hashlen<<5 );
+    return 0;
+}
+
+#endif
--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -1,11 +1,38 @@
 #ifndef CUBE_HASH_2WAY_H__
-#define CUBE_HASH_2WAY_H__
-
-#if defined(__AVX2__)
+#define CUBE_HASH_2WAY_H__ 1

 #include <stdint.h>
 #include "simd-utils.h"

+#if defined(__AVX2__)
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+struct _cube_4way_context
+{
+    __m512i h[8];
+    int hashlen;
+    int rounds;
+    int blocksize;
+    int pos; 
+} __attribute__ ((aligned (128)));
+
+typedef struct _cube_4way_context cube_4way_context;
+
+int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
+                       int blockbytes );
+// reinitialize context with same parameters, much faster.
+int cube_4way_reinit( cube_4way_context *sp );
+
+int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
+
+int cube_4way_close( cube_4way_context *sp, void *output );
+
+int cube_4way_update_close( cube_4way_context *sp, void *output,
+                            const void *data, size_t size );
+
+#endif
+
 // 2x128, 2 way parallel SSE2

 struct _cube_2way_context
@@ -15,7 +42,7 @@ struct _cube_2way_context
    int rounds;
    int blocksize;         // __m128i
    int pos;               // number of __m128i read into x from current block
-} __attribute__ ((aligned (64)));
+} __attribute__ ((aligned (128)));

 typedef struct _cube_2way_context cube_2way_context;

--- a/algo/cubehash/cube-hash-2way.h.save
+++ b/algo/cubehash/cube-hash-2way.h.save
@@ -0,0 +1,36 @@
+#ifndef CUBE_HASH_2WAY_H__
+#define CUBE_HASH_2WAY_H__
+
+#if defined(__AVX2__)
+
+#include <stdint.h>
+#include "simd-utils.h"
+
+// 2x128, 2 way parallel SSE2
+
+struct _cube_2way_context
+{
+    __m256i h[8];
+    int hashlen;           // __m128i
+    int rounds;
+    int blocksize;         // __m128i
+    int pos;               // number of __m128i read into x from current block
+} __attribute__ ((aligned (64)));
+
+typedef struct _cube_2way_context cube_2way_context;
+
+int cube_2way_init( cube_2way_context* sp, int hashbitlen, int rounds,
+                       int blockbytes );
+// reinitialize context with same parameters, much faster.
+int cube_2way_reinit( cube_2way_context *sp );
+
+int cube_2way_update( cube_2way_context *sp, const void *data, size_t size );
+
+int cube_2way_close( cube_2way_context *sp, void *output );
+
+int cube_2way_update_close( cube_2way_context *sp, void *output,
+                            const void *data, size_t size );
+
+
+#endif
+#endif
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -92,6 +92,38 @@ extern "C"{

 #endif

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define Sb_8W(x0, x1, x2, x3, c) \
+do { \
+   __m512i cc = _mm512_set1_epi64( c ); \
+    x3 = mm512_not( x3 ); \
+    x0 = _mm512_xor_si512( x0, _mm512_andnot_si512( x2, cc ) ); \
+    tmp = _mm512_xor_si512( cc, _mm512_and_si512( x0, x1 ) ); \
+    x0 = _mm512_xor_si512( x0, _mm512_and_si512( x2, x3 ) ); \
+    x3 = _mm512_xor_si512( x3, _mm512_andnot_si512( x1, x2 ) ); \
+    x1 = _mm512_xor_si512( x1, _mm512_and_si512( x0, x2 ) ); \
+    x2 = _mm512_xor_si512( x2, _mm512_andnot_si512( x3, x0 ) ); \
+    x0 = _mm512_xor_si512( x0, _mm512_or_si512( x1, x3 ) ); \
+    x3 = _mm512_xor_si512( x3, _mm512_and_si512( x1, x2 ) ); \
+    x1 = _mm512_xor_si512( x1, _mm512_and_si512( tmp, x0 ) ); \
+    x2 = _mm512_xor_si512( x2, tmp ); \
+} while (0)
+
+#define Lb_8W(x0, x1, x2, x3, x4, x5, x6, x7) \
+do { \
+    x4 = _mm512_xor_si512( x4, x1 ); \
+    x5 = _mm512_xor_si512( x5, x2 ); \
+    x6 = _mm512_xor_si512( x6, _mm512_xor_si512( x3, x0 ) ); \
+    x7 = _mm512_xor_si512( x7, x0 ); \
+    x0 = _mm512_xor_si512( x0, x5 ); \
+    x1 = _mm512_xor_si512( x1, x6 ); \
+    x2 = _mm512_xor_si512( x2, _mm512_xor_si512( x7, x4 ) ); \
+    x3 = _mm512_xor_si512( x3, x4 ); \
+} while (0)
+
+#endif
+
 #define Sb(x0, x1, x2, x3, c) \
 do { \
   __m256i cc = _mm256_set1_epi64x( c ); \
@@ -226,6 +258,48 @@ static const sph_u64 C[] = {
 			x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
 	} while (0)

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define S_8W(x0, x1, x2, x3, cb, r)   do { \
+      Sb_8W(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \
+      Sb_8W(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \
+   } while (0)
+
+#define L_8W(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+      Lb_8W(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \
+         x4 ## h, x5 ## h, x6 ## h, x7 ## h); \
+      Lb_8W(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \
+         x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
+   } while (0)
+
+#define Wz_8W(x, c, n) \
+do { \
+   __m512i t = _mm512_slli_epi64( _mm512_and_si512(x ## h, (c)), (n) ); \
+   x ## h = _mm512_or_si512( _mm512_and_si512( \
+                                _mm512_srli_epi64(x ## h, (n)), (c)), t ); \
+   t = _mm512_slli_epi64( _mm512_and_si512(x ## l, (c)), (n) ); \
+   x ## l = _mm512_or_si512( _mm512_and_si512((x ## l >> (n)), (c)), t ); \
+} while (0)
+
+#define W80(x)   Wz_8W(x, m512_const1_64( 0x5555555555555555 ),  1 )
+#define W81(x)   Wz_8W(x, m512_const1_64( 0x3333333333333333 ),  2 )
+#define W82(x)   Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ),  4 )
+#define W83(x)   Wz_8W(x, m512_const1_64( 0x00FF00FF00FF00FF ),  8 ) 
+#define W84(x)   Wz_8W(x, m512_const1_64( 0x0000FFFF0000FFFF ), 16 )
+#define W85(x)   Wz_8W(x, m512_const1_64( 0x00000000FFFFFFFF ), 32 )
+#define W86(x) \
+do { \
+   __m512i t = x ## h; \
+   x ## h = x ## l; \
+   x ## l = t; \
+} while (0)
+
+#define DECL_STATE_8W \
+   __m512i h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \
+   __m512i h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \
+   __m512i tmp;
+
+#endif

 #define Wz(x, c, n) \
 do { \
@@ -236,16 +310,6 @@ do { \
   x ## l = _mm256_or_si256( _mm256_and_si256((x ## l >> (n)), (c)), t ); \
 } while (0)

-
-/*
-#define Wz(x, c, n)   do { \
-		sph_u64 t = (x ## h & (c)) << (n); \
-		x ## h = ((x ## h >> (n)) & (c)) | t; \
-		t = (x ## l & (c)) << (n); \
-		x ## l = ((x ## l >> (n)) & (c)) | t; \
-	} while (0)
-*/
-
 #define W0(x)   Wz(x, m256_const1_64( 0x5555555555555555 ),  1 )
 #define W1(x)   Wz(x, m256_const1_64( 0x3333333333333333 ),  2 )
 #define W2(x)   Wz(x, m256_const1_64( 0x0F0F0F0F0F0F0F0F ),  4 )
@@ -259,25 +323,12 @@ do { \
   x ## l = t; \
 } while (0)

-/*
-#define W0(x)   Wz(x, SPH_C64(0x5555555555555555),  1)
-#define W1(x)   Wz(x, SPH_C64(0x3333333333333333),  2)
-#define W2(x)   Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F),  4)
-#define W3(x)   Wz(x, SPH_C64(0x00FF00FF00FF00FF),  8)
-#define W4(x)   Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16)
-#define W5(x)   Wz(x, SPH_C64(0x00000000FFFFFFFF), 32)
-#define W6(x)   do { \
-		sph_u64 t = x ## h; \
-		x ## h = x ## l; \
-		x ## l = t; \
-	} while (0)
-*/
-
 #define DECL_STATE \
 	__m256i h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \
 	__m256i h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \
 	__m256i tmp;

+
 #define READ_STATE(state)   do { \
 		h0h = (state)->H[ 0]; \
 		h0l = (state)->H[ 1]; \
@@ -316,6 +367,38 @@ do { \
 		(state)->H[15] = h7l; \
 	} while (0)

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define INPUT_BUF1_8W \
+   __m512i m0h = buf[0]; \
+   __m512i m0l = buf[1]; \
+   __m512i m1h = buf[2]; \
+   __m512i m1l = buf[3]; \
+   __m512i m2h = buf[4]; \
+   __m512i m2l = buf[5]; \
+   __m512i m3h = buf[6]; \
+   __m512i m3l = buf[7]; \
+   h0h = _mm512_xor_si512( h0h, m0h ); \
+   h0l = _mm512_xor_si512( h0l, m0l ); \
+   h1h = _mm512_xor_si512( h1h, m1h ); \
+   h1l = _mm512_xor_si512( h1l, m1l ); \
+   h2h = _mm512_xor_si512( h2h, m2h ); \
+   h2l = _mm512_xor_si512( h2l, m2l ); \
+   h3h = _mm512_xor_si512( h3h, m3h ); \
+   h3l = _mm512_xor_si512( h3l, m3l ); \
+
+#define INPUT_BUF2_8W \
+   h4h = _mm512_xor_si512( h4h, m0h ); \
+   h4l = _mm512_xor_si512( h4l, m0l ); \
+   h5h = _mm512_xor_si512( h5h, m1h ); \
+   h5l = _mm512_xor_si512( h5l, m1l ); \
+   h6h = _mm512_xor_si512( h6h, m2h ); \
+   h6l = _mm512_xor_si512( h6l, m2l ); \
+   h7h = _mm512_xor_si512( h7h, m3h ); \
+   h7l = _mm512_xor_si512( h7l, m3l ); \
+
+#endif
+
 #define INPUT_BUF1 \
 	__m256i m0h = buf[0]; \
 	__m256i m0l = buf[1]; \
@@ -344,6 +427,7 @@ do { \
   h7h = _mm256_xor_si256( h7h, m3h ); \
   h7l = _mm256_xor_si256( h7l, m3l ); \

+
 static const sph_u64 IV256[] = {
 	C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1),
 	C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03),
@@ -370,6 +454,22 @@ static const sph_u64 IV512[] = {
 #else


+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define SL_8W(ro)   SLu_8W(r + ro, ro)
+
+#define SLu_8W(r, ro)   do { \
+      S_8W(h0, h2, h4, h6, Ceven_, r); \
+      S_8W(h1, h3, h5, h7, Codd_, r); \
+      L_8W(h0, h2, h4, h6, h1, h3, h5, h7); \
+      W8 ## ro(h1); \
+      W8 ## ro(h3); \
+      W8 ## ro(h5); \
+      W8 ## ro(h7); \
+   } while (0)
+
 #endif

 #define SL(ro)   SLu(r + ro, ro)
@@ -393,6 +493,23 @@ static const sph_u64 IV512[] = {
 * loop.
 */

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define E8_8W   do { \
+      unsigned r; \
+      for (r = 0; r < 42; r += 7) { \
+         SL_8W(0); \
+         SL_8W(1); \
+         SL_8W(2); \
+         SL_8W(3); \
+         SL_8W(4); \
+         SL_8W(5); \
+         SL_8W(6); \
+      } \
+   } while (0)
+
+#endif
+
 #define E8   do { \
 		unsigned r; \
 		for (r = 0; r < 42; r += 7) { \
@@ -419,51 +536,100 @@ static const sph_u64 IV512[] = {
 * On a "true 64-bit" architecture, we can unroll at will.
 */

-#define E8   do { \
-		SLu( 0, 0); \
-		SLu( 1, 1); \
-		SLu( 2, 2); \
-		SLu( 3, 3); \
-		SLu( 4, 4); \
-		SLu( 5, 5); \
-		SLu( 6, 6); \
-		SLu( 7, 0); \
-		SLu( 8, 1); \
-		SLu( 9, 2); \
-		SLu(10, 3); \
-		SLu(11, 4); \
-		SLu(12, 5); \
-		SLu(13, 6); \
-		SLu(14, 0); \
-		SLu(15, 1); \
-		SLu(16, 2); \
-		SLu(17, 3); \
-		SLu(18, 4); \
-		SLu(19, 5); \
-		SLu(20, 6); \
-		SLu(21, 0); \
-		SLu(22, 1); \
-		SLu(23, 2); \
-		SLu(24, 3); \
-		SLu(25, 4); \
-		SLu(26, 5); \
-		SLu(27, 6); \
-		SLu(28, 0); \
-		SLu(29, 1); \
-		SLu(30, 2); \
-		SLu(31, 3); \
-		SLu(32, 4); \
-		SLu(33, 5); \
-		SLu(34, 6); \
-		SLu(35, 0); \
-		SLu(36, 1); \
-		SLu(37, 2); \
-		SLu(38, 3); \
-		SLu(39, 4); \
-		SLu(40, 5); \
-		SLu(41, 6); \
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define E8_8W   do { \
+		SLu_8W( 0, 0); \
+		SLu_8W( 1, 1); \
+		SLu_8W( 2, 2); \
+		SLu_8W( 3, 3); \
+		SLu_8W( 4, 4); \
+		SLu_8W( 5, 5); \
+		SLu_8W( 6, 6); \
+		SLu_8W( 7, 0); \
+		SLu_8W( 8, 1); \
+		SLu_8W( 9, 2); \
+		SLu_8W(10, 3); \
+		SLu_8W(11, 4); \
+		SLu_8W(12, 5); \
+		SLu_8W(13, 6); \
+		SLu_8W(14, 0); \
+		SLu_8W(15, 1); \
+		SLu_8W(16, 2); \
+		SLu_8W(17, 3); \
+		SLu_8W(18, 4); \
+		SLu_8W(19, 5); \
+		SLu_8W(20, 6); \
+		SLu_8W(21, 0); \
+		SLu_8W(22, 1); \
+		SLu_8W(23, 2); \
+		SLu_8W(24, 3); \
+		SLu_8W(25, 4); \
+		SLu_8W(26, 5); \
+		SLu_8W(27, 6); \
+		SLu_8W(28, 0); \
+		SLu_8W(29, 1); \
+		SLu_8W(30, 2); \
+		SLu_8W(31, 3); \
+		SLu_8W(32, 4); \
+		SLu_8W(33, 5); \
+		SLu_8W(34, 6); \
+		SLu_8W(35, 0); \
+		SLu_8W(36, 1); \
+		SLu_8W(37, 2); \
+		SLu_8W(38, 3); \
+		SLu_8W(39, 4); \
+		SLu_8W(40, 5); \
+		SLu_8W(41, 6); \
 	} while (0)

+#endif  // AVX512
+
+#define E8   do { \
+      SLu( 0, 0); \
+      SLu( 1, 1); \
+      SLu( 2, 2); \
+      SLu( 3, 3); \
+      SLu( 4, 4); \
+      SLu( 5, 5); \
+      SLu( 6, 6); \
+      SLu( 7, 0); \
+      SLu( 8, 1); \
+      SLu( 9, 2); \
+      SLu(10, 3); \
+      SLu(11, 4); \
+      SLu(12, 5); \
+      SLu(13, 6); \
+      SLu(14, 0); \
+      SLu(15, 1); \
+      SLu(16, 2); \
+      SLu(17, 3); \
+      SLu(18, 4); \
+      SLu(19, 5); \
+      SLu(20, 6); \
+      SLu(21, 0); \
+      SLu(22, 1); \
+      SLu(23, 2); \
+      SLu(24, 3); \
+      SLu(25, 4); \
+      SLu(26, 5); \
+      SLu(27, 6); \
+      SLu(28, 0); \
+      SLu(29, 1); \
+      SLu(30, 2); \
+      SLu(31, 3); \
+      SLu(32, 4); \
+      SLu(33, 5); \
+      SLu(34, 6); \
+      SLu(35, 0); \
+      SLu(36, 1); \
+      SLu(37, 2); \
+      SLu(38, 3); \
+      SLu(39, 4); \
+      SLu(40, 5); \
+      SLu(41, 6); \
+   } while (0)
+
 #else


@@ -471,6 +637,158 @@ static const sph_u64 IV512[] = {

 #endif

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+void jh256_8way_init( jh_8way_context *sc )
+{
+    // bswapped IV256
+    sc->H[ 0] = m512_const1_64( 0xebd3202c41a398eb );
+    sc->H[ 1] = m512_const1_64( 0xc145b29c7bbecd92 );
+    sc->H[ 2] = m512_const1_64( 0xfac7d4609151931c );
+    sc->H[ 3] = m512_const1_64( 0x038a507ed6820026 );
+    sc->H[ 4] = m512_const1_64( 0x45b92677269e23a4 );
+    sc->H[ 5] = m512_const1_64( 0x77941ad4481afbe0 );
+    sc->H[ 6] = m512_const1_64( 0x7a176b0226abb5cd );
+    sc->H[ 7] = m512_const1_64( 0xa82fff0f4224f056 );
+    sc->H[ 8] = m512_const1_64( 0x754d2e7f8996a371 );
+    sc->H[ 9] = m512_const1_64( 0x62e27df70849141d );
+    sc->H[10] = m512_const1_64( 0x948f2476f7957627 );
+    sc->H[11] = m512_const1_64( 0x6c29804757b6d587 );
+    sc->H[12] = m512_const1_64( 0x6c0d8eac2d275e5c );
+    sc->H[13] = m512_const1_64( 0x0f7a0557c6508451 );
+    sc->H[14] = m512_const1_64( 0xea12247067d3e47b );
+    sc->H[15] = m512_const1_64( 0x69d71cd313abe389 );
+    sc->ptr = 0;
+    sc->block_count = 0;
+}
+
+void jh512_8way_init( jh_8way_context *sc )
+{
+    // bswapped IV512
+    sc->H[ 0] = m512_const1_64( 0x17aa003e964bd16f );
+    sc->H[ 1] = m512_const1_64( 0x43d5157a052e6a63 );
+    sc->H[ 2] = m512_const1_64( 0x0bef970c8d5e228a );
+    sc->H[ 3] = m512_const1_64( 0x61c3b3f2591234e9 );
+    sc->H[ 4] = m512_const1_64( 0x1e806f53c1a01d89 );
+    sc->H[ 5] = m512_const1_64( 0x806d2bea6b05a92a );
+    sc->H[ 6] = m512_const1_64( 0xa6ba7520dbcc8e58 );
+    sc->H[ 7] = m512_const1_64( 0xf73bf8ba763a0fa9 );
+    sc->H[ 8] = m512_const1_64( 0x694ae34105e66901 );
+    sc->H[ 9] = m512_const1_64( 0x5ae66f2e8e8ab546 );
+    sc->H[10] = m512_const1_64( 0x243c84c1d0a74710 );
+    sc->H[11] = m512_const1_64( 0x99c15a2db1716e3b );
+    sc->H[12] = m512_const1_64( 0x56f8b19decf657cf );
+    sc->H[13] = m512_const1_64( 0x56b116577c8806a7 );
+    sc->H[14] = m512_const1_64( 0xfb1785e6dffcc2e3 );
+    sc->H[15] = m512_const1_64( 0x4bdd8ccc78465a54 );
+    sc->ptr = 0;
+    sc->block_count = 0;
+}
+
+static void
+jh_8way_core( jh_8way_context *sc, const void *data, size_t len )
+{
+    __m512i *buf;
+    __m512i *vdata = (__m512i*)data;
+   const int buf_size = 64;   // 64 * _m512i
+   size_t ptr;
+   DECL_STATE_8W
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   if ( len < (buf_size - ptr) )
+   {
+       memcpy_512( buf + (ptr>>3), vdata, len>>3 );
+       ptr += len;
+       sc->ptr = ptr;
+       return;
+   }
+
+   READ_STATE(sc);
+   while ( len > 0 )
+   {
+       size_t clen;
+       clen = buf_size - ptr;
+       if ( clen > len )
+          clen = len;
+
+       memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
+       ptr += clen;
+       vdata += (clen>>3);
+       len -= clen;
+       if ( ptr == buf_size )
+       {
+          INPUT_BUF1_8W;
+          E8_8W;
+          INPUT_BUF2_8W;
+          sc->block_count ++;
+          ptr = 0;
+       }
+   }
+   WRITE_STATE(sc);
+   sc->ptr = ptr;
+}
+
+static void
+jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst,
+               size_t out_size_w32, const void *iv )
+{
+   __m512i buf[16*4];
+   __m512i *dst512 = (__m512i*)dst;
+   size_t numz, u;
+   sph_u64 l0, l1, l0e, l1e;
+
+   buf[0] = m512_const1_64( 0x80ULL );
+
+   if ( sc->ptr == 0 )
+       numz = 48;
+   else
+       numz = 112 - sc->ptr;
+
+   memset_zero_512( buf+1, (numz>>3) - 1 );
+
+   l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
+   l1 = SPH_T64(sc->block_count >> 55);
+   sph_enc64be( &l0e, l0 );
+   sph_enc64be( &l1e, l1 );
+   *(buf + (numz>>3)    ) = _mm512_set1_epi64( l1e );
+   *(buf + (numz>>3) + 1) = _mm512_set1_epi64( l0e );
+
+   jh_8way_core( sc, buf, numz + 16 );
+
+   for ( u=0; u < 8; u++ )
+       buf[u] = sc->H[u+8];
+
+    memcpy_512( dst512, buf, 8 );
+}
+
+void
+jh256_8way_update(void *cc, const void *data, size_t len)
+{
+   jh_8way_core(cc, data, len);
+}
+
+void
+jh256_8way_close(void *cc, void *dst)
+{
+   jh_8way_close(cc, 0, 0, dst, 8, IV256);
+}
+
+void
+jh512_8way_update(void *cc, const void *data, size_t len)
+{
+   jh_8way_core(cc, data, len);
+}
+
+void
+jh512_8way_close(void *cc, void *dst)
+{
+   jh_8way_close(cc, 0, 0, dst, 16, IV512);
+}
+
+#endif
+
 void jh256_4way_init( jh_4way_context *sc )
 {
    // bswapped IV256
@@ -595,16 +913,8 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
    memcpy_256( dst256, buf, 8 );
 }

-/*
 void
-jh256_4way_init(void *cc)
-{
-	jhs_4way_init(cc, IV256);
-}
-*/
-
-void
-jh256_4way(void *cc, const void *data, size_t len)
+jh256_4way_update(void *cc, const void *data, size_t len)
 {
 	jh_4way_core(cc, data, len);
 }
@@ -615,16 +925,8 @@ jh256_4way_close(void *cc, void *dst)
 	jh_4way_close(cc, 0, 0, dst, 8, IV256);
 }

-/*
 void
-jh512_4way_init(void *cc)
-{
-	jhb_4way_init(cc, IV512);
-}
-*/
-
-void
-jh512_4way(void *cc, const void *data, size_t len)
+jh512_4way_update(void *cc, const void *data, size_t len)
 {
 	jh_4way_core(cc, data, len);
 }
@@ -635,6 +937,7 @@ jh512_4way_close(void *cc, void *dst)
 	jh_4way_close(cc, 0, 0, dst, 16, IV512);
 }

+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -60,20 +60,41 @@ extern "C"{
 * can be cloned by copying the context (e.g. with a simple
 * <code>memcpy()</code>).
 */
+
+ 
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 typedef struct {
-    __m256i buf[8] __attribute__ ((aligned (64)));
+    __m512i buf[8];
+    __m512i H[16];
+    size_t ptr;
+    uint64_t block_count;
+} jh_8way_context __attribute__ ((aligned (128)));
+
+typedef jh_8way_context jh256_8way_context;
+
+typedef jh_8way_context jh512_8way_context;
+
+void jh256_8way_init( jh_8way_context *sc);
+
+void jh256_8way_update(void *cc, const void *data, size_t len);
+
+void jh256_8way_close(void *cc, void *dst);
+
+void jh512_8way_init( jh_8way_context *sc );
+
+void jh512_8way_update(void *cc, const void *data, size_t len);
+
+void jh512_8way_close(void *cc, void *dst);
+
+#endif
+
+typedef struct {
+    __m256i buf[8];
    __m256i H[16];
    size_t ptr;
    uint64_t block_count;
-/*
-	unsigned char buf[64]; 
-	size_t ptr;
-	union {
-		sph_u64 wide[16];
-	} H;
-	sph_u64 block_count;
-*/
-} jh_4way_context;
+} jh_4way_context __attribute__ ((aligned (128)));

 typedef jh_4way_context jh256_4way_context;

@@ -81,13 +102,15 @@ typedef jh_4way_context jh512_4way_context;

 void jh256_4way_init( jh_4way_context *sc);

-void jh256_4way(void *cc, const void *data, size_t len);
+void jh256_4way_update(void *cc, const void *data, size_t len);
+#define jh256_4way jh256_4way_update

 void jh256_4way_close(void *cc, void *dst);

 void jh512_4way_init( jh_4way_context *sc );

-void jh512_4way(void *cc, const void *data, size_t len);
+void jh512_4way_update(void *cc, const void *data, size_t len);
+#define jh512_4way jh512_4way_update

 void jh512_4way_close(void *cc, void *dst);

@@ -95,6 +118,6 @@ void jh512_4way_close(void *cc, void *dst);
 }
 #endif

-#endif
+#endif // AVX2

 #endif
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -6,11 +6,591 @@

 #include "simd-utils.h"

+/* initial values of chaining variables */
+static const uint32 IV[40] __attribute((aligned(64))) = {
+    0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
+    0xdef610bb,0xee058139,0x90152df4,0x6e292011,
+    0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
+    0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
+    0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
+    0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
+    0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
+    0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
+    0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
+    0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
+};
+
+/* Round Constants */
+static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
+    0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
+    0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
+    0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
+    0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
+    0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
+    0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
+    0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
+    0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
+    0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
+    0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
+    0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
+    0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
+    0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
+    0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
+    0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
+    0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
+    0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
+    0x00000000,0x00000000,0x00000000,0x5090d577,
+    0x00000000,0x00000000,0x00000000,0xac11d7fa,
+    0x00000000,0x00000000,0x00000000,0x2d1925ab,
+    0x00000000,0x00000000,0x00000000,0x1bcb66f2,
+    0x00000000,0x00000000,0x00000000,0xb46496ac,
+    0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
+    0x00000000,0x00000000,0x00000000,0xd1925ab0,
+    0x00000000,0x00000000,0x00000000,0x78602649,
+    0x00000000,0x00000000,0x00000000,0x29131ab6,
+    0x00000000,0x00000000,0x00000000,0x8edae952,
+    0x00000000,0x00000000,0x00000000,0x0fc053c3,
+    0x00000000,0x00000000,0x00000000,0x3b6ba548,
+    0x00000000,0x00000000,0x00000000,0x3f014f0c,
+    0x00000000,0x00000000,0x00000000,0xedae9520,
+    0x00000000,0x00000000,0x00000000,0xfc053c31
+};
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define cns4w(i)  m512_const1_128( ( (__m128i*)CNS_INIT)[i] )
+
+#define ADD_CONSTANT4W(a,b,c0,c1)\
+    a = _mm512_xor_si512(a,c0);\
+    b = _mm512_xor_si512(b,c1);
+
+#define MULT24W( a0, a1, mask ) \
+do { \
+  __m512i b = _mm512_xor_si512( a0, \
+                   _mm512_shuffle_epi32( _mm512_and_si512(a1,mask), 16 ) ); \
+  a0 = _mm512_or_si512( _mm512_bsrli_epi128(b,4), _mm512_bslli_epi128(a1,12) );\
+  a1 = _mm512_or_si512( _mm512_bsrli_epi128(a1,4), _mm512_bslli_epi128(b,12) );\
+} while(0)
+
+// confirm pointer arithmetic
+// ok but use array indexes
+#define STEP_PART4W(x,c0,c1,t)\
+    SUBCRUMB4W(*x,*(x+1),*(x+2),*(x+3),*t);\
+    SUBCRUMB4W(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
+    MIXWORD4W(*x,*(x+4),*t,*(t+1));\
+    MIXWORD4W(*(x+1),*(x+5),*t,*(t+1));\
+    MIXWORD4W(*(x+2),*(x+6),*t,*(t+1));\
+    MIXWORD4W(*(x+3),*(x+7),*t,*(t+1));\
+    ADD_CONSTANT4W(*x, *(x+4), c0, c1);
+
+#define SUBCRUMB4W(a0,a1,a2,a3,t)\
+    t  = _mm512_load_si512(&a0);\
+    a0 = _mm512_or_si512(a0,a1);\
+    a2 = _mm512_xor_si512(a2,a3);\
+    a1 = _mm512_andnot_si512(a1, m512_neg1 );\
+    a0 = _mm512_xor_si512(a0,a3);\
+    a3 = _mm512_and_si512(a3,t);\
+    a1 = _mm512_xor_si512(a1,a3);\
+    a3 = _mm512_xor_si512(a3,a2);\
+    a2 = _mm512_and_si512(a2,a0);\
+    a0 = _mm512_andnot_si512(a0, m512_neg1 );\
+    a2 = _mm512_xor_si512(a2,a1);\
+    a1 = _mm512_or_si512(a1,a3);\
+    t  = _mm512_xor_si512(t,a1);\
+    a3 = _mm512_xor_si512(a3,a2);\
+    a2 = _mm512_and_si512(a2,a1);\
+    a1 = _mm512_xor_si512(a1,a0);\
+    a0 = _mm512_load_si512(&t);
+
+#define MIXWORD4W(a,b,t1,t2)\
+    b  = _mm512_xor_si512(a,b);\
+    t1 = _mm512_slli_epi32(a,2);\
+    t2 = _mm512_srli_epi32(a,30);\
+     a = _mm512_or_si512(t1,t2);\
+    a  = _mm512_xor_si512(a,b);\
+    t1 = _mm512_slli_epi32(b,14);\
+    t2 = _mm512_srli_epi32(b,18);\
+    b  = _mm512_or_si512(t1,t2);\
+    b  = _mm512_xor_si512(a,b);\
+    t1 = _mm512_slli_epi32(a,10);\
+    t2 = _mm512_srli_epi32(a,22);\
+    a  = _mm512_or_si512(t1,t2);\
+    a  = _mm512_xor_si512(a,b);\
+    t1 = _mm512_slli_epi32(b,1);\
+    t2 = _mm512_srli_epi32(b,31);\
+    b  = _mm512_or_si512(t1,t2);
+
+#define STEP_PART24W(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
+    a1 = _mm512_shuffle_epi32(a1,147);\
+    t0 = _mm512_load_si512(&a1);\
+    a1 = _mm512_unpacklo_epi32(a1,a0);\
+    t0 = _mm512_unpackhi_epi32(t0,a0);\
+    t1 = _mm512_shuffle_epi32(t0,78);\
+    a0 = _mm512_shuffle_epi32(a1,78);\
+    SUBCRUMB4W(t1,t0,a0,a1,tmp0);\
+    t0 = _mm512_unpacklo_epi32(t0,t1);\
+    a1 = _mm512_unpacklo_epi32(a1,a0);\
+    a0 = _mm512_load_si512(&a1);\
+    a0 = _mm512_unpackhi_epi64(a0,t0);\
+    a1 = _mm512_unpacklo_epi64(a1,t0);\
+    a1 = _mm512_shuffle_epi32(a1,57);\
+    MIXWORD4W(a0,a1,tmp0,tmp1);\
+    ADD_CONSTANT4W(a0,a1,c0,c1);
+
+#define NMLTOM7684W(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
+    s2 = _mm512_load_si512(&r1);\
+    q2 = _mm512_load_si512(&p1);\
+    r2 = _mm512_shuffle_epi32(r2,216);\
+    p2 = _mm512_shuffle_epi32(p2,216);\
+    r1 = _mm512_unpacklo_epi32(r1,r0);\
+    p1 = _mm512_unpacklo_epi32(p1,p0);\
+    s2 = _mm512_unpackhi_epi32(s2,r0);\
+    q2 = _mm512_unpackhi_epi32(q2,p0);\
+    s0 = _mm512_load_si512(&r2);\
+    q0 = _mm512_load_si512(&p2);\
+    r2 = _mm512_unpacklo_epi64(r2,r1);\
+    p2 = _mm512_unpacklo_epi64(p2,p1);\
+    s1 = _mm512_load_si512(&s0);\
+    q1 = _mm512_load_si512(&q0);\
+    s0 = _mm512_unpackhi_epi64(s0,r1);\
+    q0 = _mm512_unpackhi_epi64(q0,p1);\
+    r2 = _mm512_shuffle_epi32(r2,225);\
+    p2 = _mm512_shuffle_epi32(p2,225);\
+    r0 = _mm512_load_si512(&s1);\
+    p0 = _mm512_load_si512(&q1);\
+    s0 = _mm512_shuffle_epi32(s0,225);\
+    q0 = _mm512_shuffle_epi32(q0,225);\
+    s1 = _mm512_unpacklo_epi64(s1,s2);\
+    q1 = _mm512_unpacklo_epi64(q1,q2);\
+    r0 = _mm512_unpackhi_epi64(r0,s2);\
+    p0 = _mm512_unpackhi_epi64(p0,q2);\
+    s2 = _mm512_load_si512(&r0);\
+    q2 = _mm512_load_si512(&p0);\
+    s3 = _mm512_load_si512(&r2);\
+    q3 = _mm512_load_si512(&p2);
+
+#define MIXTON7684W(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
+    s0 = _mm512_load_si512(&r0);\
+    q0 = _mm512_load_si512(&p0);\
+    s1 = _mm512_load_si512(&r2);\
+    q1 = _mm512_load_si512(&p2);\
+    r0 = _mm512_unpackhi_epi32(r0,r1);\
+    p0 = _mm512_unpackhi_epi32(p0,p1);\
+    r2 = _mm512_unpackhi_epi32(r2,r3);\
+    p2 = _mm512_unpackhi_epi32(p2,p3);\
+    s0 = _mm512_unpacklo_epi32(s0,r1);\
+    q0 = _mm512_unpacklo_epi32(q0,p1);\
+    s1 = _mm512_unpacklo_epi32(s1,r3);\
+    q1 = _mm512_unpacklo_epi32(q1,p3);\
+    r1 = _mm512_load_si512(&r0);\
+    p1 = _mm512_load_si512(&p0);\
+    r0 = _mm512_unpackhi_epi64(r0,r2);\
+    p0 = _mm512_unpackhi_epi64(p0,p2);\
+    s0 = _mm512_unpackhi_epi64(s0,s1);\
+    q0 = _mm512_unpackhi_epi64(q0,q1);\
+    r1 = _mm512_unpacklo_epi64(r1,r2);\
+    p1 = _mm512_unpacklo_epi64(p1,p2);\
+    s2 = _mm512_load_si512(&r0);\
+    q2 = _mm512_load_si512(&p0);\
+    s1 = _mm512_load_si512(&r1);\
+    q1 = _mm512_load_si512(&p1);
+
+#define NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
+    s1 = _mm512_load_si512(&r3);\
+    q1 = _mm512_load_si512(&p3);\
+    s3 = _mm512_load_si512(&r3);\
+    q3 = _mm512_load_si512(&p3);\
+    s1 = _mm512_unpackhi_epi32(s1,r2);\
+    q1 = _mm512_unpackhi_epi32(q1,p2);\
+    s3 = _mm512_unpacklo_epi32(s3,r2);\
+    q3 = _mm512_unpacklo_epi32(q3,p2);\
+    s0 = _mm512_load_si512(&s1);\
+    q0 = _mm512_load_si512(&q1);\
+    s2 = _mm512_load_si512(&s3);\
+    q2 = _mm512_load_si512(&q3);\
+    r3 = _mm512_load_si512(&r1);\
+    p3 = _mm512_load_si512(&p1);\
+    r1 = _mm512_unpacklo_epi32(r1,r0);\
+    p1 = _mm512_unpacklo_epi32(p1,p0);\
+    r3 = _mm512_unpackhi_epi32(r3,r0);\
+    p3 = _mm512_unpackhi_epi32(p3,p0);\
+    s0 = _mm512_unpackhi_epi64(s0,r3);\
+    q0 = _mm512_unpackhi_epi64(q0,p3);\
+    s1 = _mm512_unpacklo_epi64(s1,r3);\
+    q1 = _mm512_unpacklo_epi64(q1,p3);\
+    s2 = _mm512_unpackhi_epi64(s2,r1);\
+    q2 = _mm512_unpackhi_epi64(q2,p1);\
+    s3 = _mm512_unpacklo_epi64(s3,r1);\
+    q3 = _mm512_unpacklo_epi64(q3,p1);
+
+#define MIXTON10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
+    NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
+
+void rnd512_4way( luffa_4way_context *state, __m512i *msg )
+{
+    __m512i t0, t1;
+    __m512i *chainv = state->chainv;
+    __m512i msg0, msg1;
+    __m512i tmp[2];
+    __m512i x[8];
+    const __m512i MASK = m512_const2_64( 0, 0x00000000ffffffff );
+
+    t0 = chainv[0];
+    t1 = chainv[1];
+
+    t0 = _mm512_xor_si512( t0, chainv[2] );
+    t1 = _mm512_xor_si512( t1, chainv[3] );
+    t0 = _mm512_xor_si512( t0, chainv[4] );
+    t1 = _mm512_xor_si512( t1, chainv[5] );
+    t0 = _mm512_xor_si512( t0, chainv[6] );
+    t1 = _mm512_xor_si512( t1, chainv[7] );
+    t0 = _mm512_xor_si512( t0, chainv[8] );
+    t1 = _mm512_xor_si512( t1, chainv[9] );
+
+    MULT24W( t0, t1, MASK );
+
+    msg0 = _mm512_shuffle_epi32( msg[0], 27 );
+    msg1 = _mm512_shuffle_epi32( msg[1], 27 );
+
+    chainv[0] = _mm512_xor_si512( chainv[0], t0 );
+    chainv[1] = _mm512_xor_si512( chainv[1], t1 );
+    chainv[2] = _mm512_xor_si512( chainv[2], t0 );
+    chainv[3] = _mm512_xor_si512( chainv[3], t1 );
+    chainv[4] = _mm512_xor_si512( chainv[4], t0 );
+    chainv[5] = _mm512_xor_si512( chainv[5], t1 );
+    chainv[6] = _mm512_xor_si512( chainv[6], t0 );
+    chainv[7] = _mm512_xor_si512( chainv[7], t1 );
+    chainv[8] = _mm512_xor_si512( chainv[8], t0 );
+    chainv[9] = _mm512_xor_si512( chainv[9], t1 );
+
+    t0 = chainv[0];
+    t1 = chainv[1];
+
+    MULT24W( chainv[0], chainv[1], MASK );
+    chainv[0] = _mm512_xor_si512( chainv[0], chainv[2] );
+    chainv[1] = _mm512_xor_si512( chainv[1], chainv[3] );
+
+    MULT24W( chainv[2], chainv[3], MASK );
+    chainv[2] = _mm512_xor_si512(chainv[2], chainv[4]);
+    chainv[3] = _mm512_xor_si512(chainv[3], chainv[5]);
+
+    MULT24W( chainv[4], chainv[5], MASK );
+    chainv[4] = _mm512_xor_si512(chainv[4], chainv[6]);
+    chainv[5] = _mm512_xor_si512(chainv[5], chainv[7]);
+
+    MULT24W( chainv[6], chainv[7], MASK );
+    chainv[6] = _mm512_xor_si512(chainv[6], chainv[8]);
+    chainv[7] = _mm512_xor_si512(chainv[7], chainv[9]);
+
+    MULT24W( chainv[8], chainv[9], MASK );
+    chainv[8] = _mm512_xor_si512( chainv[8], t0 );
+    chainv[9] = _mm512_xor_si512( chainv[9], t1 );
+
+    t0 = chainv[8];
+    t1 = chainv[9];
+
+    MULT24W( chainv[8], chainv[9], MASK );
+    chainv[8] = _mm512_xor_si512( chainv[8], chainv[6] );
+    chainv[9] = _mm512_xor_si512( chainv[9], chainv[7] );
+
+    MULT24W( chainv[6], chainv[7], MASK );
+    chainv[6] = _mm512_xor_si512( chainv[6], chainv[4] );
+    chainv[7] = _mm512_xor_si512( chainv[7], chainv[5] );
+
+    MULT24W( chainv[4], chainv[5], MASK );
+    chainv[4] = _mm512_xor_si512( chainv[4], chainv[2] );
+    chainv[5] = _mm512_xor_si512( chainv[5], chainv[3] );
+
+    MULT24W( chainv[2], chainv[3], MASK );
+    chainv[2] = _mm512_xor_si512( chainv[2], chainv[0] );
+    chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );
+
+    MULT24W( chainv[0], chainv[1], MASK );
+    chainv[0] = _mm512_xor_si512( _mm512_xor_si512( chainv[0], t0 ), msg0 );
+    chainv[1] = _mm512_xor_si512( _mm512_xor_si512( chainv[1], t1 ), msg1 );
+
+    MULT24W( msg0, msg1, MASK );
+    chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
+    chainv[3] = _mm512_xor_si512( chainv[3], msg1 );
+
+    MULT24W( msg0, msg1, MASK );
+    chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
+    chainv[5] = _mm512_xor_si512( chainv[5], msg1 );
+
+    MULT24W( chainv[2], chainv[3], MASK );
+    chainv[2] = _mm512_xor_si512( chainv[2], chainv[0] );
+    chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );
+
+    MULT24W( chainv[0], chainv[1], MASK );
+    chainv[0] = _mm512_xor_si512( _mm512_xor_si512( chainv[0], t0 ), msg0 );
+    chainv[1] = _mm512_xor_si512( _mm512_xor_si512( chainv[1], t1 ), msg1 );
+
+    MULT24W( msg0, msg1, MASK );
+    chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
+    chainv[3] = _mm512_xor_si512( chainv[3], msg1 );
+
+    MULT24W( msg0, msg1, MASK );
+    chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
+    chainv[5] = _mm512_xor_si512( chainv[5], msg1 );
+
+    MULT24W( msg0, msg1, MASK );
+    chainv[6] = _mm512_xor_si512( chainv[6], msg0 );
+    chainv[7] = _mm512_xor_si512( chainv[7], msg1 );
+
+    MULT24W( msg0, msg1, MASK );
+    chainv[8] = _mm512_xor_si512( chainv[8], msg0 );
+    chainv[9] = _mm512_xor_si512( chainv[9], msg1 );
+
+    MULT24W( msg0, msg1, MASK );
+
+    // replace with ror
+    chainv[3] = _mm512_or_si512( _mm512_slli_epi32( chainv[3],  1 ),
+                                 _mm512_srli_epi32( chainv[3], 31 ) );
+    chainv[5] = _mm512_or_si512( _mm512_slli_epi32( chainv[5],  2 ),
+                                 _mm512_srli_epi32( chainv[5], 30 ) );
+    chainv[7] = _mm512_or_si512( _mm512_slli_epi32( chainv[7],  3 ),
+                                 _mm512_srli_epi32( chainv[7], 29 ) );
+    chainv[9] = _mm512_or_si512( _mm512_slli_epi32( chainv[9],  4 ),
+                                 _mm512_srli_epi32( chainv[9], 28 ) );
+
+    NMLTOM10244W( chainv[0], chainv[2], chainv[4], chainv[6],
+                x[0], x[1], x[2], x[3],
+                chainv[1],chainv[3],chainv[5],chainv[7],
+                x[4], x[5], x[6], x[7] );
+
+    STEP_PART4W( &x[0], cns4w( 0), cns4w( 1), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w( 2), cns4w( 3), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w( 4), cns4w( 5), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w( 6), cns4w( 7), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w( 8), cns4w( 9), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w(10), cns4w(11), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w(12), cns4w(13), &tmp[0] );
+    STEP_PART4W( &x[0], cns4w(14), cns4w(15), &tmp[0] );
+
+    MIXTON10244W( x[0], x[1], x[2], x[3],
+                chainv[0], chainv[2], chainv[4],chainv[6],
+                x[4], x[5], x[6], x[7],
+                chainv[1],chainv[3],chainv[5],chainv[7]);
+
+    /* Process last 256-bit block */
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(16), cns4w(17),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(18), cns4w(19),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(20), cns4w(21),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(22), cns4w(23),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(24), cns4w(25),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(26), cns4w(27),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(28), cns4w(29),
+                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(30), cns4w(31),
+                tmp[0], tmp[1] );
+}
+
+void finalization512_4way( luffa_4way_context *state, uint32 *b )
+{
+    uint32 hash[8*4] __attribute((aligned(128)));
+    __m512i* chainv = state->chainv;
+    __m512i t[2];
+    __m512i zero[2];
+    zero[0] = zero[1] = m512_zero;
+    const __m512i shuff_bswap32 = m512_const_64(
+                                  0x3c3d3e3f38393a3b, 0x3435363730313233,
+                                  0x2c2d2e2f28292a2b, 0x2425262720212223,
+                                  0x1c1d1e1f18191a1b, 0x1415161710111213,
+                                  0x0c0d0e0f08090a0b, 0x0405060700010203 );
+
+    /*---- blank round with m=0 ----*/
+    rnd512_4way( state, zero );
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+
+    t[0] = _mm512_xor_si512( t[0], chainv[2] );
+    t[1] = _mm512_xor_si512( t[1], chainv[3] );
+    t[0] = _mm512_xor_si512( t[0], chainv[4] );
+    t[1] = _mm512_xor_si512( t[1], chainv[5] );
+    t[0] = _mm512_xor_si512( t[0], chainv[6] );
+    t[1] = _mm512_xor_si512( t[1], chainv[7] );
+    t[0] = _mm512_xor_si512( t[0], chainv[8] );
+    t[1] = _mm512_xor_si512( t[1], chainv[9] );
+
+    t[0] = _mm512_shuffle_epi32( t[0], 27 );
+    t[1] = _mm512_shuffle_epi32( t[1], 27 );
+
+    _mm512_store_si512( (__m512i*)&hash[0], t[0] );
+    _mm512_store_si512( (__m512i*)&hash[8], t[1] );
+
+    casti_m512i( b, 0 ) = _mm512_shuffle_epi8(
+                                  casti_m512i( hash, 0 ), shuff_bswap32 );
+    casti_m512i( b, 1 ) = _mm512_shuffle_epi8(
+                                  casti_m512i( hash, 1 ), shuff_bswap32 );
+
+    rnd512_4way( state, zero );
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+    t[0] = _mm512_xor_si512( t[0], chainv[2] );
+    t[1] = _mm512_xor_si512( t[1], chainv[3] );
+    t[0] = _mm512_xor_si512( t[0], chainv[4] );
+    t[1] = _mm512_xor_si512( t[1], chainv[5] );
+    t[0] = _mm512_xor_si512( t[0], chainv[6] );
+    t[1] = _mm512_xor_si512( t[1], chainv[7] );
+    t[0] = _mm512_xor_si512( t[0], chainv[8] );
+    t[1] = _mm512_xor_si512( t[1], chainv[9] );
+
+    t[0] = _mm512_shuffle_epi32( t[0], 27 );
+    t[1] = _mm512_shuffle_epi32( t[1], 27 );
+
+    _mm512_store_si512( (__m512i*)&hash[0], t[0] );
+    _mm512_store_si512( (__m512i*)&hash[8], t[1] );
+
+    casti_m512i( b, 2 ) = _mm512_shuffle_epi8(
+                                  casti_m512i( hash, 0 ), shuff_bswap32 );
+    casti_m512i( b, 3 ) = _mm512_shuffle_epi8(
+                                  casti_m512i( hash, 1 ), shuff_bswap32 );
+}
+
+int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
+{
+    state->hashbitlen = hashbitlen;
+    __m128i *iv = (__m128i*)IV;
+
+    state->chainv[0] = m512_const1_128( iv[0] );
+    state->chainv[1] = m512_const1_128( iv[1] );
+    state->chainv[2] = m512_const1_128( iv[2] );
+    state->chainv[3] = m512_const1_128( iv[3] );
+    state->chainv[4] = m512_const1_128( iv[4] );
+    state->chainv[5] = m512_const1_128( iv[5] );
+    state->chainv[6] = m512_const1_128( iv[6] );
+    state->chainv[7] = m512_const1_128( iv[7] );
+    state->chainv[8] = m512_const1_128( iv[8] );
+    state->chainv[9] = m512_const1_128( iv[9] );
+
+    ((__m512i*)state->buffer)[0] = m512_zero;
+    ((__m512i*)state->buffer)[1] = m512_zero;
+
+    return 0;
+}
+
+// Do not call luffa_update_close after having called luffa_update.
+// Once luffa_update has been called only call luffa_update or luffa_close.
+int luffa_4way_update( luffa_4way_context *state, const void *data,
+                       size_t len )
+{
+    __m512i *vdata  = (__m512i*)data;
+    __m512i *buffer = (__m512i*)state->buffer;
+    __m512i msg[2];
+    int i;
+    int blocks = (int)len >> 5;
+    const __m512i shuff_bswap32 = m512_const_64( 
+                                   0x3c3d3e3f38393a3b, 0x3435363730313233,
+                                   0x2c2d2e2f28292a2b, 0x2425262720212223,
+                                   0x1c1d1e1f18191a1b, 0x1415161710111213,
+                                   0x0c0d0e0f08090a0b, 0x0405060700010203 );
+         
+    state-> rembytes = (int)len & 0x1F;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       rnd512_4way( state, msg );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    // store in buffer for transform in final for midstate to work
+    if ( state->rembytes  )
+    {
+      // remaining data bytes
+      buffer[0] = _mm512_shuffle_epi8( vdata[0], shuff_bswap32 );
+      buffer[1] = m512_const2_64( 0, 0x0000000080000000 );
+    }
+    return 0;
+}
+
+int luffa_4way_close( luffa_4way_context *state, void *hashval )
+{
+    __m512i *buffer = (__m512i*)state->buffer;
+    __m512i msg[2];
+
+    // transform pad block
+    if ( state->rembytes )
+      // not empty, data is in buffer
+      rnd512_4way( state, buffer );
+    else
+    {     // empty pad block, constant data
+      msg[0] = m512_const2_64( 0, 0x0000000080000000 );
+      msg[1] = m512_zero;
+      rnd512_4way( state, msg );
+    }
+    finalization512_4way( state, (uint32*)hashval );
+
+    if ( state->hashbitlen > 512 )
+        finalization512_4way( state, (uint32*)( hashval+32 ) );
+    return 0;
+}
+
+int luffa_4way_update_close( luffa_4way_context *state,
+                 void *output, const void *data, size_t inlen )
+{
+// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
+    const __m512i *vdata  = (__m512i*)data;
+    __m512i msg[2];
+    int i;
+    const int blocks = (int)( inlen >> 5 );
+    const __m512i shuff_bswap32 = m512_const_64(
+                                   0x3c3d3e3f38393a3b, 0x3435363730313233,
+                                   0x2c2d2e2f28292a2b, 0x2425262720212223,
+                                   0x1c1d1e1f18191a1b, 0x1415161710111213,
+                                   0x0c0d0e0f08090a0b, 0x0405060700010203 );
+
+    state->rembytes = inlen & 0x1F;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       rnd512_4way( state, msg );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    if ( state->rembytes  )
+    {
+       // padding of partial block
+       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = m512_const2_64( 0, 0x0000000080000000 );
+       rnd512_4way( state, msg );
+    }
+    else
+    {
+       // empty pad block
+       msg[0] = m512_const2_64( 0, 0x0000000080000000 );
+       msg[1] = m512_zero;
+       rnd512_4way( state, msg );
+    }
+
+    finalization512_4way( state, (uint32*)output );
+    if ( state->hashbitlen > 512 )
+        finalization512_4way( state, (uint32*)( output+32 ) );
+
+    return 0;
+}
+
+#endif // AVX512
+
 #define cns(i)  m256_const1_128( ( (__m128i*)CNS_INIT)[i] )

 #define ADD_CONSTANT(a,b,c0,c1)\
    a = _mm256_xor_si256(a,c0);\
-    b = _mm256_xor_si256(b,c1);\
+    b = _mm256_xor_si256(b,c1);

 #define MULT2( a0, a1, mask ) \
 do { \
@@ -115,7 +695,7 @@ do { \
    s2 = _mm256_load_si256(&r0);\
    q2 = _mm256_load_si256(&p0);\
    s3 = _mm256_load_si256(&r2);\
-    q3 = _mm256_load_si256(&p2);\
+    q3 = _mm256_load_si256(&p2);

 #define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
    s0 = _mm256_load_si256(&r0);\
@@ -174,57 +754,6 @@ do { \
 #define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
    NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);

-/* initial values of chaining variables */
-static const uint32 IV[40] __attribute((aligned(32))) = {
-    0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
-    0xdef610bb,0xee058139,0x90152df4,0x6e292011,
-    0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
-    0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
-    0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
-    0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
-    0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
-    0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
-    0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
-    0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
-};
-
-/* Round Constants */
-static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
-    0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
-    0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
-    0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
-    0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
-    0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
-    0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
-    0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
-    0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
-    0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
-    0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
-    0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
-    0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
-    0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
-    0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
-    0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
-    0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
-    0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
-    0x00000000,0x00000000,0x00000000,0x5090d577,
-    0x00000000,0x00000000,0x00000000,0xac11d7fa,
-    0x00000000,0x00000000,0x00000000,0x2d1925ab,
-    0x00000000,0x00000000,0x00000000,0x1bcb66f2,
-    0x00000000,0x00000000,0x00000000,0xb46496ac,
-    0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
-    0x00000000,0x00000000,0x00000000,0xd1925ab0,
-    0x00000000,0x00000000,0x00000000,0x78602649,
-    0x00000000,0x00000000,0x00000000,0x29131ab6,
-    0x00000000,0x00000000,0x00000000,0x8edae952,
-    0x00000000,0x00000000,0x00000000,0x0fc053c3,
-    0x00000000,0x00000000,0x00000000,0x3b6ba548,
-    0x00000000,0x00000000,0x00000000,0x3f014f0c,
-    0x00000000,0x00000000,0x00000000,0xedae9520,
-    0x00000000,0x00000000,0x00000000,0xfc053c31
-};
-
-

 /***************************************************/
 /* Round function         */
@@ -385,13 +914,15 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )

 void finalization512_2way( luffa_2way_context *state, uint32 *b )
 {
-    uint32 hash[8] __attribute((aligned(64)));
+    uint32 hash[8*2] __attribute((aligned(64)));
    __m256i* chainv = state->chainv;
    __m256i t[2];
    __m256i zero[2];
    zero[0] = zero[1] = m256_zero;
-    const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
-                                                  0x0405060700010203 );
+    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
+                                                 0x1415161710111213,
+                                                 0x0c0d0e0f08090a0b,
+                                                 0x0405060700010203 );
    /*---- blank round with m=0 ----*/
    rnd512_2way( state, zero );

@@ -475,8 +1006,10 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
    __m256i msg[2];
    int i;
    int blocks = (int)len >> 5;
-    const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
-                                                  0x0405060700010203 );
+    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
+                                                 0x1415161710111213,
+                                                 0x0c0d0e0f08090a0b,
+                                                 0x0405060700010203 );
    state-> rembytes = (int)len & 0x1F;

    // full blocks
@@ -528,8 +1061,10 @@ int luffa_2way_update_close( luffa_2way_context *state,
    __m256i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
-    const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
-                                                  0x0405060700010203 );
+    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
+                                                 0x1415161710111213,
+                                                 0x0c0d0e0f08090a0b,
+                                                 0x0405060700010203 );

    state->rembytes = inlen & 0x1F;

--- a/algo/luffa/luffa-hash-2way.c.save
+++ b/algo/luffa/luffa-hash-2way.c.save
@@ -0,0 +1,573 @@
+#include <string.h>
+#include <immintrin.h>
+#include "luffa-hash-2way.h"
+
+#if defined(__AVX2__)
+
+#include "simd-utils.h"
+
+#define cns(i)  m256_const1_128( ( (__m128i*)CNS_INIT)[i] )
+
+#define ADD_CONSTANT(a,b,c0,c1)\
+    a = _mm256_xor_si256(a,c0);\
+    b = _mm256_xor_si256(b,c1);\
+
+#define MULT2( a0, a1, mask ) \
+do { \
+  __m256i b = _mm256_xor_si256( a0, \
+                   _mm256_shuffle_epi32( _mm256_and_si256(a1,mask), 16 ) ); \
+  a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
+  a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) );  \
+} while(0)
+
+// confirm pointer arithmetic
+// ok but use array indexes
+#define STEP_PART(x,c0,c1,t)\
+    SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
+    SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
+    MIXWORD(*x,*(x+4),*t,*(t+1));\
+    MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
+    MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
+    MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
+    ADD_CONSTANT(*x, *(x+4), c0, c1);
+
+#define SUBCRUMB(a0,a1,a2,a3,t)\
+    t  = _mm256_load_si256(&a0);\
+    a0 = _mm256_or_si256(a0,a1);\
+    a2 = _mm256_xor_si256(a2,a3);\
+    a1 = _mm256_andnot_si256(a1, m256_neg1 );\
+    a0 = _mm256_xor_si256(a0,a3);\
+    a3 = _mm256_and_si256(a3,t);\
+    a1 = _mm256_xor_si256(a1,a3);\
+    a3 = _mm256_xor_si256(a3,a2);\
+    a2 = _mm256_and_si256(a2,a0);\
+    a0 = _mm256_andnot_si256(a0, m256_neg1 );\
+    a2 = _mm256_xor_si256(a2,a1);\
+    a1 = _mm256_or_si256(a1,a3);\
+    t  = _mm256_xor_si256(t,a1);\
+    a3 = _mm256_xor_si256(a3,a2);\
+    a2 = _mm256_and_si256(a2,a1);\
+    a1 = _mm256_xor_si256(a1,a0);\
+    a0 = _mm256_load_si256(&t);\
+
+#define MIXWORD(a,b,t1,t2)\
+    b  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(a,2);\
+    t2 = _mm256_srli_epi32(a,30);\
+     a = _mm256_or_si256(t1,t2);\
+    a  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(b,14);\
+    t2 = _mm256_srli_epi32(b,18);\
+    b  = _mm256_or_si256(t1,t2);\
+    b  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(a,10);\
+    t2 = _mm256_srli_epi32(a,22);\
+    a  = _mm256_or_si256(t1,t2);\
+    a  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(b,1);\
+    t2 = _mm256_srli_epi32(b,31);\
+    b  = _mm256_or_si256(t1,t2);
+
+#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
+    a1 = _mm256_shuffle_epi32(a1,147);\
+    t0 = _mm256_load_si256(&a1);\
+    a1 = _mm256_unpacklo_epi32(a1,a0);\
+    t0 = _mm256_unpackhi_epi32(t0,a0);\
+    t1 = _mm256_shuffle_epi32(t0,78);\
+    a0 = _mm256_shuffle_epi32(a1,78);\
+    SUBCRUMB(t1,t0,a0,a1,tmp0);\
+    t0 = _mm256_unpacklo_epi32(t0,t1);\
+    a1 = _mm256_unpacklo_epi32(a1,a0);\
+    a0 = _mm256_load_si256(&a1);\
+    a0 = _mm256_unpackhi_epi64(a0,t0);\
+    a1 = _mm256_unpacklo_epi64(a1,t0);\
+    a1 = _mm256_shuffle_epi32(a1,57);\
+    MIXWORD(a0,a1,tmp0,tmp1);\
+    ADD_CONSTANT(a0,a1,c0,c1);
+
+#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
+    s2 = _mm256_load_si256(&r1);\
+    q2 = _mm256_load_si256(&p1);\
+    r2 = _mm256_shuffle_epi32(r2,216);\
+    p2 = _mm256_shuffle_epi32(p2,216);\
+    r1 = _mm256_unpacklo_epi32(r1,r0);\
+    p1 = _mm256_unpacklo_epi32(p1,p0);\
+    s2 = _mm256_unpackhi_epi32(s2,r0);\
+    q2 = _mm256_unpackhi_epi32(q2,p0);\
+    s0 = _mm256_load_si256(&r2);\
+    q0 = _mm256_load_si256(&p2);\
+    r2 = _mm256_unpacklo_epi64(r2,r1);\
+    p2 = _mm256_unpacklo_epi64(p2,p1);\
+    s1 = _mm256_load_si256(&s0);\
+    q1 = _mm256_load_si256(&q0);\
+    s0 = _mm256_unpackhi_epi64(s0,r1);\
+    q0 = _mm256_unpackhi_epi64(q0,p1);\
+    r2 = _mm256_shuffle_epi32(r2,225);\
+    p2 = _mm256_shuffle_epi32(p2,225);\
+    r0 = _mm256_load_si256(&s1);\
+    p0 = _mm256_load_si256(&q1);\
+    s0 = _mm256_shuffle_epi32(s0,225);\
+    q0 = _mm256_shuffle_epi32(q0,225);\
+    s1 = _mm256_unpacklo_epi64(s1,s2);\
+    q1 = _mm256_unpacklo_epi64(q1,q2);\
+    r0 = _mm256_unpackhi_epi64(r0,s2);\
+    p0 = _mm256_unpackhi_epi64(p0,q2);\
+    s2 = _mm256_load_si256(&r0);\
+    q2 = _mm256_load_si256(&p0);\
+    s3 = _mm256_load_si256(&r2);\
+    q3 = _mm256_load_si256(&p2);\
+
+#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
+    s0 = _mm256_load_si256(&r0);\
+    q0 = _mm256_load_si256(&p0);\
+    s1 = _mm256_load_si256(&r2);\
+    q1 = _mm256_load_si256(&p2);\
+    r0 = _mm256_unpackhi_epi32(r0,r1);\
+    p0 = _mm256_unpackhi_epi32(p0,p1);\
+    r2 = _mm256_unpackhi_epi32(r2,r3);\
+    p2 = _mm256_unpackhi_epi32(p2,p3);\
+    s0 = _mm256_unpacklo_epi32(s0,r1);\
+    q0 = _mm256_unpacklo_epi32(q0,p1);\
+    s1 = _mm256_unpacklo_epi32(s1,r3);\
+    q1 = _mm256_unpacklo_epi32(q1,p3);\
+    r1 = _mm256_load_si256(&r0);\
+    p1 = _mm256_load_si256(&p0);\
+    r0 = _mm256_unpackhi_epi64(r0,r2);\
+    p0 = _mm256_unpackhi_epi64(p0,p2);\
+    s0 = _mm256_unpackhi_epi64(s0,s1);\
+    q0 = _mm256_unpackhi_epi64(q0,q1);\
+    r1 = _mm256_unpacklo_epi64(r1,r2);\
+    p1 = _mm256_unpacklo_epi64(p1,p2);\
+    s2 = _mm256_load_si256(&r0);\
+    q2 = _mm256_load_si256(&p0);\
+    s1 = _mm256_load_si256(&r1);\
+    q1 = _mm256_load_si256(&p1);\
+
+#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
+    s1 = _mm256_load_si256(&r3);\
+    q1 = _mm256_load_si256(&p3);\
+    s3 = _mm256_load_si256(&r3);\
+    q3 = _mm256_load_si256(&p3);\
+    s1 = _mm256_unpackhi_epi32(s1,r2);\
+    q1 = _mm256_unpackhi_epi32(q1,p2);\
+    s3 = _mm256_unpacklo_epi32(s3,r2);\
+    q3 = _mm256_unpacklo_epi32(q3,p2);\
+    s0 = _mm256_load_si256(&s1);\
+    q0 = _mm256_load_si256(&q1);\
+    s2 = _mm256_load_si256(&s3);\
+    q2 = _mm256_load_si256(&q3);\
+    r3 = _mm256_load_si256(&r1);\
+    p3 = _mm256_load_si256(&p1);\
+    r1 = _mm256_unpacklo_epi32(r1,r0);\
+    p1 = _mm256_unpacklo_epi32(p1,p0);\
+    r3 = _mm256_unpackhi_epi32(r3,r0);\
+    p3 = _mm256_unpackhi_epi32(p3,p0);\
+    s0 = _mm256_unpackhi_epi64(s0,r3);\
+    q0 = _mm256_unpackhi_epi64(q0,p3);\
+    s1 = _mm256_unpacklo_epi64(s1,r3);\
+    q1 = _mm256_unpacklo_epi64(q1,p3);\
+    s2 = _mm256_unpackhi_epi64(s2,r1);\
+    q2 = _mm256_unpackhi_epi64(q2,p1);\
+    s3 = _mm256_unpacklo_epi64(s3,r1);\
+    q3 = _mm256_unpacklo_epi64(q3,p1);
+
+#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
+    NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
+
+/* initial values of chaining variables */
+static const uint32 IV[40] __attribute((aligned(32))) = {
+    0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
+    0xdef610bb,0xee058139,0x90152df4,0x6e292011,
+    0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
+    0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
+    0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
+    0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
+    0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
+    0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
+    0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
+    0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
+};
+
+/* Round Constants */
+static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
+    0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
+    0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
+    0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
+    0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
+    0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
+    0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
+    0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
+    0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
+    0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
+    0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
+    0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
+    0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
+    0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
+    0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
+    0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
+    0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
+    0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
+    0x00000000,0x00000000,0x00000000,0x5090d577,
+    0x00000000,0x00000000,0x00000000,0xac11d7fa,
+    0x00000000,0x00000000,0x00000000,0x2d1925ab,
+    0x00000000,0x00000000,0x00000000,0x1bcb66f2,
+    0x00000000,0x00000000,0x00000000,0xb46496ac,
+    0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
+    0x00000000,0x00000000,0x00000000,0xd1925ab0,
+    0x00000000,0x00000000,0x00000000,0x78602649,
+    0x00000000,0x00000000,0x00000000,0x29131ab6,
+    0x00000000,0x00000000,0x00000000,0x8edae952,
+    0x00000000,0x00000000,0x00000000,0x0fc053c3,
+    0x00000000,0x00000000,0x00000000,0x3b6ba548,
+    0x00000000,0x00000000,0x00000000,0x3f014f0c,
+    0x00000000,0x00000000,0x00000000,0xedae9520,
+    0x00000000,0x00000000,0x00000000,0xfc053c31
+};
+
+
+
+/***************************************************/
+/* Round function         */
+/* state: hash context    */
+
+void rnd512_2way( luffa_2way_context *state, __m256i *msg )
+{
+    __m256i t0, t1;
+    __m256i *chainv = state->chainv;
+    __m256i msg0, msg1;
+    __m256i tmp[2];
+    __m256i x[8];
+    const __m256i MASK = m256_const2_64( 0, 0x00000000ffffffff );
+
+    t0 = chainv[0];
+    t1 = chainv[1];
+
+    t0 = _mm256_xor_si256( t0, chainv[2] );
+    t1 = _mm256_xor_si256( t1, chainv[3] );
+    t0 = _mm256_xor_si256( t0, chainv[4] );
+    t1 = _mm256_xor_si256( t1, chainv[5] );
+    t0 = _mm256_xor_si256( t0, chainv[6] );
+    t1 = _mm256_xor_si256( t1, chainv[7] );
+    t0 = _mm256_xor_si256( t0, chainv[8] );
+    t1 = _mm256_xor_si256( t1, chainv[9] );
+
+    MULT2( t0, t1, MASK );
+
+    msg0 = _mm256_shuffle_epi32( msg[0], 27 );
+    msg1 = _mm256_shuffle_epi32( msg[1], 27 );
+
+    chainv[0] = _mm256_xor_si256( chainv[0], t0 );
+    chainv[1] = _mm256_xor_si256( chainv[1], t1 );
+    chainv[2] = _mm256_xor_si256( chainv[2], t0 );
+    chainv[3] = _mm256_xor_si256( chainv[3], t1 );
+    chainv[4] = _mm256_xor_si256( chainv[4], t0 );
+    chainv[5] = _mm256_xor_si256( chainv[5], t1 );
+    chainv[6] = _mm256_xor_si256( chainv[6], t0 );
+    chainv[7] = _mm256_xor_si256( chainv[7], t1 );
+    chainv[8] = _mm256_xor_si256( chainv[8], t0 );
+    chainv[9] = _mm256_xor_si256( chainv[9], t1 );
+
+    t0 = chainv[0];
+    t1 = chainv[1];
+
+    MULT2( chainv[0], chainv[1], MASK );
+    chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
+    chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
+
+    MULT2( chainv[2], chainv[3], MASK );
+    chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
+    chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
+
+    MULT2( chainv[4], chainv[5], MASK );
+    chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
+    chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
+
+    MULT2( chainv[6], chainv[7], MASK );
+    chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
+    chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
+
+    MULT2( chainv[8], chainv[9], MASK );
+    chainv[8] = _mm256_xor_si256( chainv[8], t0 );
+    chainv[9] = _mm256_xor_si256( chainv[9], t1 );
+
+    t0 = chainv[8];
+    t1 = chainv[9];
+
+    MULT2( chainv[8], chainv[9], MASK );
+    chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
+    chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
+
+    MULT2( chainv[6], chainv[7], MASK );
+    chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
+    chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
+
+    MULT2( chainv[4], chainv[5], MASK );
+    chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
+    chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
+
+    MULT2( chainv[2], chainv[3], MASK );
+    chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
+    chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
+
+    MULT2( chainv[0], chainv[1], MASK );
+    chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
+    chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );
+
+    MULT2( msg0, msg1, MASK );
+    chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
+    chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
+
+    MULT2( msg0, msg1, MASK );
+    chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
+    chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
+
+    MULT2( msg0, msg1, MASK );
+    chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
+    chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
+
+    MULT2( msg0, msg1, MASK );
+    chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
+    chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
+
+    MULT2( msg0, msg1, MASK );
+
+    chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3],  1 ),
+                                 _mm256_srli_epi32( chainv[3], 31 ) );
+    chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5],  2 ),
+                                 _mm256_srli_epi32( chainv[5], 30 ) );
+    chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7],  3 ),
+                                 _mm256_srli_epi32( chainv[7], 29 ) );
+    chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9],  4 ),
+                                 _mm256_srli_epi32( chainv[9], 28 ) );
+
+    NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
+                x[0], x[1], x[2], x[3],
+                chainv[1],chainv[3],chainv[5],chainv[7],
+                x[4], x[5], x[6], x[7] );
+
+    STEP_PART( &x[0], cns( 0), cns( 1), &tmp[0] );
+    STEP_PART( &x[0], cns( 2), cns( 3), &tmp[0] );
+    STEP_PART( &x[0], cns( 4), cns( 5), &tmp[0] );
+    STEP_PART( &x[0], cns( 6), cns( 7), &tmp[0] );
+    STEP_PART( &x[0], cns( 8), cns( 9), &tmp[0] );
+    STEP_PART( &x[0], cns(10), cns(11), &tmp[0] );
+    STEP_PART( &x[0], cns(12), cns(13), &tmp[0] );
+    STEP_PART( &x[0], cns(14), cns(15), &tmp[0] );
+
+    MIXTON1024( x[0], x[1], x[2], x[3],
+                chainv[0], chainv[2], chainv[4],chainv[6],
+                x[4], x[5], x[6], x[7],
+                chainv[1],chainv[3],chainv[5],chainv[7]);
+
+    /* Process last 256-bit block */
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17),
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19),
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21),
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23),
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25),
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27),
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29),
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31),
+                tmp[0], tmp[1] );
+}
+
+/***************************************************/
+/* Finalization function  */
+/* state: hash context    */
+/* b[8]: hash values      */
+
+void finalization512_2way( luffa_2way_context *state, uint32 *b )
+{
+    uint32 hash[8] __attribute((aligned(64)));
+    __m256i* chainv = state->chainv;
+    __m256i t[2];
+    __m256i zero[2];
+    zero[0] = zero[1] = m256_zero;
+    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
+                                                 0x1415161710111213,
+                                                 0x0c0d0e0f08090a0b,
+                                                 0x0405060700010203 );
+    /*---- blank round with m=0 ----*/
+    rnd512_2way( state, zero );
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+
+    t[0] = _mm256_xor_si256( t[0], chainv[2] );
+    t[1] = _mm256_xor_si256( t[1], chainv[3] );
+    t[0] = _mm256_xor_si256( t[0], chainv[4] );
+    t[1] = _mm256_xor_si256( t[1], chainv[5] );
+    t[0] = _mm256_xor_si256( t[0], chainv[6] );
+    t[1] = _mm256_xor_si256( t[1], chainv[7] );
+    t[0] = _mm256_xor_si256( t[0], chainv[8] );
+    t[1] = _mm256_xor_si256( t[1], chainv[9] );
+
+    t[0] = _mm256_shuffle_epi32( t[0], 27 );
+    t[1] = _mm256_shuffle_epi32( t[1], 27 );
+
+    _mm256_store_si256( (__m256i*)&hash[0], t[0] );
+    _mm256_store_si256( (__m256i*)&hash[8], t[1] );
+
+    casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
+                                  casti_m256i( hash, 0 ), shuff_bswap32 );
+    casti_m256i( b, 1 ) = _mm256_shuffle_epi8( 
+                                  casti_m256i( hash, 1 ), shuff_bswap32 );
+
+    rnd512_2way( state, zero );
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+    t[0] = _mm256_xor_si256( t[0], chainv[2] );
+    t[1] = _mm256_xor_si256( t[1], chainv[3] );
+    t[0] = _mm256_xor_si256( t[0], chainv[4] );
+    t[1] = _mm256_xor_si256( t[1], chainv[5] );
+    t[0] = _mm256_xor_si256( t[0], chainv[6] );
+    t[1] = _mm256_xor_si256( t[1], chainv[7] );
+    t[0] = _mm256_xor_si256( t[0], chainv[8] );
+    t[1] = _mm256_xor_si256( t[1], chainv[9] );
+
+    t[0] = _mm256_shuffle_epi32( t[0], 27 );
+    t[1] = _mm256_shuffle_epi32( t[1], 27 );
+
+    _mm256_store_si256( (__m256i*)&hash[0], t[0] );
+    _mm256_store_si256( (__m256i*)&hash[8], t[1] );
+
+    casti_m256i( b, 2 ) = _mm256_shuffle_epi8( 
+                                  casti_m256i( hash, 0 ), shuff_bswap32 );
+    casti_m256i( b, 3 ) = _mm256_shuffle_epi8( 
+                                  casti_m256i( hash, 1 ), shuff_bswap32 );
+}
+
+int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
+{
+    state->hashbitlen = hashbitlen;
+    __m128i *iv = (__m128i*)IV;
+    
+    state->chainv[0] = m256_const1_128( iv[0] );
+    state->chainv[1] = m256_const1_128( iv[1] );
+    state->chainv[2] = m256_const1_128( iv[2] );
+    state->chainv[3] = m256_const1_128( iv[3] );
+    state->chainv[4] = m256_const1_128( iv[4] );
+    state->chainv[5] = m256_const1_128( iv[5] );
+    state->chainv[6] = m256_const1_128( iv[6] );
+    state->chainv[7] = m256_const1_128( iv[7] );
+    state->chainv[8] = m256_const1_128( iv[8] );
+    state->chainv[9] = m256_const1_128( iv[9] );
+
+    ((__m256i*)state->buffer)[0] = m256_zero;
+    ((__m256i*)state->buffer)[1] = m256_zero;
+
+    return 0;
+}
+
+// Do not call luffa_update_close after having called luffa_update.
+// Once luffa_update has been called only call luffa_update or luffa_close.
+int luffa_2way_update( luffa_2way_context *state, const void *data,
+                       size_t len )
+{
+    __m256i *vdata  = (__m256i*)data;
+    __m256i *buffer = (__m256i*)state->buffer;
+    __m256i msg[2];
+    int i;
+    int blocks = (int)len >> 5;
+    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
+                                                 0x1415161710111213,
+                                                 0x0c0d0e0f08090a0b,
+                                                 0x0405060700010203 );
+    state-> rembytes = (int)len & 0x1F;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       rnd512_2way( state, msg );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    // store in buffer for transform in final for midstate to work
+    if ( state->rembytes  )
+    {
+      // remaining data bytes
+      buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
+      buffer[1] = m256_const2_64( 0, 0x0000000080000000 );
+    }
+    return 0;
+}
+
+int luffa_2way_close( luffa_2way_context *state, void *hashval )
+{
+    __m256i *buffer = (__m256i*)state->buffer;
+    __m256i msg[2];
+
+    // transform pad block
+    if ( state->rembytes )
+      // not empty, data is in buffer
+      rnd512_2way( state, buffer );
+    else
+    {     // empty pad block, constant data
+      msg[0] = m256_const2_64( 0, 0x0000000080000000 );
+      msg[1] = m256_zero;
+      rnd512_2way( state, msg );
+    }
+    finalization512_2way( state, (uint32*)hashval );
+
+    if ( state->hashbitlen > 512 )
+        finalization512_2way( state, (uint32*)( hashval+32 ) );
+    return 0;
+}
+
+int luffa_2way_update_close( luffa_2way_context *state,
+                 void *output, const void *data, size_t inlen )
+{
+// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
+    const __m256i *vdata  = (__m256i*)data;
+    __m256i msg[2];
+    int i;
+    const int blocks = (int)( inlen >> 5 );
+    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
+                                                 0x1415161710111213,
+                                                 0x0c0d0e0f08090a0b,
+                                                 0x0405060700010203 );
+
+    state->rembytes = inlen & 0x1F;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       rnd512_2way( state, msg );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    if ( state->rembytes  )
+    {
+       // padding of partial block
+       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = m256_const2_64( 0, 0x0000000080000000 );
+       rnd512_2way( state, msg );
+    }
+    else
+    {
+       // empty pad block
+       msg[0] = m256_const2_64( 0, 0x0000000080000000 );
+       msg[1] = m256_zero;
+       rnd512_2way( state, msg );
+    }
+
+    finalization512_2way( state, (uint32*)output );
+    if ( state->hashbitlen > 512 )
+        finalization512_2way( state, (uint32*)( output+32 ) );
+
+    return 0;
+}
+
+#endif
--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -51,12 +51,30 @@
 #define LIMIT_512 128
 /*********************************/

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 typedef struct {
-    uint32 buffer[8*2] __attribute((aligned(64)));
-    __m256i chainv[10] __attribute((aligned(32)));   /* Chaining values */
+    uint32 buffer[8*4];
+    __m512i chainv[10];   /* Chaining values */
    int hashbitlen;
    int rembytes;
-} luffa_2way_context;
+} luffa_4way_context __attribute((aligned(128)));
+
+int luffa_4way_init( luffa_4way_context *state, int hashbitlen );
+int luffa_4way_update( luffa_4way_context *state, const void *data,
+                       size_t len );
+int luffa_4way_close( luffa_4way_context *state, void *hashval );
+int luffa_4way_update_close( luffa_4way_context *state, void *output,
+                                   const void *data, size_t inlen );
+
+#endif
+
+typedef struct {
+    uint32 buffer[8*2];
+    __m256i chainv[10];   /* Chaining values */
+    int hashbitlen;
+    int rembytes;
+} luffa_2way_context __attribute((aligned(128)));

 int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
 int luffa_2way_update( luffa_2way_context *state, const void *data,
--- a/algo/luffa/luffa-hash-2way.h.save
+++ b/algo/luffa/luffa-hash-2way.h.save
@@ -0,0 +1,69 @@
+#if !defined(LUFFA_HASH_2WAY_H__)
+#define LUFFA_HASH_2WAY_H__ 1
+/*
+ * luffa_for_sse2.h
+ * Version 2.0 (Sep 15th 2009)
+ *
+ * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
+ *
+ * Hitachi, Ltd. is the owner of this software and hereby grant
+ * the U.S. Government and any interested party the right to use
+ * this software for the purposes of the SHA-3 evaluation process,
+ * notwithstanding that this software is copyrighted.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#if defined(__AVX2__)
+
+#include <immintrin.h>
+#include "algo/sha/sha3-defs.h"
+#include "simd-utils.h"
+
+/* The length of digests*/
+#define DIGEST_BIT_LEN_224 224
+#define DIGEST_BIT_LEN_256 256
+#define DIGEST_BIT_LEN_384 384
+#define DIGEST_BIT_LEN_512 512
+
+/*********************************/
+/* The parameters of Luffa       */
+#define MSG_BLOCK_BIT_LEN 256  /*The bit length of a message block*/
+#define MSG_BLOCK_BYTE_LEN (MSG_BLOCK_BIT_LEN >> 3) /* The byte length
+                                                     * of a message block*/
+
+/* The number of blocks in Luffa */
+#define WIDTH_224 3
+#define WIDTH_256 3
+#define WIDTH_384 4
+#define WIDTH_512 5
+
+/* The limit of the length of message */
+#define LIMIT_224 64
+#define LIMIT_256 64
+#define LIMIT_384 128
+#define LIMIT_512 128
+/*********************************/
+
+typedef struct {
+    uint32 buffer[8*2] __attribute((aligned(64)));
+    __m256i chainv[10] __attribute((aligned(32)));   /* Chaining values */
+    int hashbitlen;
+    int rembytes;
+} luffa_2way_context;
+
+int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
+int luffa_2way_update( luffa_2way_context *state, const void *data,
+                       size_t len );
+int luffa_2way_close( luffa_2way_context *state, void *hashval );
+int luffa_2way_update_close( luffa_2way_context *state, void *output,
+                                   const void *data, size_t inlen );
+
+#endif
+#endif
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -542,8 +542,10 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    __m256i* chainv = (__m256i*)state->chainv;
    __m256i  t;
    const __m128i zero = m128_zero;
-    const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
-                                                  0x0405060700010203 );
+    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
+                                                 0x1415161710111213,
+                                                 0x0c0d0e0f08090a0b,
+                                                 0x0405060700010203 );

    rnd512( state, zero, zero );

--- a/algo/nist5/nist5-4way.c
+++ b/algo/nist5/nist5-4way.c
@@ -3,22 +3,129 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
-#if defined(NIST5_4WAY)
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"

-void nist5hash_4way( void *out, const void *input )
+#if defined(NIST5_8WAY)
+
+void nist5hash_8way( void *out, const void *input )
 {
+     uint64_t vhash[8*16] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     blake512_8way_context  ctx_blake;
+     hashState_groestl      ctx_groestl;
+     jh512_8way_context     ctx_jh;
+     skein512_8way_context  ctx_skein;
+     keccak512_8way_context ctx_keccak;
+
+     blake512_8way_init( &ctx_blake );
+     blake512_8way_update( &ctx_blake, input, 80 );
+     blake512_8way_close( &ctx_blake, vhash );
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, 512 );
+
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash0,
+                               (const char*)hash0, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash1,
+                               (const char*)hash1, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash2,
+                               (const char*)hash2, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash3,
+                               (const char*)hash3, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash4,
+                               (const char*)hash4, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash5,
+                               (const char*)hash5, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash6,
+                               (const char*)hash6, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash7,
+                               (const char*)hash7, 512 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                         hash7, 512 );
+
+     jh512_8way_init( &ctx_jh );
+     jh512_8way_update( &ctx_jh, vhash, 64 );
+     jh512_8way_close( &ctx_jh, vhash );
+
+     keccak512_8way_init( &ctx_keccak );
+     keccak512_8way_update( &ctx_keccak, vhash, 64 );
+     keccak512_8way_close( &ctx_keccak, vhash );
+
+     skein512_8way_init( &ctx_skein );
+     skein512_8way_update( &ctx_skein, vhash, 64 );
+     skein512_8way_close( &ctx_skein, out );
+}
+
+int scanhash_nist5_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[16*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+     uint32_t *hash7 = &(hash[49]);
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     const uint32_t Htarg = ptarget[7];
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     int thr_id = mythr->id;  
+
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+               _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                 n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+        nist5hash_8way( hash, vdata );
+
+        for ( int lane = 0; lane < 8; lane++ )
+        if ( hash7[ lane<<1 ] < Htarg )
+        {
+           extr_lane_8x64( lane_hash, hash, lane, 256 );
+           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+           {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+           }
+        }
+        n += 8;
+     } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined(NIST5_4WAY)
+
+void nist5hash_4way( void *out, const void *input )
+{
+     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     blake512_4way_context  ctx_blake;
     hashState_groestl      ctx_groestl;
     jh512_4way_context     ctx_jh;
@@ -62,62 +169,39 @@ void nist5hash_4way( void *out, const void *input )
 int scanhash_nist5_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
+     uint32_t vdata[4*24] __attribute__ ((aligned (128)));
     uint32_t hash[4*16] __attribute__ ((aligned (64)));
     uint32_t *hash7 = &(hash[25]);
     uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     const uint32_t Htarg = ptarget[7];
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-     int thr_id = mythr->id;  // thr_id arg is deprecated
-
-     uint64_t htmax[] = {          0,
-                                 0xF,
-                                0xFF,
-                               0xFFF,
-                              0xFFFF,
-                          0x10000000 };
-
-     uint32_t masks[] = { 0xFFFFFFFF,
-                          0xFFFFFFF0,
-                          0xFFFFFF00,
-                          0xFFFFF000,
-                          0xFFFF0000,
-                                   0 };
+     int thr_id = mythr->id;  

     mm256_bswap32_intrlv80_4x64( vdata, pdata );

-     for ( int m=0; m < 6; m++ )
-     {
-        if (Htarg <= htmax[m])
+     do {
+        *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+               _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+
+        nist5hash_4way( hash, vdata );
+
+        for ( int lane = 0; lane < 4; lane++ )
+        if ( hash7[ lane<<1 ] < Htarg )
        {
-           uint32_t mask = masks[m];
-
-           do {
-              *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-
-              nist5hash_4way( hash, vdata );
-
-              for ( int lane = 0; lane < 4; lane++ )
-              if ( ( hash7[ lane ] & mask ) == 0 )
-              {
-                 extr_lane_4x64( lane_hash, hash, lane, 256 );
-                 if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
-                 {
-                    pdata[19] = n + lane;
-                    submit_lane_solution( work, lane_hash, mythr, lane );
-                 }
-              }
-              n += 4;
-           } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
-           break;
+           extr_lane_4x64( lane_hash, hash, lane, 256 );
+           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+           {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+           }
        }
-     }
-     *hashes_done = n - first_nonce + 1;
+        n += 4;
+     } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
     return 0;
 }

--- a/algo/nist5/nist5-gate.c
+++ b/algo/nist5/nist5-gate.c
@@ -2,8 +2,11 @@

 bool register_nist5_algo( algo_gate_t* gate )
 {
-    gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
-#if defined (NIST5_4WAY)
+    gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+#if defined (NIST5_8WAY)
+    gate->scanhash = (void*)&scanhash_nist5_8way;
+    gate->hash     = (void*)&nist5hash_8way;
+#elif defined (NIST5_4WAY)
    gate->scanhash = (void*)&scanhash_nist5_4way;
    gate->hash     = (void*)&nist5hash_4way;
 #else
--- a/algo/nist5/nist5-gate.h
+++ b/algo/nist5/nist5-gate.h
@@ -1,14 +1,23 @@
 #ifndef __NIST5_GATE_H__
-#define __NIST5_GATE_H__
+#define __NIST5_GATE_H__ 1

 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define NIST5_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define NIST5_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define NIST5_4WAY 1
 #endif

-#if defined(NIST5_4WAY)
+#if defined(NIST5_8WAY)
+
+void nist5hash_8way( void *state, const void *input );
+
+int scanhash_nist5_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(NIST5_4WAY)

 void nist5hash_4way( void *state, const void *input );

--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -1,12 +1,8 @@
 #include "cpuminer-config.h"
 #include "quark-gate.h"
-
-#if defined (QUARK_4WAY)
-
 #include <stdio.h>
 #include <string.h>
 #include <stdint.h>
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
@@ -14,6 +10,258 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"

+#if defined (QUARK_8WAY)
+
+typedef struct {
+    blake512_8way_context  blake;
+    bmw512_8way_context    bmw;
+    hashState_groestl      groestl;
+    jh512_8way_context     jh;
+    skein512_8way_context  skein;
+    keccak512_8way_context keccak;
+} quark_8way_ctx_holder;
+
+quark_8way_ctx_holder quark_8way_ctx __attribute__ ((aligned (128)));
+
+void init_quark_8way_ctx()
+{
+     blake512_8way_init( &quark_8way_ctx.blake );
+     bmw512_8way_init( &quark_8way_ctx.bmw );
+     init_groestl( &quark_8way_ctx.groestl, 64 );
+     skein512_8way_init( &quark_8way_ctx.skein );
+     jh512_8way_init( &quark_8way_ctx.jh );
+     keccak512_8way_init( &quark_8way_ctx.keccak );
+}
+
+void quark_8way_hash( void *state, const void *input )
+{
+    uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+    uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
+    uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
+    uint64_t hash0[8] __attribute__ ((aligned (64)));
+    uint64_t hash1[8] __attribute__ ((aligned (64)));
+    uint64_t hash2[8] __attribute__ ((aligned (64)));
+    uint64_t hash3[8] __attribute__ ((aligned (64)));
+    uint64_t hash4[8] __attribute__ ((aligned (64)));
+    uint64_t hash5[8] __attribute__ ((aligned (64)));
+    uint64_t hash6[8] __attribute__ ((aligned (64)));
+    uint64_t hash7[8] __attribute__ ((aligned (64)));
+    __m512i* vh  = (__m512i*)vhash;
+    __m512i* vhA = (__m512i*)vhashA;
+    __m512i* vhB = (__m512i*)vhashB;
+    __mmask8 vh_mask;
+    quark_8way_ctx_holder ctx;
+    const uint32_t mask = 8;
+    const __m512i bit3_mask = m512_const1_64( mask );
+    const __m512i zero = _mm512_setzero_si512();
+
+    memcpy( &ctx, &quark_8way_ctx, sizeof(quark_8way_ctx) );
+
+    blake512_8way_update( &ctx.blake, input, 80 );
+    blake512_8way_close( &ctx.blake, vhash );
+
+    bmw512_8way_update( &ctx.bmw, vhash, 64 );
+    bmw512_8way_close( &ctx.bmw, vhash );
+
+// AVX 512 cmpeq returns a bit mask instead of a vector mask.
+// This should simplify things but the logic doesn't seem to be working.
+// The problem appears to be related to the test to skip a hash if it isn't
+// to be used. Skipping the test for all 8 way hashes seems to have
+// fixed it. The hash selection blending works if the hash is produced
+// but the hash wasn't being produced when it should.
+// Both decisions are based on the same data, the __mmask8. It works
+// as a blend mask but not in a logical comparison, maybe the type is the
+// problem. Maybe a cast to int or movm is needed to make it work.
+// It's now moot because the hash can only be skipped 1 in 256 iterations
+// when hashing parallel 8 ways.
+// The performance impact of the workaround should be negligible.
+// It's a problem for another day.
+
+    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
+                                       zero );
+
+    dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                  vhash, 512 );
+
+    if ( hash0[0] & mask )
+    {
+       update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                               (char*)hash0, 512 );
+    }
+    if ( hash1[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                               (char*)hash1, 512 );
+    }
+    if ( hash2[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                               (char*)hash2, 512 );
+    }
+    if ( hash3[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                               (char*)hash3, 512 );
+    }
+    if ( hash4[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                               (char*)hash4, 512 );
+    }
+    if ( hash5[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                               (char*)hash5, 512 );
+    }
+    if ( hash6[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                               (char*)hash6, 512 );
+    }
+    if ( hash7[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                               (char*)hash7, 512 );
+    }
+
+    intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                         hash7, 512 );
+
+    if ( vh_mask & 0xff )
+    {
+       skein512_8way_update( &ctx.skein, vhash, 64 );
+       skein512_8way_close( &ctx.skein, vhashB );
+    }
+
+    mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+
+    dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                  vhash, 512 );
+
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+    intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 512 );
+
+    jh512_8way_update( &ctx.jh, vhash, 64 );
+    jh512_8way_close( &ctx.jh, vhash );
+
+    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
+                                       zero );
+
+    if ( ( vh_mask & 0xff ) != 0xff )
+    {
+       blake512_8way_init( &ctx.blake );
+       blake512_8way_update( &ctx.blake, vhash, 64 );
+       blake512_8way_close( &ctx.blake, vhashA );
+    }
+
+    if ( vh_mask & 0xff )
+    {
+       bmw512_8way_init( &ctx.bmw );
+       bmw512_8way_update( &ctx.bmw, vhash, 64 );
+       bmw512_8way_close( &ctx.bmw, vhashB );
+    }
+
+    mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+
+    keccak512_8way_update( &ctx.keccak, vhash, 64 );
+    keccak512_8way_close( &ctx.keccak, vhash );
+
+    skein512_8way_init( &ctx.skein );
+    skein512_8way_update( &ctx.skein, vhash, 64 );
+    skein512_8way_close( &ctx.skein, vhash );
+
+    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
+                                       zero );
+
+    if ( ( vh_mask & 0xff ) != 0xff )
+    {
+       keccak512_8way_init( &ctx.keccak );
+       keccak512_8way_update( &ctx.keccak, vhash, 64 );
+       keccak512_8way_close( &ctx.keccak, vhashA );
+    }
+
+    if ( vh_mask & 0xff )
+    {
+       jh512_8way_init( &ctx.jh );
+       jh512_8way_update( &ctx.jh, vhash, 64 );
+       jh512_8way_close( &ctx.jh, vhashB );
+    }
+
+    // Final blend, directly to state, only need 32 bytes.
+    casti_m512i( state,0 ) = _mm512_mask_blend_epi64( vh_mask, vhA[0], vhB[0] );
+    casti_m512i( state,1 ) = _mm512_mask_blend_epi64( vh_mask, vhA[1], vhB[1] );
+    casti_m512i( state,2 ) = _mm512_mask_blend_epi64( vh_mask, vhA[2], vhB[2] );
+    casti_m512i( state,3 ) = _mm512_mask_blend_epi64( vh_mask, vhA[3], vhB[3] );
+}
+
+int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint32_t hash[8*8] __attribute__ ((aligned (128)));
+    uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+    uint32_t *hash7 = &(hash[49]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    uint32_t n = pdata[19];
+    const uint32_t first_nonce = pdata[19];
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+    int thr_id = mythr->id;  // thr_id arg is deprecated
+
+    mm512_bswap32_intrlv80_8x64( vdata, pdata );
+    do
+    {
+       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+       quark_8way_hash( hash, vdata );
+       pdata[19] = n;
+
+       for ( int i = 0; i < 8; i++ )
+       if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 )
+       {
+          extr_lane_8x64( lane_hash, hash, i, 256 );
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark  )
+          {
+            pdata[19] = n+i;
+            submit_lane_solution( work, lane_hash, mythr, i );
+          }
+       }
+       n += 8;
+    } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
+
+    *hashes_done = n - first_nonce;
+    return 0;
+}
+
+
+#elif defined (QUARK_4WAY)
+
 typedef struct {
    blake512_4way_context  blake;
    bmw512_4way_context    bmw;
@@ -91,7 +339,7 @@ void quark_4way_hash( void *state, const void *input )

    intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );

-    if ( mm256_anybits0( vh_mask ) )   
+    if ( mm256_anybits1( vh_mask ) )   
    {
       skein512_4way( &ctx.skein, vhash, 64 );
       skein512_4way_close( &ctx.skein, vhashB );
@@ -117,14 +365,14 @@ void quark_4way_hash( void *state, const void *input )

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );

-    if ( mm256_anybits1( vh_mask ) )
+    if ( mm256_anybits0( vh_mask ) )   
    {
       blake512_4way_init( &ctx.blake );
       blake512_4way( &ctx.blake, vhash, 64 );
       blake512_4way_close( &ctx.blake, vhashA );
    }

-    if ( mm256_anybits0( vh_mask ) )
+    if ( mm256_anybits1( vh_mask ) )
    {
       bmw512_4way_init( &ctx.bmw );
       bmw512_4way( &ctx.bmw, vhash, 64 );
@@ -142,14 +390,14 @@ void quark_4way_hash( void *state, const void *input )

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );

-    if ( mm256_anybits1( vh_mask ) )
+    if ( mm256_anybits0( vh_mask ) )    
    {
       keccak512_4way_init( &ctx.keccak );
       keccak512_4way( &ctx.keccak, vhash, 64 );
       keccak512_4way_close( &ctx.keccak, vhashA );
    }

-    if ( mm256_anybits0( vh_mask ) )
+    if ( mm256_anybits1( vh_mask ) )
    {
       jh512_4way_init( &ctx.jh );
       jh512_4way( &ctx.jh, vhash, 64 );
--- a/algo/quark/quark-gate.c
+++ b/algo/quark/quark-gate.c
@@ -2,7 +2,11 @@

 bool register_quark_algo( algo_gate_t* gate )
 {
-#if defined (QUARK_4WAY)
+#if defined (QUARK_8WAY)
+  init_quark_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_quark_8way;
+  gate->hash      = (void*)&quark_8way_hash;
+#elif defined (QUARK_4WAY)
  init_quark_4way_ctx();
  gate->scanhash  = (void*)&scanhash_quark_4way;
  gate->hash      = (void*)&quark_4way_hash;
@@ -11,7 +15,7 @@ bool register_quark_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_quark;
  gate->hash      = (void*)&quark_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/quark/quark-gate.h
+++ b/algo/quark/quark-gate.h
@@ -4,13 +4,22 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define QUARK_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define QUARK_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define QUARK_4WAY 1
 #endif

 bool register_quark_algo( algo_gate_t* gate );

-#if defined(QUARK_4WAY)
+#if defined(QUARK_8WAY)
+
+void quark_8way_hash( void *state, const void *input );
+int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+void init_quark_8way_ctx();
+
+#elif defined(QUARK_4WAY)

 void quark_4way_hash( void *state, const void *input );
 int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -1,7 +1,4 @@
 #include "qubit-gate.h"
-
-#if defined(QUBIT_2WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -12,6 +9,160 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/echo/aes_ni/hash_api.h"

+#if defined(QUBIT_4WAY)
+
+typedef struct
+{
+        luffa_4way_context      luffa;
+        cubehashParam           cube;
+        sph_shavite512_context  shavite;
+        simd_4way_context       simd;
+        hashState_echo          echo;
+} qubit_4way_ctx_holder;
+
+qubit_4way_ctx_holder qubit_4way_ctx;
+
+void init_qubit_4way_ctx()
+{
+        cubehashInit(&qubit_4way_ctx.cube,512,16,32);
+        sph_shavite512_init(&qubit_4way_ctx.shavite);
+        simd_4way_init( &qubit_4way_ctx.simd, 512 );
+        init_echo(&qubit_4way_ctx.echo, 512);
+};
+
+void qubit_4way_hash( void *output, const void *input )
+{
+     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     qubit_4way_ctx_holder ctx;
+
+     memcpy( &ctx, &qubit_4way_ctx, sizeof(qubit_4way_ctx) );
+     luffa_4way_update( &ctx.luffa, input + (64<<2), 16 );
+     luffa_4way_close( &ctx.luffa, vhash );
+     dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+     memcpy( output+64, hash2, 32 );
+     memcpy( output+96, hash3, 32 );
+}
+
+int scanhash_qubit_4way( struct work *work,uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[4*16] __attribute__ ((aligned (128)));
+     uint32_t vdata[4*24] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *noncep = vdata + 64+3;   // 4*16 + 3
+     int thr_id = mythr->id;  
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     casti_m512i( endiandata, 0 ) = mm512_bswap_32( casti_m512i( pdata, 0 ) );
+     casti_m512i( endiandata, 1 ) = mm512_bswap_32( casti_m512i( pdata, 1 ) );
+     casti_m512i( endiandata, 4 ) = mm512_bswap_32( casti_m512i( pdata, 4 ) );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     intrlv_4x128( (uint64_t*)vdata, edata, edata, 640 );
+
+     luffa_4way_init( &qubit_4way_ctx.luffa, 512 );
+     luffa_4way_update( &qubit_4way_ctx.luffa, vdata, 64 );
+
+     for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+     {
+        uint32_t mask = masks[m];
+        do
+        {
+            be32enc( noncep,   n   );
+            be32enc( noncep+4, n+1 );
+            be32enc( noncep+8, n+2 );
+            be32enc( noncep+12, n+3 );
+            qubit_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( !( hash[7] & mask ) )
+            if ( fulltest( hash, ptarget) && !opt_benchmark )
+            {
+                pdata[19] = n;
+                submit_lane_solution( work, hash, mythr, 0 );
+            }
+            if ( !( (hash+8)[7] & mask ) )
+            if ( fulltest( hash+8, ptarget) && !opt_benchmark )
+            {
+               pdata[19] = n+1;
+               submit_lane_solution( work, hash+8, mythr, 1 );
+            }
+            if ( !( hash+16[7] & mask ) )
+            if ( fulltest( hash, ptarget) && !opt_benchmark )
+            {
+                pdata[19] = n+2;
+                submit_lane_solution( work, hash, mythr, 2 );
+            }
+            if ( !( (hash+24)[7] & mask ) )
+            if ( fulltest( hash+8, ptarget) && !opt_benchmark )
+            {
+               pdata[19] = n+3;
+               submit_lane_solution( work, hash+8, mythr, 3 );
+            }
+            n += 4;
+         } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
+         break;
+     }
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined(QUBIT_2WAY)
+
 typedef struct
 {
        luffa_2way_context      luffa;
--- a/algo/qubit/qubit-gate.c
+++ b/algo/qubit/qubit-gate.c
@@ -2,6 +2,13 @@

 bool register_qubit_algo( algo_gate_t* gate )
 {
+/*   
+#if defined (QUBIT_4WAY)
+  init_qubit_2way_ctx();
+  gate->scanhash  = (void*)&scanhash_qubit_4way;
+  gate->hash      = (void*)&qubit_4way_hash;
+#elif defined (QUBIT_4WAY)
+*/
 #if defined (QUBIT_2WAY)
  init_qubit_2way_ctx();
  gate->scanhash  = (void*)&scanhash_qubit_2way;
--- a/algo/qubit/qubit-gate.h
+++ b/algo/qubit/qubit-gate.h
@@ -4,12 +4,26 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

+/*
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define QUBIT_2WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+*/
 #if defined(__AVX2__) && defined(__AES__)
-  #define QUBIT_2WAY
+  #define QUBIT_2WAY 1
 #endif

 bool register_qubit_algo( algo_gate_t* gate );
+/*
+#if defined(QUBIT_4WAY)

+void qubit_4way_hash( void *state, const void *input );
+int scanhash_qubit_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+void init_qubit_4way_ctx();
+
+#elif defined(QUBIT_2WAY)
+*/
 #if defined(QUBIT_2WAY)

 void qubit_2way_hash( void *state, const void *input );
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -285,8 +285,10 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
    unsigned ptr;
    const int buf_size = 128;
    const int pad = buf_size - 16;
-    const __m256i shuff_bswap64 = m256_const2_64( 0x08090a0b0c0d0e0f,
-                                                  0x0001020304050607 );
+    const __m256i shuff_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f,
+                                                 0x1011121314151617,
+                                                 0x08090a0b0c0d0e0f,
+                                                 0x0001020304050607 );

    ptr = (unsigned)sc->count & (buf_size - 1U);
    sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 );
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
--- a/algo/simd/simd-hash-2way.h
+++ b/algo/simd/simd-hash-2way.h
@@ -7,15 +7,37 @@

 #include "simd-utils.h"

+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 typedef struct {
-  uint32_t A[ 32*2 ] __attribute__((aligned(64)));
-  uint8_t buffer[ 128*2 ] __attribute__((aligned(64)));
+  uint32_t A[ 32*4 ];
+  uint8_t buffer[ 128*4 ];
+  uint64_t count;
+  unsigned int hashbitlen;
+  unsigned int blocksize;
+  unsigned int n_feistels;
+
+} simd_4way_context __attribute__((aligned(128)));
+
+int simd_4way_init( simd_4way_context *state, int hashbitlen );
+int simd_4way_update( simd_4way_context *state, const void *data,
+                      int databitlen );
+int simd_4way_close( simd_4way_context *state, void *hashval );
+int simd_4way_update_close( simd_4way_context *state, void *hashval,
+                            const void *data, int databitlen );
+
+#endif
+
+typedef struct {
+  uint32_t A[ 32*2 ];
+  uint8_t buffer[ 128*2 ];
  uint64_t count;
  unsigned int hashbitlen;
  unsigned int blocksize;
  unsigned int n_feistels;
  
-} simd_2way_context;
+} simd_2way_context __attribute__((aligned(128)));

 int simd_2way_init( simd_2way_context *state, int hashbitlen );
 int simd_2way_update( simd_2way_context *state, const void *data,
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -15,7 +15,7 @@

 void skeinhash_8way( void *state, const void *input )
 {
-     uint64_t vhash64[16*8] __attribute__ ((aligned (128)));
+     uint64_t vhash64[8*8] __attribute__ ((aligned (128)));
     skein512_8way_context ctx_skein;

 //#if defined(__SHA__)
@@ -29,7 +29,7 @@ void skeinhash_8way( void *state, const void *input )
 //     uint32_t hash7[16] __attribute__ ((aligned (64)));
 //     SHA256_CTX           ctx_sha256;
 //#else
-     uint32_t vhash32[32*8] __attribute__ ((aligned (128)));
+     uint32_t vhash32[16*8] __attribute__ ((aligned (128)));
     sha256_8way_context ctx_sha256;
 //#endif

@@ -135,7 +135,7 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,

 void skeinhash_4way( void *state, const void *input )
 {
-     uint64_t vhash64[16*4] __attribute__ ((aligned (64)));
+     uint64_t vhash64[8*4] __attribute__ ((aligned (128)));
     skein512_4way_context ctx_skein;
 #if defined(__SHA__)
     uint32_t hash0[16] __attribute__ ((aligned (64)));
--- a/algo/x11/tribus-4way.c
+++ b/algo/x11/tribus-4way.c
@@ -3,22 +3,121 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
-#if defined(TRIBUS_4WAY)
-
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/echo/aes_ni/hash_api.h"

-//hashState_echo tribus_4way_ctx __attribute__ ((aligned (64)));
-static __thread jh512_4way_context ctx_mid;
-/*
-void init_tribus_4way_ctx()
+#if defined(TRIBUS_8WAY)
+
+static __thread jh512_8way_context ctx_mid;
+
+void tribus_hash_8way( void *state, const void *input )
 {
-     init_echo( &tribus_4way_ctx, 512 );
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     jh512_8way_context     ctx_jh;
+     keccak512_8way_context ctx_keccak;
+     hashState_echo         ctx_echo;
+
+     memcpy( &ctx_jh, &ctx_mid, sizeof(ctx_mid) );
+     jh512_8way_update( &ctx_jh, input + (64<<3), 16 );
+     jh512_8way_close( &ctx_jh, vhash );
+
+     keccak512_8way_init( &ctx_keccak );
+     keccak512_8way_update( &ctx_keccak, vhash, 64 );
+     keccak512_8way_close( &ctx_keccak, vhash );
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, 512 );
+
+     // hash echo serially
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash0,
+                        (const BitSequence *) hash0, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash1,
+                        (const BitSequence *) hash1, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash2,
+                        (const BitSequence *) hash2, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash3,
+                        (const BitSequence *) hash3, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash4,
+                        (const BitSequence *) hash4, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash5,
+                        (const BitSequence *) hash5, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash6,
+                        (const BitSequence *) hash6, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash7,
+                        (const BitSequence *) hash7, 512 );
+
+     memcpy( state,       hash0, 32 );
+     memcpy( state+32,    hash1, 32 );
+     memcpy( state+64,    hash2, 32 );
+     memcpy( state+96,    hash3, 32 );
+     memcpy( state+128,   hash4, 32 );
+     memcpy( state+160,   hash5, 32 );
+     memcpy( state+192,   hash6, 32 );
+     memcpy( state+224,   hash7, 32 );
 }
-*/
-void tribus_hash_4way(void *state, const void *input)
+
+int scanhash_tribus_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t Htarg = ptarget[7];
+   uint32_t n = pdata[19];
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   int thr_id = mythr->id;  
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   jh512_8way_init( &ctx_mid );
+   jh512_8way_update( &ctx_mid, vdata, 64 );
+
+   do {
+     *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+     tribus_hash_8way( hash, vdata );
+     pdata[19] = n;
+
+     for ( int i = 0; i < 8; i++ )
+     if ( (hash+(i<<3))[7] < Htarg )
+     if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+     {
+          pdata[19] = n+i;
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
+     }
+     n += 8;
+   } while ( ( n < max_nonce-8 )  && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(TRIBUS_4WAY)
+
+static __thread jh512_4way_context ctx_mid;
+
+void tribus_hash_4way( void *state, const void *input )
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
@@ -30,11 +129,11 @@ void tribus_hash_4way(void *state, const void *input)
     hashState_echo         ctx_echo;

     memcpy( &ctx_jh, &ctx_mid, sizeof(ctx_mid) );
-     jh512_4way( &ctx_jh, input + (64<<2), 16 );
+     jh512_4way_update( &ctx_jh, input + (64<<2), 16 );
     jh512_4way_close( &ctx_jh, vhash );

     keccak512_4way_init( &ctx_keccak );
-     keccak512_4way( &ctx_keccak, vhash, 64 );
+     keccak512_4way_update( &ctx_keccak, vhash, 64 );
     keccak512_4way_close( &ctx_keccak, vhash );

     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -60,7 +159,7 @@ void tribus_hash_4way(void *state, const void *input)
 }

 int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
-            uint64_t *hashes_done, struct thr_info *mythr)
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -70,57 +169,32 @@ int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
   const uint32_t Htarg = ptarget[7];
   uint32_t n = pdata[19];
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-   uint64_t htmax[] = {          0,
-                               0xF,
-                              0xFF,
-                             0xFFF,
-                            0xFFFF,
-                        0x10000000 };
-
-   uint32_t masks[] = {	0xFFFFFFFF,
-                        0xFFFFFFF0,
-                        0xFFFFFF00,
-                        0xFFFFF000,
-                        0xFFFF0000,
-                                 0 };
+   int thr_id = mythr->id;

   mm256_bswap32_intrlv80_4x64( vdata, pdata );

-   // precalc midstate
-   // doing it one way then then interleaving would be faster but too
-   // complicated tto interleave context.
   jh512_4way_init( &ctx_mid );
-   jh512_4way( &ctx_mid, vdata, 64 );
+   jh512_4way_update( &ctx_mid, vdata, 64 );

-   for ( int m = 0; m < 6; m++ )
-   {
-      if ( Htarg <= htmax[m] )
-      {
-         uint32_t mask = masks[m];
-         do {
-           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+   do {
+     *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

-            tribus_hash_4way( hash, vdata );
+     tribus_hash_4way( hash, vdata );

-            pdata[19] = n;
+     pdata[19] = n;

-            for ( int i = 0; i < 4; i++ )
-            if ( ( !( (hash+(i<<3))[7] & mask ) )
-                 && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-            {
-               pdata[19] = n+i;
-               submit_lane_solution( work, hash+(i<<3), mythr, i );
-            }
-            n += 4;
-         } while ( ( n < max_nonce )  && !work_restart[thr_id].restart);
-         break;
-      }
-   }
+     for ( int i = 0; i < 4; i++ )
+     if ( (hash+(i<<3))[7] < Htarg )
+     if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+     {
+          pdata[19] = n+i;
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
+     }
+     n += 4;
+   } while ( ( n < max_nonce-4 )  && !work_restart[thr_id].restart);

-   *hashes_done = n - first_nonce + 1;
+   *hashes_done = n - first_nonce;
   return 0;
 }

--- a/algo/x11/tribus-gate.c
+++ b/algo/x11/tribus-gate.c
@@ -2,9 +2,11 @@

 bool register_tribus_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
-#if defined (TRIBUS_4WAY)
-//  init_tribus_4way_ctx();
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+#if defined (TRIBUS_8WAY)
+  gate->scanhash      = (void*)&scanhash_tribus_8way;
+  gate->hash          = (void*)&tribus_hash_8way;
+#elif defined (TRIBUS_4WAY)
  gate->scanhash      = (void*)&scanhash_tribus_4way;
  gate->hash          = (void*)&tribus_hash_4way;
 #else
--- a/algo/x11/tribus-gate.h
+++ b/algo/x11/tribus-gate.h
@@ -1,16 +1,23 @@
 #ifndef TRIBUS_GATE_H__
-#define TRIBUS_GATE_H__
+#define TRIBUS_GATE_H__ 1

 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define TRIBUS_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define TRIBUS_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define TRIBUS_4WAY 1
 #endif

-#if defined(TRIBUS_4WAY)
+#if defined(TRIBUS_8WAY)

-//void init_tribus_4way_ctx();
+void tribus_hash_8way( void *state, const void *input );
+
+int scanhash_tribus_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(TRIBUS_4WAY)

 void tribus_hash_4way( void *state, const void *input );