v3.20.3

2026-07-14 10:56:50 +00:00 · 2022-10-21 23:12:18 -04:00
parent 58030e2788
commit bd84f199fe
35 changed files with 983 additions and 4938 deletions
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -316,7 +316,7 @@ static const sph_u32 CS[16] = {
                                          CSx( r, 5 ) ^ Mx( r, 4 ), \
                                          CSx( r, 3 ) ^ Mx( r, 2 ), \
                                          CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
-   V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 16 ); \
+   V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
   V2 = _mm_add_epi32( V2, V3 ); \
   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
@@ -324,7 +324,7 @@ static const sph_u32 CS[16] = {
                                          CSx( r, 4 ) ^ Mx( r, 5 ), \
                                          CSx( r, 2 ) ^ Mx( r, 3 ), \
                                          CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
-   V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 8 ); \
+   V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
   V2 = _mm_add_epi32( V2, V3 ); \
   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
   V3 = mm128_shufll_32( V3 ); \
@@ -335,7 +335,7 @@ static const sph_u32 CS[16] = {
                                          CSx( r, D ) ^ Mx( r, C ), \
                                          CSx( r, B ) ^ Mx( r, A ), \
                                          CSx( r, 9 ) ^ Mx( r, 8 ) ) ) ); \
-   V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 16 ); \
+   V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
   V2 = _mm_add_epi32( V2, V3 ); \
   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
@@ -343,7 +343,7 @@ static const sph_u32 CS[16] = {
                                          CSx( r, C ) ^ Mx( r, D ), \
                                          CSx( r, A ) ^ Mx( r, B ), \
                                          CSx( r, 8 ) ^ Mx( r, 9 ) ) ) ); \
-   V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 8 ); \
+   V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
   V2 = _mm_add_epi32( V2, V3 ); \
   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
   V3 = mm128_shuflr_32( V3 ); \
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -78,7 +78,8 @@
  V[1] = mm256_shufll_64( V[1] ); \
 }

-#elif defined(__SSSE3__)
+#elif defined(__SSE2__)
+// always true

 #define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
 { \
@@ -115,6 +116,7 @@
 }

 #else
+// never used, SSE2 is always available

 #ifndef ROTR64
 #define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -747,38 +747,40 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
   mj[14] = mm256_rol_64( M[14], 15 );
   mj[15] = mm256_rol_64( M[15], 16 );

-   qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7],
-              (const __m256i)_mm256_set1_epi64x( 16 * 0x0555555555555555ULL ) );
-   qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8],
-              (const __m256i)_mm256_set1_epi64x( 17 * 0x0555555555555555ULL ) );
-   qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9],
-              (const __m256i)_mm256_set1_epi64x( 18 * 0x0555555555555555ULL ) );
-   qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10],
-              (const __m256i)_mm256_set1_epi64x( 19 * 0x0555555555555555ULL ) );
-   qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11],
-              (const __m256i)_mm256_set1_epi64x( 20 * 0x0555555555555555ULL ) );
-   qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12],
-              (const __m256i)_mm256_set1_epi64x( 21 * 0x0555555555555555ULL ) );
-   qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13],
-              (const __m256i)_mm256_set1_epi64x( 22 * 0x0555555555555555ULL ) );
-   qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14],
-              (const __m256i)_mm256_set1_epi64x( 23 * 0x0555555555555555ULL ) );
-   qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15],
-              (const __m256i)_mm256_set1_epi64x( 24 * 0x0555555555555555ULL ) );
-   qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0],
-              (const __m256i)_mm256_set1_epi64x( 25 * 0x0555555555555555ULL ) );
-   qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1],
-              (const __m256i)_mm256_set1_epi64x( 26 * 0x0555555555555555ULL ) );
-   qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2],
-              (const __m256i)_mm256_set1_epi64x( 27 * 0x0555555555555555ULL ) );
-   qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3],
-              (const __m256i)_mm256_set1_epi64x( 28 * 0x0555555555555555ULL ) );
-   qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4],
-              (const __m256i)_mm256_set1_epi64x( 29 * 0x0555555555555555ULL ) );
-   qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5],
-              (const __m256i)_mm256_set1_epi64x( 30 * 0x0555555555555555ULL ) );
-   qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6],
-              (const __m256i)_mm256_set1_epi64x( 31 * 0x0555555555555555ULL ) );
+   __m256i K = _mm256_set1_epi64x( 16 * 0x0555555555555555ULL );
+   const __m256i Kincr = _mm256_set1_epi64x( 0x0555555555555555ULL );
+
+   qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6], K );

   qt[16] = _mm256_add_epi64( qt[16], expand1_b( qt, 16 ) );
   qt[17] = _mm256_add_epi64( qt[17], expand1_b( qt, 17 ) );
@@ -1180,7 +1182,6 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
   qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );

   __m512i mj[16];
-   uint64_t K = 16 * 0x0555555555555555ULL;
 
   mj[ 0] = mm512_rol_64( M[ 0],  1 );
   mj[ 1] = mm512_rol_64( M[ 1],  2 );
@@ -1199,54 +1200,40 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
   mj[14] = mm512_rol_64( M[14], 15 );
   mj[15] = mm512_rol_64( M[15], 16 );

-   qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6],
-                        (const __m512i)_mm512_set1_epi64( K ) );
+   __m512i K = _mm512_set1_epi64( 16 * 0x0555555555555555ULL );
+   const __m512i Kincr = _mm512_set1_epi64( 0x0555555555555555ULL );

+   qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6], K );

   qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) );
   qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) );
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -62,186 +62,66 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {

 #define cns4w(i)  m512_const1_128( ( (__m128i*)CNS_INIT)[i] )

-#define ADD_CONSTANT4W(a,b,c0,c1)\
-    a = _mm512_xor_si512(a,c0);\
-    b = _mm512_xor_si512(b,c1);
+#define ADD_CONSTANT4W( a, b, c0, c1 ) \
+    a = _mm512_xor_si512( a, c0 ); \
+    b = _mm512_xor_si512( b, c1 );

 #define MULT24W( a0, a1 ) \
-do { \
+{ \
  __m512i b = _mm512_xor_si512( a0, \
                     _mm512_maskz_shuffle_epi32( 0xbbbb, a1, 16 ) ); \
-  a0 = _mm512_or_si512( _mm512_bsrli_epi128(  b, 4 ), \
-                        _mm512_bslli_epi128( a1,12 ) ); \
-  a1 = _mm512_or_si512( _mm512_bsrli_epi128( a1, 4 ), \
-                        _mm512_bslli_epi128(  b,12 ) ); \
-} while(0)
+  a0 = _mm512_alignr_epi8( a1,  b, 4 ); \
+  a1 = _mm512_alignr_epi8(  b, a1, 4 ); \
+}

-/*
-#define MULT24W( a0, a1, mask ) \
-do { \
-  __m512i b = _mm512_xor_si512( a0, \
-                   _mm512_shuffle_epi32( _mm512_and_si512(a1,mask), 16 ) ); \
-  a0 = _mm512_or_si512( _mm512_bsrli_epi128(b,4), _mm512_bslli_epi128(a1,12) );\
-  a1 = _mm512_or_si512( _mm512_bsrli_epi128(a1,4), _mm512_bslli_epi128(b,12) );\
-} while(0)
-*/
-
-// confirm pointer arithmetic
-// ok but use array indexes
-#define STEP_PART4W(x,c0,c1,t)\
-    SUBCRUMB4W(*x,*(x+1),*(x+2),*(x+3),*t);\
-    SUBCRUMB4W(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
-    MIXWORD4W(*x,*(x+4),*t,*(t+1));\
-    MIXWORD4W(*(x+1),*(x+5),*t,*(t+1));\
-    MIXWORD4W(*(x+2),*(x+6),*t,*(t+1));\
-    MIXWORD4W(*(x+3),*(x+7),*t,*(t+1));\
-    ADD_CONSTANT4W(*x, *(x+4), c0, c1);
-
-#define SUBCRUMB4W(a0,a1,a2,a3,t)\
-    t  = a0;\
+#define SUBCRUMB4W( a0, a1, a2, a3 ) \
+{ \
+    __m512i t = a0; \
    a0 = mm512_xoror( a3, a0, a1 ); \
-    a2 = _mm512_xor_si512(a2,a3);\
+    a2 = _mm512_xor_si512( a2, a3 ); \
    a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
    a3 = mm512_xorand( a2, a3, t ); \
-    a2 = mm512_xorand( a1, a2, a0);\
-    a1 = _mm512_or_si512(a1,a3);\
-    a3 = _mm512_xor_si512(a3,a2);\
-    t  = _mm512_xor_si512(t,a1);\
-    a2 = _mm512_and_si512(a2,a1);\
-    a1 = mm512_xnor(a1,a0);\
-    a0 = t;
+    a2 = mm512_xorand( a1, a2, a0); \
+    a1 = _mm512_or_si512( a1, a3 ); \
+    a3 = _mm512_xor_si512( a3, a2 ); \
+    t  = _mm512_xor_si512( t, a1 ); \
+    a2 = _mm512_and_si512( a2, a1 ); \
+    a1 = mm512_xnor( a1, a0 ); \
+    a0 = t; \
+}

-/*
-#define SUBCRUMB4W(a0,a1,a2,a3,t)\
-    t  = _mm512_load_si512(&a0);\
-    a0 = _mm512_or_si512(a0,a1);\
-    a2 = _mm512_xor_si512(a2,a3);\
-    a1 = _mm512_andnot_si512(a1, m512_neg1 );\
-    a0 = _mm512_xor_si512(a0,a3);\
-    a3 = _mm512_and_si512(a3,t);\
-    a1 = _mm512_xor_si512(a1,a3);\
-    a3 = _mm512_xor_si512(a3,a2);\
-    a2 = _mm512_and_si512(a2,a0);\
-    a0 = _mm512_andnot_si512(a0, m512_neg1 );\
-    a2 = _mm512_xor_si512(a2,a1);\
-    a1 = _mm512_or_si512(a1,a3);\
-    t  = _mm512_xor_si512(t,a1);\
-    a3 = _mm512_xor_si512(a3,a2);\
-    a2 = _mm512_and_si512(a2,a1);\
-    a1 = _mm512_xor_si512(a1,a0);\
-    a0 = _mm512_load_si512(&t);
-*/
+#define MIXWORD4W( a, b ) \
+    b = _mm512_xor_si512( a, b ); \
+    a = _mm512_xor_si512( b, _mm512_rol_epi32( a, 2 ) ); \
+    b = _mm512_xor_si512( a, _mm512_rol_epi32( b, 14 ) ); \
+    a = _mm512_xor_si512( b, _mm512_rol_epi32( a, 10 ) ); \
+    b = _mm512_rol_epi32( b, 1 );

-#define MIXWORD4W(a,b,t1,t2)\
-    b  = _mm512_xor_si512(a,b);\
-    t1 = _mm512_slli_epi32(a,2);\
-    t2 = _mm512_srli_epi32(a,30);\
-    a  = mm512_xoror( b, t1, t2 ); \
-    t1 = _mm512_slli_epi32(b,14);\
-    t2 = _mm512_srli_epi32(b,18);\
-    b  = _mm512_or_si512(t1,t2);\
-    b  = mm512_xoror( a, t1, t2 ); \
-    t1 = _mm512_slli_epi32(a,10);\
-    t2 = _mm512_srli_epi32(a,22);\
-    a  = mm512_xoror( b, t1, t2 ); \
-    t1 = _mm512_slli_epi32(b,1);\
-    t2 = _mm512_srli_epi32(b,31);\
-    b  = _mm512_or_si512(t1,t2);
+#define STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
+    SUBCRUMB4W( x0, x1, x2, x3 ); \
+    SUBCRUMB4W( x5, x6, x7, x4 ); \
+    MIXWORD4W( x0, x4 ); \
+    MIXWORD4W( x1, x5 ); \
+    MIXWORD4W( x2, x6 ); \
+    MIXWORD4W( x3, x7 ); \
+    ADD_CONSTANT4W( x0, x4, c0, c1 );

-/*
-#define MIXWORD4W(a,b,t1,t2)\
-    b  = _mm512_xor_si512(a,b);\
-    t1 = _mm512_slli_epi32(a,2);\
-    t2 = _mm512_srli_epi32(a,30);\
-     a = _mm512_or_si512(t1,t2);\
-    a  = _mm512_xor_si512(a,b);\
-    t1 = _mm512_slli_epi32(b,14);\
-    t2 = _mm512_srli_epi32(b,18);\
-    b  = _mm512_or_si512(t1,t2);\
-    b  = _mm512_xor_si512(a,b);\
-    t1 = _mm512_slli_epi32(a,10);\
-    t2 = _mm512_srli_epi32(a,22);\
-    a  = _mm512_or_si512(t1,t2);\
-    a  = _mm512_xor_si512(a,b);\
-    t1 = _mm512_slli_epi32(b,1);\
-    t2 = _mm512_srli_epi32(b,31);\
-    b  = _mm512_or_si512(t1,t2);
-*/
-
-#define STEP_PART24W(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
-    a1 = _mm512_shuffle_epi32(a1,147);\
-    t0 = _mm512_load_si512(&a1);\
-    a1 = _mm512_unpacklo_epi32(a1,a0);\
-    t0 = _mm512_unpackhi_epi32(t0,a0);\
-    t1 = _mm512_shuffle_epi32(t0,78);\
-    a0 = _mm512_shuffle_epi32(a1,78);\
-    SUBCRUMB4W(t1,t0,a0,a1,tmp0);\
-    t0 = _mm512_unpacklo_epi32(t0,t1);\
-    a1 = _mm512_unpacklo_epi32(a1,a0);\
-    a0 = _mm512_load_si512(&a1);\
-    a0 = _mm512_unpackhi_epi64(a0,t0);\
-    a1 = _mm512_unpacklo_epi64(a1,t0);\
-    a1 = _mm512_shuffle_epi32(a1,57);\
-    MIXWORD4W(a0,a1,tmp0,tmp1);\
-    ADD_CONSTANT4W(a0,a1,c0,c1);
-
-#define NMLTOM7684W(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
-    s2 = _mm512_load_si512(&r1);\
-    q2 = _mm512_load_si512(&p1);\
-    r2 = _mm512_shuffle_epi32(r2,216);\
-    p2 = _mm512_shuffle_epi32(p2,216);\
-    r1 = _mm512_unpacklo_epi32(r1,r0);\
-    p1 = _mm512_unpacklo_epi32(p1,p0);\
-    s2 = _mm512_unpackhi_epi32(s2,r0);\
-    q2 = _mm512_unpackhi_epi32(q2,p0);\
-    s0 = _mm512_load_si512(&r2);\
-    q0 = _mm512_load_si512(&p2);\
-    r2 = _mm512_unpacklo_epi64(r2,r1);\
-    p2 = _mm512_unpacklo_epi64(p2,p1);\
-    s1 = _mm512_load_si512(&s0);\
-    q1 = _mm512_load_si512(&q0);\
-    s0 = _mm512_unpackhi_epi64(s0,r1);\
-    q0 = _mm512_unpackhi_epi64(q0,p1);\
-    r2 = _mm512_shuffle_epi32(r2,225);\
-    p2 = _mm512_shuffle_epi32(p2,225);\
-    r0 = _mm512_load_si512(&s1);\
-    p0 = _mm512_load_si512(&q1);\
-    s0 = _mm512_shuffle_epi32(s0,225);\
-    q0 = _mm512_shuffle_epi32(q0,225);\
-    s1 = _mm512_unpacklo_epi64(s1,s2);\
-    q1 = _mm512_unpacklo_epi64(q1,q2);\
-    r0 = _mm512_unpackhi_epi64(r0,s2);\
-    p0 = _mm512_unpackhi_epi64(p0,q2);\
-    s2 = _mm512_load_si512(&r0);\
-    q2 = _mm512_load_si512(&p0);\
-    s3 = _mm512_load_si512(&r2);\
-    q3 = _mm512_load_si512(&p2);
-
-#define MIXTON7684W(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
-    s0 = _mm512_load_si512(&r0);\
-    q0 = _mm512_load_si512(&p0);\
-    s1 = _mm512_load_si512(&r2);\
-    q1 = _mm512_load_si512(&p2);\
-    r0 = _mm512_unpackhi_epi32(r0,r1);\
-    p0 = _mm512_unpackhi_epi32(p0,p1);\
-    r2 = _mm512_unpackhi_epi32(r2,r3);\
-    p2 = _mm512_unpackhi_epi32(p2,p3);\
-    s0 = _mm512_unpacklo_epi32(s0,r1);\
-    q0 = _mm512_unpacklo_epi32(q0,p1);\
-    s1 = _mm512_unpacklo_epi32(s1,r3);\
-    q1 = _mm512_unpacklo_epi32(q1,p3);\
-    r1 = _mm512_load_si512(&r0);\
-    p1 = _mm512_load_si512(&p0);\
-    r0 = _mm512_unpackhi_epi64(r0,r2);\
-    p0 = _mm512_unpackhi_epi64(p0,p2);\
-    s0 = _mm512_unpackhi_epi64(s0,s1);\
-    q0 = _mm512_unpackhi_epi64(q0,q1);\
-    r1 = _mm512_unpacklo_epi64(r1,r2);\
-    p1 = _mm512_unpacklo_epi64(p1,p2);\
-    s2 = _mm512_load_si512(&r0);\
-    q2 = _mm512_load_si512(&p0);\
-    s1 = _mm512_load_si512(&r1);\
-    q1 = _mm512_load_si512(&p1);
+#define STEP_PART24W( a0, a1, t0, t1, c0, c1 ) \
+    a1 = _mm512_shuffle_epi32( a1, 147 ); \
+    t0 = _mm512_load_si512( &a1 ); \
+    a1 = _mm512_unpacklo_epi32( a1, a0 ); \
+    t0 = _mm512_unpackhi_epi32( t0, a0 ); \
+    t1 = _mm512_shuffle_epi32( t0, 78 ); \
+    a0 = _mm512_shuffle_epi32( a1, 78 ); \
+    SUBCRUMB4W( t1, t0, a0, a1 ); \
+    t0 = _mm512_unpacklo_epi32( t0, t1 ); \
+    a1 = _mm512_unpacklo_epi32( a1, a0 ); \
+    a0 = _mm512_load_si512( &a1 ); \
+    a0 = _mm512_unpackhi_epi64( a0, t0 ); \
+    a1 = _mm512_unpacklo_epi64( a1, t0 ); \
+    a1 = _mm512_shuffle_epi32( a1, 57 ); \
+    MIXWORD4W( a0, a1 ); \
+    ADD_CONSTANT4W( a0, a1, c0, c1 );

 #define NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
    s1 = _mm512_load_si512(&r3);\
@@ -279,8 +159,7 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
    __m512i t0, t1;
    __m512i *chainv = state->chainv;
    __m512i msg0, msg1;
-    __m512i tmp[2];
-    __m512i x[8];
+    __m512i x0, x1, x2, x3, x4, x5, x6, x7;

    t0 = mm512_xor3( chainv[0], chainv[2], chainv[4] );
    t1 = mm512_xor3( chainv[1], chainv[3], chainv[5] );
@@ -372,42 +251,30 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
    chainv[7] = _mm512_rol_epi32( chainv[7], 3 );
    chainv[9] = _mm512_rol_epi32( chainv[9], 4 );

-    NMLTOM10244W( chainv[0], chainv[2], chainv[4], chainv[6],
-                x[0], x[1], x[2], x[3],
-                chainv[1],chainv[3],chainv[5],chainv[7],
-                x[4], x[5], x[6], x[7] );
+    NMLTOM10244W( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
+                  chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );

-    STEP_PART4W( &x[0], cns4w( 0), cns4w( 1), &tmp[0] );
-    STEP_PART4W( &x[0], cns4w( 2), cns4w( 3), &tmp[0] );
-    STEP_PART4W( &x[0], cns4w( 4), cns4w( 5), &tmp[0] );
-    STEP_PART4W( &x[0], cns4w( 6), cns4w( 7), &tmp[0] );
-    STEP_PART4W( &x[0], cns4w( 8), cns4w( 9), &tmp[0] );
-    STEP_PART4W( &x[0], cns4w(10), cns4w(11), &tmp[0] );
-    STEP_PART4W( &x[0], cns4w(12), cns4w(13), &tmp[0] );
-    STEP_PART4W( &x[0], cns4w(14), cns4w(15), &tmp[0] );
+    STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 0), cns4w( 1) );
+    STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 2), cns4w( 3) );
+    STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 4), cns4w( 5) );
+    STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 6), cns4w( 7) );
+    STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 8), cns4w( 9) );
+    STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w(10), cns4w(11) );
+    STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w(12), cns4w(13) );
+    STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w(14), cns4w(15) );

-    MIXTON10244W( x[0], x[1], x[2], x[3],
-                chainv[0], chainv[2], chainv[4],chainv[6],
-                x[4], x[5], x[6], x[7],
-                chainv[1],chainv[3],chainv[5],chainv[7]);
+    MIXTON10244W( x0, x1, x2, x3, chainv[0], chainv[2], chainv[4], chainv[6],
+                  x4, x5, x6, x7, chainv[1], chainv[3], chainv[5], chainv[7] );

    /* Process last 256-bit block */
-    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(16), cns4w(17),
-                tmp[0], tmp[1] );
-    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(18), cns4w(19),
-                tmp[0], tmp[1] );
-    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(20), cns4w(21),
-                tmp[0], tmp[1] );
-    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(22), cns4w(23),
-                tmp[0], tmp[1] );
-    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(24), cns4w(25),
-                tmp[0], tmp[1] );
-    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(26), cns4w(27),
-                tmp[0], tmp[1] );
-    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(28), cns4w(29),
-                tmp[0], tmp[1] );
-    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(30), cns4w(31),
-                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(16), cns4w(17) );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(18), cns4w(19) );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(20), cns4w(21) );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(22), cns4w(23) );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(24), cns4w(25) );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(26), cns4w(27) );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(28), cns4w(29) );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(30), cns4w(31) );
 }

 void finalization512_4way( luffa_4way_context *state, uint32 *b )
@@ -683,10 +550,11 @@ int luffa_4way_update_close( luffa_4way_context *state,

 #define cns(i)  m256_const1_128( ( (__m128i*)CNS_INIT)[i] )

-#define ADD_CONSTANT(a,b,c0,c1)\
-    a = _mm256_xor_si256(a,c0);\
-    b = _mm256_xor_si256(b,c1);
+#define ADD_CONSTANT( a, b, c0, c1 ) \
+    a = _mm256_xor_si256( a, c0 ); \
+    b = _mm256_xor_si256( b, c1 );

+/*
 #define MULT2( a0, a1, mask ) \
 do { \
  __m256i b = _mm256_xor_si256( a0, \
@@ -694,127 +562,83 @@ do { \
  a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
  a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) );  \
 } while(0)
+*/

-#define STEP_PART(x,c0,c1,t)\
-    SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
-    SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
-    MIXWORD(*x,*(x+4),*t,*(t+1));\
-    MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
-    MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
-    MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
-    ADD_CONSTANT(*x, *(x+4), c0, c1);
+#define MULT2( a0, a1, mask ) \
+{ \
+  __m256i b = _mm256_xor_si256( a0, \
+                 _mm256_shuffle_epi32( _mm256_and_si256( a1, mask ), 16 ) ); \
+  a0 = _mm256_alignr_epi8( a1,  b, 4 ); \
+  a1 = _mm256_alignr_epi8(  b, a1, 4 ); \
+}

-#define SUBCRUMB(a0,a1,a2,a3,t)\
-    t  = a0;\
-    a0 = _mm256_or_si256(a0,a1);\
-    a2 = _mm256_xor_si256(a2,a3);\
-    a1 = mm256_not( a1 );\
-    a0 = _mm256_xor_si256(a0,a3);\
-    a3 = _mm256_and_si256(a3,t);\
-    a1 = _mm256_xor_si256(a1,a3);\
-    a3 = _mm256_xor_si256(a3,a2);\
-    a2 = _mm256_and_si256(a2,a0);\
-    a0 = mm256_not( a0 );\
-    a2 = _mm256_xor_si256(a2,a1);\
-    a1 = _mm256_or_si256(a1,a3);\
-    t  = _mm256_xor_si256(t,a1);\
-    a3 = _mm256_xor_si256(a3,a2);\
-    a2 = _mm256_and_si256(a2,a1);\
-    a1 = _mm256_xor_si256(a1,a0);\
-    a0 = t;\
+#define SUBCRUMB( a0, a1, a2, a3 ) \
+{ \
+    __m256i t = a0; \
+    a0 = _mm256_or_si256( a0, a1 ); \
+    a2 = _mm256_xor_si256( a2, a3 ); \
+    a1 = mm256_not( a1 ); \
+    a0 = _mm256_xor_si256( a0, a3 ); \
+    a3 = _mm256_and_si256( a3, t ); \
+    a1 = _mm256_xor_si256( a1, a3 ); \
+    a3 = _mm256_xor_si256( a3, a2 ); \
+    a2 = _mm256_and_si256( a2, a0 ); \
+    a0 = mm256_not( a0 ); \
+    a2 = _mm256_xor_si256( a2, a1 ); \
+    a1 = _mm256_or_si256(  a1, a3 ); \
+    t  = _mm256_xor_si256(  t, a1 ); \
+    a3 = _mm256_xor_si256( a3, a2 ); \
+    a2 = _mm256_and_si256( a2, a1 ); \
+    a1 = _mm256_xor_si256( a1, a0 ); \
+    a0 = t; \
+}

-#define MIXWORD(a,b,t1,t2)\
-    b  = _mm256_xor_si256(a,b);\
-    t1 = _mm256_slli_epi32(a,2);\
-    t2 = _mm256_srli_epi32(a,30);\
-     a = _mm256_or_si256(t1,t2);\
-    a  = _mm256_xor_si256(a,b);\
-    t1 = _mm256_slli_epi32(b,14);\
-    t2 = _mm256_srli_epi32(b,18);\
-    b  = _mm256_or_si256(t1,t2);\
-    b  = _mm256_xor_si256(a,b);\
-    t1 = _mm256_slli_epi32(a,10);\
-    t2 = _mm256_srli_epi32(a,22);\
-    a  = _mm256_or_si256(t1,t2);\
-    a  = _mm256_xor_si256(a,b);\
-    t1 = _mm256_slli_epi32(b,1);\
-    t2 = _mm256_srli_epi32(b,31);\
-    b  = _mm256_or_si256(t1,t2);
+#define MIXWORD( a, b ) \
+{ \
+    __m256i t1, t2; \
+    b  = _mm256_xor_si256( a,b ); \
+    t1 = _mm256_slli_epi32( a,  2 ); \
+    t2 = _mm256_srli_epi32( a, 30 ); \
+    a  = _mm256_or_si256( t1, t2 ); \
+    a  = _mm256_xor_si256( a, b ); \
+    t1 = _mm256_slli_epi32( b, 14 ); \
+    t2 = _mm256_srli_epi32( b, 18 ); \
+    b  = _mm256_or_si256( t1, t2 ); \
+    b  = _mm256_xor_si256( a, b ); \
+    t1 = _mm256_slli_epi32( a, 10 ); \
+    t2 = _mm256_srli_epi32( a, 22 ); \
+    a  = _mm256_or_si256( t1,t2 ); \
+    a  = _mm256_xor_si256( a,b ); \
+    t1 = _mm256_slli_epi32( b,1 ); \
+    t2 = _mm256_srli_epi32( b,31 ); \
+    b  = _mm256_or_si256( t1, t2 ); \
+}

-#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
-    a1 = _mm256_shuffle_epi32(a1,147);\
-    t0 = _mm256_load_si256(&a1);\
-    a1 = _mm256_unpacklo_epi32(a1,a0);\
-    t0 = _mm256_unpackhi_epi32(t0,a0);\
-    t1 = _mm256_shuffle_epi32(t0,78);\
-    a0 = _mm256_shuffle_epi32(a1,78);\
-    SUBCRUMB(t1,t0,a0,a1,tmp0);\
-    t0 = _mm256_unpacklo_epi32(t0,t1);\
-    a1 = _mm256_unpacklo_epi32(a1,a0);\
-    a0 = _mm256_load_si256(&a1);\
-    a0 = _mm256_unpackhi_epi64(a0,t0);\
-    a1 = _mm256_unpacklo_epi64(a1,t0);\
-    a1 = _mm256_shuffle_epi32(a1,57);\
-    MIXWORD(a0,a1,tmp0,tmp1);\
-    ADD_CONSTANT(a0,a1,c0,c1);
+#define STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
+    SUBCRUMB( x0, x1, x2, x3 ); \
+    SUBCRUMB( x5, x6, x7, x4 ); \
+    MIXWORD( x0, x4 ); \
+    MIXWORD( x1, x5 ); \
+    MIXWORD( x2, x6 ); \
+    MIXWORD( x3, x7 ); \
+    ADD_CONSTANT( x0, x4, c0, c1 );

-#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
-    s2 = _mm256_load_si256(&r1);\
-    q2 = _mm256_load_si256(&p1);\
-    r2 = _mm256_shuffle_epi32(r2,216);\
-    p2 = _mm256_shuffle_epi32(p2,216);\
-    r1 = _mm256_unpacklo_epi32(r1,r0);\
-    p1 = _mm256_unpacklo_epi32(p1,p0);\
-    s2 = _mm256_unpackhi_epi32(s2,r0);\
-    q2 = _mm256_unpackhi_epi32(q2,p0);\
-    s0 = _mm256_load_si256(&r2);\
-    q0 = _mm256_load_si256(&p2);\
-    r2 = _mm256_unpacklo_epi64(r2,r1);\
-    p2 = _mm256_unpacklo_epi64(p2,p1);\
-    s1 = _mm256_load_si256(&s0);\
-    q1 = _mm256_load_si256(&q0);\
-    s0 = _mm256_unpackhi_epi64(s0,r1);\
-    q0 = _mm256_unpackhi_epi64(q0,p1);\
-    r2 = _mm256_shuffle_epi32(r2,225);\
-    p2 = _mm256_shuffle_epi32(p2,225);\
-    r0 = _mm256_load_si256(&s1);\
-    p0 = _mm256_load_si256(&q1);\
-    s0 = _mm256_shuffle_epi32(s0,225);\
-    q0 = _mm256_shuffle_epi32(q0,225);\
-    s1 = _mm256_unpacklo_epi64(s1,s2);\
-    q1 = _mm256_unpacklo_epi64(q1,q2);\
-    r0 = _mm256_unpackhi_epi64(r0,s2);\
-    p0 = _mm256_unpackhi_epi64(p0,q2);\
-    s2 = _mm256_load_si256(&r0);\
-    q2 = _mm256_load_si256(&p0);\
-    s3 = _mm256_load_si256(&r2);\
-    q3 = _mm256_load_si256(&p2);
-
-#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
-    s0 = _mm256_load_si256(&r0);\
-    q0 = _mm256_load_si256(&p0);\
-    s1 = _mm256_load_si256(&r2);\
-    q1 = _mm256_load_si256(&p2);\
-    r0 = _mm256_unpackhi_epi32(r0,r1);\
-    p0 = _mm256_unpackhi_epi32(p0,p1);\
-    r2 = _mm256_unpackhi_epi32(r2,r3);\
-    p2 = _mm256_unpackhi_epi32(p2,p3);\
-    s0 = _mm256_unpacklo_epi32(s0,r1);\
-    q0 = _mm256_unpacklo_epi32(q0,p1);\
-    s1 = _mm256_unpacklo_epi32(s1,r3);\
-    q1 = _mm256_unpacklo_epi32(q1,p3);\
-    r1 = _mm256_load_si256(&r0);\
-    p1 = _mm256_load_si256(&p0);\
-    r0 = _mm256_unpackhi_epi64(r0,r2);\
-    p0 = _mm256_unpackhi_epi64(p0,p2);\
-    s0 = _mm256_unpackhi_epi64(s0,s1);\
-    q0 = _mm256_unpackhi_epi64(q0,q1);\
-    r1 = _mm256_unpacklo_epi64(r1,r2);\
-    p1 = _mm256_unpacklo_epi64(p1,p2);\
-    s2 = _mm256_load_si256(&r0);\
-    q2 = _mm256_load_si256(&p0);\
-    s1 = _mm256_load_si256(&r1);\
-    q1 = _mm256_load_si256(&p1);\
+#define STEP_PART2( a0, a1, t0, t1, c0, c1 ) \
+    a1 = _mm256_shuffle_epi32( a1, 147); \
+    t0 = _mm256_load_si256( &a1 ); \
+    a1 = _mm256_unpacklo_epi32( a1, a0 ); \
+    t0 = _mm256_unpackhi_epi32( t0, a0 ); \
+    t1 = _mm256_shuffle_epi32( t0, 78 ); \
+    a0 = _mm256_shuffle_epi32( a1, 78 ); \
+    SUBCRUMB( t1, t0, a0, a1 );\
+    t0 = _mm256_unpacklo_epi32( t0, t1 ); \
+    a1 = _mm256_unpacklo_epi32( a1, a0 ); \
+    a0 = _mm256_load_si256( &a1 ); \
+    a0 = _mm256_unpackhi_epi64( a0, t0 ); \
+    a1 = _mm256_unpacklo_epi64( a1, t0 ); \
+    a1 = _mm256_shuffle_epi32( a1, 57 ); \
+    MIXWORD( a0, a1 ); \
+    ADD_CONSTANT( a0, a1, c0, c1 );

 #define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
    s1 = _mm256_load_si256(&r3);\
@@ -857,9 +681,8 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
    __m256i t0, t1;
    __m256i *chainv = state->chainv;
    __m256i msg0, msg1;
-    __m256i tmp[2];
-    __m256i x[8];
-    const __m256i MASK = m256_const1_i128( 0x00000000ffffffff );
+    __m256i x0, x1, x2, x3, x4, x5, x6, x7;
+    const __m256i MASK = m256_const1_i128( 0xffffffff );

    t0 = chainv[0];
    t1 = chainv[1];
@@ -958,42 +781,30 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
    chainv[7] = mm256_rol_32( chainv[7], 3 );
    chainv[9] = mm256_rol_32( chainv[9], 4 );

-    NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
-                x[0], x[1], x[2], x[3],
-                chainv[1],chainv[3],chainv[5],chainv[7],
-                x[4], x[5], x[6], x[7] );
+    NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
+                chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );

-    STEP_PART( &x[0], cns( 0), cns( 1), &tmp[0] );
-    STEP_PART( &x[0], cns( 2), cns( 3), &tmp[0] );
-    STEP_PART( &x[0], cns( 4), cns( 5), &tmp[0] );
-    STEP_PART( &x[0], cns( 6), cns( 7), &tmp[0] );
-    STEP_PART( &x[0], cns( 8), cns( 9), &tmp[0] );
-    STEP_PART( &x[0], cns(10), cns(11), &tmp[0] );
-    STEP_PART( &x[0], cns(12), cns(13), &tmp[0] );
-    STEP_PART( &x[0], cns(14), cns(15), &tmp[0] );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 0), cns( 1) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 2), cns( 3) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 4), cns( 5) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 6), cns( 7) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 8), cns( 9) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(10), cns(11) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(12), cns(13) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(14), cns(15) );

-    MIXTON1024( x[0], x[1], x[2], x[3],
-                chainv[0], chainv[2], chainv[4],chainv[6],
-                x[4], x[5], x[6], x[7],
-                chainv[1],chainv[3],chainv[5],chainv[7]);
+    MIXTON1024( x0, x1, x2, x3, chainv[0], chainv[2], chainv[4], chainv[6],
+                x4, x5, x6, x7, chainv[1], chainv[3], chainv[5], chainv[7]);

    /* Process last 256-bit block */
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31),
-                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31) );
 }

 /***************************************************/
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -30,19 +30,6 @@
  a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) );  \
 } while(0)

-/*
-static inline __m256i mult2_avx2( a )
-{ 
-   __m128 a0, a0, b;
-   a0 = mm128_extractlo_256( a );
-   a1 = mm128_extracthi_256( a );
-   b =  _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) );
-   a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) );
-   a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) );
-   return mm256_concat_128( a1, a0 );
-}
-*/
-
 #define STEP_PART(x,c,t)\
    SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
    SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -15,7 +15,8 @@

 #if defined (ANIME_8WAY)

-typedef struct {
+union _anime_8way_context_overlay
+{
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
 #if defined(__VAES__)
@@ -26,23 +27,9 @@ typedef struct {
    jh512_8way_context      jh;
    skein512_8way_context   skein;
    keccak512_8way_context  keccak;
-} anime_8way_ctx_holder;
+} __attribute__ ((aligned (64)));

-anime_8way_ctx_holder anime_8way_ctx __attribute__ ((aligned (64)));
-
-void init_anime_8way_ctx()
-{
-     blake512_8way_init( &anime_8way_ctx.blake );
-     bmw512_8way_init( &anime_8way_ctx.bmw );
-#if defined(__VAES__)
-     groestl512_4way_init( &anime_8way_ctx.groestl, 64 );
-#else
-     init_groestl( &anime_8way_ctx.groestl, 64 );
-#endif
-     skein512_8way_init( &anime_8way_ctx.skein );
-     jh512_8way_init( &anime_8way_ctx.jh );
-     keccak512_8way_init( &anime_8way_ctx.keccak );
-}
+typedef union _anime_8way_context_overlay anime_8way_context_overlay;

 void anime_8way_hash( void *state, const void *input )
 {
@@ -65,17 +52,14 @@ void anime_8way_hash( void *state, const void *input )
    __m512i* vhB = (__m512i*)vhashB;
    __m512i* vhC = (__m512i*)vhashC;
    const __m512i bit3_mask = m512_const1_64( 8 );
-    const __m512i zero = _mm512_setzero_si512();
    __mmask8 vh_mask;
-    anime_8way_ctx_holder ctx;
-    memcpy( &ctx, &anime_8way_ctx, sizeof(anime_8way_ctx) );
+    anime_8way_context_overlay ctx __attribute__ ((aligned (64)));

    bmw512_8way_full( &ctx.bmw, vhash, input, 80 );

    blake512_8way_full( &ctx.blake, vhash, vhash, 64 );

-    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
-                                       zero );
+    vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );

 #if defined(__VAES__)

@@ -152,8 +136,7 @@ void anime_8way_hash( void *state, const void *input )
    jh512_8way_update( &ctx.jh, vhash, 64 );
    jh512_8way_close( &ctx.jh, vhash );

-    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
-                                       zero );
+    vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );

    if ( ( vh_mask & 0xff ) != 0xff )
       blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
@@ -168,8 +151,7 @@ void anime_8way_hash( void *state, const void *input )

    skein512_8way_full( &ctx.skein, vhash, vhash, 64 );

-    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ), 
-                                       zero );
+    vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );

    if ( ( vh_mask & 0xff ) != 0xff )
    {
@@ -237,14 +219,20 @@ int scanhash_anime_8way( struct work *work, uint32_t max_nonce,

 #elif defined (ANIME_4WAY)

-typedef struct {
+union _anime_4way_context_overlay
+{
    blake512_4way_context  blake;
    bmw512_4way_context    bmw;
    hashState_groestl      groestl;
    jh512_4way_context     jh;
    skein512_4way_context  skein;
    keccak512_4way_context keccak;
-} anime_4way_ctx_holder;
+#if defined(__VAES__)
+    groestl512_2way_context groestl2;
+#endif
+} __attribute__ ((aligned (64)));
+
+typedef union _anime_4way_context_overlay anime_4way_context_overlay;

 void anime_4way_hash( void *state, const void *input )
 {
@@ -262,7 +250,7 @@ void anime_4way_hash( void *state, const void *input )
    int h_mask;
    const __m256i bit3_mask = m256_const1_64( 8 );
    const __m256i zero = _mm256_setzero_si256();
-    anime_4way_ctx_holder ctx;
+    anime_4way_context_overlay ctx __attribute__ ((aligned (64)));

    bmw512_4way_init( &ctx.bmw );
    bmw512_4way_update( &ctx.bmw, input, 80 );
@@ -293,7 +281,18 @@ void anime_4way_hash( void *state, const void *input )

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

-    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+#if defined(__VAES__)
+
+   rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+   groestl512_2way_full( &ctx.groestl2, vhashA, vhashA, 64 );
+   groestl512_2way_full( &ctx.groestl2, vhashB, vhashB, 64 );
+
+   rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

    groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
    groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
@@ -302,6 +301,8 @@ void anime_4way_hash( void *state, const void *input )

    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

+#endif
+
    jh512_4way_init( &ctx.jh );
    jh512_4way_update( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhash );
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -13,6 +13,7 @@
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/shavite/sph_shavite.h"
+#include "algo/shavite/shavite-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -98,8 +99,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
                           hash4, hash5, hash6,  hash7 );

-   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
-                                       m512_zero );
+   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );

   // A
 #if defined(__VAES__)
@@ -154,8 +154,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   keccak512_8way_update( &ctx.keccak, vhash, 64 );
   keccak512_8way_close( &ctx.keccak, vhash );

-   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
-                                       m512_zero );
+   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );

   // A
   if ( ( vh_mask & 0xff ) != 0xff )
@@ -174,8 +173,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );

   rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
-   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
-                                       m512_zero );
+   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );

   if ( likely( ( vh_mask & 0xff ) != 0xff ) )
   {
@@ -223,8 +221,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   simd512_4way_full( &ctx.simd, vhashB, vhashB, 64 );

   rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
-   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
-                                       m512_zero );
+   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
                     hash4, hash5, hash6, hash7, vhash );
   // 4x32 for haval
@@ -302,8 +299,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)

   blake512_8way_full( &ctx.blake, vhash, vhash, 64 );

-   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
-                                       m512_zero );
+   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );

   // A
 #if defined(__VAES__)
@@ -374,8 +370,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)

   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
                           hash4, hash5, hash6, hash7 );
-   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
-                                       m512_zero );
+   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );

     // A   
 #if defined(__VAES__)
@@ -455,8 +450,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)

   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
                           hash4, hash5, hash6, hash7 );
-   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
-                                       m512_zero );
+   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );

   if ( hash0[0] & mask )
      fugue512_full( &ctx.fugue, hash0, hash0, 64 );
@@ -520,8 +514,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   sha512_8way_update( &ctx.sha512, vhash, 64 );
   sha512_8way_close( &ctx.sha512, vhash );

-   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
-                                       m512_zero );
+   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
                     hash4, hash5, hash6, hash7, vhash );

@@ -625,6 +618,7 @@ union _hmq1725_4way_context_overlay
    cube_2way_context       cube2;
    sph_shavite512_context  shavite;
    hashState_sd            sd;
+    shavite512_2way_context shavite2;
    simd_2way_context       simd;
    hashState_echo          echo;
    hamsi512_4way_context   hamsi;
@@ -633,6 +627,10 @@ union _hmq1725_4way_context_overlay
    sph_whirlpool_context   whirlpool;
    sha512_4way_context     sha512;
    haval256_5_4way_context haval;
+#if defined(__VAES__)
+    groestl512_2way_context groestl2;
+    echo_2way_context       echo2;
+#endif    
 } __attribute__ ((aligned (64)));

 typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;
@@ -750,15 +748,10 @@ extern void hmq1725_4way_hash(void *state, const void *input)

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

-    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

-    shavite512_full( &ctx.shavite, hash0, hash0, 64 );
-    shavite512_full( &ctx.shavite, hash1, hash1, 64 );
-    shavite512_full( &ctx.shavite, hash2, hash2, 64 );
-    shavite512_full( &ctx.shavite, hash3, hash3, 64 );
-
-    intrlv_2x128_512( vhashA, hash0, hash1 );
-    intrlv_2x128_512( vhashB, hash2, hash3 );
+    shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 );
+    shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 );

    simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
    simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
@@ -795,6 +788,17 @@ extern void hmq1725_4way_hash(void *state, const void *input)

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

+#if defined(__VAES__)
+
+   rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+   echo_2way_full( &ctx.echo2, vhashA, 512, vhashA, 64 );
+   echo_2way_full( &ctx.echo2, vhashB, 512, vhashB, 64 );
+
+   rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+    
    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
    
    echo_full( &ctx.echo, (BitSequence *)hash0, 512,
@@ -807,7 +811,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
                    (const BitSequence *)hash3, 64 );

    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-     
+
+#endif
+
    blake512_4way_full( &ctx.blake, vhash, vhash, 64 );

    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -939,6 +945,17 @@ extern void hmq1725_4way_hash(void *state, const void *input)

   mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

+#if defined(__VAES__)
+
+   rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+   groestl512_2way_full( &ctx.groestl2, vhashA, vhashA, 64 );
+   groestl512_2way_full( &ctx.groestl2, vhashB, vhashB, 64 );
+
+   rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+   
   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

   groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -948,6 +965,8 @@ extern void hmq1725_4way_hash(void *state, const void *input)

   intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

+#endif
+
   sha512_4way_init( &ctx.sha512 ); 
   sha512_4way_update( &ctx.sha512, vhash, 64 );
   sha512_4way_close( &ctx.sha512, vhash ); 
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -68,7 +68,6 @@ void quark_8way_hash( void *state, const void *input )
    quark_8way_ctx_holder ctx;
    const uint32_t mask = 8;
    const __m512i bit3_mask = m512_const1_64( mask );
-    const __m512i zero = _mm512_setzero_si512();

    memcpy( &ctx, &quark_8way_ctx, sizeof(quark_8way_ctx) );

@@ -76,9 +75,7 @@ void quark_8way_hash( void *state, const void *input )

    bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
    
-    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
-                                       zero );
-
+    vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
    
 #if defined(__VAES__)

@@ -154,8 +151,7 @@ void quark_8way_hash( void *state, const void *input )
    jh512_8way_update( &ctx.jh, vhash, 64 );
    jh512_8way_close( &ctx.jh, vhash );

-    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
-                                       zero );
+    vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );

    if ( ( vh_mask & 0xff ) != 0xff )
       blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
@@ -169,8 +165,7 @@ void quark_8way_hash( void *state, const void *input )

    skein512_8way_full( &ctx.skein, vhash, vhash, 64 );

-    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
-                                       zero );
+    vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );

    if ( ( vh_mask & 0xff ) != 0xff )
    {
--- a/algo/whirlpool/md-helper-4way.c
+++ b/algo/whirlpool/md-helper-4way.c
@@ -1,291 +0,0 @@
-/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */
-/*
- * This file contains some functions which implement the external data
- * handling and padding for Merkle-Damgard hash functions which follow
- * the conventions set out by MD4 (little-endian) or SHA-1 (big-endian).
- *
- * API: this file is meant to be included, not compiled as a stand-alone
- * file. Some macros must be defined:
- *   RFUN   name for the round function
- *   HASH   "short name" for the hash function
- *   BE32   defined for big-endian, 32-bit based (e.g. SHA-1)
- *   LE32   defined for little-endian, 32-bit based (e.g. MD5)
- *   BE64   defined for big-endian, 64-bit based (e.g. SHA-512)
- *   LE64   defined for little-endian, 64-bit based (no example yet)
- *   PW01   if defined, append 0x01 instead of 0x80 (for Tiger)
- *   BLEN   if defined, length of a message block (in bytes)
- *   PLW1   if defined, length is defined on one 64-bit word only (for Tiger)
- *   PLW4   if defined, length is defined on four 64-bit words (for WHIRLPOOL)
- *   SVAL   if defined, reference to the context state information
- *
- * BLEN is used when a message block is not 16 (32-bit or 64-bit) words:
- * this is used for instance for Tiger, which works on 64-bit words but
- * uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are
- * ignored if 32-bit words are used; if 64-bit words are used and PLW1 is
- * set, then only one word (64 bits) will be used to encode the input
- * message length (in bits), otherwise two words will be used (as in
- * SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but
- * not PLW1), four 64-bit words will be used to encode the message length
- * (in bits). Note that regardless of those settings, only 64-bit message
- * lengths are supported (in bits): messages longer than 2 Exabytes will be
- * improperly hashed (this is unlikely to happen soon: 2 Exabytes is about
- * 2 millions Terabytes, which is huge).
- *
- * If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close()
- * function. This is used for Tiger2, which is identical to Tiger except
- * when it comes to the padding (Tiger2 uses the standard 0x80 byte instead
- * of the 0x01 from original Tiger).
- *
- * The RFUN function is invoked with two arguments, the first pointing to
- * aligned data (as a "const void *"), the second being state information
- * from the context structure. By default, this state information is the
- * "val" field from the context, and this field is assumed to be an array
- * of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64).
- * from the context structure. The "val" field can have any type, except
- * for the output encoding which assumes that it is an array of "sph_u32"
- * values. By defining NO_OUTPUT, this last step is deactivated; the
- * includer code is then responsible for writing out the hash result. When
- * NO_OUTPUT is defined, the third parameter to the "close()" function is
- * ignored.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifdef _MSC_VER
-#pragma warning (disable: 4146)
-#endif
-
-#undef SPH_XCAT
-#define SPH_XCAT(a, b)     SPH_XCAT_(a, b)
-#undef SPH_XCAT_
-#define SPH_XCAT_(a, b)    a ## b
-
-#undef SPH_BLEN
-#undef SPH_WLEN
-#if defined BE64 || defined LE64
-#define SPH_BLEN    128U
-#define SPH_WLEN      8U
-#else
-#define SPH_BLEN     64U
-#define SPH_WLEN      4U
-#endif
-
-#ifdef BLEN
-#undef SPH_BLEN
-#define SPH_BLEN    BLEN
-#endif
-
-#undef SPH_MAXPAD
-#if defined PLW1
-#define SPH_MAXPAD   (SPH_BLEN - SPH_WLEN)
-#elif defined PLW4
-#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 2))
-#else
-#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 1))
-#endif
-
-#undef SPH_VAL
-#undef SPH_NO_OUTPUT
-#ifdef SVAL
-#define SPH_VAL         SVAL
-#define SPH_NO_OUTPUT   1
-#else
-#define SPH_VAL   sc->val
-#endif
-
-#ifndef CLOSE_ONLY
-
-#ifdef SPH_UPTR
-static void
-SPH_XCAT(HASH, _short)( void *cc, const void *data, size_t len )
-#else
-void
-HASH ( void *cc, const void *data, size_t len )
-#endif
-{
-   SPH_XCAT( HASH, _context ) *sc;
-   __m256i *vdata = (__m256i*)data;
-   size_t ptr;
-
-   sc = cc;
-   ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
-   while ( len > 0 )
-   {
-      size_t clen;
-      clen = SPH_BLEN - ptr;
-      if ( clen > len )
-         clen = len;
-      memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
-      vdata = vdata + (clen>>3);
-      ptr += clen;
-      len -= clen;
-      if ( ptr == SPH_BLEN )
-      {
-         RFUN( sc->buf, SPH_VAL );
-         ptr = 0;
-      }
-         sc->count += clen;
-   }
-}
-
-#ifdef SPH_UPTR
-void
-HASH (void *cc, const void *data, size_t len)
-{
-   SPH_XCAT(HASH, _context) *sc;
-   __m256i *vdata = (__m256i*)data;
-   unsigned ptr;
-
-   if ( len < (2 * SPH_BLEN) )
-   {
-      SPH_XCAT(HASH, _short)(cc, data, len);
-      return;
-   }
-   sc = cc;
-   ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
-   if ( ptr > 0 )
-   {
-      unsigned t;
-      t = SPH_BLEN - ptr;
-      SPH_XCAT( HASH, _short )( cc, data, t );
-      vdata = vdata + (t>>3);
-      len -= t;
-   }
-   SPH_XCAT( HASH, _short )( cc, data, len );
-}
-#endif
-
-#endif
-
-/*
- * Perform padding and produce result. The context is NOT reinitialized
- * by this function.
- */
-static void
-SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
-          void *dst, unsigned rnum )
-{
-    SPH_XCAT(HASH, _context) *sc;
-    unsigned ptr, u;
-    sc = cc;
-    ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
-
-//uint64_t *b= (uint64_t*)sc->buf;
-//uint64_t *s= (uint64_t*)sc->state;
-//printf("Vptr 1= %u\n", ptr);
-//printf("VBuf %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
-//printf("VBuf %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
-
-#ifdef PW01
-    sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x100 >> 8 );
-//    sc->buf[ptr++] = 0x100 >> 8;
-#else
-// need to overwrite exactly one byte
-//    sc->buf[ptr>>3] = _mm256_set_epi64x( 0, 0, 0, 0x80 );
-    sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x80 );
-//    ptr++;
-#endif
-    ptr += 8;
-
-//printf("Vptr 2= %u\n", ptr);
-//printf("VBuf %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
-//printf("VBuf %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
-
-    if ( ptr > SPH_MAXPAD )
-    {
-         memset_zero_256( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
-         RFUN( sc->buf, SPH_VAL );
-         memset_zero_256( sc->buf, SPH_MAXPAD >> 3 );
-    }
-    else
-    {
-         memset_zero_256( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
-    }
-#if defined BE64
-#if defined PLW1
-    sc->buf[ SPH_MAXPAD>>3 ] =
-                 mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
-#elif defined PLW4
-    memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
-    sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-                mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
-    sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-                mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
-#else
-    sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-               mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
-    sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-               mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
-#endif  // PLW
-#else  // LE64
-#if defined PLW1
-    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
-#elif defined PLW4
-    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
-    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
-                       _mm256_set1_epi64x( c->count >> 61 );
-    memset_zero_256( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
-                       2 * SPH_WLEN );
-#else
-    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
-    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
-                          _mm256_set1_epi64x( sc->count >> 61 );
-#endif // PLW
-
-#endif // LE64
-
-//printf("Vptr 3= %u\n", ptr);
-//printf("VBuf   %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
-//printf("VBuf   %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
-    RFUN( sc->buf, SPH_VAL );
-
-//printf("Vptr after= %u\n", ptr);
-//printf("VState %016llx %016llx %016llx %016llx\n", s[0], s[4], s[8], s[12] );
-//printf("VState %016llx %016llx %016llx %016llx\n", s[16], s[20], s[24], s[28] );
-
-#ifdef SPH_NO_OUTPUT
-    (void)dst;
-    (void)rnum;
-    (void)u;
-#else
-    for ( u = 0; u < rnum; u ++ )
-    {
-#if defined BE64
-       ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
-#else  // LE64
-       ((__m256i*)dst)[u] = sc->val[u];
-#endif
-    }
-#endif
-}
-
-static void
-SPH_XCAT( HASH, _mdclose )( void *cc, void *dst, unsigned rnum )
-{
-   SPH_XCAT( HASH, _addbits_and_close )( cc, 0, 0, dst, rnum );
-}
--- a/algo/whirlpool/whirlpool-hash-4way.c
+++ b/algo/whirlpool/whirlpool-hash-4way.c
--- a/algo/whirlpool/whirlpool-hash-4way.h
+++ b/algo/whirlpool/whirlpool-hash-4way.h
@@ -1,108 +0,0 @@
-/* $Id: sph_whirlpool.h 216 2010-06-08 09:46:57Z tp $ */
-/**
- * WHIRLPOOL interface.
- *
- * WHIRLPOOL knows three variants, dubbed "WHIRLPOOL-0" (original
- * version, published in 2000, studied by NESSIE), "WHIRLPOOL-1"
- * (first revision, 2001, with a new S-box) and "WHIRLPOOL" (current
- * version, 2003, with a new diffusion matrix, also described as "plain
- * WHIRLPOOL"). All three variants are implemented here.
- *
- * The original WHIRLPOOL (i.e. WHIRLPOOL-0) was published in: P. S. L.
- * M. Barreto, V. Rijmen, "The Whirlpool Hashing Function", First open
- * NESSIE Workshop, Leuven, Belgium, November 13--14, 2000.
- *
- * The current WHIRLPOOL specification and a reference implementation
- * can be found on the WHIRLPOOL web page:
- * http://paginas.terra.com.br/informatica/paulobarreto/WhirlpoolPage.html
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_whirlpool.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef WHIRLPOOL_HASH_4WAY_H__
-#define WHIRLPOOL_HASH_4WAY_H__
-
-#ifdef __AVX2__
-
-#include <stddef.h>
-#include "algo/sha/sph_types.h"
-#include "simd-utils.h"
-
-/**
- * Output size (in bits) for WHIRLPOOL.
- */
-#define SPH_SIZE_whirlpool   512
-
-/**
- * Output size (in bits) for WHIRLPOOL-0.
- */
-#define SPH_SIZE_whirlpool0   512
-
-/**
- * Output size (in bits) for WHIRLPOOL-1.
- */
-#define SPH_SIZE_whirlpool1   512
-
-typedef struct {
-    __m256i buf[8] __attribute__ ((aligned (64)));
-    __m256i state[8];
-    sph_u64 count;
-} whirlpool_4way_context;
-
-void whirlpool_4way_init( void *cc );
-
-void whirlpool_4way( void *cc, const void *data, size_t len );
-
-void whirlpool_4way_close( void *cc, void *dst );
-
-/**
- * WHIRLPOOL-0 uses the same structure than plain WHIRLPOOL.
- */
-typedef whirlpool_4way_context whirlpool0_4way_context;
-
-#define whirlpool0_4way_init whirlpool_4way_init
-
-void whirlpool0_4way( void *cc, const void *data, size_t len );
-
-void whirlpool0_4way_close( void *cc, void *dst );
-
-/**
- * WHIRLPOOL-1 uses the same structure than plain WHIRLPOOL.
- */
-typedef whirlpool_4way_context whirlpool1_4way_context;
-
-#define whirlpool1_4way_init whirlpool_4way_init
-
-void whirlpool1_4way(void *cc, const void *data, size_t len);
-
-void whirlpool1_4way_close(void *cc, void *dst);
-
-#endif
-
-#endif
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -12,6 +12,7 @@
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
+#include "algo/shavite/shavite-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #if defined(__VAES__)
@@ -22,15 +23,15 @@

 #if defined (C11_8WAY)

-typedef struct {
+union _c11_8way_context_overlay
+{
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
    skein512_8way_context   skein;
    jh512_8way_context      jh;
    keccak512_8way_context  keccak;
    luffa_4way_context      luffa;
-    cube_4way_context       cube;
-    simd_4way_context       simd;
+    cube_4way_2buf_context   cube;
 #if defined(__VAES__)
    groestl512_4way_context groestl;
    shavite512_4way_context shavite;
@@ -40,32 +41,14 @@ typedef struct {
    sph_shavite512_context  shavite;
    hashState_echo          echo;
 #endif
-} c11_8way_ctx_holder;
+    simd_4way_context       simd;
+} __attribute__ ((aligned (64)));
+typedef union _c11_8way_context_overlay c11_8way_context_overlay;

-c11_8way_ctx_holder c11_8way_ctx;
+static __thread __m512i c11_8way_midstate[16] __attribute__((aligned(64)));
+static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));

-void init_c11_8way_ctx()
-{
-     blake512_8way_init( &c11_8way_ctx.blake );
-     bmw512_8way_init( &c11_8way_ctx.bmw );
-     skein512_8way_init( &c11_8way_ctx.skein );
-     jh512_8way_init( &c11_8way_ctx.jh );
-     keccak512_8way_init( &c11_8way_ctx.keccak );
-     luffa_4way_init( &c11_8way_ctx.luffa, 512 );
-     cube_4way_init( &c11_8way_ctx.cube, 512, 16, 32 );
-     simd_4way_init( &c11_8way_ctx.simd, 512 );
-#if defined(__VAES__)
-     groestl512_4way_init( &c11_8way_ctx.groestl, 64 );
-     shavite512_4way_init( &c11_8way_ctx.shavite );
-     echo_4way_init( &c11_8way_ctx.echo, 512 );
-#else
-     init_groestl( &c11_8way_ctx.groestl, 64 );
-     sph_shavite512_init( &c11_8way_ctx.shavite );
-     init_echo( &c11_8way_ctx.echo, 512 );
-#endif
-}
-
-void c11_8way_hash( void *state, const void *input )
+int c11_8way_hash( void *state, const void *input, int thr_id )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
     uint64_t vhashA[4*8] __attribute__ ((aligned (64)));     
@@ -78,24 +61,19 @@ void c11_8way_hash( void *state, const void *input )
     uint64_t hash5[8] __attribute__ ((aligned (64)));
     uint64_t hash6[8] __attribute__ ((aligned (64)));
     uint64_t hash7[8] __attribute__ ((aligned (64)));
-     c11_8way_ctx_holder ctx;
-     memcpy( &ctx, &c11_8way_ctx, sizeof(c11_8way_ctx) );
+     c11_8way_context_overlay ctx;

-     // 1 Blake 4way
-     blake512_8way_update( &ctx.blake, input, 80 );
-     blake512_8way_close( &ctx.blake, vhash );
-
-     // 2 Bmw
-     bmw512_8way_update( &ctx.bmw, vhash, 64 );
-     bmw512_8way_close( &ctx.bmw, vhash );
+     blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
+                             c11_8way_midstate );

+     bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
+     
 #if defined(__VAES__)

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

-     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
-     groestl512_4way_init( &ctx.groestl, 64 );
-     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+     groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );

     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );

@@ -104,21 +82,14 @@ void c11_8way_hash( void *state, const void *input )
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash );

-     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
-     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
-     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
-     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );

     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7 );
@@ -126,83 +97,56 @@ void c11_8way_hash( void *state, const void *input )
 #endif

     // 4 JH
+     jh512_8way_init( &ctx.jh );
     jh512_8way_update( &ctx.jh, vhash, 64 );
     jh512_8way_close( &ctx.jh, vhash );

     // 5 Keccak
+     keccak512_8way_init( &ctx.keccak );
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );

     // 6 Skein
-     skein512_8way_update( &ctx.skein, vhash, 64 );
-     skein512_8way_close( &ctx.skein, vhash );
+     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

-     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
-     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
-
-     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
-     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+     luffa512_4way_full( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa512_4way_full( &ctx.luffa, vhashB, vhashB, 64 );

+     cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 512, vhashA, vhashB, 64 );
+     
 #if defined(__VAES__)

-     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 );

 #else
     
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );

-     sph_shavite512( &ctx.shavite, hash0, 64 );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash1, 64 );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash2, 64 );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash3, 64 );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash4, 64 );
-     sph_shavite512_close( &ctx.shavite, hash4 );
-     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash5, 64 );
-     sph_shavite512_close( &ctx.shavite, hash5 );
-     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash6, 64 );
-     sph_shavite512_close( &ctx.shavite, hash6 );
-     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash7, 64 );
-     sph_shavite512_close( &ctx.shavite, hash7 );
-
+     shavite512_full( &ctx.shavite, hash0, hash0, 64 );
+     shavite512_full( &ctx.shavite, hash1, hash1, 64 );
+     shavite512_full( &ctx.shavite, hash2, hash2, 64 );
+     shavite512_full( &ctx.shavite, hash3, hash3, 64 );
+     shavite512_full( &ctx.shavite, hash4, hash4, 64 );
+     shavite512_full( &ctx.shavite, hash5, hash5, 64 );
+     shavite512_full( &ctx.shavite, hash6, hash6, 64 );
+     shavite512_full( &ctx.shavite, hash7, hash7, 64 );
+     
     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );

 #endif

-     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+     simd512_4way_full( &ctx.simd, vhashA, vhashA, 64 );
+     simd512_4way_full( &ctx.simd, vhashB, vhashB, 64 );

 #if defined(__VAES__)

-     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
-     echo_4way_init( &ctx.echo, 512 );
-     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+     echo_4way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
+     echo_4way_full( &ctx.echo, vhashB, 512, vhashB, 64 );

     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
@@ -212,29 +156,22 @@ void c11_8way_hash( void *state, const void *input )
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
     
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                       (const BitSequence *) hash0, 512 );
-     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                       (const BitSequence *) hash1, 512 );
-     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                       (const BitSequence *) hash2, 512 );
-     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                       (const BitSequence *) hash3, 512 );
-     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash4,
-                       (const BitSequence *) hash4, 512 );
-     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash5,
-                       (const BitSequence *) hash5, 512 );
-     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash6,
-                       (const BitSequence *) hash6, 512 );
-     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash7,
-                       (const BitSequence *) hash7, 512 );
+     echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                     (const BitSequence *)hash0, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                     (const BitSequence *)hash1, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                     (const BitSequence *)hash2, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                     (const BitSequence *)hash3, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash4, 512,
+                     (const BitSequence *)hash4, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash5, 512,
+                     (const BitSequence *)hash5, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash6, 512,
+                     (const BitSequence *)hash6, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash7, 512,
+                     (const BitSequence *)hash7, 64 );

 #endif

@@ -246,225 +183,223 @@ void c11_8way_hash( void *state, const void *input )
     memcpy( state+160, hash5, 32 );
     memcpy( state+192, hash6, 32 );
     memcpy( state+224, hash7, 32 );
+
+     return 1;
 }

 int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[8*8] __attribute__ ((aligned (128)));
-     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
-     uint32_t *pdata = work->data;
-     uint32_t *ptarget = work->target;
-     uint32_t n = pdata[19];
-     const uint32_t first_nonce = pdata[19];
-     int thr_id = mythr->id;   
-     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-     const uint32_t Htarg = ptarget[7];
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   __m128i edata[5] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   __m512i  *noncev = (__m512i*)vdata + 9;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const uint32_t targ32_d7 = ptarget[7];
+   const __m512i eight = m512_const1_64( 8 );
+   const bool bench = opt_benchmark;

-     max_nonce -= 8;
+   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
+   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
+   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
+   edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
+   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );

-     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   mm512_intrlv80_8x64( vdata, edata );
+   *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
+                            0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0 ) );
+   blake512_8way_prehash_le( &blake512_8way_ctx, c11_8way_midstate, vdata );

-     do
-     {
-        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-        _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                          n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
-
-        c11_8way_hash( hash, vdata );
-        pdata[19] = n;
-
-        for ( int i = 0; i < 8; i++ )
-        if ( ( ( hash+(i<<3) )[7] <= Htarg )
-             && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-        {
-           pdata[19] = n+i;
-           submit_solution( work, hash+(i<<3), mythr );
-        }
-        n += 8;
-     } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
-     *hashes_done = n - first_nonce;
-     return 0;
+   do
+   {
+      if ( likely( c11_8way_hash( hash, vdata, thr_id ) ) )
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( ( ( hash + ( lane << 3 ) )[7] <= targ32_d7 )
+           && valid_hash( hash +( lane << 3 ), ptarget ) && !bench )
+      {
+         pdata[19] = n + lane;
+         submit_solution( work, hash + ( lane << 3 ), mythr );
+      }
+      *noncev = _mm512_add_epi32( *noncev, eight );
+      n += 8;
+   } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
 }
     
 #elif defined (C11_4WAY)

-typedef struct {
+union _c11_4way_context_overlay
+{
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
+#if defined(__VAES__)
+    groestl512_2way_context groestl;
+    echo512_2way_context    echo;
+#else
    hashState_groestl       groestl;
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;    
-    keccak512_4way_context  keccak;    
-    luffa_2way_context      luffa;
-    cubehashParam           cube;
-    sph_shavite512_context  shavite;
-    simd_2way_context       simd;
    hashState_echo          echo;
-} c11_4way_ctx_holder;
+#endif
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    luffa_2way_context      luffa;
+    cube_2way_context       cube;
+    shavite512_2way_context shavite;
+    simd_2way_context       simd;
+};
+typedef union _c11_4way_context_overlay c11_4way_context_overlay;

-c11_4way_ctx_holder c11_4way_ctx;
+static __thread __m256i c11_4way_midstate[16] __attribute__((aligned(64)));
+static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64)));

-void init_c11_4way_ctx()
-{
-     blake512_4way_init( &c11_4way_ctx.blake );
-     bmw512_4way_init( &c11_4way_ctx.bmw );
-     init_groestl( &c11_4way_ctx.groestl, 64 );
-     skein512_4way_init( &c11_4way_ctx.skein );
-     jh512_4way_init( &c11_4way_ctx.jh );
-     keccak512_4way_init( &c11_4way_ctx.keccak );
-     luffa_2way_init( &c11_4way_ctx.luffa, 512 );
-     cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 );
-     sph_shavite512_init( &c11_4way_ctx.shavite );
-     simd_2way_init( &c11_4way_ctx.simd, 512 );
-     init_echo( &c11_4way_ctx.echo, 512 );
-}
-
-void c11_4way_hash( void *state, const void *input )
+int c11_4way_hash( void *state, const void *input, int thr_id )
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     uint64_t vhashA[8*2] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*2] __attribute__ ((aligned (64)));
-     c11_4way_ctx_holder ctx;
-     memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
+     c11_4way_context_overlay ctx;

-     // 1 Blake 4way
-     blake512_4way_update( &ctx.blake, input, 80 );
-     blake512_4way_close( &ctx.blake, vhash );
+     blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ),
+                             c11_4way_midstate );

-     // 2 Bmw
+     bmw512_4way_init( &ctx.bmw );
     bmw512_4way_update( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
+     
+#if defined(__VAES__)

-     // Serial
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

-     // 3 Groestl
-     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );

-     // 4way
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );

-     // 4 JH
+#else
+
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
+
+     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
+
+#endif
+     
+     jh512_4way_init( &ctx.jh );
     jh512_4way_update( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );

-     // 5 Keccak
+     keccak512_4way_init( &ctx.keccak );
     keccak512_4way_update( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );

-     // 6 Skein
-     skein512_4way_update( &ctx.skein, vhash, 64 );
-     skein512_4way_close( &ctx.skein, vhash );
+     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );

-     // Serial
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

-     // 7 Luffa
-     intrlv_2x128( vhash, hash0, hash1, 512 );
-     intrlv_2x128( vhashB, hash2, hash3, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     luffa_2way_init( &ctx.luffa, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
-     dintrlv_2x128( hash0, hash1, vhash, 512 );
-     dintrlv_2x128( hash2, hash3, vhashB, 512 );
+     luffa512_2way_full( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa512_2way_full( &ctx.luffa, vhashB, vhashB, 64 );

-     // 8 Cubehash
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
-     memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
-     memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
-     memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+     cube_2way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
+     cube_2way_full( &ctx.cube, vhashB, 512, vhashB, 64 );

-     // 9 Shavite
-     sph_shavite512( &ctx.shavite, hash0, 64 );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash1, 64 );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash2, 64 );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash3, 64 );
-     sph_shavite512_close( &ctx.shavite, hash3 );
+     shavite512_2way_full( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_2way_full( &ctx.shavite, vhashB, vhashB, 64 );

-     // 10 Simd
-     intrlv_2x128( vhash, hash0, hash1, 512 );
-     intrlv_2x128( vhashB, hash2, hash3, 512 );
-     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     simd_2way_init( &ctx.simd, 512 );
-     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
-     dintrlv_2x128( hash0, hash1, vhash, 512 );
-     dintrlv_2x128( hash2, hash3, vhashB, 512 );
+     simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
+     simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );

-     // 11 Echo
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                       (const BitSequence *) hash0, 512 );
-     memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                       (const BitSequence *) hash1, 512 );
-     memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                       (const BitSequence *) hash2, 512 );
-     memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                       (const BitSequence *) hash3, 512 );
+#if defined(__VAES__)
+
+     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
+     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
+
+     dintrlv_2x128_512( hash0, hash1, vhashA );
+     dintrlv_2x128_512( hash2, hash3, vhashB );
+
+#else
+
+     dintrlv_2x128_512( hash0, hash1, vhashA );
+     dintrlv_2x128_512( hash2, hash3, vhashB );
+
+     echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                     (const BitSequence *)hash0, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                     (const BitSequence *)hash1, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                     (const BitSequence *)hash2, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                     (const BitSequence *)hash3, 64 );
+
+#endif

     memcpy( state,    hash0, 32 );
     memcpy( state+32, hash1, 32 );
     memcpy( state+64, hash2, 32 );
     memcpy( state+96, hash3, 32 );
+
+     return 1;
 }

 int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[4*8] __attribute__ ((aligned (64)));
-     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-     uint32_t *pdata = work->data;
-     uint32_t *ptarget = work->target;
-     uint32_t n = pdata[19];
-     const uint32_t first_nonce = pdata[19];
-     int thr_id = mythr->id;  // thr_id arg is deprecated
-     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-     const uint32_t Htarg = ptarget[7];
+   uint32_t hash[8*4] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   __m128i edata[5] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   __m256i  *noncev = (__m256i*)vdata + 9;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const uint32_t targ32_d7 = ptarget[7];
+   const __m256i four = m256_const1_64( 4 );
+   const bool bench = opt_benchmark;

-     mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
+   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
+   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
+   edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
+   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );

-     do
-     {
-        *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-             _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+   mm256_intrlv80_4x64( vdata, edata );

-        c11_4way_hash( hash, vdata );
-        pdata[19] = n;
+   *noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32(
+                                           0, 3, 0, 2, 0, 1, 0, 0 ) );
+   blake512_4way_prehash_le( &blake512_4way_ctx, c11_4way_midstate, vdata );

-        for ( int i = 0; i < 4; i++ )
-        if ( ( ( hash+(i<<3) )[7] <= Htarg )
-            && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-        {
-           pdata[19] = n+i;
-           submit_solution( work, hash+(i<<3), mythr );
-        }
-        n += 4;
-     } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
-     *hashes_done = n - first_nonce + 1;
-     return 0;
+   do
+   {
+      if ( likely( c11_4way_hash( hash, vdata, thr_id ) ) )
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( ( ( hash + ( lane << 3 ) )[7] <= targ32_d7 )
+           && valid_hash( hash +( lane << 3 ), ptarget ) && !bench )
+      {
+         pdata[19] = n + lane;
+         submit_solution( work, hash + ( lane << 3 ), mythr );
+      }
+      *noncev = _mm256_add_epi32( *noncev, four );
+      n += 4;
+   } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
 }

 #endif
--- a/algo/x11/c11-gate.c
+++ b/algo/x11/c11-gate.c
@@ -3,11 +3,9 @@
 bool register_c11_algo( algo_gate_t* gate )
 {
 #if defined (C11_8WAY)
-  init_c11_8way_ctx();
  gate->scanhash  = (void*)&scanhash_c11_8way;
  gate->hash      = (void*)&c11_8way_hash;
 #elif defined (C11_4WAY)
-  init_c11_4way_ctx();
  gate->scanhash  = (void*)&scanhash_c11_4way;
  gate->hash      = (void*)&c11_4way_hash;
 #else
--- a/algo/x11/c11-gate.h
+++ b/algo/x11/c11-gate.h
@@ -14,14 +14,14 @@
 bool register_c11_algo( algo_gate_t* gate );
 #if defined(C11_8WAY)

-void c11_8way_hash( void *state, const void *input );
+int c11_8way_hash( void *state, const void *input, int thr_id );
 int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-void init_c11_8way_ctx();
+//void init_c11_8way_ctx();

 #elif defined(C11_4WAY)

-void c11_4way_hash( void *state, const void *input );
+int c11_4way_hash( void *state, const void *input, int thr_id );
 int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 void init_c11_4way_ctx();
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -163,7 +163,7 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
-            bmw512_8way_update( &ctx.bmw, vhash, size );
+               bmw512_8way_update( &ctx.bmw, vhash, size );
            }
            bmw512_8way_close( &ctx.bmw, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -31,7 +31,7 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = masked_ntime;
      if ( !thr_id )
-          applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
+          applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
   }

@@ -85,7 +85,7 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = masked_ntime;
      if ( !thr_id )
-          applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
+          applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
   }

--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -264,10 +264,8 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );

   mm512_intrlv80_8x64( vdata, edata );
-
-   *noncev = mm512_intrlv_blend_32( *noncev,
-                           _mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4,
-                                             0, n+3, 0, n+2, 0, n+1, 0, n ) );
+   *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
+                                    0,7, 0,6, 0,5, 0,4, 0,3, 0,2, 0,1, 0,0 ) );
   blake512_8way_prehash_le( &blake512_8way_ctx, x17_8way_midstate, vdata );
   
   do
@@ -279,7 +277,7 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
         extr_lane_8x32( lane_hash, hash32, lane, 256 );
         if ( likely( valid_hash( lane_hash, ptarget ) ) )
         {
-            pdata[19] =  n + lane;
+            pdata[19] = n + lane;
            submit_solution( work, lane_hash, mythr );
         }
      }
@@ -291,8 +289,6 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
   return 0;
 }

-
-
 #elif defined(X17_4WAY)

 union _x17_4way_context_overlay
@@ -322,6 +318,9 @@ union _x17_4way_context_overlay
 };  
 typedef union _x17_4way_context_overlay x17_4way_context_overlay;

+static __thread __m256i x17_4way_midstate[16] __attribute__((aligned(64)));
+static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64)));
+
 int x17_4way_hash( void *state, const void *input, int thr_id )
 {
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
@@ -333,7 +332,10 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
     uint64_t hash3[8] __attribute__ ((aligned (32)));
     x17_4way_context_overlay ctx;

-     blake512_4way_full( &ctx.blake, vhash, input, 80 );
+     blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ),
+                             x17_4way_midstate );
+     
+//     blake512_4way_full( &ctx.blake, vhash, input, 80 );

     bmw512_4way_init( &ctx.bmw );
     bmw512_4way_update( &ctx.bmw, vhash, 64 );
@@ -449,4 +451,54 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
     return 1;
 }

+int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash32[8*4] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   __m128i edata[5] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *hash32_d7 = &(hash32[7*4]);
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   __m256i  *noncev = (__m256i*)vdata + 9;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const uint32_t targ32_d7 = ptarget[7];
+   const __m256i four = m256_const1_64( 4 );
+   const bool bench = opt_benchmark;
+
+   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
+   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
+   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
+   edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
+   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
+
+   mm256_intrlv80_4x64( vdata, edata );
+   *noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32( 0,3,0,2, 0,1,0,0 ) );
+   blake512_4way_prehash_le( &blake512_4way_ctx, x17_4way_midstate, vdata );
+
+   do
+   {
+      if ( likely( x17_4way_hash( hash32, vdata, thr_id ) ) )
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( unlikely( ( hash32_d7[ lane ] <= targ32_d7 ) && !bench ) )
+      {
+         extr_lane_4x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+      }
+      *noncev = _mm256_add_epi32( *noncev, four );
+      n += 4;
+   } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
 #endif
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -6,7 +6,8 @@ bool register_x17_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x17_8way;
  gate->hash      = (void*)&x17_8way_hash;
 #elif defined (X17_4WAY)
-  gate->scanhash  = (void*)&scanhash_4way_64in_32out;
+  gate->scanhash  = (void*)&scanhash_x17_4way;
+//  gate->scanhash  = (void*)&scanhash_4way_64in_32out;
  gate->hash      = (void*)&x17_4way_hash;
 #else
  gate->hash      = (void*)&x17_hash;
--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -581,10 +581,8 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );   

   mm512_intrlv80_8x64( vdata, edata );
-
-   *noncev = mm512_intrlv_blend_32( *noncev,
-                           _mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4,
-                                             0, n+3, 0, n+2, 0, n+1, 0, n ) );
+   *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
+                       0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0 ) );
   blake512_8way_prehash_le( &blake512_8way_ctx, x25x_8way_midstate, vdata ); 

   do
@@ -941,9 +939,8 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );

   mm256_intrlv80_4x64( vdata, edata );
-
-   *noncev = mm256_intrlv_blend_32( *noncev,
-                           _mm256_set_epi32( 0, n+3, 0, n+2, 0, n+1, 0, n ) );
+   *noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32(
+                                                 0, 3, 0, 2, 0, 1, 0, 0 ) );
   blake512_4way_prehash_le( &blake512_4way_ctx, x25x_4way_midstate, vdata );
   
   do