v3.8.0

2025-09-17 23:44:27 +00:00 · 2018-01-23 21:02:16 -05:00
parent a90d75b8f5
commit ad2275f74a
121 changed files with 4662 additions and 467 deletions
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -1,6 +1,6 @@
 #include "blake-gate.h"

-#if defined (__AVX__)
+#if defined (BLAKE_4WAY)

 #include "blake-hash-4way.h"
 #include <string.h>
--- a/algo/blake/blake-gate.c
+++ b/algo/blake/blake-gate.c
@@ -7,6 +7,7 @@ int64_t blake_get_max64 ()

 bool register_blake_algo( algo_gate_t* gate )
 {
+  gate->optimizations = AVX2_OPT;
  gate->get_max64 = (void*)&blake_get_max64;
 //#if defined (__AVX2__) && defined (FOUR_WAY)
 //   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
@@ -14,7 +15,6 @@ bool register_blake_algo( algo_gate_t* gate )
 //  gate->hash      = (void*)&blakehash_8way;
 #if defined(BLAKE_4WAY)
  four_way_not_tested();
-  gate->optimizations = FOUR_WAY_OPT;
  gate->scanhash  = (void*)&scanhash_blake_4way;
  gate->hash      = (void*)&blakehash_4way;
 #else
--- a/algo/blake/blake-gate.h
+++ b/algo/blake/blake-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(FOUR_WAY) && defined(__AVX__)
+#if defined(__AVX2__)
  #define BLAKE_4WAY
 #endif

--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
@@ -78,6 +78,8 @@ static const sph_u64 IV512[8] = {

 #if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64

+// Blake-256 4 & 8 way, Blake-512 4way
+
 static const unsigned sigma[16][16] = {
 	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
 	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
@@ -273,6 +275,8 @@ static const unsigned sigma[16][16] = {
 #define Mx_(n)      Mx__(n)
 #define Mx__(n)     M ## n

+// Blake-256 4 & 8 way
+
 #define CSx(r, i)   CSx_(Z ## r ## i)
 #define CSx_(n)     CSx__(n)
 #define CSx__(n)    CS ## n
@@ -311,6 +315,8 @@ static const sph_u32 CS[16] = {

 #if defined(__AVX2__)

+// Blake-512 4 way
+
 #define CBx(r, i)   CBx_(Z ## r ## i)
 #define CBx_(n)     CBx__(n)
 #define CBx__(n)    CB ## n
@@ -401,6 +407,35 @@ do { \

 #if defined (__AVX2__)

+// BLAKE256 8 WAY
+
+#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
+do { \
+   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
+                 _mm256_set1_epi32( c1 ), m0 ), b ), a ); \
+   d = mm256_rotr_32( _mm256_xor_si256( d, a ), 16 ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_rotr_32( _mm256_xor_si256( b, c ), 12 ); \
+   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
+                 _mm256_set1_epi32( c0 ), m1 ), b ), a ); \
+   d = mm256_rotr_32( _mm256_xor_si256( d, a ), 8 ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_rotr_32( _mm256_xor_si256( b, c ), 7 ); \
+} while (0)
+
+#define ROUND_S_8WAY(r)   do { \
+        GS_8WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
+        GS_8WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
+        GS_8WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
+        GS_8WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
+        GS_8WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
+        GS_8WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
+        GS_8WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
+        GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
+} while (0)
+
+// Blake-512 4 way
+
 #define GB_4WAY(m0, m1, c0, c1, a, b, c, d)   do { \
   a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
                 _mm256_set_epi64x( c1, c1, c1, c1 ), m0 ), b ), a ); \
@@ -627,6 +662,125 @@ do { \

 #if defined (__AVX2__)

+// Blake-256 8 way
+
+#define DECL_STATE32_8WAY \
+   __m256i H0, H1, H2, H3, H4, H5, H6, H7; \
+   __m256i S0, S1, S2, S3; \
+   sph_u32 T0, T1;
+
+#define READ_STATE32_8WAY(state) \
+do { \
+   H0 = (state)->H[0]; \
+   H1 = (state)->H[1]; \
+   H2 = (state)->H[2]; \
+   H3 = (state)->H[3]; \
+   H4 = (state)->H[4]; \
+   H5 = (state)->H[5]; \
+   H6 = (state)->H[6]; \
+   H7 = (state)->H[7]; \
+   S0 = (state)->S[0]; \
+   S1 = (state)->S[1]; \
+   S2 = (state)->S[2]; \
+   S3 = (state)->S[3]; \
+   T0 = (state)->T0; \
+   T1 = (state)->T1; \
+} while (0)
+
+#define WRITE_STATE32_8WAY(state) \
+do { \
+   (state)->H[0] = H0; \
+   (state)->H[1] = H1; \
+   (state)->H[2] = H2; \
+   (state)->H[3] = H3; \
+   (state)->H[4] = H4; \
+   (state)->H[5] = H5; \
+   (state)->H[6] = H6; \
+   (state)->H[7] = H7; \
+   (state)->S[0] = S0; \
+   (state)->S[1] = S1; \
+   (state)->S[2] = S2; \
+   (state)->S[3] = S3; \
+   (state)->T0 = T0; \
+   (state)->T1 = T1; \
+} while (0)
+
+#define COMPRESS32_8WAY( rounds ) \
+do { \
+   __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
+   __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
+   __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
+   __m256i V8, V9, VA, VB, VC, VD, VE, VF; \
+   V0 = H0; \
+   V1 = H1; \
+   V2 = H2; \
+   V3 = H3; \
+   V4 = H4; \
+   V5 = H5; \
+   V6 = H6; \
+   V7 = H7; \
+   V8 = _mm256_xor_si256( S0, _mm256_set1_epi32( CS0 ) ); \
+   V9 = _mm256_xor_si256( S1, _mm256_set1_epi32( CS1 ) ); \
+   VA = _mm256_xor_si256( S2, _mm256_set1_epi32( CS2 ) ); \
+   VB = _mm256_xor_si256( S3, _mm256_set1_epi32( CS3 ) ); \
+   VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS4 ) ); \
+   VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \
+   VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \
+   VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \
+   M0 = mm256_byteswap_32( * buf ); \
+   M1 = mm256_byteswap_32( *(buf+1) ); \
+   M2 = mm256_byteswap_32( *(buf+2) ); \
+   M3 = mm256_byteswap_32( *(buf+3) ); \
+   M4 = mm256_byteswap_32( *(buf+4) ); \
+   M5 = mm256_byteswap_32( *(buf+5) ); \
+   M6 = mm256_byteswap_32( *(buf+6) ); \
+   M7 = mm256_byteswap_32( *(buf+7) ); \
+   M8 = mm256_byteswap_32( *(buf+8) ); \
+   M9 = mm256_byteswap_32( *(buf+9) ); \
+   MA = mm256_byteswap_32( *(buf+10) ); \
+   MB = mm256_byteswap_32( *(buf+11) ); \
+   MC = mm256_byteswap_32( *(buf+12) ); \
+   MD = mm256_byteswap_32( *(buf+13) ); \
+   ME = mm256_byteswap_32( *(buf+14) ); \
+   MF = mm256_byteswap_32( *(buf+15) ); \
+   ROUND_S_8WAY(0); \
+   ROUND_S_8WAY(1); \
+   ROUND_S_8WAY(2); \
+   ROUND_S_8WAY(3); \
+   ROUND_S_8WAY(4); \
+   ROUND_S_8WAY(5); \
+   ROUND_S_8WAY(6); \
+   ROUND_S_8WAY(7); \
+   if (rounds == 14) \
+   { \
+      ROUND_S_8WAY(8); \
+      ROUND_S_8WAY(9); \
+      ROUND_S_8WAY(0); \
+      ROUND_S_8WAY(1); \
+      ROUND_S_8WAY(2); \
+      ROUND_S_8WAY(3); \
+   } \
+   H0 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), \
+                                                              S0 ), H0 ); \
+   H1 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), \
+                                                              S1 ), H1 ); \
+   H2 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), \
+                                                              S2 ), H2 ); \
+   H3 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), \
+                                                              S3 ), H3 ); \
+   H4 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), \
+                                                              S0 ), H4 ); \
+   H5 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), \
+                                                              S1 ), H5 ); \
+   H6 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), \
+                                                              S2 ), H6 ); \
+   H7 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), \
+                                                              S3 ), H7 ); \
+} while (0)
+
+
+// Blake-512 4 way
+
 #define DECL_STATE64_4WAY \
 	__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
        __m256i S0, S1, S2, S3; \
@@ -813,7 +967,7 @@ do { \

 #endif

-static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 };
+static const sph_u32 salt_zero_4way_small[4] = { 0, 0, 0, 0 };

 static void
 blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
@@ -934,6 +1088,129 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,

 #if defined (__AVX2__)

+// Blake-256 8 way
+
+static const sph_u32 salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+static void
+blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
+                   const sph_u32 *salt, int rounds )
+{
+   int i;
+   for ( i = 0; i < 8; i++ )
+      sc->H[i] = _mm256_set1_epi32( iv[i] );
+   for ( i = 0; i < 4; i++ )
+      sc->S[i] = _mm256_set1_epi32( salt[i] );
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
+   sc->rounds = rounds;
+}
+
+static void
+blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   __m256i *buf;
+   size_t ptr;
+   const int buf_size = 64;   // number of elements, sizeof/4
+   DECL_STATE32_8WAY
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   if ( len < buf_size - ptr )
+   {
+        memcpy_256( buf + (ptr>>2), vdata, len>>2 );
+        ptr += len;
+        sc->ptr = ptr;
+        return;
+   }
+
+   READ_STATE32_8WAY(sc);
+   while ( len > 0 )
+   {
+      size_t clen;
+
+      clen = buf_size - ptr;
+      if (clen > len)
+           clen = len;
+      memcpy_256( buf + (ptr>>2), vdata, clen>>2 );
+      ptr += clen;
+      vdata += (clen>>2);
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+          if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
+                T1 = SPH_T32(T1 + 1);
+          COMPRESS32_8WAY( sc->rounds );
+          ptr = 0;
+      }
+   }
+   WRITE_STATE32_8WAY(sc);
+   sc->ptr = ptr;
+}
+
+static void
+blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
+                    void *dst, size_t out_size_w32 )
+{
+   union {
+        __m256i buf[16];
+        sph_u32 dummy;
+   } u;
+   size_t ptr, k;
+   unsigned bit_len;
+   sph_u32 th, tl;
+   __m256i *out;
+
+   ptr = sc->ptr;
+   bit_len = ((unsigned)ptr << 3);
+   u.buf[ptr>>2] = _mm256_set1_epi32( 0x80 );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+
+   if ( ptr == 0 )
+   {
+        sc->T0 = SPH_C32(0xFFFFFE00UL);
+        sc->T1 = SPH_C32(0xFFFFFFFFUL);
+   }
+   else if ( sc->T0 == 0 )
+   {
+        sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
+        sc->T1 = SPH_T32(sc->T1 - 1);
+   }
+   else
+        sc->T0 -= 512 - bit_len;
+
+   if ( ptr <= 52 )
+   {
+       memset_zero_256( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
+       if (out_size_w32 == 8)
+           u.buf[52>>2] = _mm256_or_si256( u.buf[52>>2],
+                                           _mm256_set1_epi32( 0x01000000UL ) );
+       *(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) );
+       *(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) );
+       blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr );
+   }
+   else
+   {
+        memset_zero_256( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
+        blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr );
+        sc->T0 = SPH_C32(0xFFFFFE00UL);
+        sc->T1 = SPH_C32(0xFFFFFFFFUL);
+        memset_zero_256( u.buf, 56>>2 );
+       if (out_size_w32 == 8)
+           u.buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
+        *(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) );
+        *(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) );
+        blake32_8way( sc, u.buf, 64 );
+   }
+   out = (__m256i*)dst;
+   for ( k = 0; k < out_size_w32; k++ )
+        out[k] = mm256_byteswap_32( sc->H[k] );
+}
+
+// Blake-512 4 way
+
 static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };

 static void
@@ -1065,11 +1342,13 @@ blake64_4way_close( blake_4way_big_context *sc,

 #endif

+// Blake-256 4 way & 8 way
+
 // default 14 rounds, backward copatibility
 void
 blake256_4way_init(void *cc)
 {
-   blake32_4way_init( cc, IV256, salt_zero_small, 14 );
+   blake32_4way_init( cc, IV256, salt_zero_4way_small, 14 );
 }

 void
@@ -1084,10 +1363,31 @@ blake256_4way_close(void *cc, void *dst)
        blake32_4way_close(cc, 0, 0, dst, 8);
 }

-// 14 rounds blake, decred
+#if defined(__AVX2__)
+void
+blake256_8way_init(void *cc)
+{
+   blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
+}
+
+void
+blake256_8way(void *cc, const void *data, size_t len)
+{
+        blake32_8way(cc, data, len);
+}
+
+void
+blake256_8way_close(void *cc, void *dst)
+{
+        blake32_8way_close(cc, 0, 0, dst, 8);
+}
+
+#endif
+
+// 14 rounds Blake, Decred
 void blake256r14_4way_init(void *cc)
 {
-   blake32_4way_init( cc, IV256, salt_zero_small, 14 );
+   blake32_4way_init( cc, IV256, salt_zero_4way_small, 14 );
 }

 void
@@ -1102,10 +1402,31 @@ blake256r14_4way_close(void *cc, void *dst)
   blake32_4way_close(cc, 0, 0, dst, 8);
 }

-// 8 rounds blakecoin, vanilla
+#if defined(__AVX2__)
+
+void blake256r14_8way_init(void *cc)
+{
+   blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
+}
+
+void
+blake256r14_8way(void *cc, const void *data, size_t len)
+{
+   blake32_8way(cc, data, len);
+}
+
+void
+blake256r14_8way_close(void *cc, void *dst)
+{
+   blake32_8way_close(cc, 0, 0, dst, 8);
+}
+
+#endif
+
+// 8 rounds Blakecoin, Vanilla
 void blake256r8_4way_init(void *cc)
 {
-   blake32_4way_init( cc, IV256, salt_zero_small, 8 );
+   blake32_4way_init( cc, IV256, salt_zero_4way_small, 8 );
 }

 void
@@ -1122,6 +1443,29 @@ blake256r8_4way_close(void *cc, void *dst)

 #if defined (__AVX2__)

+void blake256r8_8way_init(void *cc)
+{
+   blake32_8way_init( cc, IV256, salt_zero_8way_small, 8 );
+}
+
+void
+blake256r8_8way(void *cc, const void *data, size_t len)
+{
+   blake32_8way(cc, data, len);
+}
+
+void
+blake256r8_8way_close(void *cc, void *dst)
+{
+   blake32_8way_close(cc, 0, 0, dst, 8);
+}
+
+#endif
+
+// Blake-512 4 way
+
+#if defined (__AVX2__)
+
 void
 blake512_4way_init(void *cc)
 {
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -51,6 +51,11 @@ extern "C"{

 #define SPH_SIZE_blake512   512

+// With AVX only Blake-256 4 way is available.
+// With AVX2 Blake-256 8way & Blake-512 4 way are also available.
+
+// Blake-256 4 way
+
 typedef struct {
   __m128i buf[16] __attribute__ ((aligned (64)));
   __m128i H[8];
@@ -80,6 +85,37 @@ void blake256r8_4way_close(void *cc, void *dst);

 #ifdef __AVX2__

+// Blake-256 8 way
+
+typedef struct {
+   __m256i buf[16] __attribute__ ((aligned (64)));
+   __m256i H[8];
+   __m256i S[4];
+   size_t ptr;
+   sph_u32 T0, T1;
+   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
+} blake_8way_small_context;
+
+// Default 14 rounds
+typedef blake_8way_small_context blake256_8way_context;
+void blake256_8way_init(void *cc);
+void blake256_8way(void *cc, const void *data, size_t len);
+void blake256_8way_close(void *cc, void *dst);
+
+// 14 rounds, blake, decred
+typedef blake_8way_small_context blake256r14_8way_context;
+void blake256r14_8way_init(void *cc);
+void blake256r14_8way(void *cc, const void *data, size_t len);
+void blake256r14_8way_close(void *cc, void *dst);
+
+// 8 rounds, blakecoin, vanilla
+typedef blake_8way_small_context blake256r8_8way_context;
+void blake256r8_8way_init(void *cc);
+void blake256r8_8way(void *cc, const void *data, size_t len);
+void blake256r8_8way_close(void *cc, void *dst);
+
+// Blake-512 4 way
+
 typedef struct {
        __m256i buf[16] __attribute__ ((aligned (64)));
        __m256i H[8];
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -3,7 +3,7 @@
 #include <string.h>
 #include <stdint.h>

-#include "crypto/blake2s.h"
+#include "sph-blake2s.h"

 static __thread blake2s_state s_midstate;
 static __thread blake2s_state s_ctx;
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -1,6 +1,6 @@
 #include "blakecoin-gate.h"

-#if defined (__AVX__)
+#if defined (BLAKECOIN_4WAY)

 #include "blake-hash-4way.h"
 #include <string.h>
--- a/algo/blake/blakecoin-gate.c
+++ b/algo/blake/blakecoin-gate.c
@@ -15,13 +15,13 @@ void bc4w_get_new_work( struct work* work, struct work* g_work, int thr_id,
                     uint32_t *end_nonce_ptr, bool clean_job )
 {
   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
-// 
+ 
 //   if ( have_stratum && ( *nonceptr >= *end_nonce_ptr ) )
 //      algo_gate.stratum_gen_work( &stratum, g_work );

   if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size ) 
   || ( *nonceptr >= *end_nonce_ptr )
-   || (  work->job_id != g_work->job_id ) && clean_job  )
+   || ( (  work->job_id != g_work->job_id ) && clean_job ) )
 /*
   if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
      && ( clean_job || ( *nonceptr >= *end_nonce_ptr )
@@ -47,7 +47,6 @@ bool register_vanilla_algo( algo_gate_t* gate )
 {
 #if defined(BLAKECOIN_4WAY)
 //  four_way_not_tested();
-  gate->optimizations = FOUR_WAY_OPT;
  gate->scanhash  = (void*)&scanhash_blakecoin_4way;
  gate->hash      = (void*)&blakecoin_4way_hash;
 //  gate->get_new_work = (void*)&bc4w_get_new_work;
@@ -57,7 +56,7 @@ bool register_vanilla_algo( algo_gate_t* gate )
  gate->hash     = (void*)&blakecoinhash;
 //  blakecoin_init( &blake_init_ctx );
 #endif
-  gate->optimizations = AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = AVX2_OPT;
  gate->get_max64 = (void*)&blakecoin_get_max64;
  return true;
 }
--- a/algo/blake/blakecoin-gate.h
+++ b/algo/blake/blakecoin-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(FOUR_WAY) && defined(__AVX__)
+#if defined(__AVX2__)
  #define BLAKECOIN_4WAY
 #endif

--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -145,15 +145,13 @@ bool register_decred_algo( algo_gate_t* gate )
 {
 #if defined(DECRED_4WAY)
  four_way_not_tested();
-  gate->optimizations = FOUR_WAY_OPT;
  gate->scanhash  = (void*)&scanhash_decred_4way;
  gate->hash      = (void*)&decred_hash_4way;
 #else
-  gate->optimizations = SSE2_OPT;
  gate->scanhash  = (void*)&scanhash_decred;
  gate->hash      = (void*)&decred_hash;
 #endif
-
+  gate->optimizations = AVX2_OPT;
  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
  gate->get_max64             = (void*)&get_max64_0x3fffffLL;
  gate->display_extra_data    = (void*)&decred_decode_extradata;
--- a/algo/blake/decred-gate.h
+++ b/algo/blake/decred-gate.h
@@ -18,7 +18,7 @@
 //                         uint64_t *hashes_done );
 #endif

-#if defined(FOUR_WAY) && defined(__AVX__)
+#if defined(__AVX2__)
  #define DECRED_4WAY
 #endif

--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -1,6 +1,6 @@
 #include "pentablake-gate.h"

-#ifdef __AVX2__
+#if defined (__AVX2__)

 #include <stdlib.h>
 #include <stdint.h>
--- a/algo/blake/pentablake-gate.c
+++ b/algo/blake/pentablake-gate.c
@@ -9,7 +9,7 @@ bool register_pentablake_algo( algo_gate_t* gate )
    gate->scanhash  = (void*)&scanhash_pentablake;
    gate->hash      = (void*)&pentablakehash;
 #endif
-    gate->optimizations = FOUR_WAY_OPT;
+    gate->optimizations = AVX2_OPT;
    gate->get_max64 = (void*)&get_max64_0x3ffff;
    return true;
 };
--- a/algo/blake/pentablake-gate.h
+++ b/algo/blake/pentablake-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(FOUR_WAY) && defined(__AVX2__)
+#if defined(__AVX2__)
  #define PENTABLAKE_4WAY
 #endif

--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -0,0 +1,378 @@
+/**
+ * BLAKE2 reference source code package - reference C implementations
+ *
+ * Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide. This software is distributed without any warranty.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "algo/sha/sph_types.h"
+#include "sph-blake2s.h"
+
+static const uint32_t blake2s_IV[8] =
+{
+	0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+	0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
+};
+
+static const uint8_t blake2s_sigma[10][16] =
+{
+	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
+	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
+	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
+	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
+	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
+	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
+	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
+	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
+	{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
+	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
+};
+
+static inline int blake2s_set_lastnode( blake2s_state *S )
+{
+	S->f[1] = ~0U;
+	return 0;
+}
+
+static inline int blake2s_clear_lastnode( blake2s_state *S )
+{
+	S->f[1] = 0U;
+	return 0;
+}
+
+/* Some helper functions, not necessarily useful */
+static inline int blake2s_set_lastblock( blake2s_state *S )
+{
+	if( S->last_node ) blake2s_set_lastnode( S );
+
+	S->f[0] = ~0U;
+	return 0;
+}
+
+static inline int blake2s_clear_lastblock( blake2s_state *S )
+{
+	if( S->last_node ) blake2s_clear_lastnode( S );
+
+	S->f[0] = 0U;
+	return 0;
+}
+
+static inline int blake2s_increment_counter( blake2s_state *S, const uint32_t inc )
+{
+	S->t[0] += inc;
+	S->t[1] += ( S->t[0] < inc );
+	return 0;
+}
+
+// Parameter-related functions
+static inline int blake2s_param_set_digest_length( blake2s_param *P, const uint8_t digest_length )
+{
+	P->digest_length = digest_length;
+	return 0;
+}
+
+static inline int blake2s_param_set_fanout( blake2s_param *P, const uint8_t fanout )
+{
+	P->fanout = fanout;
+	return 0;
+}
+
+static inline int blake2s_param_set_max_depth( blake2s_param *P, const uint8_t depth )
+{
+	P->depth = depth;
+	return 0;
+}
+
+static inline int blake2s_param_set_leaf_length( blake2s_param *P, const uint32_t leaf_length )
+{
+	store32( &P->leaf_length, leaf_length );
+	return 0;
+}
+
+static inline int blake2s_param_set_node_offset( blake2s_param *P, const uint64_t node_offset )
+{
+	store48( P->node_offset, node_offset );
+	return 0;
+}
+
+static inline int blake2s_param_set_node_depth( blake2s_param *P, const uint8_t node_depth )
+{
+	P->node_depth = node_depth;
+	return 0;
+}
+
+static inline int blake2s_param_set_inner_length( blake2s_param *P, const uint8_t inner_length )
+{
+	P->inner_length = inner_length;
+	return 0;
+}
+
+static inline int blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[BLAKE2S_SALTBYTES] )
+{
+	memcpy( P->salt, salt, BLAKE2S_SALTBYTES );
+	return 0;
+}
+
+static inline int blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[BLAKE2S_PERSONALBYTES] )
+{
+	memcpy( P->personal, personal, BLAKE2S_PERSONALBYTES );
+	return 0;
+}
+
+static inline int blake2s_init0( blake2s_state *S )
+{
+	memset( S, 0, sizeof( blake2s_state ) );
+
+	for( int i = 0; i < 8; ++i ) S->h[i] = blake2s_IV[i];
+
+	return 0;
+}
+
+/* init2 xors IV with input parameter block */
+int blake2s_init_param( blake2s_state *S, const blake2s_param *P )
+{
+	blake2s_init0( S );
+	uint32_t *p = ( uint32_t * )( P );
+
+	/* IV XOR ParamBlock */
+	for( size_t i = 0; i < 8; ++i )
+		S->h[i] ^= load32( &p[i] );
+
+	return 0;
+}
+
+
+// Sequential blake2s initialization
+int blake2s_init( blake2s_state *S, const uint8_t outlen )
+{
+	blake2s_param P[1];
+
+	/* Move interval verification here? */
+	if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
+
+	P->digest_length = outlen;
+	P->key_length    = 0;
+	P->fanout        = 1;
+	P->depth         = 1;
+	store32( &P->leaf_length, 0 );
+	store48( &P->node_offset, 0 );
+	P->node_depth    = 0;
+	P->inner_length  = 0;
+	// memset(P->reserved, 0, sizeof(P->reserved) );
+	memset( P->salt,     0, sizeof( P->salt ) );
+	memset( P->personal, 0, sizeof( P->personal ) );
+	return blake2s_init_param( S, P );
+}
+
+int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen )
+{
+	blake2s_param P[1];
+
+	if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
+
+	if ( !key || !keylen || keylen > BLAKE2S_KEYBYTES ) return -1;
+
+	P->digest_length = outlen;
+	P->key_length    = keylen;
+	P->fanout        = 1;
+	P->depth         = 1;
+	store32( &P->leaf_length, 0 );
+	store48( &P->node_offset, 0 );
+	P->node_depth    = 0;
+	P->inner_length  = 0;
+	// memset(P->reserved, 0, sizeof(P->reserved) );
+	memset( P->salt,     0, sizeof( P->salt ) );
+	memset( P->personal, 0, sizeof( P->personal ) );
+
+	if( blake2s_init_param( S, P ) < 0 ) return -1;
+
+	{
+		uint8_t block[BLAKE2S_BLOCKBYTES];
+		memset( block, 0, BLAKE2S_BLOCKBYTES );
+		memcpy( block, key, keylen );
+		blake2s_update( S, block, BLAKE2S_BLOCKBYTES );
+		secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */
+	}
+	return 0;
+}
+
+int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
+{
+	uint32_t m[16];
+	uint32_t v[16];
+
+	for( size_t i = 0; i < 16; ++i )
+		m[i] = load32( block + i * sizeof( m[i] ) );
+
+	for( size_t i = 0; i < 8; ++i )
+		v[i] = S->h[i];
+
+	v[ 8] = blake2s_IV[0];
+	v[ 9] = blake2s_IV[1];
+	v[10] = blake2s_IV[2];
+	v[11] = blake2s_IV[3];
+	v[12] = S->t[0] ^ blake2s_IV[4];
+	v[13] = S->t[1] ^ blake2s_IV[5];
+	v[14] = S->f[0] ^ blake2s_IV[6];
+	v[15] = S->f[1] ^ blake2s_IV[7];
+#define G(r,i,a,b,c,d) \
+	do { \
+		a = a + b + m[blake2s_sigma[r][2*i+0]]; \
+		d = SPH_ROTR32(d ^ a, 16); \
+		c = c + d; \
+		b = SPH_ROTR32(b ^ c, 12); \
+		a = a + b + m[blake2s_sigma[r][2*i+1]]; \
+		d = SPH_ROTR32(d ^ a, 8); \
+		c = c + d; \
+		b = SPH_ROTR32(b ^ c, 7); \
+	} while(0)
+#define ROUND(r)  \
+	do { \
+		G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+		G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+		G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+		G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+		G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+		G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+		G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+		G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
+	} while(0)
+	ROUND( 0 );
+	ROUND( 1 );
+	ROUND( 2 );
+	ROUND( 3 );
+	ROUND( 4 );
+	ROUND( 5 );
+	ROUND( 6 );
+	ROUND( 7 );
+	ROUND( 8 );
+	ROUND( 9 );
+
+	for( size_t i = 0; i < 8; ++i )
+		S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
+
+#undef G
+#undef ROUND
+	return 0;
+}
+
+
+int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen )
+{
+	while( inlen > 0 )
+	{
+		size_t left = S->buflen;
+		size_t fill = 2 * BLAKE2S_BLOCKBYTES - left;
+
+		if( inlen > fill )
+		{
+			memcpy( S->buf + left, in, fill ); // Fill buffer
+			S->buflen += fill;
+			blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
+			blake2s_compress( S, S->buf ); // Compress
+			memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES ); // Shift buffer left
+			S->buflen -= BLAKE2S_BLOCKBYTES;
+			in += fill;
+			inlen -= fill;
+		}
+		else // inlen <= fill
+		{
+			memcpy(S->buf + left, in, (size_t) inlen);
+			S->buflen += (size_t) inlen; // Be lazy, do not compress
+			in += inlen;
+			inlen -= inlen;
+		}
+	}
+
+	return 0;
+}
+
+int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen )
+{
+	uint8_t buffer[BLAKE2S_OUTBYTES];
+
+	if( S->buflen > BLAKE2S_BLOCKBYTES )
+	{
+		blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
+		blake2s_compress( S, S->buf );
+		S->buflen -= BLAKE2S_BLOCKBYTES;
+		memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, S->buflen );
+	}
+
+	blake2s_increment_counter( S, ( uint32_t )S->buflen );
+	blake2s_set_lastblock( S );
+	memset( S->buf + S->buflen, 0, 2 * BLAKE2S_BLOCKBYTES - S->buflen ); /* Padding */
+	blake2s_compress( S, S->buf );
+
+	for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
+		store32( buffer + sizeof( S->h[i] ) * i, S->h[i] );
+
+	memcpy( out, buffer, outlen );
+	return 0;
+}
+
+int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
+{
+	blake2s_state S[1];
+
+	/* Verify parameters */
+	if ( NULL == in ) return -1;
+
+	if ( NULL == out ) return -1;
+
+	if ( NULL == key ) keylen = 0; /* Fail here instead if keylen != 0 and key == NULL? */
+
+	if( keylen > 0 )
+	{
+		if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
+	}
+	else
+	{
+		if( blake2s_init( S, outlen ) < 0 ) return -1;
+	}
+
+	blake2s_update( S, ( uint8_t * )in, inlen );
+	blake2s_final( S, out, outlen );
+	return 0;
+}
+
+#if defined(BLAKE2S_SELFTEST)
+#include <string.h>
+#include "blake2-kat.h" /* test data not included */
+int main( int argc, char **argv )
+{
+	uint8_t key[BLAKE2S_KEYBYTES];
+	uint8_t buf[KAT_LENGTH];
+
+	for( size_t i = 0; i < BLAKE2S_KEYBYTES; ++i )
+		key[i] = ( uint8_t )i;
+
+	for( size_t i = 0; i < KAT_LENGTH; ++i )
+		buf[i] = ( uint8_t )i;
+
+	for( size_t i = 0; i < KAT_LENGTH; ++i )
+	{
+		uint8_t hash[BLAKE2S_OUTBYTES];
+		blake2s( hash, buf, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES );
+
+		if( 0 != memcmp( hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES ) )
+		{
+			puts( "error" );
+			return -1;
+		}
+	}
+
+	puts( "ok" );
+	return 0;
+}
+#endif
--- a/algo/blake/sph-blake2s.h
+++ b/algo/blake/sph-blake2s.h
@@ -0,0 +1,150 @@
+/**
+ * BLAKE2 reference source code package - reference C implementations
+ *
+ * Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide. This software is distributed without any warranty.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+#pragma once
+#ifndef __BLAKE2_H__
+#define __BLAKE2_H__
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#include <inttypes.h>
+#define inline __inline
+#define ALIGN(x) __declspec(align(x))
+#else
+#define ALIGN(x) __attribute__((aligned(x)))
+#endif
+
+/* blake2-impl.h */
+
+static inline uint32_t load32(const void *src)
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+	return *(uint32_t *)(src);
+#else
+	const uint8_t *p = (uint8_t *)src;
+	uint32_t w = *p++;
+	w |= (uint32_t)(*p++) << 8;
+	w |= (uint32_t)(*p++) << 16;
+	w |= (uint32_t)(*p++) << 24;
+	return w;
+#endif
+}
+
+static inline void store32(void *dst, uint32_t w)
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+	*(uint32_t *)(dst) = w;
+#else
+	uint8_t *p = (uint8_t *)dst;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w;
+#endif
+}
+
+static inline uint64_t load48(const void *src)
+{
+	const uint8_t *p = (const uint8_t *)src;
+	uint64_t w = *p++;
+	w |= (uint64_t)(*p++) << 8;
+	w |= (uint64_t)(*p++) << 16;
+	w |= (uint64_t)(*p++) << 24;
+	w |= (uint64_t)(*p++) << 32;
+	w |= (uint64_t)(*p++) << 40;
+	return w;
+}
+
+static inline void store48(void *dst, uint64_t w)
+{
+	uint8_t *p = (uint8_t *)dst;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w;
+}
+
+/* prevents compiler optimizing out memset() */
+static inline void secure_zero_memory(void *v, size_t n)
+{
+	volatile uint8_t *p = ( volatile uint8_t * )v;
+
+	while( n-- ) *p++ = 0;
+}
+
+/* blake2.h */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+	enum blake2s_constant
+	{
+		BLAKE2S_BLOCKBYTES = 64,
+		BLAKE2S_OUTBYTES   = 32,
+		BLAKE2S_KEYBYTES   = 32,
+		BLAKE2S_SALTBYTES  = 8,
+		BLAKE2S_PERSONALBYTES = 8
+	};
+
+#pragma pack(push, 1)
+	typedef struct __blake2s_param
+	{
+		uint8_t  digest_length; // 1
+		uint8_t  key_length;    // 2
+		uint8_t  fanout;        // 3
+		uint8_t  depth;         // 4
+		uint32_t leaf_length;   // 8
+		uint8_t  node_offset[6];// 14
+		uint8_t  node_depth;    // 15
+		uint8_t  inner_length;  // 16
+		// uint8_t  reserved[0];
+		uint8_t  salt[BLAKE2S_SALTBYTES]; // 24
+		uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
+	} blake2s_param;
+
+	ALIGN( 64 ) typedef struct __blake2s_state
+	{
+		uint32_t h[8];
+		uint32_t t[2];
+		uint32_t f[2];
+		uint8_t  buf[2 * BLAKE2S_BLOCKBYTES];
+		size_t   buflen;
+		uint8_t  last_node;
+	} blake2s_state ;
+#pragma pack(pop)
+
+	int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] );
+
+	// Streaming API
+	int blake2s_init( blake2s_state *S, const uint8_t outlen );
+	int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+	int blake2s_init_param( blake2s_state *S, const blake2s_param *P );
+	int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen );
+	int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen );
+
+	// Simple API
+	int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+
+	// Direct Hash Mining Helpers
+	#define blake2s_salt32(out, in, inlen, key32) blake2s(out, in, key32, 32, inlen, 32) /* neoscrypt */
+	#define blake2s_simple(out, in, inlen) blake2s(out, in, NULL, 32, inlen, 0)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
--- a/algo/bmw/bmw-hash-4way.c
+++ b/algo/bmw/bmw-hash-4way.c
@@ -49,6 +49,11 @@ extern "C"{

 // BMW256

+// BMW small has a bug not present in big. Lanes 0 & 2 produce valid hash
+// while lanes 1 & 3 produce invalid hash. The cause is not known.
+
+
+
 static const sph_u32 IV256[] = {
 	SPH_C32(0x40414243), SPH_C32(0x44454647),
 	SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F),
@@ -116,14 +121,16 @@ static const sph_u64 IV512[] = {
   mm_rotl_32( M[ ( (j) + (off) ) & 0xF ] , \
                ( ( (j) + (off) ) & 0xF ) + 1 )

+// The multiplication in this macro is a possible cause of the lane
+// corruption but a vectorized mullo did not help.
 #define add_elt_s( M, H, j ) \
   _mm_xor_si128( \
      _mm_add_epi32( \
            _mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
                                          rol_off_32( M, j, 3 ) ), \
                           rol_off_32( M, j, 10 ) ), \
-            _mm_set1_epi32( ( (j) + 16 ) * 0x05555555UL ) ), \
-    H[ ( (j)+7 ) & 0xF ] )
+            _mm_set1_epi32( ( (j) + 16 ) * 0x05555555UL ) \
+                   ), H[ ( (j)+7 ) & 0xF ] )


 #define expand1s( qt, M, H, i ) \
@@ -160,7 +167,7 @@ static const sph_u64 IV512[] = {
             _mm_add_epi32( \
                _mm_add_epi32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ) ), \
                _mm_add_epi32( qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ) ), \
-             _mm_add_epi64( \
+             _mm_add_epi32( \
                _mm_add_epi32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ) ), \
                _mm_add_epi32( qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ) ) ), \
         _mm_add_epi32( \
@@ -861,7 +868,27 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
 } 

 // BMW256
-
+/*
+static const uint32_t final_s[16][4] =
+{
+   { 0xaaaaaaa0, 0xaaaaaaa0, 0xaaaaaaa0, 0xaaaaaaa0 },
+   { 0xaaaaaaa1, 0xaaaaaaa1, 0xaaaaaaa1, 0xaaaaaaa1 },
+   { 0xaaaaaaa2, 0xaaaaaaa2, 0xaaaaaaa2, 0xaaaaaaa2 },
+   { 0xaaaaaaa3, 0xaaaaaaa3, 0xaaaaaaa3, 0xaaaaaaa3 },
+   { 0xaaaaaaa4, 0xaaaaaaa4, 0xaaaaaaa4, 0xaaaaaaa4 },
+   { 0xaaaaaaa5, 0xaaaaaaa5, 0xaaaaaaa5, 0xaaaaaaa5 },
+   { 0xaaaaaaa6, 0xaaaaaaa6, 0xaaaaaaa6, 0xaaaaaaa6 },
+   { 0xaaaaaaa7, 0xaaaaaaa7, 0xaaaaaaa7, 0xaaaaaaa7 },
+   { 0xaaaaaaa8, 0xaaaaaaa8, 0xaaaaaaa8, 0xaaaaaaa8 },
+   { 0xaaaaaaa9, 0xaaaaaaa9, 0xaaaaaaa9, 0xaaaaaaa9 },
+   { 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa },
+   { 0xaaaaaaab, 0xaaaaaaab, 0xaaaaaaab, 0xaaaaaaab },
+   { 0xaaaaaaac, 0xaaaaaaac, 0xaaaaaaac, 0xaaaaaaac },
+   { 0xaaaaaaad, 0xaaaaaaad, 0xaaaaaaad, 0xaaaaaaad },
+   { 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae },
+   { 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
+};
+*/
 static const __m128i final_s[16] =
 {
   { 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
@@ -901,11 +928,12 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
   size_t ptr;
   const int buf_size = 64;  // bytes of one lane, compatible with len

-   sc->bit_count += (sph_u64)len << 3;
+   sc->bit_count += (sph_u32)len << 3;
   buf = sc->buf;
   ptr = sc->ptr;
   h1 = sc->H;
   h2 = htmp;
+
   while ( len > 0 )
   {
      size_t clen;
@@ -938,13 +966,11 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
   __m128i *buf;
   __m128i h1[16], h2[16], *h;
   size_t ptr, u, v;
-   unsigned z;
   const int buf_size = 64;  // bytes of one lane, compatible with len

   buf = sc->buf;
   ptr = sc->ptr;
-   z = 0x80 >> n;
-   buf[ ptr>>2 ] = _mm_set1_epi32( z );
+   buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 );
   ptr += 4;
   h = sc->H;

@@ -956,12 +982,15 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
      ptr = 0;
      h = h1;
   }
-   memset_zero_128( buf + (ptr>>2), (buf_size - 4 - ptr) >> 2 );
-   buf[ (buf_size - 4) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
+   memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
+   buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
+   buf[ (buf_size - 4) >> 2 ] = mm_zero;
   compress_small( buf, h, h2 );
+
   for ( u = 0; u < 16; u ++ )
      buf[u] = h2[u];
-   compress_small( buf, final_s, h1 );
+   compress_small( buf, (__m128i*)final_s, h1 );
+
   for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
      casti_m128i( dst, u ) = h1[v];
 }
--- a/algo/cryptonight/cryptonight-aesni.c
+++ b/algo/cryptonight/cryptonight-aesni.c
@@ -3,7 +3,8 @@
 #include "cryptonight.h"
 #include "miner.h"
 #include "crypto/c_keccak.h"
-#include "avxdefs.h"
+#include <immintrin.h>
+//#include "avxdefs.h"

 void aesni_parallel_noxor(uint8_t *long_state, uint8_t *text, uint8_t *ExpandedKey);
 void aesni_parallel_xor(uint8_t *text, uint8_t *ExpandedKey, uint8_t *long_state);
--- a/algo/cubehash/sse2/cubehash_sse2.c
+++ b/algo/cubehash/sse2/cubehash_sse2.c
@@ -10,6 +10,10 @@
 #endif
 #include "cubehash_sse2.h"
 #include "algo/sha/sha3-defs.h"
+#include <stdbool.h>
+#include <unistd.h>
+#include <memory.h>
+#include "avxdefs.h"

 static void transform( cubehashParam *sp )
 {
@@ -125,6 +129,18 @@ static void transform( cubehashParam *sp )
 #endif
 }  // transform

+// Ccubehash context initializing is very expensive.
+// Cache the intial value for faster reinitializing.
+cubehashParam cube_ctx_cache __attribute__ ((aligned (64)));
+
+int cubehashReinit( cubehashParam *sp )
+{
+   memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) );
+   return SUCCESS;
+
+}
+
+// Initialize the cache then copy to sp.
 int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
 {
    int i;
@@ -135,24 +151,26 @@ int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)

    /* Sanity checks */
    if ( rounds <= 0 || rounds > 32 )
-         rounds = CUBEHASH_ROUNDS;
+       rounds = CUBEHASH_ROUNDS;
    if ( blockbytes <= 0 || blockbytes >= 256)
-         blockbytes = CUBEHASH_BLOCKBYTES;
+       blockbytes = CUBEHASH_BLOCKBYTES;

    // all sizes of __m128i
-    sp->hashlen   = hashbitlen/128;
-    sp->blocksize = blockbytes/16;
-    sp->rounds    = rounds;
-    sp->pos       = 0;
+    cube_ctx_cache.hashlen   = hashbitlen/128;
+    cube_ctx_cache.blocksize = blockbytes/16;
+    cube_ctx_cache.rounds    = rounds;
+    cube_ctx_cache.pos       = 0;

    for ( i = 0; i < 8; ++i )
-         sp->x[i] = _mm_set_epi32(0, 0, 0, 0);
+       cube_ctx_cache.x[i] = _mm_setzero_si128();;

-    sp->x[0] = _mm_set_epi32( 0, rounds, blockbytes, hashbitlen / 8 );
+    cube_ctx_cache.x[0] = _mm_set_epi32( 0, rounds, blockbytes,
+                                         hashbitlen / 8 );

    for ( i = 0; i < 10; ++i )
-         transform(sp);
-//    sp->pos = 0;
+       transform( &cube_ctx_cache );
+
+    memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) );
    return SUCCESS;
 }

--- a/algo/cubehash/sse2/cubehash_sse2.h
+++ b/algo/cubehash/sse2/cubehash_sse2.h
@@ -29,6 +29,8 @@ extern "C" {
 #endif

 int cubehashInit(cubehashParam* sp, int hashbitlen, int rounds, int blockbytes);
+// reinitialize context with same parameters, much faster.
+int cubehashReinit( cubehashParam* sp );

 int cubehashUpdate(cubehashParam* sp, const byte *data, size_t size);

--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -0,0 +1,510 @@
+/* $Id: hamsi.c 251 2010-10-19 14:31:51Z tp $ */
+/*
+ * Hamsi implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "hamsi-hash-4way.h"
+
+#if defined(__AVX__)
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+/*
+ * The SPH_HAMSI_EXPAND_* define how many input bits we handle in one
+ * table lookup during message expansion (1 to 8, inclusive). If we note
+ * w the number of bits per message word (w=32 for Hamsi-224/256, w=64
+ * for Hamsi-384/512), r the size of a "row" in 32-bit words (r=8 for
+ * Hamsi-224/256, r=16 for Hamsi-384/512), and n the expansion level,
+ * then we will get t tables (where t=ceil(w/n)) of individual size
+ * 2^n*r*4 (in bytes). The last table may be shorter (e.g. with w=32 and
+ * n=5, there are 7 tables, but the last one uses only two bits on
+ * input, not five).
+ *
+ * Also, we read t rows of r words from RAM. Words in a given row are
+ * concatenated in RAM in that order, so most of the cost is about
+ * reading the first row word; comparatively, cache misses are thus
+ * less expensive with Hamsi-512 (r=16) than with Hamsi-256 (r=8).
+ *
+ * When n=1, tables are "special" in that we omit the first entry of
+ * each table (which always contains 0), so that total table size is
+ * halved.
+ *
+ * We thus have the following (size1 is the cumulative table size of
+ * Hamsi-224/256; size2 is for Hamsi-384/512; similarly, t1 and t2
+ * are for Hamsi-224/256 and Hamsi-384/512, respectively).
+ *
+ *   n      size1      size2    t1    t2
+ * ---------------------------------------
+ *   1       1024       4096    32    64
+ *   2       2048       8192    16    32
+ *   3       2688      10880    11    22
+ *   4       4096      16384     8    16
+ *   5       6272      25600     7    13
+ *   6      10368      41984     6    11
+ *   7      16896      73856     5    10
+ *   8      32768     131072     4     8
+ *
+ * So there is a trade-off: a lower n makes the tables fit better in
+ * L1 cache, but increases the number of memory accesses. The optimal
+ * value depends on the amount of available L1 cache and the relative
+ * impact of a cache miss.
+ *
+ * Experimentally, in ideal benchmark conditions (which are not necessarily
+ * realistic with regards to L1 cache contention), it seems that n=8 is
+ * the best value on "big" architectures (those with 32 kB or more of L1
+ * cache), while n=4 is better on "small" architectures. This was tested
+ * on an Intel Core2 Q6600 (both 32-bit and 64-bit mode), a PowerPC G3
+ * (32 kB L1 cache, hence "big"), and a MIPS-compatible Broadcom BCM3302
+ * (8 kB L1 cache).
+ *
+ * Note: with n=1, the 32 tables (actually implemented as one big table)
+ * are read entirely and sequentially, regardless of the input data,
+ * thus avoiding any data-dependent table access pattern.
+ */
+
+// Hard coded
+//#define SPH_HAMSI_EXPAND_BIG    1
+
+/*
+#if !defined SPH_HAMSI_EXPAND_SMALL
+#if SPH_SMALL_FOOTPRINT_HAMSI
+#define SPH_HAMSI_EXPAND_SMALL  4
+#else
+#define SPH_HAMSI_EXPAND_SMALL  8
+#endif
+#endif
+
+#if !defined SPH_HAMSI_EXPAND_BIG
+#define SPH_HAMSI_EXPAND_BIG    8
+#endif
+*/
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#include "hamsi-helper-4way.c"
+
+static const sph_u32 IV512[] = {
+	SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172),
+	SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062),
+	SPH_C32(0x75732032), SPH_C32(0x3434362c), SPH_C32(0x20422d33),
+	SPH_C32(0x30303120), SPH_C32(0x4c657576), SPH_C32(0x656e2d48),
+	SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c),
+	SPH_C32(0x6769756d)
+};
+
+static const sph_u32 alpha_n[] = {
+	SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
+	SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
+	SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
+	SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
+	SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
+};
+
+static const sph_u32 alpha_f[] = {
+	SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
+	SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
+	SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
+	SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
+	SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
+	SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
+	SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
+};
+
+/*
+#define s0   m0
+#define s1   m1
+#define s2   c0
+#define s3   c1
+#define s4   c2
+#define s5   c3
+#define s6   m2
+#define s7   m3
+#define s8   m4
+#define s9   m5
+#define sA   c4
+#define sB   c5
+#define sC   c6
+#define sD   c7
+#define sE   m6
+#define sF   m7
+*/
+
+#define SBOX( a, b, c, d ) \
+do { \
+  __m128i t; \
+  t = a; \
+  a = _mm_xor_si128( d, _mm_and_si128( a, c ) ); \
+  c = _mm_xor_si128( a, _mm_xor_si128( c, b ) ); \
+  d = _mm_xor_si128( b, _mm_or_si128( d, t ) ); \
+  t = _mm_xor_si128( t, c ); \
+  b = d; \
+  d = _mm_xor_si128( a, _mm_or_si128( d, t ) ); \
+  a = _mm_and_si128( a, b ); \
+  t = _mm_xor_si128( t, a ); \
+  b = _mm_xor_si128( t, _mm_xor_si128( b, d ) ); \
+  a = c; \
+  c = b; \
+  b = d; \
+  d = mm_not( t ); \
+} while (0)
+
+#define L( a, b, c, d ) \
+do { \
+   a = mm_rotl_32( a, 13 ); \
+   c = mm_rotl_32( c,  3 ); \
+   b = _mm_xor_si128( b, _mm_xor_si128( a, c ) ); \
+   d = _mm_xor_si128( d, _mm_xor_si128( c, _mm_slli_epi32( a, 3 ) ) ); \
+   b = mm_rotl_32( b, 1 ); \
+   d = mm_rotl_32( d, 7 ); \
+   a = _mm_xor_si128( a, _mm_xor_si128( b, d ) ); \
+   c = _mm_xor_si128( c, _mm_xor_si128( d, _mm_slli_epi32( b, 7 ) ) ); \
+   a = mm_rotl_32( a,  5 ); \
+   c = mm_rotl_32( c, 22 ); \
+} while (0)
+
+#define DECL_STATE_BIG \
+   __m128i c0, c1, c2, c3, c4, c5, c6, c7; \
+   __m128i c8, c9, cA, cB, cC, cD, cE, cF;
+
+#define READ_STATE_BIG(sc)   do { \
+		c0 = sc->h[0x0]; \
+		c1 = sc->h[0x1]; \
+		c2 = sc->h[0x2]; \
+		c3 = sc->h[0x3]; \
+		c4 = sc->h[0x4]; \
+		c5 = sc->h[0x5]; \
+		c6 = sc->h[0x6]; \
+		c7 = sc->h[0x7]; \
+		c8 = sc->h[0x8]; \
+		c9 = sc->h[0x9]; \
+		cA = sc->h[0xA]; \
+		cB = sc->h[0xB]; \
+		cC = sc->h[0xC]; \
+		cD = sc->h[0xD]; \
+		cE = sc->h[0xE]; \
+		cF = sc->h[0xF]; \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		sc->h[0x0] = c0; \
+		sc->h[0x1] = c1; \
+		sc->h[0x2] = c2; \
+		sc->h[0x3] = c3; \
+		sc->h[0x4] = c4; \
+		sc->h[0x5] = c5; \
+		sc->h[0x6] = c6; \
+		sc->h[0x7] = c7; \
+		sc->h[0x8] = c8; \
+		sc->h[0x9] = c9; \
+		sc->h[0xA] = cA; \
+		sc->h[0xB] = cB; \
+		sc->h[0xC] = cC; \
+		sc->h[0xD] = cD; \
+		sc->h[0xE] = cE; \
+		sc->h[0xF] = cF; \
+	} while (0)
+
+#define s00   m0
+#define s01   m1
+#define s02   c0
+#define s03   c1
+#define s04   m2
+#define s05   m3
+#define s06   c2
+#define s07   c3
+#define s08   c4
+#define s09   c5
+#define s0A   m4
+#define s0B   m5
+#define s0C   c6
+#define s0D   c7
+#define s0E   m6
+#define s0F   m7
+#define s10   m8
+#define s11   m9
+#define s12   c8
+#define s13   c9
+#define s14   mA
+#define s15   mB
+#define s16   cA
+#define s17   cB
+#define s18   cC
+#define s19   cD
+#define s1A   mC
+#define s1B   mD
+#define s1C   cE
+#define s1D   cF
+#define s1E   mE
+#define s1F   mF
+
+#define ROUND_BIG(rc, alpha) \
+do { \
+   s00 = _mm_xor_si128( s00, _mm_set1_epi32( alpha[ 0x00 ] ) ); \
+   s01 = _mm_xor_si128( s01, _mm_xor_si128( _mm_set1_epi32( alpha[ 0x01 ] ), \
+                                            _mm_set1_epi32( rc ) ) ); \
+   s02 = _mm_xor_si128( s02, _mm_set1_epi32( alpha[ 0x02 ] ) ); \
+   s03 = _mm_xor_si128( s03, _mm_set1_epi32( alpha[ 0x03 ] ) ); \
+   s04 = _mm_xor_si128( s04, _mm_set1_epi32( alpha[ 0x04 ] ) ); \
+   s05 = _mm_xor_si128( s05, _mm_set1_epi32( alpha[ 0x05 ] ) ); \
+   s06 = _mm_xor_si128( s06, _mm_set1_epi32( alpha[ 0x06 ] ) ); \
+   s07 = _mm_xor_si128( s07, _mm_set1_epi32( alpha[ 0x07 ] ) ); \
+   s08 = _mm_xor_si128( s08, _mm_set1_epi32( alpha[ 0x08 ] ) ); \
+   s09 = _mm_xor_si128( s09, _mm_set1_epi32( alpha[ 0x09 ] ) ); \
+   s0A = _mm_xor_si128( s0A, _mm_set1_epi32( alpha[ 0x0A ] ) ); \
+   s0B = _mm_xor_si128( s0B, _mm_set1_epi32( alpha[ 0x0B ] ) ); \
+   s0C = _mm_xor_si128( s0C, _mm_set1_epi32( alpha[ 0x0C ] ) ); \
+   s0D = _mm_xor_si128( s0D, _mm_set1_epi32( alpha[ 0x0D ] ) ); \
+   s0E = _mm_xor_si128( s0E, _mm_set1_epi32( alpha[ 0x0E ] ) ); \
+   s0F = _mm_xor_si128( s0F, _mm_set1_epi32( alpha[ 0x0F ] ) ); \
+   s10 = _mm_xor_si128( s10, _mm_set1_epi32( alpha[ 0x10 ] ) ); \
+   s11 = _mm_xor_si128( s11, _mm_set1_epi32( alpha[ 0x11 ] ) ); \
+   s12 = _mm_xor_si128( s12, _mm_set1_epi32( alpha[ 0x12 ] ) ); \
+   s13 = _mm_xor_si128( s13, _mm_set1_epi32( alpha[ 0x13 ] ) ); \
+   s14 = _mm_xor_si128( s14, _mm_set1_epi32( alpha[ 0x14 ] ) ); \
+   s15 = _mm_xor_si128( s15, _mm_set1_epi32( alpha[ 0x15 ] ) ); \
+   s16 = _mm_xor_si128( s16, _mm_set1_epi32( alpha[ 0x16 ] ) ); \
+   s17 = _mm_xor_si128( s17, _mm_set1_epi32( alpha[ 0x17 ] ) ); \
+   s18 = _mm_xor_si128( s18, _mm_set1_epi32( alpha[ 0x18 ] ) ); \
+   s19 = _mm_xor_si128( s19, _mm_set1_epi32( alpha[ 0x19 ] ) ); \
+   s1A = _mm_xor_si128( s1A, _mm_set1_epi32( alpha[ 0x1A ] ) ); \
+   s1B = _mm_xor_si128( s1B, _mm_set1_epi32( alpha[ 0x1B ] ) ); \
+   s1C = _mm_xor_si128( s1C, _mm_set1_epi32( alpha[ 0x1C ] ) ); \
+   s1D = _mm_xor_si128( s1D, _mm_set1_epi32( alpha[ 0x1D ] ) ); \
+   s1E = _mm_xor_si128( s1E, _mm_set1_epi32( alpha[ 0x1E ] ) ); \
+   s1F = _mm_xor_si128( s1F, _mm_set1_epi32( alpha[ 0x1F ] ) ); \
+   SBOX( s00, s08, s10, s18); \
+   SBOX( s01, s09, s11, s19); \
+   SBOX( s02, s0A, s12, s1A); \
+   SBOX( s03, s0B, s13, s1B); \
+   SBOX( s04, s0C, s14, s1C); \
+   SBOX( s05, s0D, s15, s1D); \
+   SBOX( s06, s0E, s16, s1E); \
+   SBOX( s07, s0F, s17, s1F); \
+   L( s00, s09, s12, s1B ); \
+   L( s01, s0A, s13, s1C ); \
+   L( s02, s0B, s14, s1D ); \
+   L( s03, s0C, s15, s1E ); \
+   L( s04, s0D, s16, s1F ); \
+   L( s05, s0E, s17, s18 ); \
+   L( s06, s0F, s10, s19 ); \
+   L( s07, s08, s11, s1A ); \
+   L( s00, s02, s05, s07 ); \
+   L( s10, s13, s15, s16 ); \
+   L( s09, s0B, s0C, s0E ); \
+   L( s19, s1A, s1C, s1F ); \
+} while (0)
+
+#define P_BIG   do { \
+		ROUND_BIG(0, alpha_n); \
+		ROUND_BIG(1, alpha_n); \
+		ROUND_BIG(2, alpha_n); \
+		ROUND_BIG(3, alpha_n); \
+		ROUND_BIG(4, alpha_n); \
+		ROUND_BIG(5, alpha_n); \
+	} while (0)
+
+#define PF_BIG   do { \
+		ROUND_BIG(0, alpha_f); \
+		ROUND_BIG(1, alpha_f); \
+		ROUND_BIG(2, alpha_f); \
+		ROUND_BIG(3, alpha_f); \
+		ROUND_BIG(4, alpha_f); \
+		ROUND_BIG(5, alpha_f); \
+		ROUND_BIG(6, alpha_f); \
+		ROUND_BIG(7, alpha_f); \
+		ROUND_BIG(8, alpha_f); \
+		ROUND_BIG(9, alpha_f); \
+		ROUND_BIG(10, alpha_f); \
+		ROUND_BIG(11, alpha_f); \
+	} while (0)
+
+#define T_BIG \
+do { /* order is important */ \
+   cF = _mm_xor_si128( sc->h[ 0xF ], s17 ); \
+   cE = _mm_xor_si128( sc->h[ 0xE ], s16 ); \
+   cD = _mm_xor_si128( sc->h[ 0xD ], s15 ); \
+   cC = _mm_xor_si128( sc->h[ 0xC ], s14 ); \
+   cB = _mm_xor_si128( sc->h[ 0xB ], s13 ); \
+   cA = _mm_xor_si128( sc->h[ 0xA ], s12 ); \
+   c9 = _mm_xor_si128( sc->h[ 0x9 ], s11 ); \
+   c8 = _mm_xor_si128( sc->h[ 0x8 ], s10 ); \
+   c7 = _mm_xor_si128( sc->h[ 0x7 ], s07 ); \
+   c6 = _mm_xor_si128( sc->h[ 0x6 ], s06 ); \
+   c5 = _mm_xor_si128( sc->h[ 0x5 ], s05 ); \
+   c4 = _mm_xor_si128( sc->h[ 0x4 ], s04 ); \
+   c3 = _mm_xor_si128( sc->h[ 0x3 ], s03 ); \
+   c2 = _mm_xor_si128( sc->h[ 0x2 ], s02 ); \
+   c1 = _mm_xor_si128( sc->h[ 0x1 ], s01 ); \
+   c0 = _mm_xor_si128( sc->h[ 0x0 ], s00 ); \
+} while (0)
+
+void hamsi_big( hamsi_4way_big_context *sc, __m128i *buf, size_t num )
+{
+   DECL_STATE_BIG
+   sph_u32 tmp;
+
+   tmp = SPH_T32( (sph_u32)num << 6 );
+   sc->count_low = SPH_T32( sc->count_low + tmp );
+   sc->count_high += (sph_u32)( (num >> 13) >> 13 );
+   if ( sc->count_low < tmp )
+      sc->count_high++;
+
+   READ_STATE_BIG( sc );
+
+   while ( num-- > 0 )
+   {
+      __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+      __m128i m8, m9, mA, mB, mC, mD, mE, mF;
+
+      INPUT_BIG;
+      P_BIG;
+      T_BIG;
+
+// Strange kluge. Without the following WRITE_STATE the hash is bad.
+// SPH doesn't do it.
+      WRITE_STATE_BIG( sc );
+      buf += 2;
+   }
+   WRITE_STATE_BIG( sc );
+}
+
+void hamsi_big_final( hamsi_4way_big_context *sc, __m128i *buf )
+{
+   __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+   __m128i m8, m9, mA, mB, mC, mD, mE, mF;
+   DECL_STATE_BIG
+
+   READ_STATE_BIG( sc );
+   INPUT_BIG;
+   PF_BIG;
+   T_BIG;
+   WRITE_STATE_BIG( sc );
+}
+
+void hamsi_big_init( hamsi_4way_big_context *sc, const sph_u32 *iv )
+{
+   sc->partial_len = 0;
+   sc->count_high = sc->count_low = 0;
+   for ( int i = 0; i < 16; i ++ )
+      sc->h[i] = _mm_set1_epi32( iv[i] );
+}
+
+void hamsi_big_core( hamsi_4way_big_context *sc, const void *data, size_t len )
+{
+   __m128i *vdata = (__m128i*)data;
+
+   if ( sc->partial_len != 0 )
+   {
+      size_t mlen;
+
+      mlen = 8 - sc->partial_len;
+      if ( len < mlen )
+      {
+         memcpy_128( sc->partial + (sc->partial_len >> 2), data, len>>2 );
+         sc->partial_len += len;
+         return;
+      }
+      else
+      {
+         memcpy_128( sc->partial + (sc->partial_len >> 2), data, mlen>>2 );
+         len -= mlen;
+         vdata += mlen>>2;
+         hamsi_big( sc, sc->partial, 1 );
+         sc->partial_len = 0;
+      }
+   }
+
+   hamsi_big( sc, vdata, len>>3 );
+   vdata += ( (len& ~(size_t)7) >> 2 );
+   len &= (size_t)7;
+   memcpy_128( sc->partial, vdata, len>>2 );
+}
+
+void hamsi_big_close( hamsi_4way_big_context *sc, void *dst,
+                      size_t out_size_w32 )
+{
+   __m128i pad[2];
+   size_t ptr, u;
+   __m128i *out = (__m128i*)dst;
+
+   ptr = sc->partial_len;
+
+   pad[0] = mm_byteswap_32( _mm_set1_epi32( sc->count_high ) );      
+   pad[1] = mm_byteswap_32( _mm_set1_epi32( sc->count_low + (ptr << 3) ) );
+
+   sc->partial[ ptr>>2 ] = _mm_set1_epi32( 0x80UL );
+
+   if ( ptr < 8 )
+      memset_zero_128( sc->partial + (ptr>>2) + 1, (8-ptr) >> 2 );
+
+   hamsi_big( sc, sc->partial, 1 );
+   hamsi_big_final( sc, pad );
+
+   for ( u = 0; u < 16; u ++ )
+      out[u] = mm_byteswap_32( sc->h[u] );
+}
+
+void hamsi512_4way_init( void *cc )
+{
+	hamsi_big_init( cc, IV512 );
+}
+
+void hamsi512_4way( void *cc, const void *data, size_t len )
+{
+	hamsi_big_core( cc, data, len );
+}
+
+void hamsi512_4way_close( void *cc, void *dst )
+{
+	hamsi_big_close( cc, dst, 16 );
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -0,0 +1,72 @@
+/* $Id: sph_hamsi.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * Hamsi interface. This code implements Hamsi with the recommended
+ * parameters for SHA-3, with outputs of 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_hamsi.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef HAMSI_4WAY_H__
+#define HAMSI_4WAY_H__
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+
+#if defined (__AVX__)
+
+#include "avxdefs.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#define SPH_SIZE_hamsi512   512
+
+typedef struct {
+   __m128i h[16];
+   __m128i partial[2];
+   size_t partial_len;
+   sph_u32 count_high, count_low;
+} hamsi_4way_big_context;
+
+typedef hamsi_4way_big_context hamsi512_4way_context;
+
+void hamsi512_4way_init(void *cc);
+
+void hamsi512_4way(void *cc, const void *data, size_t len);
+
+void hamsi512_4way_close(void *cc, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif
--- a/algo/hamsi/hamsi-helper-4way.c
+++ b/algo/hamsi/hamsi-helper-4way.c
@@ -0,0 +1,482 @@
+/* $Id: hamsi_helper.c 202 2010-05-31 15:46:48Z tp $ */
+/*
+ * Helper code for Hamsi (input block expansion). This code is
+ * automatically generated and includes precomputed tables for
+ * expansion code which handles 2 to 8 bits at a time.
+ *
+ * This file is included from hamsi.c, and is not meant to be compiled
+ * independently.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+/* Note: this table lists bits within each byte from least
+   siginificant to most significant. */
+static const sph_u32 T512[64][16] = {
+	{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
+	  SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
+	  SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
+	  SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
+	  SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
+	  SPH_C32(0x9e69af68) },
+	{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
+	  SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
+	  SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
+	  SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
+	  SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
+	  SPH_C32(0x0c26f262) },
+	{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
+	  SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
+	  SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
+	  SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
+	  SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
+	  SPH_C32(0xdc24e61f) },
+	{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
+	  SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
+	  SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
+	  SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
+	  SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
+	  SPH_C32(0x3daac2da) },
+	{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
+	  SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
+	  SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
+	  SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
+	  SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
+	  SPH_C32(0x78cace29) },
+	{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
+	  SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
+	  SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
+	  SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
+	  SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
+	  SPH_C32(0x2dd1f9ab) },
+	{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
+	  SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
+	  SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
+	  SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
+	  SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
+	  SPH_C32(0xbf2c0be2) },
+	{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
+	  SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
+	  SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
+	  SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
+	  SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
+	  SPH_C32(0x32219526) },
+	{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
+	  SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
+	  SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
+	  SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
+	  SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
+	  SPH_C32(0xac8e6c88) },
+	{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
+	  SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
+	  SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
+	  SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
+	  SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
+	  SPH_C32(0x7b1bd6b9) },
+	{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
+	  SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
+	  SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
+	  SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
+	  SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
+	  SPH_C32(0xf746c320) },
+	{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
+	  SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
+	  SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
+	  SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
+	  SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
+	  SPH_C32(0x69505b3a) },
+	{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
+	  SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
+	  SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
+	  SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
+	  SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
+	  SPH_C32(0x8a341574) },
+	{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
+	  SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
+	  SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
+	  SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
+	  SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
+	  SPH_C32(0x450360bf) },
+	{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
+	  SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
+	  SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
+	  SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
+	  SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
+	  SPH_C32(0xf3d45758) },
+	{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
+	  SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
+	  SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
+	  SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
+	  SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
+	  SPH_C32(0x925c44e9) },
+	{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
+	  SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
+	  SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
+	  SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
+	  SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
+	  SPH_C32(0xa123ff9f) },
+	{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
+	  SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
+	  SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
+	  SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
+	  SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
+	  SPH_C32(0x1568ff0f) },
+	{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
+	  SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
+	  SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
+	  SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
+	  SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
+	  SPH_C32(0xc5c1eb3e) },
+	{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
+	  SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
+	  SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
+	  SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
+	  SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
+	  SPH_C32(0x1af21fe1) },
+	{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
+	  SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
+	  SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
+	  SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
+	  SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
+	  SPH_C32(0x857f3c2b) },
+	{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
+	  SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
+	  SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
+	  SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
+	  SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
+	  SPH_C32(0x2ba05a55) },
+	{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
+	  SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
+	  SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
+	  SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
+	  SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
+	  SPH_C32(0xfeabf254) },
+	{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
+	  SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
+	  SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
+	  SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
+	  SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
+	  SPH_C32(0xfe1cdc7f) },
+	{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
+	  SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
+	  SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
+	  SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
+	  SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
+	  SPH_C32(0xb0a51834) },
+	{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
+	  SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
+	  SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
+	  SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
+	  SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
+	  SPH_C32(0xa6b8c28d) },
+	{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
+	  SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
+	  SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
+	  SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
+	  SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
+	  SPH_C32(0x3a4e99d7) },
+	{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
+	  SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
+	  SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
+	  SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
+	  SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
+	  SPH_C32(0xe1844257) },
+	{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
+	  SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
+	  SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
+	  SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
+	  SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
+	  SPH_C32(0x2c3b504e) },
+	{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
+	  SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
+	  SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
+	  SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
+	  SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
+	  SPH_C32(0x524a0d59) },
+	{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
+	  SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
+	  SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
+	  SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
+	  SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
+	  SPH_C32(0x378dd173) },
+	{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
+	  SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
+	  SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
+	  SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
+	  SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
+	  SPH_C32(0x8b6c72bd) },
+	{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
+	  SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
+	  SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
+	  SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
+	  SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
+	  SPH_C32(0x8e67b7fa) },
+	{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
+	  SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
+	  SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
+	  SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
+	  SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
+	  SPH_C32(0x443d3004) },
+	{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
+	  SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
+	  SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
+	  SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
+	  SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
+	  SPH_C32(0xf4f6ea7b) },
+	{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
+	  SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
+	  SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
+	  SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
+	  SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
+	  SPH_C32(0x979961d0) },
+	{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
+	  SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
+	  SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
+	  SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
+	  SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
+	  SPH_C32(0x98aa496e) },
+	{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
+	  SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
+	  SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
+	  SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
+	  SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
+	  SPH_C32(0x094e3198) },
+	{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
+	  SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
+	  SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
+	  SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
+	  SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
+	  SPH_C32(0xe86cba2e) },
+	{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
+	  SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
+	  SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
+	  SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
+	  SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
+	  SPH_C32(0x4b7eec55) },
+	{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
+	  SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
+	  SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
+	  SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
+	  SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
+	  SPH_C32(0x1e7536a6) },
+	{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
+	  SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
+	  SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
+	  SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
+	  SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
+	  SPH_C32(0x24314f17) },
+	{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
+	  SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
+	  SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
+	  SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
+	  SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
+	  SPH_C32(0x9075b1ce) },
+	{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
+	  SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
+	  SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
+	  SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
+	  SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
+	  SPH_C32(0x9b6ef888) },
+	{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
+	  SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
+	  SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
+	  SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
+	  SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
+	  SPH_C32(0xd8b61463) },
+	{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
+	  SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
+	  SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
+	  SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
+	  SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
+	  SPH_C32(0x3ea660f7) },
+	{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
+	  SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
+	  SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
+	  SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
+	  SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
+	  SPH_C32(0x7f975691) },
+	{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
+	  SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
+	  SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
+	  SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
+	  SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
+	  SPH_C32(0x2c94459e) },
+	{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
+	  SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
+	  SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
+	  SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
+	  SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
+	  SPH_C32(0x56a7b19f) },
+	{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
+	  SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
+	  SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
+	  SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
+	  SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
+	  SPH_C32(0x81fdf908) },
+	{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
+	  SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
+	  SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
+	  SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
+	  SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
+	  SPH_C32(0x5bd61539) },
+	{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
+	  SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
+	  SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
+	  SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
+	  SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
+	  SPH_C32(0x15b961e7) },
+	{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
+	  SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
+	  SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
+	  SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
+	  SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
+	  SPH_C32(0x2a2c18f0) },
+	{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
+	  SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
+	  SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
+	  SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
+	  SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
+	  SPH_C32(0x551e3d6e) },
+	{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
+	  SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
+	  SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
+	  SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
+	  SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
+	  SPH_C32(0x33c5244f) },
+	{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
+	  SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
+	  SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
+	  SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
+	  SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
+	  SPH_C32(0x8a58e6a4) },
+	{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
+	  SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
+	  SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
+	  SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
+	  SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
+	  SPH_C32(0xda878000) },
+	{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
+	  SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
+	  SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
+	  SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
+	  SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
+	  SPH_C32(0x3c5dfffe) },
+	{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
+	  SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
+	  SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
+	  SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
+	  SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
+	  SPH_C32(0x7b1675d7) },
+	{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
+	  SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
+	  SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
+	  SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
+	  SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
+	  SPH_C32(0x2879ebac) },
+	{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
+	  SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
+	  SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
+	  SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
+	  SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
+	  SPH_C32(0xbe0a679e) },
+	{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
+	  SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
+	  SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
+	  SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
+	  SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
+	  SPH_C32(0x30aebcf7) },
+	{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
+	  SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
+	  SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
+	  SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
+	  SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
+	  SPH_C32(0xc7ff60f0) },
+	{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
+	  SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
+	  SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
+	  SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
+	  SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
+	  SPH_C32(0xe7e00a94) }
+};
+
+#define U_BIG( n ) \
+do { \
+  __m128i db = buf[n]; \
+  for ( int u = 0; u < 32; u++ ) \
+  { \
+     __m128i dm = mm_negate_32( _mm_and_si128( db, mm_one_32 ) ); \
+     m0 = _mm_xor_si128( m0, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
+     m1 = _mm_xor_si128( m1, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
+     m2 = _mm_xor_si128( m2, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
+     m3 = _mm_xor_si128( m3, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
+     m4 = _mm_xor_si128( m4, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
+     m5 = _mm_xor_si128( m5, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
+     m6 = _mm_xor_si128( m6, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
+     m7 = _mm_xor_si128( m7, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
+     m8 = _mm_xor_si128( m8, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
+     m9 = _mm_xor_si128( m9, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
+     mA = _mm_xor_si128( mA, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
+     mB = _mm_xor_si128( mB, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
+     mC = _mm_xor_si128( mC, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
+     mD = _mm_xor_si128( mD, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
+     mE = _mm_xor_si128( mE, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
+     mF = _mm_xor_si128( mF, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
+     db = _mm_srli_epi32( db, 1 ); \
+  } \
+} while (0);
+
+#define INPUT_BIG \
+do { \
+  const sph_u32 *tp = &T512[0][0]; \
+  m0 = mm_zero; \
+  m1 = mm_zero; \
+  m2 = mm_zero; \
+  m3 = mm_zero; \
+  m4 = mm_zero; \
+  m5 = mm_zero; \
+  m6 = mm_zero; \
+  m7 = mm_zero; \
+  m8 = mm_zero; \
+  m9 = mm_zero; \
+  mA = mm_zero; \
+  mB = mm_zero; \
+  mC = mm_zero; \
+  mD = mm_zero; \
+  mE = mm_zero; \
+  mF = mm_zero; \
+  U_BIG( 0 ); \
+  U_BIG( 1 ); \
+} while (0)
+
+#ifdef __cplusplus
+}
+#endif
--- a/algo/haval/haval-4way-helper.c
+++ b/algo/haval/haval-4way-helper.c
@@ -0,0 +1,115 @@
+/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
+/*
+ * Helper code, included (three times !) by HAVAL implementation.
+ *
+ * TODO: try to merge this with md_helper.c.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#undef SPH_XCAT
+#define SPH_XCAT(a, b)    SPH_XCAT_(a, b)
+#undef SPH_XCAT_
+#define SPH_XCAT_(a, b)   a ## b
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _4way)
+( haval_4way_context *sc, const void *data, size_t len )
+{
+   __m128i *vdata = (__m128i*)data;
+   unsigned current;
+
+   current = (unsigned)sc->count_low & 127U;
+   while ( len > 0 )
+   {
+      unsigned clen;
+      sph_u32 clow, clow2;
+
+      clen = 128U - current;
+      if ( clen > len )
+         clen = len;
+      memcpy_128( sc->buf + (current>>2), vdata, clen>>2 );
+      vdata += clen>>2;
+      current += clen;
+      len -= clen;
+      if ( current == 128U )
+      {
+         DSTATE;
+         IN_PREPARE(sc->buf);
+         RSTATE;
+         SPH_XCAT(CORE, PASSES)(INW);
+         WSTATE;
+         current = 0;
+      }
+      clow = sc->count_low;
+      clow2 = SPH_T32(clow + clen);
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high ++;
+   }
+}
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
+                                                void *dst)
+{
+   unsigned current;
+   DSTATE;
+
+   current = (unsigned)sc->count_low & 127UL;
+
+   sc->buf[ current>>2 ] = mm_one_32;
+   current += 4;   
+   RSTATE;
+   if ( current > 116UL )
+   {
+      memset_zero_128( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
+      do
+      {
+         IN_PREPARE(sc->buf);
+         SPH_XCAT(CORE, PASSES)(INW);
+      } while (0);
+      current = 0;
+   }
+
+   uint32_t t1, t2;
+   memset_zero_128( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
+   t1 = 0x01 | (PASSES << 3);
+   t2 = sc->olen << 3;
+   sc->buf[ 116>>2 ] = _mm_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
+   sc->buf[ 120>>2 ] = _mm_set1_epi32( sc->count_low << 3 );
+   sc->buf[ 124>>2 ] = _mm_set1_epi32( (sc->count_high << 3)
+                                     | (sc->count_low >> 29) );
+   do
+   {
+      IN_PREPARE(sc->buf);
+      SPH_XCAT(CORE, PASSES)(INW);
+   } while (0);
+   WSTATE;
+   haval_4way_out( sc, dst );
+}
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -0,0 +1,522 @@
+/* $Id: haval.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * HAVAL implementation.
+ *
+ * The HAVAL reference paper is of questionable clarity with regards to
+ * some details such as endianness of bits within a byte, bytes within
+ * a 32-bit word, or the actual ordering of words within a stream of
+ * words. This implementation has been made compatible with the reference
+ * implementation available on: http://labs.calyptix.com/haval.php
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include "haval-hash-4way.h"
+
+#if defined (__AVX__)
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+//#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_HAVAL
+#define SPH_SMALL_FOOTPRINT_HAVAL   1
+//#endif
+
+#define F1(x6, x5, x4, x3, x2, x1, x0) \
+   _mm_xor_si128( x0, \
+       _mm_xor_si128( _mm_and_si128(_mm_xor_si128( x0, x4 ), x1 ), \
+                      _mm_xor_si128( _mm_and_si128( x2, x5 ), \
+                                     _mm_and_si128( x3, x6 ) ) ) ) \
+
+#define F2(x6, x5, x4, x3, x2, x1, x0) \
+   _mm_xor_si128( \
+      _mm_and_si128( x2, \
+         _mm_xor_si128( _mm_andnot_si128( x3, x1 ), \
+                        _mm_xor_si128( _mm_and_si128( x4, x5 ), \
+                                       _mm_xor_si128( x6, x0 ) ) ) ), \
+         _mm_xor_si128( \
+             _mm_and_si128( x4, _mm_xor_si128( x1, x5 ) ), \
+             _mm_xor_si128( _mm_and_si128( x3, x5 ), x0 ) ) ) \
+
+#define F3(x6, x5, x4, x3, x2, x1, x0) \
+  _mm_xor_si128( \
+    _mm_and_si128( x3, \
+      _mm_xor_si128( _mm_and_si128( x1, x2 ), \
+                     _mm_xor_si128( x6, x0 ) ) ), \
+      _mm_xor_si128( _mm_xor_si128(_mm_and_si128( x1, x4 ), \
+                                   _mm_and_si128( x2, x5 ) ), x0 ) )
+
+#define F4(x6, x5, x4, x3, x2, x1, x0) \
+  _mm_xor_si128( \
+     _mm_xor_si128( \
+        _mm_and_si128( x3, \
+           _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x2 ), \
+                                         _mm_or_si128( x4, x6 ) ), x5 ) ), \
+        _mm_and_si128( x4, \
+           _mm_xor_si128( _mm_xor_si128( _mm_and_si128( mm_not(x2), x5 ), \
+                          _mm_xor_si128( x1, x6 ) ), x0 ) ) ), \
+     _mm_xor_si128( _mm_and_si128( x2, x6 ), x0 ) )
+
+
+#define F5(x6, x5, x4, x3, x2, x1, x0) \
+   _mm_xor_si128( \
+       _mm_and_si128( x0, \
+            mm_not( _mm_xor_si128( \
+                    _mm_and_si128( _mm_and_si128( x1, x2 ), x3 ), x5 ) ) ), \
+      _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x4 ), \
+                                    _mm_and_si128( x2, x5 ) ), \
+                                    _mm_and_si128( x3, x6 ) ) )
+
+/*
+ * The macros below integrate the phi() permutations, depending on the
+ * pass and the total number of passes.
+ */
+
+#define FP3_1(x6, x5, x4, x3, x2, x1, x0) \
+	F1(x1, x0, x3, x5, x6, x2, x4)
+#define FP3_2(x6, x5, x4, x3, x2, x1, x0) \
+	F2(x4, x2, x1, x0, x5, x3, x6)
+#define FP3_3(x6, x5, x4, x3, x2, x1, x0) \
+	F3(x6, x1, x2, x3, x4, x5, x0)
+
+#define FP4_1(x6, x5, x4, x3, x2, x1, x0) \
+	F1(x2, x6, x1, x4, x5, x3, x0)
+#define FP4_2(x6, x5, x4, x3, x2, x1, x0) \
+	F2(x3, x5, x2, x0, x1, x6, x4)
+#define FP4_3(x6, x5, x4, x3, x2, x1, x0) \
+	F3(x1, x4, x3, x6, x0, x2, x5)
+#define FP4_4(x6, x5, x4, x3, x2, x1, x0) \
+	F4(x6, x4, x0, x5, x2, x1, x3)
+
+#define FP5_1(x6, x5, x4, x3, x2, x1, x0) \
+	F1(x3, x4, x1, x0, x5, x2, x6)
+#define FP5_2(x6, x5, x4, x3, x2, x1, x0) \
+	F2(x6, x2, x1, x0, x3, x4, x5)
+#define FP5_3(x6, x5, x4, x3, x2, x1, x0) \
+	F3(x2, x6, x0, x4, x3, x1, x5)
+#define FP5_4(x6, x5, x4, x3, x2, x1, x0) \
+	F4(x1, x5, x3, x2, x0, x4, x6)
+#define FP5_5(x6, x5, x4, x3, x2, x1, x0) \
+	F5(x2, x5, x0, x6, x4, x3, x1)
+
+/*
+ * One step, for "n" passes, pass number "p" (1 <= p <= n), using
+ * input word number "w" and step constant "c".
+ */
+#define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
+do { \
+   __m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = _mm_add_epi32( _mm_add_epi32( mm_rotr_32( t, 7 ), \
+                                      mm_rotr_32( x7, 11 ) ), \
+                       _mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \
+} while (0)
+
+/*
+ * PASSy(n, in) computes pass number "y", for a total of "n", using the
+ * one-argument macro "in" to access input words. Current state is assumed
+ * to be held in variables "s0" to "s7".
+ */
+
+//#if SPH_SMALL_FOOTPRINT_HAVAL
+
+#define PASS1(n, in)   do { \
+		unsigned pass_count; \
+		for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+			STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
+				in(pass_count + 0), SPH_C32(0x00000000)); \
+			STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
+				in(pass_count + 1), SPH_C32(0x00000000)); \
+			STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
+				in(pass_count + 2), SPH_C32(0x00000000)); \
+			STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
+				in(pass_count + 3), SPH_C32(0x00000000)); \
+			STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
+				in(pass_count + 4), SPH_C32(0x00000000)); \
+			STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
+				in(pass_count + 5), SPH_C32(0x00000000)); \
+			STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
+				in(pass_count + 6), SPH_C32(0x00000000)); \
+			STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
+				in(pass_count + 7), SPH_C32(0x00000000)); \
+   		} \
+	} while (0)
+
+#define PASSG(p, n, in)   do { \
+		unsigned pass_count; \
+		for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+			STEP(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
+				in(MP ## p[pass_count + 0]), \
+				RK ## p[pass_count + 0]); \
+			STEP(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
+				in(MP ## p[pass_count + 1]), \
+				RK ## p[pass_count + 1]); \
+			STEP(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
+				in(MP ## p[pass_count + 2]), \
+				RK ## p[pass_count + 2]); \
+			STEP(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
+				in(MP ## p[pass_count + 3]), \
+				RK ## p[pass_count + 3]); \
+			STEP(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
+				in(MP ## p[pass_count + 4]), \
+				RK ## p[pass_count + 4]); \
+			STEP(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
+				in(MP ## p[pass_count + 5]), \
+				RK ## p[pass_count + 5]); \
+			STEP(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
+				in(MP ## p[pass_count + 6]), \
+				RK ## p[pass_count + 6]); \
+			STEP(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
+				in(MP ## p[pass_count + 7]), \
+				RK ## p[pass_count + 7]); \
+   		} \
+	} while (0)
+
+#define PASS2(n, in)    PASSG(2, n, in)
+#define PASS3(n, in)    PASSG(3, n, in)
+#define PASS4(n, in)    PASSG(4, n, in)
+#define PASS5(n, in)    PASSG(5, n, in)
+
+static const unsigned MP2[32] = {
+	 5, 14, 26, 18, 11, 28,  7, 16,
+	 0, 23, 20, 22,  1, 10,  4,  8,
+	30,  3, 21,  9, 17, 24, 29,  6,
+	19, 12, 15, 13,  2, 25, 31, 27
+};
+
+static const unsigned MP3[32] = {
+	19,  9,  4, 20, 28, 17,  8, 22,
+	29, 14, 25, 12, 24, 30, 16, 26,
+	31, 15,  7,  3,  1,  0, 18, 27,
+	13,  6, 21, 10, 23, 11,  5,  2
+};
+
+static const unsigned MP4[32] = {
+	24,  4,  0, 14,  2,  7, 28, 23,
+	26,  6, 30, 20, 18, 25, 19,  3,
+	22, 11, 31, 21,  8, 27, 12,  9,
+	 1, 29,  5, 15, 17, 10, 16, 13
+};
+
+static const unsigned MP5[32] = {
+	27,  3, 21, 26, 17, 11, 20, 29,
+	19,  0, 12,  7, 13,  8, 31, 10,
+	 5,  9, 14, 30, 18,  6, 28, 24,
+	 2, 23, 16, 22,  4,  1, 25, 15
+};
+
+static const sph_u32 RK2[32] = {
+	SPH_C32(0x452821E6), SPH_C32(0x38D01377),
+	SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
+	SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
+	SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917),
+	SPH_C32(0x9216D5D9), SPH_C32(0x8979FB1B),
+	SPH_C32(0xD1310BA6), SPH_C32(0x98DFB5AC),
+	SPH_C32(0x2FFD72DB), SPH_C32(0xD01ADFB7),
+	SPH_C32(0xB8E1AFED), SPH_C32(0x6A267E96),
+	SPH_C32(0xBA7C9045), SPH_C32(0xF12C7F99),
+	SPH_C32(0x24A19947), SPH_C32(0xB3916CF7),
+	SPH_C32(0x0801F2E2), SPH_C32(0x858EFC16),
+	SPH_C32(0x636920D8), SPH_C32(0x71574E69),
+	SPH_C32(0xA458FEA3), SPH_C32(0xF4933D7E),
+	SPH_C32(0x0D95748F), SPH_C32(0x728EB658),
+	SPH_C32(0x718BCD58), SPH_C32(0x82154AEE),
+	SPH_C32(0x7B54A41D), SPH_C32(0xC25A59B5)
+};
+
+static const sph_u32 RK3[32] = {
+	SPH_C32(0x9C30D539), SPH_C32(0x2AF26013),
+	SPH_C32(0xC5D1B023), SPH_C32(0x286085F0),
+	SPH_C32(0xCA417918), SPH_C32(0xB8DB38EF),
+	SPH_C32(0x8E79DCB0), SPH_C32(0x603A180E),
+	SPH_C32(0x6C9E0E8B), SPH_C32(0xB01E8A3E),
+	SPH_C32(0xD71577C1), SPH_C32(0xBD314B27),
+	SPH_C32(0x78AF2FDA), SPH_C32(0x55605C60),
+	SPH_C32(0xE65525F3), SPH_C32(0xAA55AB94),
+	SPH_C32(0x57489862), SPH_C32(0x63E81440),
+	SPH_C32(0x55CA396A), SPH_C32(0x2AAB10B6),
+	SPH_C32(0xB4CC5C34), SPH_C32(0x1141E8CE),
+	SPH_C32(0xA15486AF), SPH_C32(0x7C72E993),
+	SPH_C32(0xB3EE1411), SPH_C32(0x636FBC2A),
+	SPH_C32(0x2BA9C55D), SPH_C32(0x741831F6),
+	SPH_C32(0xCE5C3E16), SPH_C32(0x9B87931E),
+	SPH_C32(0xAFD6BA33), SPH_C32(0x6C24CF5C)
+};
+
+static const sph_u32 RK4[32] = {
+	SPH_C32(0x7A325381), SPH_C32(0x28958677),
+	SPH_C32(0x3B8F4898), SPH_C32(0x6B4BB9AF),
+	SPH_C32(0xC4BFE81B), SPH_C32(0x66282193),
+	SPH_C32(0x61D809CC), SPH_C32(0xFB21A991),
+	SPH_C32(0x487CAC60), SPH_C32(0x5DEC8032),
+	SPH_C32(0xEF845D5D), SPH_C32(0xE98575B1),
+	SPH_C32(0xDC262302), SPH_C32(0xEB651B88),
+	SPH_C32(0x23893E81), SPH_C32(0xD396ACC5),
+	SPH_C32(0x0F6D6FF3), SPH_C32(0x83F44239),
+	SPH_C32(0x2E0B4482), SPH_C32(0xA4842004),
+	SPH_C32(0x69C8F04A), SPH_C32(0x9E1F9B5E),
+	SPH_C32(0x21C66842), SPH_C32(0xF6E96C9A),
+	SPH_C32(0x670C9C61), SPH_C32(0xABD388F0),
+	SPH_C32(0x6A51A0D2), SPH_C32(0xD8542F68),
+	SPH_C32(0x960FA728), SPH_C32(0xAB5133A3),
+	SPH_C32(0x6EEF0B6C), SPH_C32(0x137A3BE4)
+};
+
+static const sph_u32 RK5[32] = {
+	SPH_C32(0xBA3BF050), SPH_C32(0x7EFB2A98),
+	SPH_C32(0xA1F1651D), SPH_C32(0x39AF0176),
+	SPH_C32(0x66CA593E), SPH_C32(0x82430E88),
+	SPH_C32(0x8CEE8619), SPH_C32(0x456F9FB4),
+	SPH_C32(0x7D84A5C3), SPH_C32(0x3B8B5EBE),
+	SPH_C32(0xE06F75D8), SPH_C32(0x85C12073),
+	SPH_C32(0x401A449F), SPH_C32(0x56C16AA6),
+	SPH_C32(0x4ED3AA62), SPH_C32(0x363F7706),
+	SPH_C32(0x1BFEDF72), SPH_C32(0x429B023D),
+	SPH_C32(0x37D0D724), SPH_C32(0xD00A1248),
+	SPH_C32(0xDB0FEAD3), SPH_C32(0x49F1C09B),
+	SPH_C32(0x075372C9), SPH_C32(0x80991B7B),
+	SPH_C32(0x25D479D8), SPH_C32(0xF6E8DEF7),
+	SPH_C32(0xE3FE501A), SPH_C32(0xB6794C3B),
+	SPH_C32(0x976CE0BD), SPH_C32(0x04C006BA),
+	SPH_C32(0xC1A94FB6), SPH_C32(0x409F60C4)
+};
+
+#define SAVE_STATE \
+   __m128i u0, u1, u2, u3, u4, u5, u6, u7; \
+   do { \
+      u0 = s0; \
+      u1 = s1; \
+      u2 = s2; \
+      u3 = s3; \
+      u4 = s4; \
+      u5 = s5; \
+      u6 = s6; \
+      u7 = s7; \
+   } while (0)
+
+#define UPDATE_STATE \
+do { \
+   s0 = _mm_add_epi32( s0, u0 ); \
+   s1 = _mm_add_epi32( s1, u1 ); \
+   s2 = _mm_add_epi32( s2, u2 ); \
+   s3 = _mm_add_epi32( s3, u3 ); \
+   s4 = _mm_add_epi32( s4, u4 ); \
+   s5 = _mm_add_epi32( s5, u5 ); \
+   s6 = _mm_add_epi32( s6, u6 ); \
+   s7 = _mm_add_epi32( s7, u7 ); \
+} while (0)
+
+/*
+ * COREn(in) performs the core HAVAL computation for "n" passes, using
+ * the one-argument macro "in" to access the input words. Running state
+ * is held in variable "s0" to "s7".
+ */
+/*
+#define CORE3(in)  do { \
+		SAVE_STATE; \
+		PASS1(3, in); \
+		PASS2(3, in); \
+		PASS3(3, in); \
+		UPDATE_STATE; \
+	} while (0)
+
+#define CORE4(in)  do { \
+		SAVE_STATE; \
+		PASS1(4, in); \
+		PASS2(4, in); \
+		PASS3(4, in); \
+		PASS4(4, in); \
+		UPDATE_STATE; \
+	} while (0)
+*/
+#define CORE5(in)  do { \
+		SAVE_STATE; \
+		PASS1(5, in); \
+		PASS2(5, in); \
+		PASS3(5, in); \
+		PASS4(5, in); \
+		PASS5(5, in); \
+		UPDATE_STATE; \
+	} while (0)
+
+/*
+ * DSTATE declares the state variables "s0" to "s7".
+ */
+#define DSTATE   __m128i s0, s1, s2, s3, s4, s5, s6, s7
+
+/*
+ * RSTATE fills the state variables from the context "sc".
+ */
+#define RSTATE \
+do { \
+   s0 = sc->s0; \
+   s1 = sc->s1; \
+   s2 = sc->s2; \
+   s3 = sc->s3; \
+   s4 = sc->s4; \
+   s5 = sc->s5; \
+   s6 = sc->s6; \
+   s7 = sc->s7; \
+} while (0)
+
+/*
+ * WSTATE updates the context "sc" from the state variables.
+ */
+#define WSTATE \
+do { \
+   sc->s0 = s0; \
+   sc->s1 = s1; \
+   sc->s2 = s2; \
+   sc->s3 = s3; \
+   sc->s4 = s4; \
+   sc->s5 = s5; \
+   sc->s6 = s6; \
+   sc->s7 = s7; \
+} while (0)
+
+/*
+ * Initialize a context. "olen" is the output length, in 32-bit words
+ * (between 4 and 8, inclusive). "passes" is the number of passes
+ * (3, 4 or 5).
+ */
+static void
+haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes )
+{
+   sc->s0 = _mm_set1_epi32( 0x243F6A88UL );
+   sc->s1 = _mm_set1_epi32( 0x85A308D3UL );
+   sc->s2 = _mm_set1_epi32( 0x13198A2EUL );
+   sc->s3 = _mm_set1_epi32( 0x03707344UL );
+   sc->s4 = _mm_set1_epi32( 0xA4093822UL );
+   sc->s5 = _mm_set1_epi32( 0x299F31D0UL );
+   sc->s6 = _mm_set1_epi32( 0x082EFA98UL );
+   sc->s7 = _mm_set1_epi32( 0xEC4E6C89UL );
+   sc->olen = olen;
+   sc->passes = passes;
+   sc->count_high = 0;
+   sc->count_low = 0;
+	
+}
+
+#define IN_PREPARE(indata) const __m128i *const load_ptr = (indata)
+
+#define INW(i)   load_ptr[ i ] 
+
+/*
+ * Write out HAVAL output. The output length is tailored to the requested
+ * length.
+ */
+static void
+haval_4way_out( haval_4way_context *sc, void *dst )
+{
+   __m128i *buf = (__m128i*)dst;
+   DSTATE;
+   RSTATE;
+
+   buf[0] = s0;
+   buf[1] = s1;
+   buf[2] = s2;
+   buf[3] = s3;
+   buf[4] = s4;
+   buf[5] = s5;
+   buf[6] = s6;
+   buf[7] = s7;
+}
+
+/*
+ * The main core functions inline the code with the COREx() macros. We
+ * use a helper file, included three times, which avoids code copying.
+ */
+/*
+#undef PASSES
+#define PASSES   3
+#include "haval-helper.c"
+
+#undef PASSES
+#define PASSES   4
+#include "haval-helper.c"
+*/
+
+#undef PASSES
+#define PASSES   5
+#include "haval-4way-helper.c"
+
+/* ====================================================================== */
+
+#define API(xxx, y) \
+void \
+haval ## xxx ## _ ## y ## _4way_init(void *cc) \
+{ \
+	haval_4way_init(cc, xxx >> 5, y); \
+} \
+ \
+void \
+haval ## xxx ## _ ## y ## _4way (void *cc, const void *data, size_t len) \
+{ \
+	haval ## y ## _4way(cc, data, len); \
+} \
+ \
+void \
+haval ## xxx ## _ ## y ## _4way_close(void *cc, void *dst) \
+{ \
+	haval ## y ## _4way_close(cc, dst); \
+} \
+
+API(256, 5)
+
+#define RVAL \
+do { \
+   s0 = val[0]; \
+   s1 = val[1]; \
+   s2 = val[2]; \
+   s3 = val[3]; \
+   s4 = val[4]; \
+   s5 = val[5]; \
+   s6 = val[6]; \
+   s7 = val[7]; \
+} while (0)
+
+#define WVAL \
+do { \
+   val[0] = s0; \
+   val[1] = s1; \
+   val[2] = s2; \
+   val[3] = s3; \
+   val[4] = s4; \
+   val[5] = s5; \
+   val[6] = s6; \
+   val[7] = s7; \
+} while (0)
+
+#define INMSG(i)   msg[i]
+
+#ifdef __cplusplus
+}
+#endif	
+#endif
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -0,0 +1,95 @@
+/* $Id: sph_haval.h 218 2010-06-08 17:06:34Z tp $ */
+/**
+ * HAVAL interface.
+ *
+ * HAVAL is actually a family of 15 hash functions, depending on whether
+ * the internal computation uses 3, 4 or 5 passes, and on the output
+ * length, which is 128, 160, 192, 224 or 256 bits. This implementation
+ * provides interface functions for all 15, which internally map to
+ * three cores (depending on the number of passes). Note that output
+ * lengths other than 256 bits are not obtained by a simple truncation
+ * of a longer result; the requested length is encoded within the
+ * padding data.
+ *
+ * HAVAL was published in: Yuliang Zheng, Josef Pieprzyk and Jennifer
+ * Seberry: "HAVAL -- a one-way hashing algorithm with variable length
+ * of output", Advances in Cryptology -- AUSCRYPT'92, Lecture Notes in
+ * Computer Science, Vol.718, pp.83-104, Springer-Verlag, 1993.
+ *
+ * This paper, and a reference implementation, are available on the
+ * Calyptix web site: http://labs.calyptix.com/haval.php
+ *
+ * The HAVAL reference paper is quite unclear on the data encoding
+ * details, i.e. endianness (both byte order within a 32-bit word, and
+ * word order within a message block). This implementation has been
+ * made compatible with the reference implementation referenced above.
+ *
+ * @warning   A collision for HAVAL-128/3 (HAVAL with three passes and
+ * 128-bit output) has been published; this function is thus considered
+ * as cryptographically broken. The status for other variants is unclear;
+ * use only with care.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_haval.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef HAVAL_HASH_4WAY_H__
+#define HAVAL_HASH_4WAY_H__
+
+#if defined(__AVX__)
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+#define SPH_SIZE_haval256_5   256
+
+typedef struct {
+   __m128i buf[32];
+   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+   unsigned olen, passes;
+   sph_u32 count_high, count_low;
+} haval_4way_context;
+
+typedef haval_4way_context haval256_5_4way_context;
+
+void haval256_5_4way_init( void *cc );
+
+void haval256_5_4way( void *cc, const void *data, size_t len );
+
+void haval256_5_4way_close( void *cc, void *dst );
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+#endif
--- a/algo/jh/jha-gate.c
+++ b/algo/jh/jha-gate.c
@@ -5,14 +5,13 @@ bool register_jha_algo( algo_gate_t* gate )
 {
 #if defined (JHA_4WAY)
  four_way_not_tested();
-  gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
  gate->scanhash         = (void*)&scanhash_jha_4way;
  gate->hash             = (void*)&jha_hash_4way;
 #else
-  gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
  gate->scanhash         = (void*)&scanhash_jha;
  gate->hash             = (void*)&jha_hash;
 #endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->set_target       = (void*)&scrypt_set_target;
  return true;
 };
--- a/algo/jh/jha-gate.h
+++ b/algo/jh/jha-gate.h
@@ -5,7 +5,7 @@
 #include <stdint.h>


-#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
+#if defined(__AVX2__) && defined(__AES__)
  #define JHA_4WAY
 #endif

--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -9,7 +9,7 @@ int64_t keccak_get_max64() { return 0x7ffffLL; }

 bool register_keccak_algo( algo_gate_t* gate )
 {
-  gate->optimizations = FOUR_WAY_OPT;
+  gate->optimizations = AVX2_OPT;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  gate->set_target      = (void*)&keccak_set_target;
  gate->get_max64       = (void*)&keccak_get_max64;
@@ -30,7 +30,7 @@ void keccakc_set_target( struct work* work, double job_diff )

 bool register_keccakc_algo( algo_gate_t* gate )
 {
-  gate->optimizations = FOUR_WAY_OPT;
+  gate->optimizations = AVX2_OPT;
  gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
  gate->set_target      = (void*)&keccakc_set_target;
  gate->get_max64       = (void*)&keccak_get_max64;
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(FOUR_WAY) && defined(__AVX2__)
+#if defined(__AVX2__)
  #define KECCAK_4WAY
 #endif

--- a/algo/lyra2/lyra2h-gate.c
+++ b/algo/lyra2/lyra2h-gate.c
@@ -17,7 +17,7 @@ bool register_lyra2h_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2h;
  gate->hash       = (void*)&lyra2h_hash;
 #endif
-  gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = AVX_OPT | AVX2_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  gate->set_target = (void*)&lyra2h_set_target;
  return true;
--- a/algo/lyra2/lyra2h-gate.h
+++ b/algo/lyra2/lyra2h-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY)
+#if defined(__AVX2__)
  #define LYRA2H_4WAY
 #endif

--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -1,7 +1,7 @@
 #include "lyra2rev2-gate.h"
 #include <memory.h>

-#ifdef __AVX2__	
+#if defined (__AVX2__)	

 #include "algo/blake/blake-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
@@ -9,7 +9,7 @@
 #include "algo/bmw/bmw-hash-4way.h"

 #include "algo/cubehash/sph_cubehash.h"
-#include "algo/bmw/sph_bmw.h"
+//#include "algo/bmw/sph_bmw.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h" 

 typedef struct {
@@ -17,8 +17,8 @@ typedef struct {
   keccak256_4way_context    keccak;
   cubehashParam             cube;
   skein256_4way_context     skein;
-        sph_bmw256_context       bmw;
-
+   bmw256_4way_context          bmw;
+//        sph_bmw256_context       bmw;
 } lyra2v2_4way_ctx_holder;

 static lyra2v2_4way_ctx_holder l2v2_4way_ctx;
@@ -29,7 +29,8 @@ void init_lyra2rev2_4way_ctx()
   keccak256_4way_init( &l2v2_4way_ctx.keccak );
   cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
   skein256_4way_init( &l2v2_4way_ctx.skein );
-        sph_bmw256_init( &l2v2_4way_ctx.bmw );
+   bmw256_4way_init( &l2v2_4way_ctx.bmw );
+//        sph_bmw256_init( &l2v2_4way_ctx.bmw );
 }

 void lyra2rev2_4way_hash( void *state, const void *input )
@@ -80,23 +81,26 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );


-	sph_bmw256( &ctx.bmw, hash0, 32 );
-	sph_bmw256_close( &ctx.bmw, hash0 );
-        memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
-        sph_bmw256( &ctx.bmw, hash1, 32 );
-        sph_bmw256_close( &ctx.bmw, hash1 );
-        memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
-        sph_bmw256( &ctx.bmw, hash2, 32 );
-        sph_bmw256_close( &ctx.bmw, hash2 );
-        memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
-        sph_bmw256( &ctx.bmw, hash3, 32 );
-        sph_bmw256_close( &ctx.bmw, hash3 );
+   // BMW256 4way has a lane corruption problem, only lanes 0 & 2 produce
+   // good hash. As a result this ugly workaround of running bmw256-4way
+   // twice with data shuffled to get all 4 lanes of good hash.
+   // The hash is then shuffled back into the appropriate lanes for output.
+   // Not as fast but still faster than using sph serially. 

-
-   memcpy( state,    hash0, 32 );
-   memcpy( state+32, hash1, 32 );
-   memcpy( state+64, hash2, 32 );
-   memcpy( state+96, hash3, 32 );
+   // shift lane 1 data to lane 2.
+   mm_interleave_4x32( vhash, hash0, hash0, hash1, hash1, 256 );
+   bmw256_4way( &ctx.bmw, vhash, 32 );
+   bmw256_4way_close( &ctx.bmw, vhash );
+   uint32_t trash[8] __attribute__ ((aligned (32)));
+   // extract lane 0 as usual and lane2 containing lane 1 hash
+   mm_deinterleave_4x32( state, trash, state+32, trash, vhash, 256 );
+   // shift lane2 data to lane 0 and lane 3 data to lane 2
+   mm_interleave_4x32( vhash, hash2, hash2, hash3, hash3, 256 );
+   bmw256_4way_init( &ctx.bmw );
+   bmw256_4way( &ctx.bmw, vhash, 32 );
+   bmw256_4way_close( &ctx.bmw, vhash );
+   // extract lane 2 hash from lane 0 and lane 3 hash from lane 2.
+   mm_deinterleave_4x32( state+64, trash, state+96, trash, vhash, 256 );
 }

 int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -140,6 +144,7 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,

      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
      {
+//printf("found0\n");
          found[0] = true;
          num_found++;
          nonces[0] = pdata[19] = n;
@@ -147,6 +152,7 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
      }
      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
      {
+//printf("found1\n");
          found[1] = true;
          num_found++;
          nonces[1] = n+1;
@@ -154,6 +160,7 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
      }
      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
      {
+//printf("found2\n");
          found[2] = true;
          num_found++;
          nonces[2] = n+2;
@@ -161,6 +168,7 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
      }
      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
      {
+//printf("found3\n");
          found[3] = true;
          num_found++;
          nonces[3] = n+3;
--- a/algo/lyra2/lyra2rev2-gate.c
+++ b/algo/lyra2/lyra2rev2-gate.c
@@ -29,7 +29,7 @@ bool register_lyra2rev2_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_lyra2rev2;
  gate->hash      = (void*)&lyra2rev2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  gate->set_target        = (void*)&lyra2rev2_set_target;
  return true;
--- a/algo/lyra2/lyra2rev2-gate.h
+++ b/algo/lyra2/lyra2rev2-gate.h
@@ -5,7 +5,7 @@
 #include <stdint.h>
 #include "lyra2.h"

-#if defined(HASH_4WAY)
+#if defined(__AVX2__)
  #define LYRA2REV2_4WAY
 #endif

--- a/algo/lyra2/lyra2z-gate.c
+++ b/algo/lyra2/lyra2z-gate.c
@@ -17,7 +17,7 @@ bool register_lyra2z_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2z;
  gate->hash       = (void*)&lyra2z_hash;
 #endif
-  gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = AVX_OPT | AVX2_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  gate->set_target = (void*)&lyra2z_set_target;
  return true;
--- a/algo/lyra2/lyra2z-gate.h
+++ b/algo/lyra2/lyra2z-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY)
+#if defined(__AVX2__)
  #define LYRA2Z_4WAY
 #endif

--- a/algo/nist5/nist5-gate.c
+++ b/algo/nist5/nist5-gate.c
@@ -2,7 +2,7 @@

 bool register_nist5_algo( algo_gate_t* gate )
 {
-    gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+    gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
 #if defined (NIST5_4WAY)
    gate->scanhash = (void*)&scanhash_nist5_4way;
    gate->hash     = (void*)&nist5hash_4way;
--- a/algo/nist5/nist5-gate.h
+++ b/algo/nist5/nist5-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY) && defined(__AES__)
+#if defined(__AVX2__) && defined(__AES__)
  #define NIST5_4WAY
 #endif

--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -0,0 +1,231 @@
+#include "cpuminer-config.h"
+#include "anime-gate.h"
+
+#if defined (ANIME_4WAY)
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+
+typedef struct {
+    blake512_4way_context  blake;
+    bmw512_4way_context    bmw;
+    hashState_groestl      groestl;
+    jh512_4way_context     jh;
+    skein512_4way_context  skein;
+    keccak512_4way_context keccak;
+} anime_4way_ctx_holder;
+
+anime_4way_ctx_holder anime_4way_ctx __attribute__ ((aligned (64)));
+
+void init_anime_4way_ctx()
+{
+     blake512_4way_init( &anime_4way_ctx.blake );
+     bmw512_4way_init( &anime_4way_ctx.bmw );
+     init_groestl( &anime_4way_ctx.groestl, 64 );
+     skein512_4way_init( &anime_4way_ctx.skein );
+     jh512_4way_init( &anime_4way_ctx.jh );
+     keccak512_4way_init( &anime_4way_ctx.keccak );
+}
+
+void anime_4way_hash( void *state, const void *input )
+{
+    uint64_t hash0[8] __attribute__ ((aligned (64)));
+    uint64_t hash1[8] __attribute__ ((aligned (64)));
+    uint64_t hash2[8] __attribute__ ((aligned (64)));
+    uint64_t hash3[8] __attribute__ ((aligned (64)));
+    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
+    __m256i* vh  = (__m256i*)vhash;
+    __m256i* vhA = (__m256i*)vhashA;
+    __m256i* vhB = (__m256i*)vhashB;
+    __m256i vh_mask;
+    __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
+    int i;
+    anime_4way_ctx_holder ctx;
+    memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );
+
+    bmw512_4way( &ctx.bmw, vhash, 80 );
+    bmw512_4way_close( &ctx.bmw, vhash );
+
+    blake512_4way( &ctx.blake, input, 64 );
+    blake512_4way_close( &ctx.blake, vhash );
+
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
+                                  mm256_zero );
+
+       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+       update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                               (char*)hash0, 512 );
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                               (char*)hash1, 512 );
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                               (char*)hash2, 512 );
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                               (char*)hash3, 512 );
+       mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+
+       skein512_4way( &ctx.skein, vhash, 64 );
+       skein512_4way_close( &ctx.skein, vhashB );
+
+    for ( i = 0; i < 8; i++ )
+       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
+    mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+    jh512_4way( &ctx.jh, vhash, 64 );
+    jh512_4way_close( &ctx.jh, vhash );
+
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
+                                  mm256_zero );
+
+       blake512_4way_init( &ctx.blake );
+       blake512_4way( &ctx.blake, vhash, 64 );
+       blake512_4way_close( &ctx.blake, vhashA );
+
+       bmw512_4way_init( &ctx.bmw );
+       bmw512_4way( &ctx.bmw, vhash, 64 );
+       bmw512_4way_close( &ctx.bmw, vhashB );
+
+    for ( i = 0; i < 8; i++ )
+       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
+    keccak512_4way( &ctx.keccak, vhash, 64 );
+    keccak512_4way_close( &ctx.keccak, vhash );
+
+    skein512_4way_init( &ctx.skein );
+    skein512_4way( &ctx.skein, vhash, 64 );
+    skein512_4way_close( &ctx.skein, vhash );
+
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
+                                  mm256_zero );
+
+       keccak512_4way_init( &ctx.keccak );
+       keccak512_4way( &ctx.keccak, vhash, 64 );
+       keccak512_4way_close( &ctx.keccak, vhashA );
+
+       jh512_4way_init( &ctx.jh );
+       jh512_4way( &ctx.jh, vhash, 64 );
+       jh512_4way_close( &ctx.jh, vhashB );
+
+    for ( i = 0; i < 8; i++ )
+       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
+    mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
+}
+
+int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done)
+{
+    uint32_t hash[4*8] __attribute__ ((aligned (64)));
+    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+    uint32_t endiandata[20] __attribute__((aligned(64)));
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    uint32_t n = pdata[19];
+    const uint32_t first_nonce = pdata[19];
+    uint32_t *nonces = work->nonces;
+    bool *found = work->nfound;
+    int num_found = 0;
+    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+    uint32_t *noncep1 = vdata + 75;
+    uint32_t *noncep2 = vdata + 77;
+    uint32_t *noncep3 = vdata + 79;
+    const uint32_t Htarg = ptarget[7];
+    uint64_t htmax[] = {
+                0,
+                0xF,
+                0xFF,
+                0xFFF,
+                0xFFFF,
+                0x10000000
+        };
+    uint32_t masks[] = {
+                0xFFFFFFFF,
+                0xFFFFFFF0,
+                0xFFFFFF00,
+                0xFFFFF000,
+                0xFFFF0000,
+                0
+        };
+
+    swab32_array( endiandata, pdata, 20 );
+
+    uint64_t *edata = (uint64_t*)endiandata;
+    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+    for (int m=0; m < 6; m++)
+       if (Htarg <= htmax[m])
+       {
+          uint32_t mask = masks[m];
+
+          do
+          {
+              found[0] = found[1] = found[2] = found[3] = false;
+              be32enc( noncep0, n   );
+              be32enc( noncep1, n+1 );
+              be32enc( noncep2, n+2 );
+              be32enc( noncep3, n+3 );
+
+              anime_4way_hash( hash, vdata );
+              pdata[19] = n;
+
+             if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) ) 
+             {
+                found[0] = true;
+                num_found++;
+                nonces[0] = n;
+                work_set_target_ratio( work, hash );
+             }
+             if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+             {
+                found[1] = true;
+                num_found++;
+                nonces[1] = n+1;
+                work_set_target_ratio( work, hash );
+             }
+             if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+             {
+                found[2] = true;
+                num_found++;
+                nonces[2] = n+2;
+                work_set_target_ratio( work, hash );
+             }
+             if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+             {
+                found[3] = true;
+                num_found++;
+                nonces[3] = n+3;
+                work_set_target_ratio( work, hash );
+             }
+             n += 4;
+          } while ( ( num_found == 0 ) && ( n < max_nonce )
+              && !work_restart[thr_id].restart );
+          break;
+       }
+
+    *hashes_done = n - first_nonce + 1;
+    return num_found;
+}
+
+#endif
--- a/algo/quark/anime-gate.c
+++ b/algo/quark/anime-gate.c
@@ -0,0 +1,17 @@
+#include "anime-gate.h"
+
+bool register_anime_algo( algo_gate_t* gate )
+{
+#if defined (ANIME_4WAY)
+  init_anime_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_anime_4way;
+  gate->hash      = (void*)&anime_4way_hash;
+#else
+  init_anime_ctx();
+  gate->scanhash  = (void*)&scanhash_anime;
+  gate->hash      = (void*)&anime_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  return true;
+};
+
--- a/algo/quark/anime-gate.h
+++ b/algo/quark/anime-gate.h
@@ -0,0 +1,32 @@
+#ifndef ANIME_GATE_H__
+#define ANIME_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__) && defined(__AES__)
+  #define ANIME_4WAY
+#endif
+
+bool register_anime_algo( algo_gate_t* gate );
+
+#if defined(ANIME_4WAY)
+
+void anime_4way_hash( void *state, const void *input );
+
+int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_anime_4way_ctx();
+
+#endif
+
+void anime_hash( void *state, const void *input );
+
+int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_anime_ctx();
+
+#endif
+
--- a/algo/quark/anime.c
+++ b/algo/quark/anime.c
@@ -0,0 +1,189 @@
+#include "cpuminer-config.h"
+#include "anime-gate.h"
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include "algo/blake/sph_blake.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/skein/sph_skein.h"
+#include "algo/jh/sph_jh.h"
+#include "algo/keccak/sph_keccak.h"
+#ifdef __AES__
+ #include "algo/groestl/aes_ni/hash-groestl.h"
+#else
+ #include "algo/groestl/sph_groestl.h"
+#endif
+
+typedef struct {
+    sph_blake512_context  blake;
+    sph_bmw512_context    bmw;
+#ifdef __AES__
+    hashState_groestl groestl;
+#else
+    sph_groestl512_context groestl;
+#endif
+    sph_jh512_context      jh;
+    sph_skein512_context   skein;
+    sph_keccak512_context  keccak;
+} anime_ctx_holder;
+
+anime_ctx_holder anime_ctx __attribute__ ((aligned (64)));
+
+void init_anime_ctx()
+{
+     sph_blake512_init( &anime_ctx.blake );
+     sph_bmw512_init( &anime_ctx.bmw );
+#ifdef __AES__
+    init_groestl( &anime_ctx.groestl, 64 );
+#else
+     sph_groestl512_init( &anime_ctx.groestl );
+#endif
+     sph_skein512_init( &anime_ctx.skein );
+     sph_jh512_init( &anime_ctx.jh );
+     sph_keccak512_init( &anime_ctx.keccak );
+}
+
+void anime_hash( void *state, const void *input )
+{
+    unsigned char hash[128] __attribute__ ((aligned (32)));
+/*
+    uint64_t hash0[8] __attribute__ ((aligned (64)));
+    uint64_t hash1[8] __attribute__ ((aligned (64)));
+    uint64_t hash2[8] __attribute__ ((aligned (64)));
+    uint64_t hash3[8] __attribute__ ((aligned (64)));
+    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
+    __m256i* vh  = (__m256i*)vhash;
+    __m256i* vhA = (__m256i*)vhashA;
+    __m256i* vhB = (__m256i*)vhashB;
+    __m256i vh_mask;
+    __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
+*/
+    uint32_t mask = 8;
+    anime_ctx_holder ctx;
+    memcpy( &ctx, &anime_ctx, sizeof(anime_ctx) );
+
+    sph_bmw512( &ctx.bmw, input, 80 );
+    sph_bmw512_close( &ctx.bmw, hash );
+
+    sph_blake512( &ctx.blake, hash, 64 );
+    sph_blake512_close( &ctx.blake, hash );
+
+    if ( ( hash[0] & mask ) != 0 ) 
+    {
+#ifdef __AES__
+       update_and_final_groestl( &ctx.groestl, (char*)hash, (char*)hash, 512 );
+       reinit_groestl( &ctx.groestl );
+#else
+       sph_groestl512 ( &ctx.groestl, hash, 64 );
+       sph_groestl512_close( &ctx.groestl, hash );
+       sph_groestl512_init( &ctx.groestl );
+#endif
+    }
+    else
+    {
+       sph_skein512( &ctx.skein, hash, 64 );
+       sph_skein512_close( &ctx.skein, hash );
+       sph_skein512_init( &ctx.skein );
+    }
+
+#ifdef __AES__
+    update_and_final_groestl( &ctx.groestl, (char*)hash, (char*)hash, 512 );
+#else
+    sph_groestl512 ( &ctx.groestl, hash, 64 );
+    sph_groestl512_close( &ctx.groestl, hash );
+#endif
+
+    sph_jh512( &ctx.jh, hash, 64 );
+    sph_jh512_close( &ctx.jh, hash );
+
+    if ( ( hash[0] & mask ) != 0 )
+    {
+       sph_blake512_init( &ctx.blake );
+       sph_blake512( &ctx.blake, hash, 64 );
+       sph_blake512_close( &ctx.blake, hash );
+    }
+    else
+    {
+       sph_bmw512_init( &ctx.bmw );
+       sph_bmw512( &ctx.bmw, hash, 64 );
+       sph_bmw512_close( &ctx.bmw, hash );
+    }
+
+    sph_keccak512( &ctx.keccak, hash, 64 );
+    sph_keccak512_close( &ctx.keccak, hash );
+
+    sph_skein512( &ctx.skein, hash, 64 );
+    sph_skein512_close( &ctx.skein, hash );
+
+    if ( ( hash[0] & mask ) != 0 )
+    {
+       sph_keccak512_init( &ctx.keccak );
+       sph_keccak512( &ctx.keccak, hash, 64 );
+       sph_keccak512_close( &ctx.keccak, hash );
+    }
+    else
+    {
+       sph_jh512_init( &ctx.jh );
+       sph_jh512( &ctx.jh, hash, 64 );
+       sph_jh512_close( &ctx.jh, hash );
+    }
+
+   memcpy( state, hash, 32 );
+}
+
+int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done)
+{
+    uint32_t hash[8] __attribute__ ((aligned (64)));
+    uint32_t endiandata[20] __attribute__((aligned(64)));
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    uint32_t n = pdata[19];
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t Htarg = ptarget[7];
+    uint64_t htmax[] = {
+                0,
+                0xF,
+                0xFF,
+                0xFFF,
+                0xFFFF,
+                0x10000000
+        };
+    uint32_t masks[] = {
+                0xFFFFFFFF,
+                0xFFFFFFF0,
+                0xFFFFFF00,
+                0xFFFFF000,
+                0xFFFF0000,
+                0
+        };
+
+    swab32_array( endiandata, pdata, 20 );
+
+    for (int m=0; m < 6; m++)
+       if (Htarg <= htmax[m])
+       {
+          uint32_t mask = masks[m];
+          do
+          {
+              be32enc( &endiandata[19], n );
+              anime_hash( hash, endiandata );
+              pdata[19] = n;
+
+             if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) ) 
+             {
+                work_set_target_ratio( work, hash );
+                *hashes_done = n - first_nonce + 1;
+                return true;
+             }
+             n++;
+          } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
+          break;
+       }
+
+    pdata[19] = n;
+    return 0;
+}
+
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -1,7 +1,7 @@
 #include "cpuminer-config.h"
 #include "quark-gate.h"

-#if defined (__AVX2__) && defined (__AES__)
+#if defined (QUARK_4WAY)

 #include <stdio.h>
 #include <string.h>
--- a/algo/quark/quark-gate.c
+++ b/algo/quark/quark-gate.c
@@ -11,7 +11,7 @@ bool register_quark_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_quark;
  gate->hash      = (void*)&quark_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  return true;
 };

--- a/algo/quark/quark-gate.h
+++ b/algo/quark/quark-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY) && defined(__AES__)
+#if defined(__AVX2__) && defined(__AES__)
  #define QUARK_4WAY
 #endif

--- a/algo/sha/md-helper-4way.c
+++ b/algo/sha/md-helper-4way.c
@@ -0,0 +1,270 @@
+/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */
+/*
+ * This file contains some functions which implement the external data
+ * handling and padding for Merkle-Damgard hash functions which follow
+ * the conventions set out by MD4 (little-endian) or SHA-1 (big-endian).
+ *
+ * API: this file is meant to be included, not compiled as a stand-alone
+ * file. Some macros must be defined:
+ *   RFUN   name for the round function
+ *   HASH   "short name" for the hash function
+ *   BE32   defined for big-endian, 32-bit based (e.g. SHA-1)
+ *   LE32   defined for little-endian, 32-bit based (e.g. MD5)
+ *   BE64   defined for big-endian, 64-bit based (e.g. SHA-512)
+ *   LE64   defined for little-endian, 64-bit based (no example yet)
+ *   PW01   if defined, append 0x01 instead of 0x80 (for Tiger)
+ *   BLEN   if defined, length of a message block (in bytes)
+ *   PLW1   if defined, length is defined on one 64-bit word only (for Tiger)
+ *   PLW4   if defined, length is defined on four 64-bit words (for WHIRLPOOL)
+ *   SVAL   if defined, reference to the context state information
+ *
+ * BLEN is used when a message block is not 16 (32-bit or 64-bit) words:
+ * this is used for instance for Tiger, which works on 64-bit words but
+ * uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are
+ * ignored if 32-bit words are used; if 64-bit words are used and PLW1 is
+ * set, then only one word (64 bits) will be used to encode the input
+ * message length (in bits), otherwise two words will be used (as in
+ * SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but
+ * not PLW1), four 64-bit words will be used to encode the message length
+ * (in bits). Note that regardless of those settings, only 64-bit message
+ * lengths are supported (in bits): messages longer than 2 Exabytes will be
+ * improperly hashed (this is unlikely to happen soon: 2 Exabytes is about
+ * 2 millions Terabytes, which is huge).
+ *
+ * If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close()
+ * function. This is used for Tiger2, which is identical to Tiger except
+ * when it comes to the padding (Tiger2 uses the standard 0x80 byte instead
+ * of the 0x01 from original Tiger).
+ *
+ * The RFUN function is invoked with two arguments, the first pointing to
+ * aligned data (as a "const void *"), the second being state information
+ * from the context structure. By default, this state information is the
+ * "val" field from the context, and this field is assumed to be an array
+ * of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64).
+ * from the context structure. The "val" field can have any type, except
+ * for the output encoding which assumes that it is an array of "sph_u32"
+ * values. By defining NO_OUTPUT, this last step is deactivated; the
+ * includer code is then responsible for writing out the hash result. When
+ * NO_OUTPUT is defined, the third parameter to the "close()" function is
+ * ignored.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#undef SPH_XCAT
+#define SPH_XCAT(a, b)     SPH_XCAT_(a, b)
+#undef SPH_XCAT_
+#define SPH_XCAT_(a, b)    a ## b
+
+#undef SPH_BLEN
+#undef SPH_WLEN
+#if defined BE64 || defined LE64
+#define SPH_BLEN    128U
+#define SPH_WLEN      8U
+#else
+#define SPH_BLEN     64U
+#define SPH_WLEN      4U
+#endif
+
+#ifdef BLEN
+#undef SPH_BLEN
+#define SPH_BLEN    BLEN
+#endif
+
+#undef SPH_MAXPAD
+#if defined PLW1
+#define SPH_MAXPAD   (SPH_BLEN - SPH_WLEN)
+#elif defined PLW4
+#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 2))
+#else
+#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 1))
+#endif
+
+#undef SPH_VAL
+#undef SPH_NO_OUTPUT
+#ifdef SVAL
+#define SPH_VAL         SVAL
+#define SPH_NO_OUTPUT   1
+#else
+#define SPH_VAL   sc->val
+#endif
+
+#ifndef CLOSE_ONLY
+
+#ifdef SPH_UPTR
+static void
+SPH_XCAT(HASH, _short)( void *cc, const void *data, size_t len )
+#else
+void
+HASH ( void *cc, const void *data, size_t len )
+#endif
+{
+   SPH_XCAT( HASH, _context ) *sc;
+   __m256i *vdata = (__m256i*)data;
+   size_t ptr;
+
+   sc = cc;
+   ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = SPH_BLEN - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
+      vdata = vdata + (clen>>3);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == SPH_BLEN )
+      {
+         RFUN( sc->buf, SPH_VAL );
+         ptr = 0;
+      }
+         sc->count += clen;
+   }
+}
+
+#ifdef SPH_UPTR
+void
+HASH (void *cc, const void *data, size_t len)
+{
+   SPH_XCAT(HASH, _context) *sc;
+   __m256i *vdata = (__m256i*)data;
+   unsigned ptr;
+
+   if ( len < (2 * SPH_BLEN) )
+   {
+      SPH_XCAT(HASH, _short)(cc, data, len);
+      return;
+   }
+   sc = cc;
+   ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
+   if ( ptr > 0 )
+   {
+      unsigned t;
+      t = SPH_BLEN - ptr;
+      SPH_XCAT( HASH, _short )( cc, data, t );
+      vdata = vdata + (t>>3);
+      len -= t;
+   }
+   SPH_XCAT( HASH, _short )( cc, data, len );
+}
+#endif
+
+#endif
+
+/*
+ * Perform padding and produce result. The context is NOT reinitialized
+ * by this function.
+ */
+static void
+SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
+          void *dst, unsigned rnum )
+{
+    SPH_XCAT(HASH, _context) *sc;
+    unsigned ptr, u;
+    sc = cc;
+    ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
+
+#ifdef PW01
+    sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x100 >> 8 );
+#else
+    sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x80 );
+#endif
+    ptr += 8;
+
+    if ( ptr > SPH_MAXPAD )
+    {
+         memset_zero_256( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
+         RFUN( sc->buf, SPH_VAL );
+         memset_zero_256( sc->buf, SPH_MAXPAD >> 3 );
+    }
+    else
+    {
+         memset_zero_256( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
+    }
+#if defined BE64
+#if defined PLW1
+    sc->buf[ SPH_MAXPAD>>3 ] =
+                 mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+#elif defined PLW4
+    memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
+    sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
+                mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+    sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
+                mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+#else
+    sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
+               mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+    sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
+               mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+#endif  // PLW
+#else  // LE64
+#if defined PLW1
+    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
+#elif defined PLW4
+    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
+    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
+                       _mm256_set1_epi64x( c->count >> 61 );
+    memset_zero_256( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
+                       2 * SPH_WLEN );
+#else
+    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
+    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
+                          _mm256_set1_epi64x( sc->count >> 61 );
+#endif // PLW
+
+#endif // LE64
+
+    RFUN( sc->buf, SPH_VAL );
+
+#ifdef SPH_NO_OUTPUT
+    (void)dst;
+    (void)rnum;
+    (void)u;
+#else
+    for ( u = 0; u < rnum; u ++ )
+    {
+#if defined BE64
+       ((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
+#else  // LE64
+       ((__m256i*)dst)[u] = sc->val[u];
+#endif
+    }
+#endif
+}
+
+static void
+SPH_XCAT( HASH, _mdclose )( void *cc, void *dst, unsigned rnum )
+{
+   SPH_XCAT( HASH, _addbits_and_close )( cc, 0, 0, dst, rnum );
+}
--- a/algo/sha/sha2-big-4way.c
+++ b/algo/sha/sha2-big-4way.c
@@ -0,0 +1,247 @@
+/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */
+/*
+ * SHA-384 / SHA-512 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_sha2.h"
+
+#if SPH_64
+
+#define CH(X, Y, Z)    ((((Y) ^ (Z)) & (X)) ^ (Z))
+#define MAJ(X, Y, Z)   (((X) & (Y)) | (((X) | (Y)) & (Z)))
+
+#define ROTR64    SPH_ROTR64
+
+#define BSG5_0(x)      (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39))
+#define BSG5_1(x)      (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41))
+#define SSG5_0(x)      (ROTR64(x, 1) ^ ROTR64(x, 8) ^ SPH_T64((x) >> 7))
+#define SSG5_1(x)      (ROTR64(x, 19) ^ ROTR64(x, 61) ^ SPH_T64((x) >> 6))
+
+static const sph_u64 K512[80] = {
+	SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
+	SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
+	SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
+	SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
+	SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
+	SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
+	SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
+	SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
+	SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
+	SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
+	SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
+	SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
+	SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
+	SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
+	SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
+	SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
+	SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
+	SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
+	SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
+	SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
+	SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
+	SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
+	SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
+	SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
+	SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
+	SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
+	SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
+	SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
+	SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
+	SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
+	SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
+	SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
+	SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
+	SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
+	SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
+	SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
+	SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
+	SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
+	SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
+	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
+};
+
+static const sph_u64 H384[8] = {
+	SPH_C64(0xCBBB9D5DC1059ED8), SPH_C64(0x629A292A367CD507),
+	SPH_C64(0x9159015A3070DD17), SPH_C64(0x152FECD8F70E5939),
+	SPH_C64(0x67332667FFC00B31), SPH_C64(0x8EB44A8768581511),
+	SPH_C64(0xDB0C2E0D64F98FA7), SPH_C64(0x47B5481DBEFA4FA4)
+};
+
+static const sph_u64 H512[8] = {
+	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
+	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
+	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
+	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+};
+
+/*
+ * This macro defines the body for a SHA-384 / SHA-512 compression function
+ * implementation. The "in" parameter should evaluate, when applied to a
+ * numerical input parameter from 0 to 15, to an expression which yields
+ * the corresponding input block. The "r" parameter should evaluate to
+ * an array or pointer expression designating the array of 8 words which
+ * contains the input and output of the compression function.
+ *
+ * SHA-512 is hard for the compiler. If the loop is completely unrolled,
+ * then the code will be quite huge (possibly more than 100 kB), and the
+ * performance will be degraded due to cache misses on the code. We
+ * unroll only eight steps, which avoids all needless copies when
+ * 64-bit registers are swapped.
+ */
+
+#define SHA3_STEP(A, B, C, D, E, F, G, H, i)   do { \
+		sph_u64 T1, T2; \
+		T1 = SPH_T64(H + BSG5_1(E) + CH(E, F, G) + K512[i] + W[i]); \
+		T2 = SPH_T64(BSG5_0(A) + MAJ(A, B, C)); \
+		D = SPH_T64(D + T1); \
+		H = SPH_T64(T1 + T2); \
+	} while (0)
+
+#define SHA3_ROUND_BODY(in, r)   do { \
+		int i; \
+		sph_u64 A, B, C, D, E, F, G, H; \
+		sph_u64 W[80]; \
+ \
+ 		for (i = 0; i < 16; i ++) \
+			W[i] = in(i); \
+		for (i = 16; i < 80; i ++) \
+ 			W[i] = SPH_T64(SSG5_1(W[i - 2]) + W[i - 7] \
+				+ SSG5_0(W[i - 15]) + W[i - 16]); \
+		A = (r)[0]; \
+		B = (r)[1]; \
+		C = (r)[2]; \
+		D = (r)[3]; \
+		E = (r)[4]; \
+		F = (r)[5]; \
+		G = (r)[6]; \
+		H = (r)[7]; \
+		for (i = 0; i < 80; i += 8) { \
+			SHA3_STEP(A, B, C, D, E, F, G, H, i + 0); \
+			SHA3_STEP(H, A, B, C, D, E, F, G, i + 1); \
+			SHA3_STEP(G, H, A, B, C, D, E, F, i + 2); \
+			SHA3_STEP(F, G, H, A, B, C, D, E, i + 3); \
+			SHA3_STEP(E, F, G, H, A, B, C, D, i + 4); \
+			SHA3_STEP(D, E, F, G, H, A, B, C, i + 5); \
+			SHA3_STEP(C, D, E, F, G, H, A, B, i + 6); \
+			SHA3_STEP(B, C, D, E, F, G, H, A, i + 7); \
+		} \
+		(r)[0] = SPH_T64((r)[0] + A); \
+		(r)[1] = SPH_T64((r)[1] + B); \
+		(r)[2] = SPH_T64((r)[2] + C); \
+		(r)[3] = SPH_T64((r)[3] + D); \
+		(r)[4] = SPH_T64((r)[4] + E); \
+		(r)[5] = SPH_T64((r)[5] + F); \
+		(r)[6] = SPH_T64((r)[6] + G); \
+		(r)[7] = SPH_T64((r)[7] + H); \
+	} while (0)
+
+/*
+ * One round of SHA-384 / SHA-512. The data must be aligned for 64-bit access.
+ */
+static void
+sha3_round(const unsigned char *data, sph_u64 r[8])
+{
+#define SHA3_IN(x)   sph_dec64be_aligned(data + (8 * (x)))
+	SHA3_ROUND_BODY(SHA3_IN, r);
+#undef SHA3_IN
+}
+
+/* see sph_sha3.h */
+void
+sph_sha384_init(void *cc)
+{
+	sph_sha384_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, H384, sizeof H384);
+	sc->count = 0;
+}
+
+/* see sph_sha3.h */
+void
+sph_sha512_init(void *cc)
+{
+	sph_sha512_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, H512, sizeof H512);
+	sc->count = 0;
+}
+
+#define RFUN   sha3_round
+#define HASH   sha384
+#define BE64   1
+#include "md_helper.c"
+
+/* see sph_sha3.h */
+void
+sph_sha384_close(void *cc, void *dst)
+{
+	sha384_close(cc, dst, 6);
+//	sph_sha384_init(cc);
+}
+
+/* see sph_sha3.h */
+void
+sph_sha384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	sha384_addbits_and_close(cc, ub, n, dst, 6);
+//	sph_sha384_init(cc);
+}
+
+/* see sph_sha3.h */
+void
+sph_sha512_close(void *cc, void *dst)
+{
+	sha384_close(cc, dst, 8);
+//	sph_sha512_init(cc);
+}
+
+/* see sph_sha3.h */
+void
+sph_sha512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	sha384_addbits_and_close(cc, ub, n, dst, 8);
+//	sph_sha512_init(cc);
+}
+
+/* see sph_sha3.h */
+void
+sph_sha384_comp(const sph_u64 msg[16], sph_u64 val[8])
+{
+#define SHA3_IN(x)   msg[x]
+	SHA3_ROUND_BODY(SHA3_IN, val);
+#undef SHA3_IN
+}
+
+#endif
--- a/algo/sha/sha2-hash-4way.c
+++ b/algo/sha/sha2-hash-4way.c
@@ -0,0 +1,236 @@
+/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */
+/*
+ * SHA-384 / SHA-512 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sha2-hash-4way.h"
+
+#if defined(__AVX2__)
+
+static const sph_u64 K512[80] = {
+	SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
+	SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
+	SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
+	SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
+	SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
+	SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
+	SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
+	SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
+	SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
+	SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
+	SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
+	SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
+	SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
+	SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
+	SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
+	SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
+	SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
+	SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
+	SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
+	SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
+	SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
+	SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
+	SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
+	SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
+	SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
+	SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
+	SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
+	SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
+	SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
+	SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
+	SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
+	SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
+	SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
+	SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
+	SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
+	SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
+	SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
+	SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
+	SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
+	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
+};
+
+static const sph_u64 H512[8] = {
+	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
+	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
+	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
+	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+};
+
+#define CH(X, Y, Z) \
+   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
+
+#define MAJ(X, Y, Z) \
+   _mm256_or_si256( _mm256_and_si256( X, Y ), \
+                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
+
+#define BSG5_0(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+        mm256_rotr_64(x, 28), mm256_rotr_64(x, 34) ), mm256_rotr_64(x, 39) )
+
+#define BSG5_1(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+        mm256_rotr_64(x, 14), mm256_rotr_64(x, 18) ), mm256_rotr_64(x, 41) )
+
+#define SSG5_0(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+        mm256_rotr_64(x, 1), mm256_rotr_64(x, 8) ), _mm256_srli_epi64(x, 7) ) 
+
+#define SSG5_1(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+        mm256_rotr_64(x, 19), mm256_rotr_64(x, 61) ), _mm256_srli_epi64(x, 6) )
+
+#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
+do { \
+  __m256i T1, T2; \
+  T1 = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64( \
+       _mm256_add_epi64( H, BSG5_1(E) ), CH(E, F, G) ), \
+                         _mm256_set1_epi64x( K512[i] ) ), W[i] ); \
+  T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
+  D  = _mm256_add_epi64( D, T1 ); \
+  H  = _mm256_add_epi64( T1, T2 ); \
+} while (0)
+
+static void
+sha512_4way_round( __m256i *in, __m256i r[8] )
+{
+   int i;
+   __m256i A, B, C, D, E, F, G, H;
+   __m256i W[80];
+
+   for ( i = 0; i < 16; i++ )
+      W[i] = mm256_byteswap_64( in[i] );
+   for ( i = 16; i < 80; i++ )
+      W[i] = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64(
+           SSG5_1( W[ i-2 ] ), W[ i-7 ] ), SSG5_0( W[ i-15 ] ) ), W[ i-16 ] );
+
+   A = r[0];
+   B = r[1];
+   C = r[2];
+   D = r[3];
+   E = r[4];
+   F = r[5];
+   G = r[6];
+   H = r[7];
+
+   for ( i = 0; i < 80; i += 8 )
+   {
+      SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
+      SHA3_4WAY_STEP( H, A, B, C, D, E, F, G, i + 1 );
+      SHA3_4WAY_STEP( G, H, A, B, C, D, E, F, i + 2 );
+      SHA3_4WAY_STEP( F, G, H, A, B, C, D, E, i + 3 );
+      SHA3_4WAY_STEP( E, F, G, H, A, B, C, D, i + 4 );
+      SHA3_4WAY_STEP( D, E, F, G, H, A, B, C, i + 5 );
+      SHA3_4WAY_STEP( C, D, E, F, G, H, A, B, i + 6 );
+      SHA3_4WAY_STEP( B, C, D, E, F, G, H, A, i + 7 );
+   }
+
+   r[0] = _mm256_add_epi64( r[0], A );
+   r[1] = _mm256_add_epi64( r[1], B );
+   r[2] = _mm256_add_epi64( r[2], C );
+   r[3] = _mm256_add_epi64( r[3], D );
+   r[4] = _mm256_add_epi64( r[4], E );
+   r[5] = _mm256_add_epi64( r[5], F );
+   r[6] = _mm256_add_epi64( r[6], G );
+   r[7] = _mm256_add_epi64( r[7], H );
+}
+
+void sha512_4way_init( sha512_4way_context *sc )
+{
+   sc->count = 0;
+   sc->val[0] = _mm256_set1_epi64x( H512[0] );
+   sc->val[1] = _mm256_set1_epi64x( H512[1] );
+   sc->val[2] = _mm256_set1_epi64x( H512[2] );
+   sc->val[3] = _mm256_set1_epi64x( H512[3] );
+   sc->val[4] = _mm256_set1_epi64x( H512[4] );
+   sc->val[5] = _mm256_set1_epi64x( H512[5] );
+   sc->val[6] = _mm256_set1_epi64x( H512[6] );
+   sc->val[7] = _mm256_set1_epi64x( H512[7] );
+}
+
+void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   size_t ptr;
+   int buf_size = 128;
+
+   ptr = (unsigned)sc->count & (buf_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
+      vdata = vdata + (clen>>3);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         sha512_4way_round( sc->buf, sc->val );
+         ptr = 0;
+      }
+      sc->count += clen;
+   }
+}
+
+void sha512_4way_close( sha512_4way_context *sc, void *dst )
+{
+    unsigned ptr, u;
+    int buf_size = 128;
+    int pad = buf_size - 16;
+
+    ptr = (unsigned)sc->count & (buf_size - 1U);
+    sc->buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
+    ptr += 8;
+
+    if ( ptr > pad )
+    {
+         memset_zero_256( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );
+         sha512_4way_round( sc->buf, sc->val );
+         memset_zero_256( sc->buf, pad >> 3 );
+    }
+    else
+         memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
+
+    sc->buf[ pad >> 3 ] =
+                 mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+    sc->buf[ ( pad+8 ) >> 3 ] = 
+                 mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+    sha512_4way_round( sc->buf, sc->val );
+
+    for ( u = 0; u < 8; u ++ )
+       ((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
+}
+
+#endif
--- a/algo/sha/sha2-hash-4way.h
+++ b/algo/sha/sha2-hash-4way.h
@@ -0,0 +1,104 @@
+/* $Id: sph_sha2.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * SHA-224, SHA-256, SHA-384 and SHA-512 interface.
+ *
+ * SHA-256 has been published in FIPS 180-2, now amended with a change
+ * notice to include SHA-224 as well (which is a simple variation on
+ * SHA-256). SHA-384 and SHA-512 are also defined in FIPS 180-2. FIPS
+ * standards can be found at:
+ *    http://csrc.nist.gov/publications/fips/
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_sha2.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SHA2_HASH_4WAY_H__
+#define SHA2_HASH_4WAY_H__ 1
+
+#include <stddef.h>
+#include "sph_types.h"
+#include "avxdefs.h"
+
+#if 0
+
+#define SPH_SIZE_sha224   224
+
+#define SPH_SIZE_sha256   256
+
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	sph_u32 val[8];
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_sha224_context;
+
+typedef sph_sha224_context sph_sha256_context;
+
+void sph_sha224_init(void *cc);
+
+void sph_sha224(void *cc, const void *data, size_t len);
+
+void sph_sha224_close(void *cc, void *dst);
+
+void sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
+
+void sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8]);
+
+void sph_sha256_init(void *cc);
+
+void sph_sha256(void *cc, const void *data, size_t len);
+
+void sph_sha256_close(void *cc, void *dst);
+
+void sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
+
+void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]);
+
+#endif
+
+#if defined (__AVX2__)
+
+#define SPH_SIZE_sha512   512
+
+typedef struct {
+   __m256i buf[128>>3];
+   __m256i val[8];
+   uint64_t count;
+} sha512_4way_context;
+
+void sha512_4way_init( sha512_4way_context *sc);
+void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
+void sha512_4way_close( sha512_4way_context *sc, void *dst );
+
+#endif
+#endif
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -6,7 +6,7 @@ int64_t skein_get_max64() { return 0x7ffffLL; }

 bool register_skein_algo( algo_gate_t* gate )
 {
-    gate->optimizations = FOUR_WAY_OPT | SHA_OPT;
+    gate->optimizations = AVX2_OPT | SHA_OPT;
 #if defined (SKEIN_4WAY)
    gate->scanhash  = (void*)&scanhash_skein_4way;
    gate->hash      = (void*)&skeinhash_4way;
--- a/algo/skein/skein-gate.h
+++ b/algo/skein/skein-gate.h
@@ -3,7 +3,7 @@
 #include <stdint.h>
 #include "algo-gate-api.h"

-#if defined(FOUR_WAY) && defined(__AVX2__)
+#if defined(__AVX2__)
  #define SKEIN_4WAY
 #endif

--- a/algo/skein/skein2-gate.c
+++ b/algo/skein/skein2-gate.c
@@ -9,7 +9,7 @@ int64_t skein2_get_max64 ()

 bool register_skein2_algo( algo_gate_t* gate )
 {
-  gate->optimizations = FOUR_WAY_OPT;
+  gate->optimizations = AVX2_OPT;
 #if defined (FOUR_WAY) && defined (__AVX2__)
  gate->scanhash  = (void*)&scanhash_skein2_4way;
  gate->hash      = (void*)&skein2hash_4way;
--- a/algo/skein/skein2-gate.h
+++ b/algo/skein/skein2-gate.h
@@ -3,7 +3,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(FOUR_WAY) && defined(__AVX2__)
+#if defined(__AVX2__)
  #define SKEIN2_4WAY
 #endif

--- a/algo/whirlpool/whirlpool-gate.c
+++ b/algo/whirlpool/whirlpool-gate.c
@@ -4,7 +4,7 @@ bool register_whirlpool_algo( algo_gate_t* gate )
 {
 #if defined (WHIRLPOOL_4WAY)
  four_way_not_tested();
-  gate->optimizations = FOUR_WAY_OPT;
+  gate->optimizations = AVX2_OPT;
  gate->scanhash  = (void*)&scanhash_whirlpool_4way;
  gate->hash      = (void*)&whirlpool_hash_4way;
 #else
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -1,7 +1,7 @@
 #include "cpuminer-config.h"
 #include "c11-gate.h"

-#if defined (__AVX2__) && defined (__AES__)
+#if defined (C11_4WAY)

 #include <string.h>
 #include <stdint.h>
--- a/algo/x11/c11-gate.c
+++ b/algo/x11/c11-gate.c
@@ -11,7 +11,7 @@ bool register_c11_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_c11;
  gate->hash      = (void*)&c11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x11/c11-gate.h
+++ b/algo/x11/c11-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY) && defined(__AES__)
+#if defined(__AVX2__) && defined(__AES__)
  #define C11_4WAY
 #endif

--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -1,6 +1,6 @@
 #include "timetravel-gate.h"

-#if defined(__AVX2__) && defined(__AES__)
+#if defined(TIMETRAVEL_4WAY)

 #include <stdlib.h>
 #include <stdint.h>
--- a/algo/x11/timetravel-gate.c
+++ b/algo/x11/timetravel-gate.c
@@ -17,7 +17,7 @@ bool register_timetravel_algo( algo_gate_t* gate )
  gate->hash       = (void*)&timetravel_hash;
 #endif
  gate->set_target = (void*)&tt8_set_target;
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  return true;
 };
--- a/algo/x11/timetravel-gate.h
+++ b/algo/x11/timetravel-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY) && defined(__AES__)
+#if defined(__AVX2__) && defined(__AES__)
  #define TIMETRAVEL_4WAY
 #endif

--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -1,6 +1,6 @@
 #include "timetravel10-gate.h"

-#if defined(__AVX2__) && defined(__AES__)
+#if defined(TIMETRAVEL10_4WAY)

 #include <stdlib.h>
 #include <stdint.h>
--- a/algo/x11/timetravel10-gate.c
+++ b/algo/x11/timetravel10-gate.c
@@ -17,7 +17,7 @@ bool register_timetravel10_algo( algo_gate_t* gate )
  gate->hash       = (void*)&timetravel10_hash;
 #endif
  gate->set_target = (void*)&tt10_set_target;
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  return true;
 };
--- a/algo/x11/timetravel10-gate.h
+++ b/algo/x11/timetravel10-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY) && defined(__AES__)
+#if defined(__AVX2__) && defined(__AES__)
  #define TIMETRAVEL10_4WAY
 #endif

--- a/algo/x11/tribus-4way.c
+++ b/algo/x11/tribus-4way.c
@@ -4,7 +4,7 @@
 #include <string.h>
 #include <stdio.h>

-#if defined(__AVX2__) && !defined(NO_AES_NI)
+#if defined(TRIBUS_4WAY)

 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
--- a/algo/x11/tribus-gate.c
+++ b/algo/x11/tribus-gate.c
@@ -2,7 +2,7 @@

 bool register_tribus_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64     = (void*)&get_max64_0x1ffff;
 #if defined (TRIBUS_4WAY)
 //  init_tribus_4way_ctx();
--- a/algo/x11/tribus-gate.h
+++ b/algo/x11/tribus-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY) && defined(__AES__)
+#if defined(__AVX2__) && defined(__AES__)
  #define TRIBUS_4WAY
 #endif

--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -1,7 +1,7 @@
 #include "cpuminer-config.h"
 #include "x11-gate.h"

-#if defined (__AVX2__) && defined (__AES__)
+#if defined (X11_4WAY)

 #include <string.h>
 #include <stdint.h>
--- a/algo/x11/x11-gate.c
+++ b/algo/x11/x11-gate.c
@@ -11,7 +11,7 @@ bool register_x11_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x11;
  gate->hash      = (void*)&x11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x11/x11-gate.h
+++ b/algo/x11/x11-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY) && defined(__AES__)
+#if defined(__AVX2__) && defined(__AES__)
  #define X11_4WAY
 #endif

--- a/algo/x11/x11evo-4way.c
+++ b/algo/x11/x11evo-4way.c
@@ -1,7 +1,7 @@
 #include "cpuminer-config.h"
 #include "x11evo-gate.h"

-#if defined(__AVX2__) && defined(__AES__)
+#if defined(X11EVO_4WAY)

 #include <string.h>
 #include <stdint.h>
--- a/algo/x11/x11evo-gate.c
+++ b/algo/x11/x11evo-gate.c
@@ -89,7 +89,7 @@ bool register_x11evo_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x11evo;
  gate->hash      = (void*)&x11evo_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  return true;
 };

--- a/algo/x11/x11evo-gate.h
+++ b/algo/x11/x11evo-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY) && defined(__AES__)
+#if defined(__AVX2__) && defined(__AES__)
  #define X11EVO_4WAY
 #endif

--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -1,7 +1,7 @@
 #include "cpuminer-config.h"
 #include "x11gost-gate.h"

-#if defined (__AVX2__) && defined (__AES__)
+#if defined (X11GOST_4WAY)

 #include <string.h>
 #include <stdint.h>
--- a/algo/x11/x11gost-gate.c
+++ b/algo/x11/x11gost-gate.c
@@ -11,7 +11,7 @@ bool register_x11gost_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x11gost;
  gate->hash      = (void*)&x11gost_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x11/x11gost-gate.h
+++ b/algo/x11/x11gost-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY) && defined(__AES__)
+#if defined(__AVX2__) && defined(__AES__)
  #define X11GOST_4WAY
 #endif

--- a/algo/x13/phi1612-4way.c
+++ b/algo/x13/phi1612-4way.c
@@ -1,6 +1,6 @@
-#include "x13-gate.h"
+#include "phi1612-gate.h"

-#if defined(__AVX2__) && defined(__AES__)
+#if defined(PHI1612_4WAY)

 #include <stdlib.h>
 #include <stdint.h>
--- a/algo/x13/phi1612-gate.c
+++ b/algo/x13/phi1612-gate.c
@@ -11,7 +11,7 @@ bool register_phi1612_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_phi1612;
  gate->hash      = (void*)&phi1612_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x13/phi1612-gate.h
+++ b/algo/x13/phi1612-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY) && defined(__AES__)
+#if defined(__AVX2__) && defined(__AES__)
  #define PHI1612_4WAY
 #endif

--- a/algo/x13/skunk-4way.c
+++ b/algo/x13/skunk-4way.c
@@ -1,6 +1,6 @@
 #include "skunk-gate.h"

-#ifdef __AVX2__
+#if defined(SKUNK_4WAY)

 #include <stdlib.h>
 #include <stdint.h>
--- a/algo/x13/skunk-gate.c
+++ b/algo/x13/skunk-gate.c
@@ -2,7 +2,7 @@

 bool register_skunk_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
 #if defined (SKUNK_4WAY)
   gate->miner_thread_init = (void*)&skunk_4way_thread_init;
   gate->scanhash = (void*)&scanhash_skunk_4way;
--- a/algo/x13/skunk-gate.h
+++ b/algo/x13/skunk-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY)
+#if defined(__AVX2__)
  #define SKUNK_4WAY
 #endif

--- a/algo/x13/x13-4way.c
+++ b/algo/x13/x13-4way.c
@@ -1,6 +1,6 @@
 #include "x13-gate.h"

-#if defined(__AVX2__) && defined(__AES__)
+#if defined(X13_4WAY)

 #include <stdlib.h>
 #include <stdint.h>
@@ -17,7 +17,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/sse2/nist.h"
 #include "algo/echo/aes_ni/hash_api.h"
-#include "algo/hamsi/sph_hamsi.h"
+#include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"

 typedef struct {
@@ -32,7 +32,7 @@ typedef struct {
    sph_shavite512_context  shavite;
    hashState_sd            simd;
    hashState_echo          echo;
-    sph_hamsi512_context    hamsi;
+    hamsi512_4way_context   hamsi;
    sph_fugue512_context    fugue;
 } x13_4way_ctx_holder;

@@ -51,7 +51,7 @@ void init_x13_4way_ctx()
     sph_shavite512_init( &x13_4way_ctx.shavite );
     init_sd( &x13_4way_ctx.simd, 512 );
     init_echo( &x13_4way_ctx.echo, 512 );
-     sph_hamsi512_init( &x13_4way_ctx.hamsi );
+     hamsi512_4way_init( &x13_4way_ctx.hamsi );
     sph_fugue512_init( &x13_4way_ctx.fugue );
 };

@@ -85,7 +85,7 @@ void x13_4way_hash( void *state, const void *input )
     memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

-     // Parallel 4way
+     // Parallel 4way 64 bit
     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     // 4 Skein
@@ -100,7 +100,7 @@ void x13_4way_hash( void *state, const void *input )
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );

-     // Serial to the end
+     // Serial
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     // 7 Luffa
@@ -167,20 +167,13 @@ void x13_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

-     // 12 Hamsi
-     sph_hamsi512( &ctx.hamsi, hash0, 64 );
-     sph_hamsi512_close( &ctx.hamsi, hash0 );
-     memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
-     sph_hamsi512( &ctx.hamsi, hash1, 64 );
-     sph_hamsi512_close( &ctx.hamsi, hash1 );
-     memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
-     sph_hamsi512( &ctx.hamsi, hash2, 64 );
-     sph_hamsi512_close( &ctx.hamsi, hash2 );
-     memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
-     sph_hamsi512( &ctx.hamsi, hash3, 64 );
-     sph_hamsi512_close( &ctx.hamsi, hash3 );
+     // 12 Hamsi parallel 4way 32 bit
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_close( &ctx.hamsi, vhash );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

-     // 13 Fugue
+     // 13 Fugue serial
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
     memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
--- a/algo/x13/x13-gate.c
+++ b/algo/x13/x13-gate.c
@@ -11,7 +11,7 @@ bool register_x13_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x13;
  gate->hash      = (void*)&x13hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x13/x13-gate.h
+++ b/algo/x13/x13-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY) && defined(__AES__)
+#if defined(__AVX2__) && defined(__AES__)
  #define X13_4WAY
 #endif

--- a/algo/x13/x13sm3-4way.c
+++ b/algo/x13/x13sm3-4way.c
@@ -1,6 +1,6 @@
 #include "x13sm3-gate.h"

-#if defined(__AVX2__) && defined(__AES__)
+#if defined(X13SM3_4WAY)

 #include <stdlib.h>
 #include <stdint.h>
@@ -18,7 +18,7 @@
 #include "algo/simd/sse2/nist.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/sm3/sm3-hash-4way.h"
-#include "algo/hamsi/sph_hamsi.h"
+#include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"

 typedef struct {
@@ -34,7 +34,7 @@ typedef struct {
    hashState_sd            simd;
    hashState_echo          echo;
    sm3_4way_ctx_t          sm3;
-    sph_hamsi512_context    hamsi;
+    hamsi512_4way_context   hamsi;
    sph_fugue512_context    fugue;
 } x13sm3_4way_ctx_holder;

@@ -55,7 +55,7 @@ void init_x13sm3_4way_ctx()
     init_sd( &x13sm3_4way_ctx.simd, 512 );
     init_echo( &x13sm3_4way_ctx.echo, 512 );
     sm3_4way_init( &x13sm3_4way_ctx.sm3 );
-     sph_hamsi512_init( &x13sm3_4way_ctx.hamsi );
+     hamsi512_4way_init( &x13sm3_4way_ctx.hamsi );
     sph_fugue512_init( &x13sm3_4way_ctx.fugue );
 };

@@ -174,7 +174,9 @@ void x13sm3_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

-     // SM3
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // SM3 parallel 32 bit
     uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64)));
     memset( sm3_vhash, 0, sizeof sm3_vhash );
     uint32_t sm3_hash0[32] __attribute__ ((aligned (32)));
@@ -186,26 +188,16 @@ void x13sm3_4way_hash( void *state, const void *input )
     uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
     memset( sm3_hash3, 0, sizeof sm3_hash3 );

-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     sm3_4way( &ctx.sm3, vhash, 64 );
     sm3_4way_close( &ctx.sm3, sm3_vhash );
-     mm_deinterleave_4x32( sm3_hash0, sm3_hash1, sm3_hash2, sm3_hash3,
-                           sm3_vhash, 1024 );

-     // Hamsi
-     sph_hamsi512( &ctx.hamsi, sm3_hash0, 64 );
-     sph_hamsi512_close( &ctx.hamsi, hash0 );
-     memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
-     sph_hamsi512( &ctx.hamsi, sm3_hash1, 64 );
-     sph_hamsi512_close( &ctx.hamsi, hash1 );
-     memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
-     sph_hamsi512( &ctx.hamsi, sm3_hash2, 64 );
-     sph_hamsi512_close( &ctx.hamsi, hash2 );
-     memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
-     sph_hamsi512( &ctx.hamsi, sm3_hash3, 64 );
-     sph_hamsi512_close( &ctx.hamsi, hash3 );
+     // Hamsi parallel 32 bit
+     hamsi512_4way( &ctx.hamsi, sm3_vhash, 64 );
+     hamsi512_4way_close( &ctx.hamsi, vhash );

-     // Fugue
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // Fugue serial
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
     memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) );
--- a/algo/x13/x13sm3-gate.c
+++ b/algo/x13/x13sm3-gate.c
@@ -11,7 +11,7 @@ bool register_x13sm3_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x13sm3;
  gate->hash      = (void*)&x13sm3_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x13/x13sm3-gate.h
+++ b/algo/x13/x13sm3-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY) && defined(__AES__)
+#if defined(__AVX2__) && defined(__AES__)
  #define X13SM3_4WAY
 #endif

--- a/algo/x14/polytimos-4way.c
+++ b/algo/x14/polytimos-4way.c
@@ -1,6 +1,6 @@
 #include "polytimos-gate.h"

-#if defined(__AVX2__) && defined(__AES__)
+#if defined(POLYTIMOS_4WAY)

 #include <stdlib.h>
 #include <stdint.h>
--- a/algo/x14/polytimos-gate.c
+++ b/algo/x14/polytimos-gate.c
@@ -2,7 +2,7 @@

 bool register_polytimos_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
 #ifdef POLYTIMOS_4WAY
  init_polytimos_4way_ctx();
  gate->scanhash  = (void*)&scanhash_polytimos_4way;
--- a/algo/x14/veltor-gate.c
+++ b/algo/x14/veltor-gate.c
@@ -11,7 +11,7 @@ bool register_veltor_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_veltor;
  gate->hash      = (void*)&veltor_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x14/veltor-gate.h
+++ b/algo/x14/veltor-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY) && defined(__AES__)
+#if defined(__AVX2__) && defined(__AES__)
  #define VELTOR_4WAY
 #endif

--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -1,6 +1,6 @@
 #include "x14-gate.h"

-#if defined(__AVX2__) && defined(__AES__)
+#if defined(X14_4WAY)

 #include <stdlib.h>
 #include <stdint.h>
@@ -18,7 +18,7 @@
 #include "algo/simd/sse2/nist.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/echo/sph_echo.h"
-#include "algo/hamsi/sph_hamsi.h"
+#include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/shabal-hash-4way.h"

@@ -34,7 +34,7 @@ typedef struct {
    sph_shavite512_context  shavite;
    hashState_sd            simd;
    hashState_echo          echo;
-    sph_hamsi512_context    hamsi;
+    hamsi512_4way_context   hamsi;
    sph_fugue512_context    fugue;
    shabal512_4way_context  shabal;
 } x14_4way_ctx_holder;
@@ -55,7 +55,7 @@ void init_x14_4way_ctx()
     sph_shavite512_init( &x14_4way_ctx.shavite );
     init_sd( &x14_4way_ctx.simd, 512 );
     init_echo( &x14_4way_ctx.echo, 512 );
-     sph_hamsi512_init( &x14_4way_ctx.hamsi );
+     hamsi512_4way_init( &x14_4way_ctx.hamsi );
     sph_fugue512_init( &x14_4way_ctx.fugue );
     shabal512_4way_init( &x14_4way_ctx.shabal );
 };
@@ -172,20 +172,13 @@ void x14_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

-     // 12 Hamsi
-     sph_hamsi512( &ctx.hamsi, hash0, 64 );
-     sph_hamsi512_close( &ctx.hamsi, hash0 );
-     memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
-     sph_hamsi512( &ctx.hamsi, hash1, 64 );
-     sph_hamsi512_close( &ctx.hamsi, hash1 );
-     memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
-     sph_hamsi512( &ctx.hamsi, hash2, 64 );
-     sph_hamsi512_close( &ctx.hamsi, hash2 );
-     memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
-     sph_hamsi512( &ctx.hamsi, hash3, 64 );
-     sph_hamsi512_close( &ctx.hamsi, hash3 );
+     // 12 Hamsi parallel 4way 32 bit
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_close( &ctx.hamsi, vhash );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

-     // 13 Fugue
+     // 13 Fugue serial
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
     memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) );
--- a/algo/x14/x14-gate.c
+++ b/algo/x14/x14-gate.c
@@ -11,7 +11,7 @@ bool register_x14_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x14;
  gate->hash      = (void*)&x14hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x14/x14-gate.h
+++ b/algo/x14/x14-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY) && defined(__AES__)
+#if defined(__AVX2__) && defined(__AES__)
  #define X14_4WAY
 #endif

--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -1,6 +1,6 @@
 #include "x15-gate.h"

-#if defined(__AVX2__) && defined(__AES__)
+#if defined(X15_4WAY)

 #include <stdlib.h>
 #include <stdint.h>
@@ -18,7 +18,8 @@
 #include "algo/simd/sse2/nist.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/echo/sph_echo.h"
-#include "algo/hamsi/sph_hamsi.h"
+#include "algo/hamsi/hamsi-hash-4way.h"
+//#include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
@@ -35,7 +36,8 @@ typedef struct {
    sph_shavite512_context  shavite;
    hashState_sd            simd;
    hashState_echo          echo;
-    sph_hamsi512_context    hamsi;
+    hamsi512_4way_context   hamsi;
+//    sph_hamsi512_context    hamsi;
    sph_fugue512_context    fugue;
    shabal512_4way_context  shabal;
    sph_whirlpool_context   whirlpool;
@@ -56,7 +58,8 @@ void init_x15_4way_ctx()
     sph_shavite512_init( &x15_4way_ctx.shavite );
     init_sd( &x15_4way_ctx.simd, 512 );
     init_echo( &x15_4way_ctx.echo, 512 );
-     sph_hamsi512_init( &x15_4way_ctx.hamsi );
+     hamsi512_4way_init( &x15_4way_ctx.hamsi );
+//     sph_hamsi512_init( &x15_4way_ctx.hamsi );
     sph_fugue512_init( &x15_4way_ctx.fugue );
     shabal512_4way_init( &x15_4way_ctx.shabal );
     sph_whirlpool_init( &x15_4way_ctx.whirlpool );
@@ -174,6 +177,12 @@ void x15_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

+     // 12 Hamsi parallel 4way 32 bit
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_close( &ctx.hamsi, vhash );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+/*
     // 12 Hamsi
     sph_hamsi512( &ctx.hamsi, hash0, 64 );
     sph_hamsi512_close( &ctx.hamsi, hash0 );
@@ -186,7 +195,7 @@ void x15_4way_hash( void *state, const void *input )
     memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
     sph_hamsi512( &ctx.hamsi, hash3, 64 );
     sph_hamsi512_close( &ctx.hamsi, hash3 );
-
+*/
     // 13 Fugue
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
--- a/algo/x15/x15-gate.c
+++ b/algo/x15/x15-gate.c
@@ -11,7 +11,7 @@ bool register_x15_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x15;
  gate->hash      = (void*)&x15hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  return true;
 };

--- a/algo/x15/x15-gate.h
+++ b/algo/x15/x15-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY) && defined(__AES__)
+#if defined(__AVX2__) && defined(__AES__)
  #define X15_4WAY
 #endif

--- a/algo/x17/x16r-4way.c
+++ b/algo/x17/x16r-4way.c
@@ -0,0 +1,396 @@
+/**
+ * x16r algo implementation
+ *
+ * Implementation by tpruvot@github Jan 2018
+ * Optimized by JayDDee@github Jan 2018
+ */
+#include "x16r-gate.h"
+
+#if defined (X16R_4WAY)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/hamsi/hamsi-hash-4way.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/shabal-hash-4way.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include "algo/sha/sha2-hash-4way.h"
+
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
+
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_echo          echo;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hamsi512_4way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_4way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_4way_context     sha512;
+} x16r_4way_ctx_holder;
+
+x16r_4way_ctx_holder x16r_4way_ctx __attribute__ ((aligned (64)));
+
+// Cube needs one full init so fast reinits can be done in the hash loop.
+void init_x16r_4way_ctx()
+{
+   cubehashInit( &x16r_4way_ctx.cube, 512, 16, 32 );
+};
+
+
+void x16r_4way_hash( void* output, const void* input )
+{
+   uint32_t hash0[16] __attribute__ ((aligned (64)));
+   uint32_t hash1[16] __attribute__ ((aligned (64)));
+   uint32_t hash2[16] __attribute__ ((aligned (64)));
+   uint32_t hash3[16] __attribute__ ((aligned (64)));
+   uint32_t vhash[16*4] __attribute__ ((aligned (64)));
+   uint32_t inp0[24] __attribute__ ((aligned (64)));
+   uint32_t inp1[24] __attribute__ ((aligned (64)));
+   uint32_t inp2[24] __attribute__ ((aligned (64)));
+   uint32_t inp3[24] __attribute__ ((aligned (64)));
+
+   x16r_4way_ctx_holder ctx;
+   
+   void *in0 = (void*) inp0;
+   void *in1 = (void*) inp1;
+   void *in2 = (void*) inp2;
+   void *in3 = (void*) inp3;
+   int size = 80;
+
+   mm256_deinterleave_4x64( inp0, inp1, inp2, inp3, input, 640 );
+ 
+   if ( s_ntime == UINT32_MAX )
+   {
+      const uint8_t* tmp = (uint8_t*) inp0;
+      x16r_getAlgoString( &tmp[4], hashOrder );
+   }
+
+   // Input data is both 64 bit interleaved (input)
+   // and deinterleaved in inp0-3.
+   // If First function uses 64 bit data it is not required to interleave inp
+   // first. It may use the inerleaved data dmost convenient, ie 4way 64 bit.
+   // All other functions assume data is deinterleaved in hash0-3
+   // All functions must exit with data deinterleaved in hash0-3.
+   // Alias in0-3 points to either inp0-3 or hash0-3 according to
+   // its hashOrder position. Size is also set accordingly.
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            blake512_4way_init( &ctx.blake );
+            if ( i == 0 )
+               blake512_4way( &ctx.blake, input, size );
+            else
+            {
+               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               blake512_4way( &ctx.blake, vhash, size );
+            }
+            blake512_4way_close( &ctx.blake, vhash );
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case BMW:
+            bmw512_4way_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_4way( &ctx.bmw, input, size );
+            else
+            {
+               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               bmw512_4way( &ctx.bmw, vhash, size );
+            }
+            bmw512_4way_close( &ctx.bmw, vhash );
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case GROESTL:
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                 (const char*)in0, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                 (const char*)in1, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                 (const char*)in2, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                 (const char*)in3, size<<3 );
+         break;
+         case SKEIN:
+            skein512_4way_init( &ctx.skein );
+            if ( i == 0 )
+               skein512_4way( &ctx.skein, input, size );
+            else
+            {
+               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               skein512_4way( &ctx.skein, vhash, size );
+            }
+            skein512_4way_close( &ctx.skein, vhash );
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case JH:
+            jh512_4way_init( &ctx.jh );
+            if ( i == 0 )
+               jh512_4way( &ctx.jh, input, size );
+            else
+            {
+               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               jh512_4way( &ctx.jh, vhash, size );
+            }
+            jh512_4way_close( &ctx.jh, vhash );
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case KECCAK:
+            keccak512_4way_init( &ctx.keccak );
+            if ( i == 0 )
+               keccak512_4way( &ctx.keccak, input, size );
+            else
+            {
+               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               keccak512_4way( &ctx.keccak, vhash, size );
+            }
+            keccak512_4way_close( &ctx.keccak, vhash );
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case LUFFA:
+            init_luffa( &ctx.luffa, 512 );
+            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                                          (const BitSequence*)in0, size );
+            init_luffa( &ctx.luffa, 512 );
+            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                                          (const BitSequence*)in1, size );
+            init_luffa( &ctx.luffa, 512 );
+            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                                          (const BitSequence*)in2, size );
+            init_luffa( &ctx.luffa, 512 );
+            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                                          (const BitSequence*)in3, size );
+         break;
+         case CUBEHASH:
+            cubehashReinit( &ctx.cube );
+            cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
+                                  (const byte*)in0, size );
+            cubehashReinit( &ctx.cube );
+            cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
+                                  (const byte*)in1, size );
+            cubehashReinit( &ctx.cube );
+            cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
+                                  (const byte*)in2, size );
+            cubehashReinit( &ctx.cube );
+            cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
+                                        (const byte*)in3, size );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in0, size );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in1, size );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in2, size );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in3, size );
+            sph_shavite512_close( &ctx.shavite, hash3 );
+         break;
+         case SIMD:
+             init_sd( &ctx.simd, 512 );
+             update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                              (const BitSequence*)in0, size<<3 );
+             init_sd( &ctx.simd, 512 );
+             update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                              (const BitSequence*)in1, size<<3 );
+             init_sd( &ctx.simd, 512 );
+             update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                              (const BitSequence*)in2, size<<3 );
+             init_sd( &ctx.simd, 512 );
+             update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                              (const BitSequence*)in3, size<<3 );
+         break;
+         case ECHO:
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                                (const BitSequence*)in0, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                                (const BitSequence*)in1, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                                (const BitSequence*)in2, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                                (const BitSequence*)in3, size<<3 );
+         break;
+         case HAMSI:
+             mm_interleave_4x32( vhash, in0, in1, in2, in3, size<<3 );
+             hamsi512_4way_init( &ctx.hamsi );
+             hamsi512_4way( &ctx.hamsi, vhash, size );
+             hamsi512_4way_close( &ctx.hamsi, vhash );
+             mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in0, size );
+             sph_fugue512_close( &ctx.fugue, hash0 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in1, size );
+             sph_fugue512_close( &ctx.fugue, hash1 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in2, size );
+             sph_fugue512_close( &ctx.fugue, hash2 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in3, size );
+             sph_fugue512_close( &ctx.fugue, hash3 );
+         break;
+         case SHABAL:
+             mm_interleave_4x32( vhash, in0, in1, in2, in3, size<<3 );
+             shabal512_4way_init( &ctx.shabal );
+             shabal512_4way( &ctx.shabal, vhash, size );
+             shabal512_4way_close( &ctx.shabal, vhash );
+             mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in0, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash0 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in1, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash1 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in2, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash2 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in3, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+         break;
+         case SHA_512:
+             mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+             sha512_4way_init( &ctx.sha512 );
+             sha512_4way( &ctx.sha512, vhash, size );
+             sha512_4way_close( &ctx.sha512, vhash );
+             mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+      }
+      in0 = (void*) hash0;
+      in1 = (void*) hash1;
+      in2 = (void*) hash2;
+      in3 = (void*) hash3;
+      size = 64;
+   }
+   memcpy( output,    hash0, 32 );
+   memcpy( output+32, hash1, 32 );
+   memcpy( output+64, hash2, 32 );
+   memcpy( output+96, hash3, 32 );
+}
+
+int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done )
+{
+   uint32_t hash[4*16] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+   uint32_t *noncep1 = vdata + 75;
+   uint32_t *noncep2 = vdata + 77;
+   uint32_t *noncep3 = vdata + 79;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   for ( int k=0; k < 19; k++ )
+      be32enc( &endiandata[k], pdata[k] );
+
+   if ( s_ntime != pdata[17] )
+   {
+      uint32_t ntime = swab32(pdata[17]);
+      x16r_getAlgoString( (const char*) (&endiandata[1]), hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+   }
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   do
+   {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+      x16r_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      {
+         found[0] = true;
+         num_found++;
+         nonces[0] = n;
+         work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= Htarg && fulltest( hash, ptarget ) )
+      {
+         found[1] = true;
+         num_found++;
+         nonces[1] = n+1;
+         work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= Htarg && fulltest( hash, ptarget ) )
+      {
+         found[2] = true;
+         num_found++;
+         nonces[2] = n+2;
+         work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= Htarg && fulltest( hash, ptarget ) )
+      {
+         found[3] = true;
+         num_found++;
+         nonces[3] = n+3;
+         work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/x17/x16r-gate.c
+++ b/algo/x17/x16r-gate.c
@@ -0,0 +1,35 @@
+#include "x16r-gate.h"
+
+void x16r_getAlgoString( const uint8_t* prevblock, char *output )
+{
+   char *sptr = output;
+   for ( int j = 0; j < X16R_HASH_FUNC_COUNT; j++ )
+   {
+      uint8_t b = (15 - j) >> 1; // 16 first ascii hex chars (lsb in uint256)
+      uint8_t algoDigit = (j & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4;
+      if (algoDigit >= 10)
+          sprintf(sptr, "%c", 'A' + (algoDigit - 10));
+      else
+          sprintf(sptr, "%u", (uint32_t) algoDigit);
+      sptr++;
+   }
+   *sptr = '\0';
+}
+
+
+bool register_x16r_algo( algo_gate_t* gate )
+{
+#if defined (X16R_4WAY)
+  init_x16r_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x16r_4way;
+  gate->hash      = (void*)&x16r_4way_hash;
+#else
+  init_x16r_ctx();
+  gate->scanhash  = (void*)&scanhash_x16r;
+  gate->hash      = (void*)&x16r_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->set_target = (void*)&alt_set_target;
+  return true;
+};
+
--- a/Show More
+++ b/Show More