v3.7.10

2025-09-17 23:44:27 +00:00 · 2018-01-16 15:11:44 -05:00
parent bee78eac76
commit a90d75b8f5
77 changed files with 3408 additions and 1214 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -46,8 +46,10 @@ cpuminer_SOURCES = \
  algo/blake/sph_blake2b.c \
  algo/blake/blake2b.c \
  algo/blake/blake2s.c \
  algo/blake/blakecoin-gate.c \
  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
  algo/blake/blakecoin-4way.c \
  algo/blake/decred-gate.c \
  algo/blake/decred.c \
  algo/blake/decred-4way.c \
@@ -99,13 +101,17 @@ cpuminer_SOURCES = \
  algo/luffa/sse2/luffa_for_sse2.c \
  algo/lyra2/lyra2.c \
  algo/lyra2/sponge.c \
  algo/lyra2/lyra2rev2-gate.c \
  algo/lyra2/lyra2rev2.c \
  algo/lyra2/lyra2rev2-4way.c \
  algo/lyra2/lyra2re.c \
  algo/lyra2/lyra2z-gate.c \
  algo/lyra2/lyra2z.c \
  algo/lyra2/lyra2z-4way.c \
  algo/lyra2/lyra2z330.c \
  algo/lyra2/lyra2h-gate.c \
  algo/lyra2/lyra2h.c \
  algo/lyra2/lyra2h-4way.c \
  algo/m7m.c \
  algo/neoscrypt/neoscrypt.c \
  algo/nist5/nist5-gate.c \
@@ -113,7 +119,9 @@ cpuminer_SOURCES = \
  algo/nist5/nist5.c \
  algo/nist5/zr5.c \
  algo/pluck.c \
  algo/quark/quark-gate.c \
  algo/quark/quark.c \
  algo/quark/quark-4way.c \
  algo/qubit/qubit.c \
  algo/qubit/deep.c \
  algo/ripemd/sph_ripemd.c \
@@ -140,9 +148,8 @@ cpuminer_SOURCES = \
  algo/skein/skein2-4way.c \
  algo/skein/skein2-gate.c \
  algo/sm3/sm3.c \
  algo/sm3/sm3-hash-4way.c \
  algo/tiger/sph_tiger.c \
  algo/timetravel.c \
  algo/timetravel10.c \
  algo/whirlpool/sph_whirlpool.c \
  algo/whirlpool/whirlpool-hash-4way.c \
  algo/whirlpool/whirlpool-gate.c \
@@ -161,8 +168,16 @@ cpuminer_SOURCES = \
  algo/x11/tribus-gate.c \
  algo/x11/tribus.c \
  algo/x11/tribus-4way.c \
  algo/x11/timetravel-gate.c \
  algo/x11/timetravel.c \
  algo/x11/timetravel-4way.c \
  algo/x11/timetravel10-gate.c \
  algo/x11/timetravel10.c \
  algo/x11/timetravel10-4way.c \
  algo/x11/fresh.c \
  algo/x11/x11evo.c \
  algo/x11/x11evo-4way.c \
  algo/x11/x11evo-gate.c \
  algo/x13/x13-gate.c \
  algo/x13/x13.c \
  algo/x13/x13-4way.c \
--- a/7
+++ b/7
@@ -165,6 +165,13 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------
 v3.7.10
 4way optimizations for lyra2rev2, lyra2h, quark, timetravel8, timetravel10
   x11evo, blakecoin.
 Faster x13sm3 (hsr).
 Added share difficulty to accepted message.
 v3.7.9
 Partial 4way optimizations for veltor, skunk, polytimos, lyra2z.
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -1,31 +1,22 @@
 #include "blake-gate.h"
-#include "sph_blake.h"
+
 #if defined (__AVX__)
 #include "blake-hash-4way.h"
 #include <string.h>
 #include <stdint.h>
 #include <memory.h>
-#if defined (BLAKE_4WAY)
+blake256r14_4way_context blake_ctx;
 void blakehash_4way(void *state, const void *input)
 {
-     uint32_t vhash[4*4] __attribute__ ((aligned (64)));
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-     uint32_t hash0[4] __attribute__ ((aligned (32)));
+     blake256r14_4way_context ctx;
-     uint32_t hash1[4] __attribute__ ((aligned (32)));
+     memcpy( &ctx, &blake_ctx, sizeof ctx );
-     uint32_t hash2[4] __attribute__ ((aligned (32)));
+     blake256r14_4way( &ctx, input + (64<<2), 16 );
-     uint32_t hash3[4] __attribute__ ((aligned (32)));
+     blake256r14_4way_close( &ctx, vhash );
-     blake256_4way_context ctx;
+     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
     blake256_4way_init( &ctx );
     blake256_4way( &ctx, input, 16 );
     blake256_4way_close( &ctx, vhash );
     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
     memcpy( state,    hash0, 32 );
     memcpy( state+32, hash1, 32 );
     memcpy( state+64, hash1, 32 );
     memcpy( state+96, hash1, 32 );
 }
 int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -36,21 +27,24 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
-//   uint32_t HTarget = ptarget[7];
+   uint32_t HTarget = ptarget[7];
   uint32_t _ALIGN(32) edata[20];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
   bool *found = work->nfound;
   int num_found = 0;
-//   if (opt_benchmark)
+   if (opt_benchmark)
-//      HTarget = 0x7f;
+      HTarget = 0x7f;
   // we need big endian data...
   swab32_array( edata, pdata, 20 );
   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   blake256r14_4way_init( &blake_ctx );
   blake256r14_4way( &blake_ctx, vdata, 64 );
   uint32_t *noncep = vdata + 76;   // 19*4
   do {
      found[0] = found[1] = found[2] = found[3] = false;
@@ -61,45 +55,36 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
      blakehash_4way( hash, vdata );
-      if ( hash[7] == 0 )
+      if (  hash[7] <= HTarget && fulltest( hash, ptarget ) )
      {
         if ( fulltest( hash, ptarget ) )
      {
          found[0] = true;
          num_found++;
          nonces[0] = n;
          pdata[19] = n;
          work_set_target_ratio( work, hash );
      }
-      }
+      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
      if ( (hash+8)[7] == 0 ) 
      {
         if ( fulltest( hash+8, ptarget ) ) 
      {
          found[1] = true;
          num_found++;
          nonces[1] = n+1;
          work_set_target_ratio( work, hash+8 );
      }
-      }
+      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
      if ( (hash+16)[7] == 0 )
      {
          if ( fulltest( hash+8, ptarget ) )
      {
           found[2] = true;
           num_found++;
           nonces[2] = n+2;
           work_set_target_ratio( work, hash+16 );
      }
-      }
+      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
      if ( (hash+24)[7] == 0 )
      {
         if ( fulltest( hash+8, ptarget ) )
      {
           found[3] = true;
           num_found++;
           nonces[3] = n+3;
-         }
+           work_set_target_ratio( work, hash+24 );
      }
      n += 4;
      *hashes_done = n - first_nonce + 1;
   } while ( (num_found == 0) && (n < max_nonce) 
             && !work_restart[thr_id].restart );
--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
@@ -491,14 +491,9 @@ do { \
 		(state)->T1 = T1; \
 	} while (0)
 //#define BLAKE32_ROUNDS 8
 #ifndef BLAKE32_ROUNDS
 #define BLAKE32_ROUNDS 14
 #endif
 #if SPH_COMPACT_BLAKE_32
-#define COMPRESS32_4WAY   do { \
+#define COMPRESS32_4WAY( rounds )   do { \
 	__m128i M[16]; \
 	__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
 	__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
@@ -539,7 +534,7 @@ do { \
 	M[0xD] = mm_byteswap_32( *(buf + 13) ); \
 	M[0xE] = mm_byteswap_32( *(buf + 14) ); \
 	M[0xF] = mm_byteswap_32( *(buf + 15) ); \
-	for (r = 0; r < BLAKE32_ROUNDS; r ++) \
+	for (r = 0; r < rounds; r ++) \
 		ROUND_S_4WAY(r); \
        H0 = _mm_xor_si128( _mm_xor_si128( \
                                   _mm_xor_si128( S0, V0 ), V8 ), H0 ); \
@@ -563,7 +558,8 @@ do { \
 // current impl
-#define COMPRESS32_4WAY   do { \
+#define COMPRESS32_4WAY( rounds ) \
 do { \
   __m128i M0, M1, M2, M3, M4, M5, M6, M7; \
   __m128i M8, M9, MA, MB, MC, MD, ME, MF; \
   __m128i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -580,14 +576,10 @@ do { \
   V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
   VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
   VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
-        VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
+   VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
-                            _mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \
+   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
-        VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
+   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
-                            _mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \
+   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
        VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
                            _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
        VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
                            _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
   M0 = mm_byteswap_32( * buf ); \
   M1 = mm_byteswap_32( *(buf+1) ); \
   M2 = mm_byteswap_32( *(buf+2) ); \
@@ -612,7 +604,8 @@ do { \
   ROUND_S_4WAY(5); \
   ROUND_S_4WAY(6); \
   ROUND_S_4WAY(7); \
-	if (BLAKE32_ROUNDS == 14) { \
+   if (rounds == 14) \
   { \
      ROUND_S_4WAY(8); \
      ROUND_S_4WAY(9); \
      ROUND_S_4WAY(0); \
@@ -620,22 +613,14 @@ do { \
      ROUND_S_4WAY(2); \
      ROUND_S_4WAY(3); \
   } \
-        H0 = _mm_xor_si128( _mm_xor_si128( \
+   H0 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V8, V0 ), S0 ), H0 ); \
-                                _mm_xor_si128( V8, V0 ), S0 ), H0 ); \
+   H1 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V9, V1 ), S1 ), H1 ); \
-        H1 = _mm_xor_si128( _mm_xor_si128( \
+   H2 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VA, V2 ), S2 ), H2 ); \
-                                _mm_xor_si128( V9, V1 ), S1 ), H1 ); \
+   H3 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VB, V3 ), S3 ), H3 ); \
-        H2 = _mm_xor_si128( _mm_xor_si128( \
+   H4 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VC, V4 ), S0 ), H4 ); \
-                                _mm_xor_si128( VA, V2 ), S2 ), H2 ); \
+   H5 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VD, V5 ), S1 ), H5 ); \
-        H3 = _mm_xor_si128( _mm_xor_si128( \
+   H6 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VE, V6 ), S2 ), H6 ); \
-                                _mm_xor_si128( VB, V3 ), S3 ), H3 ); \
+   H7 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VF, V7 ), S3 ), H7 ); \
        H4 = _mm_xor_si128( _mm_xor_si128( \
                                _mm_xor_si128( VC, V4 ), S0 ), H4 ); \
        H5 = _mm_xor_si128( _mm_xor_si128( \
                                _mm_xor_si128( VD, V5 ), S1 ), H5 ); \
        H6 = _mm_xor_si128( _mm_xor_si128( \
                                _mm_xor_si128( VE, V6 ), S2 ), H6 ); \
        H7 = _mm_xor_si128( _mm_xor_si128( \
                                _mm_xor_si128( VF, V7 ), S3 ), H7 ); \
 } while (0)
 #endif
@@ -832,7 +817,7 @@ static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 };
 static void
 blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
-                   const sph_u32 *salt)
+                   const sph_u32 *salt, int rounds )
 {
   int i;
   for ( i = 0; i < 8; i++ )
@@ -841,6 +826,7 @@ blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
      sc->S[i] = _mm_set1_epi32( salt[i] );
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
   sc->rounds = rounds;
 }
 static void
@@ -878,7 +864,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
           {
 		if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
 			T1 = SPH_T32(T1 + 1);
-		COMPRESS32_4WAY;
+                COMPRESS32_4WAY( sc->rounds );
 		ptr = 0;
 	   }
 	}
@@ -1079,10 +1065,11 @@ blake64_4way_close( blake_4way_big_context *sc,
 #endif
 // default 14 rounds, backward copatibility
 void
 blake256_4way_init(void *cc)
 {
-	blake32_4way_init(cc, IV256, salt_zero_small);
+   blake32_4way_init( cc, IV256, salt_zero_small, 14 );
 }
 void
@@ -1094,13 +1081,43 @@ blake256_4way(void *cc, const void *data, size_t len)
 void
 blake256_4way_close(void *cc, void *dst)
 {
-	blake256_4way_addbits_and_close(cc, 0, 0, dst);
+        blake32_4way_close(cc, 0, 0, dst, 8);
 }
 // 14 rounds blake, decred
 void blake256r14_4way_init(void *cc)
 {
   blake32_4way_init( cc, IV256, salt_zero_small, 14 );
 }
 void
-blake256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+blake256r14_4way(void *cc, const void *data, size_t len)
 {
-	blake32_4way_close(cc, ub, n, dst, 8);
+   blake32_4way(cc, data, len);
 }
 void
 blake256r14_4way_close(void *cc, void *dst)
 {
   blake32_4way_close(cc, 0, 0, dst, 8);
 }
 // 8 rounds blakecoin, vanilla
 void blake256r8_4way_init(void *cc)
 {
   blake32_4way_init( cc, IV256, salt_zero_small, 8 );
 }
 void
 blake256r8_4way(void *cc, const void *data, size_t len)
 {
   blake32_4way(cc, data, len);
 }
 void
 blake256r8_4way_close(void *cc, void *dst)
 {
   blake32_4way_close(cc, 0, 0, dst, 8);
 }
 #if defined (__AVX2__)
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -35,7 +35,9 @@
 */
 #ifndef __BLAKE_HASH_4WAY__
-#define __BLAKE_HASH_4WAY___
+#define __BLAKE_HASH_4WAY__
 #ifdef __AVX__
 #ifdef __cplusplus
 extern "C"{
@@ -45,38 +47,36 @@ extern "C"{
 #include "algo/sha/sph_types.h"
 #include "avxdefs.h"
 /**
 * Output size (in bits) for BLAKE-256.
 */
 #define SPH_SIZE_blake256   256
 #if SPH_64
 /**
 * Output size (in bits) for BLAKE-512.
 */
 #define SPH_SIZE_blake512   512
 #endif
 #ifdef __AVX__
 typedef struct {
   __m128i buf[16] __attribute__ ((aligned (64)));
   __m128i H[8];
   __m128i S[4];    
   size_t ptr;
   sph_u32 T0, T1;
   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
 } blake_4way_small_context;
 // Default 14 rounds
 typedef blake_4way_small_context blake256_4way_context;
 void blake256_4way_init(void *cc);
 void blake256_4way(void *cc, const void *data, size_t len);
 void blake256_4way_close(void *cc, void *dst);
 void blake256_4way_addbits_and_close(
        void *cc, unsigned ub, unsigned n, void *dst);
-#endif
+// 14 rounds, blake, decred
 typedef blake_4way_small_context blake256r14_4way_context;
 void blake256r14_4way_init(void *cc);
 void blake256r14_4way(void *cc, const void *data, size_t len);
 void blake256r14_4way_close(void *cc, void *dst);
 // 8 rounds, blakecoin, vanilla
 typedef blake_4way_small_context blake256r8_4way_context;
 void blake256r8_4way_init(void *cc);
 void blake256r8_4way(void *cc, const void *data, size_t len);
 void blake256r8_4way_close(void *cc, void *dst);
 #ifdef __AVX2__
@@ -103,3 +103,5 @@ void blake512_4way_addbits_and_close(
 #endif
 #endif
 #endif
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -0,0 +1,106 @@
 #include "blakecoin-gate.h"
 #if defined (__AVX__)
 #include "blake-hash-4way.h"
 #include <string.h>
 #include <stdint.h>
 #include <memory.h>
 blake256r8_4way_context blakecoin_ctx;
 void blakecoin_4way_hash(void *state, const void *input)
 {
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
     blake256r8_4way_context ctx;
     memcpy( &ctx, &blakecoin_ctx, sizeof ctx );
     blake256r8_4way( &ctx, input + (64<<2), 16 );
     blake256r8_4way_close( &ctx, vhash );
     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }
 int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done )
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t HTarget = ptarget[7];
   uint32_t _ALIGN(32) edata[20];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
   bool *found = work->nfound;
   int num_found = 0;
   if (opt_benchmark)
      HTarget = 0x7f;
   // we need big endian data...
   swab32_array( edata, pdata, 20 );
   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   blake256r8_4way_init( &blakecoin_ctx );
   blake256r8_4way( &blakecoin_ctx, vdata, 64 );
   uint32_t *noncep = vdata + 76;   // 19*4
   do {
      found[0] = found[1] = found[2] = found[3] = false;
      be32enc( noncep,    n   );
      be32enc( noncep +1, n+1 );
      be32enc( noncep +2, n+2 );
      be32enc( noncep +3, n+3 );
      blakecoin_4way_hash( hash, vdata );
      pdata[19] = n;
      if (  hash[7] <= HTarget && fulltest( hash, ptarget ) )
      {
          found[0] = true;
          num_found++;
          nonces[0] = n;
          work_set_target_ratio( work, hash );
      }
      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) ) 
      {
          found[1] = true;
          num_found++;
          nonces[1] = n+1;
          work_set_target_ratio( work, hash+8 );
      }
      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
      {
           found[2] = true;
           num_found++;
           nonces[2] = n+2;
           work_set_target_ratio( work, hash+16 );
      }
      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
      {
           found[3] = true;
           num_found++;
           nonces[3] = n+3;
           work_set_target_ratio( work, hash+24 );
      }
      n += 4;
   } while ( (num_found == 0) && (n < max_nonce) 
             && !work_restart[thr_id].restart );
   *hashes_done = n - first_nonce + 1;
   // workaround to prevent flood of hash reports when nonce range exhasuted
   // and thread is spinning waiting for new work
   if ( ( n >= max_nonce ) && ( *hashes_done < 10 ) )
   {
      *hashes_done = 0;
      sleep(1);
   }
   return num_found;
 }
 #endif
--- a/algo/blake/blakecoin-gate.c
+++ b/algo/blake/blakecoin-gate.c
@@ -0,0 +1,71 @@
 #include "blakecoin-gate.h"
 #include <memory.h>
 // changed to get_max64_0x3fffffLL in cpuminer-multi-decred
 int64_t blakecoin_get_max64 ()
 {
  return 0x7ffffLL;
 //  return 0x3fffffLL;
 }
 // Blakecoin 4 way hashes so fast it runs out of nonces.
 // This is an attempt to solve this but the result may be
 // to rehash old nonces until new work is received.
 void bc4w_get_new_work( struct work* work, struct work* g_work, int thr_id,
                     uint32_t *end_nonce_ptr, bool clean_job )
 {
   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
 // 
 //   if ( have_stratum && ( *nonceptr >= *end_nonce_ptr ) )
 //      algo_gate.stratum_gen_work( &stratum, g_work );
   if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size ) 
   || ( *nonceptr >= *end_nonce_ptr )
   || (  work->job_id != g_work->job_id ) && clean_job  )
 /*
   if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
      && ( clean_job || ( *nonceptr >= *end_nonce_ptr )
         || ( work->job_id != g_work->job_id ) ) )
 */   
   {
     work_free( work );
     work_copy( work, g_work );
     *nonceptr = 0xffffffffU / opt_n_threads * thr_id;
     if ( opt_randomize )
       *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
     *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20; 
 // try incrementing the xnonce to chsnge the data
 //     for ( int i = 0; i < work->xnonce2_size && !( ++work->xnonce2[i] ); i++ );
   }
   else
       ++(*nonceptr);
 }
 // vanilla uses default gen merkle root, otherwise identical to blakecoin
 bool register_vanilla_algo( algo_gate_t* gate )
 {
 #if defined(BLAKECOIN_4WAY)
 //  four_way_not_tested();
  gate->optimizations = FOUR_WAY_OPT;
  gate->scanhash  = (void*)&scanhash_blakecoin_4way;
  gate->hash      = (void*)&blakecoin_4way_hash;
 //  gate->get_new_work = (void*)&bc4w_get_new_work;
 //  blakecoin_4way_init( &blake_4way_init_ctx );
 #else
  gate->scanhash = (void*)&scanhash_blakecoin;
  gate->hash     = (void*)&blakecoinhash;
 //  blakecoin_init( &blake_init_ctx );
 #endif
  gate->optimizations = AVX2_OPT | FOUR_WAY_OPT;
  gate->get_max64 = (void*)&blakecoin_get_max64;
  return true;
 }
 bool register_blakecoin_algo( algo_gate_t* gate )
 {
  register_vanilla_algo( gate );
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  return true;
 }
--- a/algo/blake/blakecoin-gate.h
+++ b/algo/blake/blakecoin-gate.h
@@ -0,0 +1,21 @@
 #ifndef __BLAKECOIN_GATE_H__
 #define __BLAKECOIN_GATE_H__
 #include "algo-gate-api.h"
 #include <stdint.h>
 #if defined(FOUR_WAY) && defined(__AVX__)
  #define BLAKECOIN_4WAY
 #endif
 #if defined (BLAKECOIN_4WAY)
 void blakecoin_4way_hash(void *state, const void *input);
 int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );
 #endif
 void blakecoinhash( void *state, const void *input );
 int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done );
 #endif
--- a/algo/blake/blakecoin.c
+++ b/algo/blake/blakecoin.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "blakecoin-gate.h"
 #define BLAKE32_ROUNDS 8
 #include "sph_blake.h"
@@ -98,7 +98,7 @@ void blakecoin_gen_merkle_root ( char* merkle_root, struct stratum_ctx* sctx )
 SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
 }
 */
-
+/*
 // changed to get_max64_0x3fffffLL in cpuminer-multi-decred
 int64_t blakecoin_get_max64 ()
 {
@@ -121,4 +121,4 @@ bool register_blakecoin_algo( algo_gate_t* gate )
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  return true;
 }
-
+*/
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -1,5 +1,4 @@
 #include "decred-gate.h"
 #include "sph_blake.h"
 #include "blake-hash-4way.h"
 #include <string.h>
 #include <stdint.h>
@@ -9,7 +8,6 @@
 #if defined (DECRED_4WAY)
 static __thread blake256_4way_context blake_mid;
 static __thread bool ctx_midstate_done = false;
 void decred_hash_4way( void *state, const void *input )
 {
@@ -18,50 +16,14 @@ void decred_hash_4way( void *state, const void *input )
     uint32_t hash1[8] __attribute__ ((aligned (32)));
     uint32_t hash2[8] __attribute__ ((aligned (32)));
     uint32_t hash3[8] __attribute__ ((aligned (32)));
     blake256_4way_context ctx __attribute__ ((aligned (64)));
     sph_blake256_context ctx2 __attribute__ ((aligned (64)));
     uint32_t hash[16] __attribute__ ((aligned (64)));
     uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
     mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );
     void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
     int tail_len = 180 - DECRED_MIDSTATE_LEN; 
     blake256_4way_context ctx __attribute__ ((aligned (64)));
     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
     blake256_4way( &ctx, tail, tail_len );
     blake256_4way_close( &ctx, vhash );
-/*
+     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
     sph_blake256_init( &ctx2 );
     sph_blake256( &ctx2, sin0, 180 );
     sph_blake256_close( &ctx2, hash );
 */
 /*
     blake256_4way_init( &ctx );
     blake256_4way( &ctx, input, 180 );
     blake256_4way_close( &ctx, vhash );
 */
     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
 /*
        for ( int i = 0; i < 8; i++ )
          if ( hash[i] != hash0[i] )
            printf(" hash mismatch, i = %u\n",i);
 printf("hash:  %08lx %08lx %08lx %08lx\n", *hash, *(hash+1),
                             *(hash+2), *(hash+3) );
 printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1),
                             *(hash0+2), *(hash0+3) );
 printf("\n");
 */
     memcpy( state,    hash0, 32 );
     memcpy( state+32, hash1, 32 );
     memcpy( state+64, hash2, 32 );
     memcpy( state+96, hash3, 32 );
 //     memcpy( state, hash, 32 );
 }
 int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -79,11 +41,11 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
   bool *found = work->nfound;
   int num_found = 0;
-        ctx_midstate_done = false;
+   // copy to buffer guaranteed to be aligned.
   memcpy( edata, pdata, 180 );
   // use the old way until  new way updated for size.
-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 180*8 );
+   mm_interleave_4x32x( vdata, edata, edata, edata, edata, 180*8 );
   blake256_4way_init( &blake_mid );
   blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
@@ -106,22 +68,13 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
          nonces[0] = n;
          pdata[DECRED_NONCE_INDEX] = n;
      }
 /*
      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
      {
 printf("found 1\n");          
 printf("vhash: %08lx %08lx %08lx %08lx\n", hash[8], hash[9], hash[10],hash[11] );
 printf("vhash: %08lx %08lx %08lx %08lx\n", hash[12], hash[13], hash[14],hash[15] );
 printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
 printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
          work_set_target_ratio( work, hash+8 );
          found[1] = true;
          num_found++;
          nonces[1] = n+1;
      }
 */
      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
      {
          work_set_target_ratio( work, hash+16 );
@@ -129,24 +82,15 @@ printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[
          num_found++;
          nonces[2] = n+2;
      }
-/*
+
      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
      {
 printf("found 3\n");          
 printf("vhash: %08lx %08lx %08lx %08lx\n", hash[0], hash[1], hash[2],hash[3] );
 printf("vhash: %08lx %08lx %08lx %08lx\n", hash[4], hash[5], hash[6],hash[7] );
 printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
 printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
          work_set_target_ratio( work, hash+24 );
          found[3] = true;
          num_found++;
          nonces[3] = n+3;
      }
-*/
+      n += 4;
      n += 2;
 //      n += 4;
  } while ( (num_found == 0) && (n < max_nonce) 
            && !work_restart[thr_id].restart );
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -1,4 +1,7 @@
 #include "pentablake-gate.h"
 #ifdef __AVX2__
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -9,8 +12,6 @@
 //#define DEBUG_ALGO
 #ifdef PENTABLAKE_4WAY
 extern void pentablakehash_4way( void *output, const void *input )
 {
 	unsigned char _ALIGN(32) hash[128];
--- a/algo/blake/pentablake-gate.h
+++ b/algo/blake/pentablake-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(FOUR_WAY) && defined(__AVX__)
+#if defined(FOUR_WAY) && defined(__AVX2__)
  #define PENTABLAKE_4WAY
 #endif
--- a/algo/bmw/bmw-hash-4way.c
+++ b/algo/bmw/bmw-hash-4way.c
@@ -41,19 +41,13 @@
 extern "C"{
 #endif
 //#include "sph_bmw.h"
 //#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BMW
 #define SPH_SMALL_FOOTPRINT_BMW   1
 //#endif
 #ifdef _MSC_VER
 #pragma warning (disable: 4146)
 #endif
-//#undef SPH_ROTL64
+#define LPAR   (
-//#define SPH_ROTL64(x,n)  (((x) << (n)) | ((x) >> (64 - (n))))
+
-//#define SPH_ROTL64(x,n)  mm256_rotl_64(x,n)
+// BMW256
 static const sph_u32 IV256[] = {
 	SPH_C32(0x40414243), SPH_C32(0x44454647),
@@ -66,8 +60,7 @@ static const sph_u32 IV256[] = {
 	SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
 };
-#if SPH_64
+// BMW512
 static const sph_u64 IV512[] = {
 	SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
 	SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
@@ -79,69 +72,108 @@ static const sph_u64 IV512[] = {
 	SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
 };
-#endif
+// BMW256
-#define XCAT(x, y)    XCAT_(x, y)
+#define ss0(x) \
-#define XCAT_(x, y)   x ## y
+   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
                                 _mm_slli_epi32( (x), 3) ), \
                  _mm_xor_si128( mm_rotl_32( (x),  4), \
                                 mm_rotl_32( (x), 19) ) )
-#define LPAR   (
+#define ss1(x) \
   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
                                 _mm_slli_epi32( (x), 2) ), \
                  _mm_xor_si128( mm_rotl_32( (x),  8), \
                                 mm_rotl_32( (x), 23) ) )
-/*
+#define ss2(x) \
-#define ss0(x)    (((x) >> 1) ^ SPH_T32((x) << 3) \
+   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
-                  ^ SPH_ROTL32(x,  4) ^ SPH_ROTL32(x, 19))
+                                 _mm_slli_epi32( (x), 1) ), \
-#define ss1(x)    (((x) >> 1) ^ SPH_T32((x) << 2) \
+                  _mm_xor_si128( mm_rotl_32( (x), 12), \
-                  ^ SPH_ROTL32(x,  8) ^ SPH_ROTL32(x, 23))
+                                 mm_rotl_32( (x), 25) ) )
 #define ss2(x)    (((x) >> 2) ^ SPH_T32((x) << 1) \
                  ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
 #define ss3(x)    (((x) >> 2) ^ SPH_T32((x) << 2) \
                  ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
 #define ss4(x)    (((x) >> 1) ^ (x))
 #define ss5(x)    (((x) >> 2) ^ (x))
 #define rs1(x)    SPH_ROTL32(x,  3)
 #define rs2(x)    SPH_ROTL32(x,  7)
 #define rs3(x)    SPH_ROTL32(x, 13)
 #define rs4(x)    SPH_ROTL32(x, 16)
 #define rs5(x)    SPH_ROTL32(x, 19)
 #define rs6(x)    SPH_ROTL32(x, 23)
 #define rs7(x)    SPH_ROTL32(x, 27)
-#define Ks(j)   SPH_T32((sph_u32)(j) * SPH_C32(0x05555555))
+#define ss3(x) \
   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
                                 _mm_slli_epi32( (x), 2) ), \
                  _mm_xor_si128( mm_rotl_32( (x), 15), \
                                 mm_rotl_32( (x), 29) ) )
-#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
+#define ss4(x) \
-	(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \
+  _mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) )
 		- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
-#define expand1s_inner(qf, mf, hf, i16, \
+#define ss5(x) \
-		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+  _mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) )
 		i9, i10, i11, i12, i13, i14, i15, \
 		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
 	SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
 		+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
 		+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
 		+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
 		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
-#define expand1s(qf, mf, hf, i16) \
+#define rs1(x)    mm_rotl_32( x,  3 ) 
-	expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define rs2(x)    mm_rotl_32( x,  7 ) 
-#define expand1s_(qf, mf, hf, i16, ix, iy) \
+#define rs3(x)    mm_rotl_32( x, 13 ) 
-	expand1s_inner LPAR qf, mf, hf, i16, ix, iy)
+#define rs4(x)    mm_rotl_32( x, 16 ) 
 #define rs5(x)    mm_rotl_32( x, 19 ) 
 #define rs6(x)    mm_rotl_32( x, 23 ) 
 #define rs7(x)    mm_rotl_32( x, 27 ) 
-#define expand2s_inner(qf, mf, hf, i16, \
+#define rol_off_32( M, j, off ) \
-		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+   mm_rotl_32( M[ ( (j) + (off) ) & 0xF ] , \
-		i9, i10, i11, i12, i13, i14, i15, \
+                ( ( (j) + (off) ) & 0xF ) + 1 )
 		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
 	SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
 		+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
 		+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
 		+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
 		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
-#define expand2s(qf, mf, hf, i16) \
+#define add_elt_s( M, H, j ) \
-	expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+   _mm_xor_si128( \
-#define expand2s_(qf, mf, hf, i16, ix, iy) \
+      _mm_add_epi32( \
-	expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
+            _mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
-*/
+                                          rol_off_32( M, j, 3 ) ), \
-#if SPH_64
+                           rol_off_32( M, j, 10 ) ), \
            _mm_set1_epi32( ( (j) + 16 ) * 0x05555555UL ) ), \
    H[ ( (j)+7 ) & 0xF ] )
 #define expand1s( qt, M, H, i ) \
   _mm_add_epi32( \
      _mm_add_epi32( \
         _mm_add_epi32( \
             _mm_add_epi32( \
                _mm_add_epi32( ss1( qt[ (i)-16 ] ), \
                               ss2( qt[ (i)-15 ] ) ), \
                _mm_add_epi32( ss3( qt[ (i)-14 ] ), \
                               ss0( qt[ (i)-13 ] ) ) ), \
             _mm_add_epi32( \
                _mm_add_epi32( ss1( qt[ (i)-12 ] ), \
                               ss2( qt[ (i)-11 ] ) ), \
                _mm_add_epi32( ss3( qt[ (i)-10 ] ), \
                               ss0( qt[ (i)- 9 ] ) ) ) ), \
         _mm_add_epi32( \
             _mm_add_epi32( \
                _mm_add_epi32( ss1( qt[ (i)- 8 ] ), \
                               ss2( qt[ (i)- 7 ] ) ), \
                _mm_add_epi32( ss3( qt[ (i)- 6 ] ), \
                               ss0( qt[ (i)- 5 ] ) ) ), \
             _mm_add_epi32( \
                _mm_add_epi32( ss1( qt[ (i)- 4 ] ), \
                               ss2( qt[ (i)- 3 ] ) ), \
                _mm_add_epi32( ss3( qt[ (i)- 2 ] ), \
                               ss0( qt[ (i)- 1 ] ) ) ) ) ), \
      add_elt_s( M, H, (i)-16 ) )
 #define expand2s( qt, M, H, i) \
   _mm_add_epi32( \
      _mm_add_epi32( \
         _mm_add_epi32( \
             _mm_add_epi32( \
                _mm_add_epi32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ) ), \
                _mm_add_epi32( qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ) ), \
             _mm_add_epi64( \
                _mm_add_epi32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ) ), \
                _mm_add_epi32( qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ) ) ), \
         _mm_add_epi32( \
             _mm_add_epi32( \
                _mm_add_epi32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ) ), \
                _mm_add_epi32( qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ) ), \
             _mm_add_epi32( \
                _mm_add_epi32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ) ), \
                _mm_add_epi32( ss4( qt[ (i)- 2 ] ), \
                               ss5( qt[ (i)- 1 ] ) ) ) ) ), \
      add_elt_s( M, H, (i)-16 ) )
 // BMW512
 #define sb0(x) \
   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
@@ -181,18 +213,18 @@ static const sph_u64 IV512[] = {
 #define rb6(x)    mm256_rotl_64( x, 43 ) 
 #define rb7(x)    mm256_rotl_64( x, 53 ) 
-#define rol_off( M, j, off ) \
+#define rol_off_64( M, j, off ) \
-   mm256_rotl_64( M[ ( (j) + (off) ) & 15 ] , \
+   mm256_rotl_64( M[ ( (j) + (off) ) & 0xF ] , \
-                   ( ( (j) + (off) ) & 15 ) + 1 )
+                   ( ( (j) + (off) ) & 0xF ) + 1 )
 #define add_elt_b( M, H, j ) \
   _mm256_xor_si256( \
      _mm256_add_epi64( \
-            _mm256_sub_epi64( _mm256_add_epi64( rol_off( M, j, 0 ), \
+            _mm256_sub_epi64( _mm256_add_epi64( rol_off_64( M, j, 0 ), \
-                                                rol_off( M, j, 3 ) ), \
+                                                rol_off_64( M, j, 3 ) ), \
-                             rol_off( M, j, 10 ) ), \
+                             rol_off_64( M, j, 10 ) ), \
            _mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
-       H[ ( (j)+7 ) & 15 ] )
+       H[ ( (j)+7 ) & 0xF ] )
 #define expand1b( qt, M, H, i ) \
   _mm256_add_epi64( \
@@ -241,132 +273,301 @@ static const sph_u64 IV512[] = {
                                  sb5( qt[ (i)- 1 ] ) ) ) ) ), \
      add_elt_b( M, H, (i)-16 ) )
-#endif
+// BMW256
-/*
+#define Ws0 \
-#define MAKE_W( i0, op01, i1, op12, i2, op23, i3, op34, i4) \
+   _mm_add_epi32( \
-        ((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
+       _mm_add_epi32( \
-        op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
+          _mm_add_epi32( \
-*/
+             _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
                            _mm_xor_si128( M[ 7], H[ 7] ) ), \
             _mm_xor_si128( M[10], H[10] ) ), \
          _mm_xor_si128( M[13], H[13] ) ), \
       _mm_xor_si128( M[14], H[14] ) )
-/*
+#define Ws1 \
-#define Ws0    MAKE_W(SPH_T32,  5, -,  7, +, 10, +, 13, +, 14)
+   _mm_sub_epi32( \
-#define Ws1    MAKE_W(SPH_T32,  6, -,  8, +, 11, +, 14, -, 15)
+       _mm_add_epi32( \
-#define Ws2    MAKE_W(SPH_T32,  0, +,  7, +,  9, -, 12, +, 15)
+          _mm_add_epi32( \
-#define Ws3    MAKE_W(SPH_T32,  0, -,  1, +,  8, -, 10, +, 13)
+             _mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
-#define Ws4    MAKE_W(SPH_T32,  1, +,  2, +,  9, -, 11, -, 14)
+                            _mm_xor_si128( M[ 8], H[ 8] ) ), \
-#define Ws5    MAKE_W(SPH_T32,  3, -,  2, +, 10, -, 12, +, 15)
+             _mm_xor_si128( M[11], H[11] ) ), \
-#define Ws6    MAKE_W(SPH_T32,  4, -,  0, -,  3, -, 11, +, 13)
+          _mm_xor_si128( M[14], H[14] ) ), \
-#define Ws7    MAKE_W(SPH_T32,  1, -,  4, -,  5, -, 12, -, 14)
+       _mm_xor_si128( M[15], H[15] ) )
 #define Ws8    MAKE_W(SPH_T32,  2, -,  5, -,  6, +, 13, -, 15)
 #define Ws9    MAKE_W(SPH_T32,  0, -,  3, +,  6, -,  7, +, 14)
 #define Ws10   MAKE_W(SPH_T32,  8, -,  1, -,  4, -,  7, +, 15)
 #define Ws11   MAKE_W(SPH_T32,  8, -,  0, -,  2, -,  5, +,  9)
 #define Ws12   MAKE_W(SPH_T32,  1, +,  3, -,  6, -,  9, +, 10)
 #define Ws13   MAKE_W(SPH_T32,  2, +,  4, +,  7, +, 10, +, 11)
 #define Ws14   MAKE_W(SPH_T32,  3, -,  5, +,  8, -, 11, -, 12)
 #define Ws15   MAKE_W(SPH_T32, 12, -,  4, -,  6, -,  9, +, 13)
-#if SPH_SMALL_FOOTPRINT_BMW
+#define Ws2 \
   _mm_add_epi32( \
       _mm_sub_epi32( \
          _mm_add_epi32( \
             _mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
                            _mm_xor_si128( M[ 7], H[ 7] ) ), \
             _mm_xor_si128( M[ 9], H[ 9] ) ), \
          _mm_xor_si128( M[12], H[12] ) ), \
       _mm_xor_si128( M[15], H[15] ) )
-#define MAKE_Qas   do { \
+#define Ws3 \
-		unsigned u; \
+   _mm_add_epi32( \
-		sph_u32 Ws[16]; \
+       _mm_sub_epi32( \
-		Ws[ 0] = Ws0; \
+          _mm_add_epi32( \
-		Ws[ 1] = Ws1; \
+             _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
-		Ws[ 2] = Ws2; \
+                            _mm_xor_si128( M[ 1], H[ 1] ) ), \
-		Ws[ 3] = Ws3; \
+             _mm_xor_si128( M[ 8], H[ 8] ) ), \
-		Ws[ 4] = Ws4; \
+          _mm_xor_si128( M[10], H[10] ) ), \
-		Ws[ 5] = Ws5; \
+       _mm_xor_si128( M[13], H[13] ) )
 		Ws[ 6] = Ws6; \
 		Ws[ 7] = Ws7; \
 		Ws[ 8] = Ws8; \
 		Ws[ 9] = Ws9; \
 		Ws[10] = Ws10; \
 		Ws[11] = Ws11; \
 		Ws[12] = Ws12; \
 		Ws[13] = Ws13; \
 		Ws[14] = Ws14; \
 		Ws[15] = Ws15; \
 		for (u = 0; u < 15; u += 5) { \
 			qt[u + 0] = SPH_T32(ss0(Ws[u + 0]) + H(u + 1)); \
 			qt[u + 1] = SPH_T32(ss1(Ws[u + 1]) + H(u + 2)); \
 			qt[u + 2] = SPH_T32(ss2(Ws[u + 2]) + H(u + 3)); \
 			qt[u + 3] = SPH_T32(ss3(Ws[u + 3]) + H(u + 4)); \
 			qt[u + 4] = SPH_T32(ss4(Ws[u + 4]) + H(u + 5)); \
 		} \
 		qt[15] = SPH_T32(ss0(Ws[15]) + H(0)); \
 	} while (0)
-#define MAKE_Qbs   do { \
+#define Ws4 \
-		qt[16] = expand1s(Qs, M, H, 16); \
+   _mm_sub_epi32( \
-		qt[17] = expand1s(Qs, M, H, 17); \
+       _mm_sub_epi32( \
-		qt[18] = expand2s(Qs, M, H, 18); \
+          _mm_add_epi32( \
-		qt[19] = expand2s(Qs, M, H, 19); \
+             _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
-		qt[20] = expand2s(Qs, M, H, 20); \
+                            _mm_xor_si128( M[ 2], H[ 2] ) ), \
-		qt[21] = expand2s(Qs, M, H, 21); \
+             _mm_xor_si128( M[ 9], H[ 9] ) ), \
-		qt[22] = expand2s(Qs, M, H, 22); \
+          _mm_xor_si128( M[11], H[11] ) ), \
-		qt[23] = expand2s(Qs, M, H, 23); \
+       _mm_xor_si128( M[14], H[14] ) )
 		qt[24] = expand2s(Qs, M, H, 24); \
 		qt[25] = expand2s(Qs, M, H, 25); \
 		qt[26] = expand2s(Qs, M, H, 26); \
 		qt[27] = expand2s(Qs, M, H, 27); \
 		qt[28] = expand2s(Qs, M, H, 28); \
 		qt[29] = expand2s(Qs, M, H, 29); \
 		qt[30] = expand2s(Qs, M, H, 30); \
 		qt[31] = expand2s(Qs, M, H, 31); \
 	} while (0)
-#else
+#define Ws5 \
   _mm_add_epi32( \
       _mm_sub_epi32( \
          _mm_add_epi32( \
             _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
                            _mm_xor_si128( M[ 2], H[ 2] ) ), \
             _mm_xor_si128( M[10], H[10] ) ), \
          _mm_xor_si128( M[12], H[12] ) ), \
       _mm_xor_si128( M[15], H[15] ) )
-#define MAKE_Qas   do { \
+#define Ws6 \
-		qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \
+   _mm_add_epi32( \
-		qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \
+       _mm_sub_epi32( \
-		qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \
+          _mm_sub_epi32( \
-		qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \
+             _mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
-		qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \
+                            _mm_xor_si128( M[ 0], H[ 0] ) ), \
-		qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \
+             _mm_xor_si128( M[ 3], H[ 3] ) ), \
-		qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \
+          _mm_xor_si128( M[11], H[11] ) ), \
-		qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \
+       _mm_xor_si128( M[13], H[13] ) )
 		qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
 		qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
 		qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
 		qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
 		qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
 		qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
 		qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
 		qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
 	} while (0)
-#define MAKE_Qbs   do { \
+#define Ws7 \
-		qt[16] = expand1s(Qs, M, H, 16); \
+   _mm_sub_epi32( \
-		qt[17] = expand1s(Qs, M, H, 17); \
+       _mm_sub_epi32( \
-		qt[18] = expand2s(Qs, M, H, 18); \
+          _mm_sub_epi32( \
-		qt[19] = expand2s(Qs, M, H, 19); \
+             _mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
-		qt[20] = expand2s(Qs, M, H, 20); \
+                            _mm_xor_si128( M[ 4], H[ 4] ) ), \
-		qt[21] = expand2s(Qs, M, H, 21); \
+             _mm_xor_si128( M[ 5], H[ 5] ) ), \
-		qt[22] = expand2s(Qs, M, H, 22); \
+          _mm_xor_si128( M[12], H[12] ) ), \
-		qt[23] = expand2s(Qs, M, H, 23); \
+       _mm_xor_si128( M[14], H[14] ) )
 		qt[24] = expand2s(Qs, M, H, 24); \
 		qt[25] = expand2s(Qs, M, H, 25); \
 		qt[26] = expand2s(Qs, M, H, 26); \
 		qt[27] = expand2s(Qs, M, H, 27); \
 		qt[28] = expand2s(Qs, M, H, 28); \
 		qt[29] = expand2s(Qs, M, H, 29); \
 		qt[30] = expand2s(Qs, M, H, 30); \
 		qt[31] = expand2s(Qs, M, H, 31); \
 	} while (0)
-#endif
+#define Ws8 \
   _mm_sub_epi32( \
       _mm_add_epi32( \
          _mm_sub_epi32( \
             _mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
                            _mm_xor_si128( M[ 5], H[ 5] ) ), \
             _mm_xor_si128( M[ 6], H[ 6] ) ), \
          _mm_xor_si128( M[13], H[13] ) ), \
       _mm_xor_si128( M[15], H[15] ) )
-#define MAKE_Qs   do { \
+#define Ws9 \
-		MAKE_Qas; \
+   _mm_add_epi32( \
-		MAKE_Qbs; \
+       _mm_sub_epi32( \
-	} while (0)
+          _mm_add_epi32( \
             _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
                            _mm_xor_si128( M[ 3], H[ 3] ) ), \
             _mm_xor_si128( M[ 6], H[ 6] ) ), \
          _mm_xor_si128( M[ 7], H[ 7] ) ), \
       _mm_xor_si128( M[14], H[14] ) )
-#define Qs(j)   (qt[j])
+#define Ws10 \
-*/
+   _mm_add_epi32( \
-#if SPH_64
+       _mm_sub_epi32( \
          _mm_sub_epi32( \
             _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
                            _mm_xor_si128( M[ 1], H[ 1] ) ), \
             _mm_xor_si128( M[ 4], H[ 4] ) ), \
          _mm_xor_si128( M[ 7], H[ 7] ) ), \
       _mm_xor_si128( M[15], H[15] ) )
 #define Ws11 \
   _mm_add_epi32( \
       _mm_sub_epi32( \
          _mm_sub_epi32( \
             _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
                            _mm_xor_si128( M[ 0], H[ 0] ) ), \
             _mm_xor_si128( M[ 2], H[ 2] ) ), \
          _mm_xor_si128( M[ 5], H[ 5] ) ), \
       _mm_xor_si128( M[ 9], H[ 9] ) )
 #define Ws12 \
   _mm_add_epi32( \
       _mm_sub_epi32( \
          _mm_sub_epi32( \
             _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
                            _mm_xor_si128( M[ 3], H[ 3] ) ), \
             _mm_xor_si128( M[ 6], H[ 6] ) ), \
          _mm_xor_si128( M[ 9], H[ 9] ) ), \
       _mm_xor_si128( M[10], H[10] ) )
 #define Ws13 \
   _mm_add_epi32( \
       _mm_add_epi32( \
          _mm_add_epi32( \
             _mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
                            _mm_xor_si128( M[ 4], H[ 4] ) ), \
             _mm_xor_si128( M[ 7], H[ 7] ) ), \
          _mm_xor_si128( M[10], H[10] ) ), \
       _mm_xor_si128( M[11], H[11] ) )
 #define Ws14 \
   _mm_sub_epi32( \
       _mm_sub_epi32( \
          _mm_add_epi32( \
             _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
                               _mm_xor_si128( M[ 5], H[ 5] ) ), \
             _mm_xor_si128( M[ 8], H[ 8] ) ), \
          _mm_xor_si128( M[11], H[11] ) ), \
       _mm_xor_si128( M[12], H[12] ) )
 #define Ws15 \
   _mm_add_epi32( \
       _mm_sub_epi32( \
          _mm_sub_epi32( \
             _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
                            _mm_xor_si128( M[ 4], H[ 4] ) ), \
             _mm_xor_si128( M[ 6], H[ 6] ) ), \
          _mm_xor_si128( M[ 9], H[ 9] ) ), \
       _mm_xor_si128( M[13], H[13] ) )
 void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
 {
   __m128i qt[32], xl, xh; \
   qt[ 0] = ss0( Ws0 ) + H[ 1];
   qt[ 1] = ss1( Ws1 ) + H[ 2];
   qt[ 2] = ss2( Ws2 ) + H[ 3];
   qt[ 3] = ss3( Ws3 ) + H[ 4];
   qt[ 4] = ss4( Ws4 ) + H[ 5];
   qt[ 5] = ss0( Ws5 ) + H[ 6];
   qt[ 6] = ss1( Ws6 ) + H[ 7];
   qt[ 7] = ss2( Ws7 ) + H[ 8];
   qt[ 8] = ss3( Ws8 ) + H[ 9];
   qt[ 9] = ss4( Ws9 ) + H[10];
   qt[10] = ss0( Ws10) + H[11];
   qt[11] = ss1( Ws11) + H[12];
   qt[12] = ss2( Ws12) + H[13];
   qt[13] = ss3( Ws13) + H[14];
   qt[14] = ss4( Ws14) + H[15];
   qt[15] = ss0( Ws15) + H[ 0];
   qt[16] = expand1s( qt, M, H, 16 );
   qt[17] = expand1s( qt, M, H, 17 );
   qt[18] = expand2s( qt, M, H, 18 );
   qt[19] = expand2s( qt, M, H, 19 );
   qt[20] = expand2s( qt, M, H, 20 );
   qt[21] = expand2s( qt, M, H, 21 );
   qt[22] = expand2s( qt, M, H, 22 );
   qt[23] = expand2s( qt, M, H, 23 );
   qt[24] = expand2s( qt, M, H, 24 );
   qt[25] = expand2s( qt, M, H, 25 );
   qt[26] = expand2s( qt, M, H, 26 );
   qt[27] = expand2s( qt, M, H, 27 );
   qt[28] = expand2s( qt, M, H, 28 );
   qt[29] = expand2s( qt, M, H, 29 );
   qt[30] = expand2s( qt, M, H, 30 );
   qt[31] = expand2s( qt, M, H, 31 );
   xl = _mm_xor_si128(
              _mm_xor_si128( _mm_xor_si128( qt[16], qt[17] ),
                             _mm_xor_si128( qt[18], qt[19] ) ),
              _mm_xor_si128( _mm_xor_si128( qt[20], qt[21] ),
                             _mm_xor_si128( qt[22], qt[23] ) ) );
   xh = _mm_xor_si128( xl,
             _mm_xor_si128(
                 _mm_xor_si128( _mm_xor_si128( qt[24], qt[25] ),
                                   _mm_xor_si128( qt[26], qt[27] ) ),
                 _mm_xor_si128( _mm_xor_si128( qt[28], qt[29] ),
                                   _mm_xor_si128( qt[30], qt[31] ) )));
   dH[ 0] = _mm_add_epi32(
                 _mm_xor_si128( M[0],
                      _mm_xor_si128( _mm_slli_epi32( xh, 5 ),
                                     _mm_srli_epi32( qt[16], 5 ) ) ),
                 _mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] ));
   dH[ 1] = _mm_add_epi32(
                 _mm_xor_si128( M[1],
                      _mm_xor_si128( _mm_srli_epi32( xh, 7 ),
                                     _mm_slli_epi32( qt[17], 8 ) ) ),
                 _mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] ));
   dH[ 2] = _mm_add_epi32(
                 _mm_xor_si128( M[2],
                      _mm_xor_si128( _mm_srli_epi32( xh, 5 ),
                                     _mm_slli_epi32( qt[18], 5 ) ) ),
                 _mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] ));
   dH[ 3] = _mm_add_epi32(
                 _mm_xor_si128( M[3],
                      _mm_xor_si128( _mm_srli_epi32( xh, 1 ),
                                     _mm_slli_epi32( qt[19], 5 ) ) ),
                 _mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] ));
   dH[ 4] = _mm_add_epi32(
                 _mm_xor_si128( M[4],
                      _mm_xor_si128( _mm_srli_epi32( xh, 3 ),
                                     _mm_slli_epi32( qt[20], 0 ) ) ),
                 _mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] ));
   dH[ 5] = _mm_add_epi32(
                 _mm_xor_si128( M[5],
                      _mm_xor_si128( _mm_slli_epi32( xh, 6 ),
                                     _mm_srli_epi32( qt[21], 6 ) ) ),
                 _mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] ));
   dH[ 6] = _mm_add_epi32(
                 _mm_xor_si128( M[6],
                      _mm_xor_si128( _mm_srli_epi32( xh, 4 ),
                                     _mm_slli_epi32( qt[22], 6 ) ) ),
                 _mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] ));
   dH[ 7] = _mm_add_epi32(
                 _mm_xor_si128( M[7],
                      _mm_xor_si128( _mm_srli_epi32( xh, 11 ),
                                     _mm_slli_epi32( qt[23], 2 ) ) ),
                 _mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ));
   dH[ 8] = _mm_add_epi32( _mm_add_epi32(
                 mm_rotl_32( dH[4], 9 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )),
                 _mm_xor_si128( _mm_slli_epi32( xl, 8 ),
                                _mm_xor_si128( qt[23], qt[ 8] ) ) );
   dH[ 9] = _mm_add_epi32( _mm_add_epi32(
                 mm_rotl_32( dH[5], 10 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )),
                 _mm_xor_si128( _mm_srli_epi32( xl, 6 ),
                                _mm_xor_si128( qt[16], qt[ 9] ) ) );
   dH[10] = _mm_add_epi32( _mm_add_epi32(
                 mm_rotl_32( dH[6], 11 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )),
                 _mm_xor_si128( _mm_slli_epi32( xl, 6 ),
                                _mm_xor_si128( qt[17], qt[10] ) ) );
   dH[11] = _mm_add_epi32( _mm_add_epi32(
                 mm_rotl_32( dH[7], 12 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
                 _mm_xor_si128( _mm_slli_epi32( xl, 4 ),
                                _mm_xor_si128( qt[18], qt[11] ) ) );
   dH[12] = _mm_add_epi32( _mm_add_epi32(
                 mm_rotl_32( dH[0], 13 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )),
                 _mm_xor_si128( _mm_srli_epi32( xl, 3 ),
                                _mm_xor_si128( qt[19], qt[12] ) ) );
   dH[13] = _mm_add_epi32( _mm_add_epi32(
                 mm_rotl_32( dH[1], 14 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )),
                 _mm_xor_si128( _mm_srli_epi32( xl, 4 ),
                                _mm_xor_si128( qt[20], qt[13] ) ) );
   dH[14] = _mm_add_epi32( _mm_add_epi32(
                 mm_rotl_32( dH[2], 15 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )),
                 _mm_xor_si128( _mm_srli_epi32( xl, 7 ),
                                _mm_xor_si128( qt[21], qt[14] ) ) );
   dH[15] = _mm_add_epi32( _mm_add_epi32(
                 mm_rotl_32( dH[3], 16 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )),
                 _mm_xor_si128( _mm_srli_epi32( xl, 2 ),
                                _mm_xor_si128( qt[22], qt[15] ) ) );
 }
 // BMW512
 #define Wb0 \
   _mm256_add_epi64( \
@@ -564,6 +765,7 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
   qt[29] = expand2b( qt, M, H, 29 ); 
   qt[30] = expand2b( qt, M, H, 30 ); 
   qt[31] = expand2b( qt, M, H, 31 ); 
   xl = _mm256_xor_si256( 
              _mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ), 
                                _mm256_xor_si256( qt[18], qt[19] ) ), 
@@ -575,6 +777,7 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
                                   _mm256_xor_si256( qt[26], qt[27] ) ),
                 _mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
                                   _mm256_xor_si256( qt[30], qt[31] ) )));
   dH[ 0] = _mm256_add_epi64(
                 _mm256_xor_si256( M[0],
                      _mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
@@ -657,85 +860,65 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
 } 
-#endif  // 64
+// BMW256
-//#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_ROTL32, M, Qs, dH)
+static const __m128i final_s[16] =
 /*
 static void
 compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
 {
-#define M(x)    sph_dec32le_aligned(data + 4 * (x))
+   { 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
-#define H(x)    (h[x])
+   { 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
-#define dH(x)   (dh[x])
+   { 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 },
-
+   { 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 },
-	FOLDs;
+   { 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 },
-
+   { 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 },
-#undef M
+   { 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 },
-#undef H
+   { 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 },
-#undef dH
+   { 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 },
-}
+   { 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 },
-
+   { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
-static const sph_u32 final_s[16] = {
+   { 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab },
-	SPH_C32(0xaaaaaaa0), SPH_C32(0xaaaaaaa1), SPH_C32(0xaaaaaaa2),
+   { 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac },
-	SPH_C32(0xaaaaaaa3), SPH_C32(0xaaaaaaa4), SPH_C32(0xaaaaaaa5),
+   { 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad },
-	SPH_C32(0xaaaaaaa6), SPH_C32(0xaaaaaaa7), SPH_C32(0xaaaaaaa8),
+   { 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
-	SPH_C32(0xaaaaaaa9), SPH_C32(0xaaaaaaaa), SPH_C32(0xaaaaaaab),
+   { 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
 	SPH_C32(0xaaaaaaac), SPH_C32(0xaaaaaaad), SPH_C32(0xaaaaaaae),
 	SPH_C32(0xaaaaaaaf)
 };
 static void
 bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv)
 {
-	memcpy(sc->H, iv, sizeof sc->H);
+   for ( int i = 0; i < 16; i++ )
      sc->H[i] = _mm_set1_epi32( iv[i] );
   sc->ptr = 0;
 #if SPH_64
   sc->bit_count = 0;
 #else
 	sc->bit_count_high = 0;
 	sc->bit_count_low = 0;
 #endif
 }
 static void
 bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
 {
-	unsigned char *buf;
+   __m128i *vdata = (__m128i*)data;
   __m128i *buf;
   __m128i htmp[16];
   __m128i *h1, *h2;
   size_t ptr;
-	sph_u32 htmp[16];
+   const int buf_size = 64;  // bytes of one lane, compatible with len
 	sph_u32 *h1, *h2;
 #if !SPH_64
 	sph_u32 tmp;
 #endif
 #if SPH_64
   sc->bit_count += (sph_u64)len << 3;
 #else
 	tmp = sc->bit_count_low;
 	sc->bit_count_low = SPH_T32(tmp + ((sph_u32)len << 3));
 	if (sc->bit_count_low < tmp)
 		sc->bit_count_high ++;
 	sc->bit_count_high += len >> 29;
 #endif
   buf = sc->buf;
   ptr = sc->ptr;
   h1 = sc->H;
   h2 = htmp;
-	while (len > 0) {
+   while ( len > 0 )
   {
      size_t clen;
-
+      clen = buf_size - ptr;
 		clen = (sizeof sc->buf) - ptr;
      if ( clen > len )
         clen = len;
-		memcpy(buf + ptr, data, clen);
+      memcpy_128( buf + (ptr>>2), vdata, clen >> 2 );
-		data = (const unsigned char *)data + clen;
+      vdata += ( clen >> 2 );
      len -= clen;
      ptr += clen;
-		if (ptr == sizeof sc->buf) {
+      if ( ptr == buf_size )
-			sph_u32 *ht;
+      {
-
+         __m128i *ht;
         compress_small( buf, h1, h2 );
         ht = h1;
         h1 = h2;
@@ -745,49 +928,45 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
   }
   sc->ptr = ptr;
   if ( h1 != sc->H )
-		memcpy(sc->H, h1, sizeof sc->H);
+        memcpy_128( sc->H, h1, 16 );
 }
 static void
 bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
 	void *dst, size_t out_size_w32)
 {
-	unsigned char *buf, *out;
+   __m128i *buf;
   __m128i h1[16], h2[16], *h;
   size_t ptr, u, v;
   unsigned z;
-	sph_u32 h1[16], h2[16], *h;
+   const int buf_size = 64;  // bytes of one lane, compatible with len
   buf = sc->buf;
   ptr = sc->ptr;
   z = 0x80 >> n;
-	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+   buf[ ptr>>2 ] = _mm_set1_epi32( z );
   ptr += 4;
   h = sc->H;
-	if (ptr > (sizeof sc->buf) - 8) {
+
-		memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+   // assume bit_count fits in 32 bits 
   if ( ptr > buf_size - 4 )
   {
      memset_zero_128( buf + (ptr>>2), (buf_size - ptr) >> 2 );
      compress_small( buf, h, h1 );
      ptr = 0;
      h = h1;
   }
-	memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr);
+   memset_zero_128( buf + (ptr>>2), (buf_size - 4 - ptr) >> 2 );
-#if SPH_64
+   buf[ (buf_size - 4) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
 	sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
 		SPH_T64(sc->bit_count + n));
 #else
 	sph_enc32le_aligned(buf + (sizeof sc->buf) - 8,
 		sc->bit_count_low + n);
 	sph_enc32le_aligned(buf + (sizeof sc->buf) - 4,
 		SPH_T32(sc->bit_count_high));
 #endif
   compress_small( buf, h, h2 );
   for ( u = 0; u < 16; u ++ )
-		sph_enc32le_aligned(buf + 4 * u, h2[u]);
+      buf[u] = h2[u];
   compress_small( buf, final_s, h1 );
 	out = dst;
   for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
-		sph_enc32le(out + 4 * u, h1[v]);
+      casti_m128i( dst, u ) = h1[v];
 }
-*/
+
-#if SPH_64
+// BMW512
 static const __m256i final_b[16] =
 {
@@ -908,33 +1087,33 @@ bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n,
      casti_m256i(dst,u) = h1[v];
 }
-#endif
+// BMW256
 void
 bmw256_4way_init(void *cc)
 {
-//	bmw32_4way_init(cc, IV256);
+	bmw32_4way_init(cc, IV256);
 }
 void
 bmw256_4way(void *cc, const void *data, size_t len)
 {
-//	bmw32_4way(cc, data, len);
+	bmw32_4way(cc, data, len);
 }
 void
 bmw256_4way_close(void *cc, void *dst)
 {
-//	bmw256_4way_addbits_and_close(cc, 0, 0, dst);
+	bmw256_4way_addbits_and_close(cc, 0, 0, dst);
 }
 void
 bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
-//	bmw32_4way_close(cc, ub, n, dst, 8);
+	bmw32_4way_close(cc, ub, n, dst, 8);
 }
-#if SPH_64
+// BMW512
 void
 bmw512_4way_init(void *cc)
@@ -960,10 +1139,8 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 	bmw64_4way_close(cc, ub, n, dst, 8);
 }
 #endif
 #ifdef __cplusplus
 }
 #endif
-#endif
+#endif  // __AVX2__
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -46,94 +46,37 @@ extern "C"{
 #include "algo/sha/sph_types.h"
 #include "avxdefs.h"
 /**
 * Output size (in bits) for BMW-224.
 */
 #define SPH_SIZE_bmw224   224
 /**
 * Output size (in bits) for BMW-256.
 */
 #define SPH_SIZE_bmw256   256
 #if SPH_64
 /**
 * Output size (in bits) for BMW-384.
 */
 #define SPH_SIZE_bmw384   384
 /**
 * Output size (in bits) for BMW-512.
 */
 #define SPH_SIZE_bmw512   512
 #endif
 /**
 * This structure is a context for BMW-224 and BMW-256 computations:
 * it contains the intermediate values and some data from the last
 * entered block. Once a BMW computation has been performed, the
 * context can be reused for another computation.
 *
 * The contents of this structure are private. A running BMW
 * computation can be cloned by copying the context (e.g. with a simple
 * <code>memcpy()</code>).
 */
 typedef struct {
-#ifndef DOXYGEN_IGNORE
+   __m128i buf[64];
-	unsigned char buf[64];    /* first field, for alignment */
+   __m128i H[16];
   size_t ptr;
-	sph_u32 H[16];
+   sph_u32 bit_count;  // assume bit_count fits in 32 bits
 #if SPH_64
 	sph_u64 bit_count;
 #else
 	sph_u32 bit_count_high, bit_count_low;
 #endif
 #endif
 } bmw_4way_small_context;
 typedef bmw_4way_small_context bmw256_4way_context;
 #if SPH_64
 /**
 * This structure is a context for BMW-384 and BMW-512 computations:
 * it contains the intermediate values and some data from the last
 * entered block. Once a BMW computation has been performed, the
 * context can be reused for another computation.
 *
 * The contents of this structure are private. A running BMW
 * computation can be cloned by copying the context (e.g. with a simple
 * <code>memcpy()</code>).
 */
 typedef struct {
 #ifndef DOXYGEN_IGNORE
   __m256i buf[16];
   __m256i H[16];
 //	unsigned char buf[128];    /* first field, for alignment */
   size_t ptr;
 //	sph_u64 H[16];
   sph_u64 bit_count;
 #endif
 } bmw_4way_big_context;
 typedef bmw_4way_big_context bmw512_4way_context;
 #endif
 void bmw256_4way_init(void *cc);
 void bmw256_4way(void *cc, const void *data, size_t len);
 void bmw256_4way_close(void *cc, void *dst);
-void bmw256_addbits_and_close(
+void bmw256_4way_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);
 #if SPH_64
 void bmw512_4way_init(void *cc);
 void bmw512_4way(void *cc, const void *data, size_t len);
@@ -150,5 +93,3 @@ void bmw512_4way_addbits_and_close(
 #endif
 #endif
 #endif
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -23,12 +23,12 @@ void jha_hash_4way( void *out, const void *input )
    uint64_t hash2[8] __attribute__ ((aligned (64)));
    uint64_t hash3[8] __attribute__ ((aligned (64)));
    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-    uint64_t vhash0[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
-    uint64_t vhash1[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
    __m256i mask, mask0, mask1;
    __m256i* vh  = (__m256i*)vhash;
-    __m256i* vh0 = (__m256i*)vhash0;
+    __m256i* vhA = (__m256i*)vhashA;
-    __m256i* vh1 = (__m256i*)vhash1;
+    __m256i* vhB = (__m256i*)vhashB;
    __m256i vh_mask;
    blake512_4way_context  ctx_blake;
    hashState_groestl      ctx_groestl;
@@ -40,32 +40,13 @@ void jha_hash_4way( void *out, const void *input )
    keccak512_4way( &ctx_keccak, input, 80 );
    keccak512_4way_close( &ctx_keccak, vhash );
 //    memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
 //    keccak512_4way( &ctx_keccak, input + (64<<2), 16 );
 //    keccak512_4way_close( &ctx_keccak, vhash );
    // Heavy & Light Pair Loop
    for ( int round = 0; round < 3; round++ )
    {
-      // select next function based on bit 0 of previous hash.
+       vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256(
-      // Specutively execute both functions and use mask to
+               vh[0], _mm256_set1_epi64x( 1 ) ), mm256_zero );
      // select results from correct function for each lane.
      // hash = mask : vhash0 ? vhash1
      mask = mm256_negate_64(
                     _mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) );
 // second version
 //      mask0 = mask
 //      mask1 = mm256_not( mask );
 // first version
 //       mask = _mm256_sub_epi64( _mm256_and_si256( vh[0],
 //                     _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) );
       // groestl (serial) vs skein
       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash0,
                                               (char*)hash0, 512 );
@@ -78,67 +59,28 @@ void jha_hash_4way( void *out, const void *input )
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash3,
                                               (char*)hash3, 512 );
-
+       mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
       mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 );
       // skein
       skein512_4way_init( &ctx_skein );
       skein512_4way( &ctx_skein, vhash, 64 );
-       skein512_4way_close( &ctx_skein, vhash1 );
+       skein512_4way_close( &ctx_skein, vhashB );
       // merge vectored hash
       for ( int i = 0; i < 8; i++ )
-       {
+          vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
          // blend should be faster
          vh[i] = _mm256_blendv_epi8( vh0[i], vh1[i], mask );
 // second version
 //          vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
 //                                   _mm256_and_si256( vh1[i], mask1 ) );
 // first version
 /*
          vh0[i] = _mm256_maskload_epi64( 
                                      vhash0 + i*4, mm256_not( mask ) );
          vh1[i] = _mm256_maskload_epi64(
                                      vhash1 + i*4, mask );
          vh[i]  = _mm256_or_si256( vh0[i], vh1[i] );
 */
       }
       // blake v jh
       blake512_4way_init( &ctx_blake );
       blake512_4way( &ctx_blake, vhash, 64 );
-       blake512_4way_close( &ctx_blake, vhash0 );
+       blake512_4way_close( &ctx_blake, vhashA );
       jh512_4way_init( &ctx_jh );
       jh512_4way( &ctx_jh, vhash, 64 );
-       jh512_4way_close( &ctx_jh, vhash1 );
+       jh512_4way_close( &ctx_jh, vhashB );
       // merge hash
       for ( int i = 0; i < 8; i++ )
-       {
+          vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
          vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
                                   _mm256_and_si256( vh1[i], mask1 ) );
 /*
          vha256[i] = _mm256_maskload_epi64(
                                      vhasha + i*4, mm256_not( mask ) );
          vhb256[i] = _mm256_maskload_epi64(
                                      vhashb + i*4, mask );
          vh256[i]  = _mm256_or_si256( vha256[i], vhb256[i] );
 */
       }
    }
    mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
 //    memcpy( output,       hash0, 32 );
 //    memcpy( output+32,    hash1, 32 );
 //    memcpy( output+64,    hash2, 32 );
 //    memcpy( output+96,    hash3, 32 );
 }
 int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -177,17 +119,12 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
 		0
 	};
   // we need bigendian data...
   for ( int i=0; i < 19; i++ )
      be32enc( &endiandata[i], pdata[i] );
   uint64_t *edata = (uint64_t*)endiandata;
   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
   // precalc midstate for keccak
 //   keccak512_4way_init( &jha_kec_mid );
 //   keccak512_4way( &jha_kec_mid, vdata, 64 );
   for ( int m = 0; m < 6; m++ )
   {
      if ( Htarg <= htmax[m] )
@@ -201,7 +138,6 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
              be32enc( noncep3, n+3 );
              jha_hash_4way( hash, vdata );
              pdata[19] = n;
              if ( ( !(hash[7] & mask) )
@@ -239,11 +175,9 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
              n += 4;
         } while ( ( num_found == 0 ) && ( n < max_nonce )
                     && !work_restart[thr_id].restart );
         break;
      }
   }
   *hashes_done = n - first_nonce + 1;
   return num_found;
 }
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -0,0 +1,128 @@
 #include "lyra2h-gate.h"
 #ifdef LYRA2H_4WAY
 #include <memory.h>
 #include <mm_malloc.h>
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"
 #include "algo/blake/blake-hash-4way.h"
 __thread uint64_t* lyra2h_4way_matrix;
 bool lyra2h_4way_thread_init()
 {
 return ( lyra2h_4way_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
 }
 static __thread blake256_4way_context l2h_4way_blake_mid;
 void lyra2h_4way_midstate( const void* input )
 {
       blake256_4way_init( &l2h_4way_blake_mid );
       blake256_4way( &l2h_4way_blake_mid, input, 64 );
 }
 void lyra2h_4way_hash( void *state, const void *input )
 {
     uint32_t hash0[8] __attribute__ ((aligned (64)));
     uint32_t hash1[8] __attribute__ ((aligned (64)));
     uint32_t hash2[8] __attribute__ ((aligned (64)));
     uint32_t hash3[8] __attribute__ ((aligned (64)));
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
     memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
     blake256_4way( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );
     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
     LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
     LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
     LYRA2Z( lyra2h_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 16, 16, 16 );
     LYRA2Z( lyra2h_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 16, 16, 16 );
     memcpy( state,    hash0, 32 );
     memcpy( state+32, hash1, 32 );
     memcpy( state+64, hash2, 32 );
     memcpy( state+96, hash3, 32 );
 }
 int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
   bool *found = work->nfound;
   int num_found = 0;
   uint32_t *noncep0 = vdata + 76; // 19*4
   uint32_t *noncep1 = vdata + 77;
   uint32_t *noncep2 = vdata + 78;
   uint32_t *noncep3 = vdata + 79;
   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;
   for ( int i=0; i < 19; i++ )
      be32enc( &edata[i], pdata[i] );
   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   lyra2h_4way_midstate( vdata );
   do {
      found[0] = found[1] = found[2] = found[3] = false;
      be32enc( noncep0, n   );
      be32enc( noncep1, n+1 );
      be32enc( noncep2, n+2 );
      be32enc( noncep3, n+3 );
      be32enc( &edata[19], n );
      lyra2h_4way_hash( hash, vdata );
      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
      {
          found[0] = true;
          num_found++;
          nonces[0] = pdata[19] = n;
          work_set_target_ratio( work, hash );
      }
      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
      {
          found[1] = true;
          num_found++;
          nonces[1] = n+1;
          work_set_target_ratio( work, hash+8 );
      }
      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
      {
          found[2] = true;
          num_found++;
          nonces[2] = n+2;
          work_set_target_ratio( work, hash+16 );
      }
      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
      {
          found[3] = true;
          num_found++;
          nonces[3] = n+3;
          work_set_target_ratio( work, hash+24 );
      }
      n += 4;
   } while ( (num_found == 0) && (n < max_nonce-4)
                   && !work_restart[thr_id].restart);
   *hashes_done = n - first_nonce + 1;
   return num_found;
 }
 #endif
--- a/algo/lyra2/lyra2h-gate.c
+++ b/algo/lyra2/lyra2h-gate.c
@@ -0,0 +1,25 @@
 #include "lyra2h-gate.h"
 #include "lyra2.h"
 void lyra2h_set_target( struct work* work, double job_diff )
 {
 work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }
 bool register_lyra2h_algo( algo_gate_t* gate )
 {
 #ifdef LYRA2H_4WAY
  gate->miner_thread_init = (void*)&lyra2h_4way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2h_4way;
  gate->hash       = (void*)&lyra2h_4way_hash;
 #else
  gate->miner_thread_init = (void*)&lyra2h_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2h;
  gate->hash       = (void*)&lyra2h_hash;
 #endif
  gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  gate->set_target = (void*)&lyra2h_set_target;
  return true;
 };
--- a/algo/lyra2/lyra2h-gate.h
+++ b/algo/lyra2/lyra2h-gate.h
@@ -0,0 +1,32 @@
 #ifndef LYRA2H_GATE_H__
 #define LYRA2H_GATE_H__
 #include "algo-gate-api.h"
 #include <stdint.h>
 #if defined(HASH_4WAY)
  #define LYRA2H_4WAY
 #endif
 #define LYRA2H_MATRIX_SIZE  BLOCK_LEN_INT64 * 16 * 16 * 8
 #if defined(LYRA2H_4WAY)
 void lyra2h_4way_hash( void *state, const void *input );
 int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );
 bool lyra2h_4way_thread_init();
 #endif
 void lyra2h_hash( void *state, const void *input );
 int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );
 bool lyra2h_thread_init();
 #endif
--- a/algo/lyra2/lyra2h.c
+++ b/algo/lyra2/lyra2h.c
@@ -1,6 +1,6 @@
 #include "lyra2h-gate.h"
 #include <memory.h>
 #include <mm_malloc.h>
 #include "algo-gate-api.h"
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"
@@ -8,8 +8,7 @@ __thread uint64_t* lyra2h_matrix;
 bool lyra2h_thread_init()
 {
-   const int i = 16 * 16 * 96;
+   lyra2h_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 );
   lyra2h_matrix = _mm_malloc( i, 64 );
   return lyra2h_matrix;
 }
@@ -74,20 +73,3 @@ int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
 void lyra2h_set_target( struct work* work, double job_diff )
 {
 work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }
 bool register_lyra2h_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2h_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2h;
  gate->hash       = (void*)&lyra2h_hash;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  gate->set_target = (void*)&lyra2h_set_target;
  return true;
 };
--- a/algo/lyra2/lyra2re.c
+++ b/algo/lyra2/lyra2re.c
@@ -106,6 +106,7 @@ int scanhash_lyra2re(int thr_id, struct work *work,
                   {
 			pdata[19] = nonce;
 			*hashes_done = pdata[19] - first_nonce;
                        work_set_target_ratio( work, hash );
 			return 1;
                   }
 		}
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -0,0 +1,177 @@
 #include "lyra2rev2-gate.h"
 #include <memory.h>
 #ifdef __AVX2__	
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/sph_cubehash.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
 typedef struct {
   blake256_4way_context     blake;
   keccak256_4way_context    keccak;
   cubehashParam             cube;
   skein256_4way_context     skein;
        sph_bmw256_context       bmw;
 } lyra2v2_4way_ctx_holder;
 static lyra2v2_4way_ctx_holder l2v2_4way_ctx;
 void init_lyra2rev2_4way_ctx()
 {
 //   blake256_4way_init( &l2v2_4way_ctx.blake );
   keccak256_4way_init( &l2v2_4way_ctx.keccak );
   cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
   skein256_4way_init( &l2v2_4way_ctx.skein );
        sph_bmw256_init( &l2v2_4way_ctx.bmw );
 }
 void lyra2rev2_4way_hash( void *state, const void *input )
 {
   uint32_t hash0[8] __attribute__ ((aligned (64)));
   uint32_t hash1[8] __attribute__ ((aligned (32)));
   uint32_t hash2[8] __attribute__ ((aligned (32)));
   uint32_t hash3[8] __attribute__ ((aligned (32)));
   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
   uint64_t vhash64[4*4] __attribute__ ((aligned (64)));
   lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
   memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
   blake256_4way( &ctx.blake, input + (64<<2), 16 );
 //   blake256_4way( &ctx.blake, input, 80 );
   blake256_4way_close( &ctx.blake, vhash );
   mm256_reinterleave_4x64( vhash64, vhash, 256 );
   keccak256_4way( &ctx.keccak, vhash64, 32 );
   keccak256_4way_close( &ctx.keccak, vhash64 );
   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
   LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
   LYRA2REV2( l2v2_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
   LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
   LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
   mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
   skein256_4way( &ctx.skein, vhash64, 32 );
   skein256_4way_close( &ctx.skein, vhash64 );
   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
 	sph_bmw256( &ctx.bmw, hash0, 32 );
 	sph_bmw256_close( &ctx.bmw, hash0 );
        memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
        sph_bmw256( &ctx.bmw, hash1, 32 );
        sph_bmw256_close( &ctx.bmw, hash1 );
        memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
        sph_bmw256( &ctx.bmw, hash2, 32 );
        sph_bmw256_close( &ctx.bmw, hash2 );
        memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
        sph_bmw256( &ctx.bmw, hash3, 32 );
        sph_bmw256_close( &ctx.bmw, hash3 );
   memcpy( state,    hash0, 32 );
   memcpy( state+32, hash1, 32 );
   memcpy( state+64, hash2, 32 );
   memcpy( state+96, hash3, 32 );
 }
 int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
   uint32_t *nonces = work->nonces;
   bool *found = work->nfound;
   int num_found = 0;
   uint32_t *noncep0 = vdata + 76; // 19*4
   uint32_t *noncep1 = vdata + 77;
   uint32_t *noncep2 = vdata + 78;
   uint32_t *noncep3 = vdata + 79;
   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;
   swab32_array( edata, pdata, 20 );
   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   blake256_4way_init( &l2v2_4way_ctx.blake );
   blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
   do {
      found[0] = found[1] = found[2] = found[3] = false;
      be32enc( noncep0, n   );
      be32enc( noncep1, n+1 );
      be32enc( noncep2, n+2 );
      be32enc( noncep3, n+3 );
      lyra2rev2_4way_hash( hash, vdata );
      pdata[19] = n;
      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
      {
          found[0] = true;
          num_found++;
          nonces[0] = pdata[19] = n;
          work_set_target_ratio( work, hash );
      }
      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
      {
          found[1] = true;
          num_found++;
          nonces[1] = n+1;
          work_set_target_ratio( work, hash+8 );
      }
      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
      {
          found[2] = true;
          num_found++;
          nonces[2] = n+2;
          work_set_target_ratio( work, hash+16 );
      }
      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
      {
          found[3] = true;
          num_found++;
          nonces[3] = n+3;
          work_set_target_ratio( work, hash+24 );
      }
      n += 4;
   } while ( (num_found == 0) && (n < max_nonce-4)
                   && !work_restart[thr_id].restart);
   *hashes_done = n - first_nonce + 1;
   return num_found;
 }
 #endif
--- a/algo/lyra2/lyra2rev2-gate.c
+++ b/algo/lyra2/lyra2rev2-gate.c
@@ -0,0 +1,38 @@
 #include "lyra2rev2-gate.h"
 __thread uint64_t* l2v2_wholeMatrix;
 void lyra2rev2_set_target( struct work* work, double job_diff )
 {
 work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }
 bool lyra2rev2_thread_init()
 {
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
   int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
   l2v2_wholeMatrix = _mm_malloc( i, 64 );
   return l2v2_wholeMatrix;
 }
 bool register_lyra2rev2_algo( algo_gate_t* gate )
 {
 #if defined (LYRA2REV2_4WAY)
  init_lyra2rev2_4way_ctx();
  gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
  gate->hash      = (void*)&lyra2rev2_4way_hash;
 #else
  init_lyra2rev2_ctx();
  gate->scanhash  = (void*)&scanhash_lyra2rev2;
  gate->hash      = (void*)&lyra2rev2_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  gate->set_target        = (void*)&lyra2rev2_set_target;
  return true;
 };
--- a/algo/lyra2/lyra2rev2-gate.h
+++ b/algo/lyra2/lyra2rev2-gate.h
@@ -0,0 +1,35 @@
 #ifndef LYRA2REV2_GATE_H__
 #define LYRA2REV2_GATE_H__ 1
 #include "algo-gate-api.h"
 #include <stdint.h>
 #include "lyra2.h"
 #if defined(HASH_4WAY)
  #define LYRA2REV2_4WAY
 #endif
 extern __thread uint64_t* l2v2_wholeMatrix;
 bool register_lyra2rev2_algo( algo_gate_t* gate );
 #if defined(LYRA2REV2_4WAY)
 void lyra2rev2_4way_hash( void *state, const void *input );
 int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );
 void init_lyra2rev2_4way_ctx();
 #endif
 void lyra2rev2_hash( void *state, const void *input );
 int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );
 void init_lyra2rev2_ctx();
 #endif
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -1,20 +1,12 @@
 #include "lyra2rev2-gate.h"
 #include <memory.h>
 #include "algo-gate-api.h"
 #include "algo/blake/sph_blake.h"
 #include "algo/cubehash/sph_cubehash.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
-#include "lyra2.h"
+//#include "lyra2.h"
 #include "avxdefs.h"
 // This gets allocated when miner_thread starts up and is never freed.
 // It's not a leak because the only way to allocate it again is to exit
 // the thread and that only occurs when the entire program exits.
 __thread uint64_t* l2v2_wholeMatrix;
 typedef struct {
        cubehashParam           cube1;
@@ -106,6 +98,7 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
                   if( fulltest(hash, ptarget) )
                   {
 			pdata[19] = nonce;
                        work_set_target_ratio( work, hash );
 			*hashes_done = pdata[19] - first_nonce;
 		   	return 1;
 		   }
@@ -119,30 +112,3 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
 	return 0;
 }
 void lyra2rev2_set_target( struct work* work, double job_diff )
 {
 work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }
 bool lyra2rev2_thread_init()
 {
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
   int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
   l2v2_wholeMatrix = _mm_malloc( i, 64 );
   return l2v2_wholeMatrix;
 }
 bool register_lyra2rev2_algo( algo_gate_t* gate )
 {
  init_lyra2rev2_ctx();
  gate->optimizations = AVX_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  gate->scanhash          = (void*)&scanhash_lyra2rev2;
  gate->hash              = (void*)&lyra2rev2_hash;
  gate->set_target        = (void*)&lyra2rev2_set_target;
  return true;
 };
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -85,8 +85,8 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
      be32enc( noncep2, n+2 );
      be32enc( noncep3, n+3 );
      be32enc( &edata[19], n );
      lyra2z_4way_hash( hash, vdata );
      pdata[19] = n;
      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
      {
--- a/algo/lyra2/lyra2z.c
+++ b/algo/lyra2/lyra2z.c
@@ -82,41 +82,3 @@ int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }
 /*
 //int64_t get_max64_0xffffLL() { return 0xffffLL; };
 void lyra2z_set_target( struct work* work, double job_diff )
 {
 work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }
 bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
 {
   work->height = sctx->bloc_height;
   return false;
 }
 bool lyra2z_thread_init()
 {
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
   int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
   lyra2z_wholeMatrix = _mm_malloc( i, 64 );
   return lyra2z_wholeMatrix;
 }
 bool register_lyra2z_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2z_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z;
  gate->hash       = (void*)&lyra2z_hash;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  gate->set_target = (void*)&lyra2z_set_target;
 //  gate->prevent_dupes = (void*)&zcoin_get_work_height;
  return true;
 };
 */
--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -346,6 +346,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
                    hash_str,
                    target_str);
            }
            work_set_target_ratio( work, hash );
            pdata[19] = data[19];
            goto out;
 	  }
--- a/algo/nist5/nist5.c
+++ b/algo/nist5/nist5.c
@@ -132,6 +132,7 @@ int scanhash_nist5(int thr_id, struct work *work,
 				if (!(hash64[7] & mask)) {
 					printf("[%d]",thr_id);
 					if (fulltest(hash64, ptarget)) {
                                                work_set_target_ratio( work, hash64 );
 						*hashes_done = n - first_nonce + 1;
 						return true;
 					}
--- a/algo/nist5/zr5.c
+++ b/algo/nist5/zr5.c
@@ -172,6 +172,7 @@ int scanhash_zr5( int thr_id, struct work *work,
         pdata[0] = tmpdata[0];
         pdata[19] = nonce;
         *hashes_done = pdata[19] - first_nonce + 1;
         work_set_target_ratio( work, hash );
         if (opt_debug)
           applog(LOG_INFO, "found nonce %x", nonce);
         return 1;
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -0,0 +1,207 @@
 #include "cpuminer-config.h"
 #include "quark-gate.h"
 #if defined (__AVX2__) && defined (__AES__)
 #include <stdio.h>
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 typedef struct {
    blake512_4way_context  blake;
    bmw512_4way_context    bmw;
    hashState_groestl      groestl;
    jh512_4way_context     jh;
    skein512_4way_context  skein;
    keccak512_4way_context keccak;
 } quark_4way_ctx_holder;
 quark_4way_ctx_holder quark_4way_ctx __attribute__ ((aligned (64)));
 void init_quark_4way_ctx()
 {
     blake512_4way_init( &quark_4way_ctx.blake );
     bmw512_4way_init( &quark_4way_ctx.bmw );
     init_groestl( &quark_4way_ctx.groestl, 64 );
     skein512_4way_init( &quark_4way_ctx.skein );
     jh512_4way_init( &quark_4way_ctx.jh );
     keccak512_4way_init( &quark_4way_ctx.keccak );
 }
 void quark_4way_hash( void *state, const void *input )
 {
    uint64_t hash0[8] __attribute__ ((aligned (64)));
    uint64_t hash1[8] __attribute__ ((aligned (64)));
    uint64_t hash2[8] __attribute__ ((aligned (64)));
    uint64_t hash3[8] __attribute__ ((aligned (64)));
    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
    uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
    uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
    __m256i* vh  = (__m256i*)vhash;
    __m256i* vhA = (__m256i*)vhashA;
    __m256i* vhB = (__m256i*)vhashB;
    __m256i vh_mask;
    __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
    int i;
    quark_4way_ctx_holder ctx;
    memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
    blake512_4way( &ctx.blake, input, 80 );
    blake512_4way_close( &ctx.blake, vhash );
    bmw512_4way( &ctx.bmw, vhash, 64 );
    bmw512_4way_close( &ctx.bmw, vhash );
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  mm256_zero );
       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
       update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                               (char*)hash0, 512 );
       reinit_groestl( &ctx.groestl );
       update_and_final_groestl( &ctx.groestl, (char*)hash1,
                                               (char*)hash1, 512 );
       reinit_groestl( &ctx.groestl );
       update_and_final_groestl( &ctx.groestl, (char*)hash2,
                                               (char*)hash2, 512 );
       reinit_groestl( &ctx.groestl );
       update_and_final_groestl( &ctx.groestl, (char*)hash3,
                                               (char*)hash3, 512 );
       mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
       skein512_4way( &ctx.skein, vhash, 64 );
       skein512_4way_close( &ctx.skein, vhashB );
    for ( i = 0; i < 8; i++ )
       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
    mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
    mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
    jh512_4way( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhash );
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  mm256_zero );
       blake512_4way_init( &ctx.blake );
       blake512_4way( &ctx.blake, vhash, 64 );
       blake512_4way_close( &ctx.blake, vhashA );
       bmw512_4way_init( &ctx.bmw );
       bmw512_4way( &ctx.bmw, vhash, 64 );
       bmw512_4way_close( &ctx.bmw, vhashB );
    for ( i = 0; i < 8; i++ )
       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
    keccak512_4way( &ctx.keccak, vhash, 64 );
    keccak512_4way_close( &ctx.keccak, vhash );
    skein512_4way_init( &ctx.skein );
    skein512_4way( &ctx.skein, vhash, 64 );
    skein512_4way_close( &ctx.skein, vhash );
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  mm256_zero );
       keccak512_4way_init( &ctx.keccak );
       keccak512_4way( &ctx.keccak, vhash, 64 );
       keccak512_4way_close( &ctx.keccak, vhashA );
       jh512_4way_init( &ctx.jh );
       jh512_4way( &ctx.jh, vhash, 64 );
       jh512_4way_close( &ctx.jh, vhashB );
    for ( i = 0; i < 8; i++ )
       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
    mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
 }
 int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done)
 {
    uint32_t hash[4*8] __attribute__ ((aligned (64)));
    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
    uint32_t endiandata[20] __attribute__((aligned(64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
    uint32_t *nonces = work->nonces;
    bool *found = work->nfound;
    int num_found = 0;
    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
    uint32_t *noncep1 = vdata + 75;
    uint32_t *noncep2 = vdata + 77;
    uint32_t *noncep3 = vdata + 79;
    swab32_array( endiandata, pdata, 20 );
    uint64_t *edata = (uint64_t*)endiandata;
    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
    do
    {
       found[0] = found[1] = found[2] = found[3] = false;
       be32enc( noncep0, n   );
       be32enc( noncep1, n+1 );
       be32enc( noncep2, n+2 );
       be32enc( noncep3, n+3 );
       quark_4way_hash( hash, vdata );
       pdata[19] = n;
       if ( ( hash[7] & 0xFFFFFF00 ) == 0 && fulltest( hash, ptarget ) ) 
       {
          found[0] = true;
          num_found++;
          nonces[0] = n;
          work_set_target_ratio( work, hash );
       }
       if ( ( (hash+8)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+8, ptarget ) )
       {
          found[1] = true;
          num_found++;
          nonces[1] = n+1;
          work_set_target_ratio( work, hash );
       }
       if ( ( (hash+16)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+16, ptarget ) )
       {
          found[2] = true;
          num_found++;
          nonces[2] = n+2;
          work_set_target_ratio( work, hash );
       }
       if ( ( (hash+24)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+24, ptarget ) )
       {
          found[3] = true;
          num_found++;
          nonces[3] = n+3;
          work_set_target_ratio( work, hash );
       }
       n += 4;
    } while ( ( num_found == 0 ) && ( n < max_nonce )
              && !work_restart[thr_id].restart );
    *hashes_done = n - first_nonce + 1;
    return num_found;
 }
 #endif
--- a/algo/quark/quark-gate.c
+++ b/algo/quark/quark-gate.c
@@ -0,0 +1,17 @@
 #include "quark-gate.h"
 bool register_quark_algo( algo_gate_t* gate )
 {
 #if defined (QUARK_4WAY)
  init_quark_4way_ctx();
  gate->scanhash  = (void*)&scanhash_quark_4way;
  gate->hash      = (void*)&quark_4way_hash;
 #else
  init_quark_ctx();
  gate->scanhash  = (void*)&scanhash_quark;
  gate->hash      = (void*)&quark_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
  return true;
 };
--- a/algo/quark/quark-gate.h
+++ b/algo/quark/quark-gate.h
@@ -0,0 +1,32 @@
 #ifndef QUARK_GATE_H__
 #define QUARK_GATE_H__ 1
 #include "algo-gate-api.h"
 #include <stdint.h>
 #if defined(HASH_4WAY) && defined(__AES__)
  #define QUARK_4WAY
 #endif
 bool register_quark_algo( algo_gate_t* gate );
 #if defined(QUARK_4WAY)
 void quark_4way_hash( void *state, const void *input );
 int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );
 void init_quark_4way_ctx();
 #endif
 void quark_hash( void *state, const void *input );
 int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );
 void init_quark_ctx();
 #endif
--- a/algo/quark/quark.c
+++ b/algo/quark/quark.c
@@ -1,5 +1,5 @@
 #include "cpuminer-config.h"
-#include "algo-gate-api.h"
+#include "quark-gate.h"
 #include <stdio.h>
 #include <string.h>
@@ -47,7 +47,7 @@ void init_quark_ctx()
 #endif
 }
-inline static void quarkhash(void *state, const void *input)
+void quark_hash(void *state, const void *input)
 {
    unsigned char hashbuf[128];
    size_t hashptr;
@@ -187,11 +187,12 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
 	do {
 		pdata[19] = ++n;
 		be32enc(&endiandata[19], n); 
-		quarkhash(hash64, &endiandata);
+		quark_hash(hash64, &endiandata);
                if ((hash64[7]&0xFFFFFF00)==0)
                {
                  if (fulltest(hash64, ptarget)) 
                  {
                    work_set_target_ratio( work, hash64 );
                    *hashes_done = n - first_nonce + 1;
 		    return true;
                  }
@@ -203,12 +204,3 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }
 bool register_quark_algo( algo_gate_t* gate )
 {
  init_quark_ctx();
  gate->optimizations = SSE2_OPT | AES_OPT;
  gate->scanhash         = (void*)&scanhash_quark;
  gate->hash             = (void*)&quarkhash;
  return true;
 };
--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -122,6 +122,7 @@ int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
 	        	if (!(hash64[7] & mask)) {
 		            printf("[%d]",thr_id);
 			    if (fulltest(hash64, ptarget)) {
                             work_set_target_ratio( work, hash64 );
                             *hashes_done = n - first_nonce + 1;
 				return true;
 	                    }
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -134,6 +134,7 @@ int scanhash_qubit(int thr_id, struct work *work,
 	        	if (!(hash64[7] & mask)) {
 		            printf("[%d]",thr_id);
 			    if (fulltest(hash64, ptarget)) {
                             work_set_target_ratio( work, hash64 );
                             *hashes_done = n - first_nonce + 1;
 				return true;
 	                    }
--- a/algo/scrypt.c
+++ b/algo/scrypt.c
@@ -754,6 +754,7 @@ extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce,
 			if (unlikely(hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget))) {
 				*hashes_done = n - pdata[19] + 1;
 				pdata[19] = data[i * 20 + 19];
                                work_set_target_ratio( work, hash );
 				return 1;
 			}
 		}
--- a/algo/scryptjane/scrypt-jane-chacha.h
+++ b/algo/scryptjane/scrypt-jane-chacha.h
@@ -114,7 +114,7 @@ available_implementations() {
 	return flags;
 }
 #endif
-
+/*
 static int
 scrypt_test_mix() {
 	static const uint8_t expected[16] = {
@@ -145,4 +145,4 @@ scrypt_test_mix() {
 	return ret;
 }
-
+*/
--- a/algo/scryptjane/scrypt-jane-hash.h
+++ b/algo/scryptjane/scrypt-jane-hash.h
@@ -26,7 +26,7 @@
 #include "scrypt-jane-pbkdf2.h"
 #define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */
-
+/*
 static int
 scrypt_test_hash() {
 	scrypt_hash_state st;
@@ -45,4 +45,4 @@ scrypt_test_hash() {
 	scrypt_hash_finish(&st, final);
 	return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE);
 }
-
+*/
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -342,17 +342,6 @@ do { \
 do { \
  sph_u64 t0, t1, t2; \
  __m256i h8; \
 /* can LE be assumed? \
   dec64le does nothing when SPH_LITTLE endian is set, as it is. \
  __m256i m0 = _mm256_dec64le( buf ); \
  __m256i m1 = _mm256_dec64le( buf +  8*4 ); \
  __m256i m2 = _mm256_dec64le( buf + 16*4 ); \
  __m256i m3 = _mm256_dec64le( buf + 24*4 ); \
  __m256i m4 = _mm256_dec64le( buf + 32*4 ); \
  __m256i m5 = _mm256_dec64le( buf + 40*4 ); \
  __m256i m6 = _mm256_dec64le( buf + 48*4 ); \
  __m256i m7 = _mm256_dec64le( buf + 56*4 ); \
 */ \
  __m256i m0 =  buf[0]; \
  __m256i m1 =  buf[1]; \
  __m256i m2 =  buf[2]; \
--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -39,7 +39,9 @@
 */
 #ifndef __SKEIN_HASH_4WAY_H__
-#define __SKEIN_HASH_4WAY_H__
+#define __SKEIN_HASH_4WAY_H__ 1
 #ifdef __AVX2__
 #ifdef __cplusplus
 extern "C"{
@@ -53,14 +55,15 @@ extern "C"{
 #define SPH_SIZE_skein256   256
 #define SPH_SIZE_skein512   512
 #ifdef __AVX2__
 typedef struct {
        __m256i buf[8] __attribute__ ((aligned (32)));
        __m256i h0, h1, h2, h3, h4, h5, h6, h7;
        size_t ptr;
 	sph_u64 bcount;
-} skein512_4way_context;
+} sph_skein_4way_big_context;
 typedef sph_skein_4way_big_context skein512_4way_context;
 typedef sph_skein_4way_big_context skein256_4way_context;
 void skein512_4way_init(void *cc);
 void skein512_4way(void *cc, const void *data, size_t len);
@@ -68,26 +71,15 @@ void skein512_4way_close(void *cc, void *dst);
 //void sph_skein512_addbits_and_close(
 //        void *cc, unsigned ub, unsigned n, void *dst);
 #endif
 #ifdef __AVX__
 typedef struct {
        __m128i buf[8] __attribute__ ((aligned (32)));
        __m128i h0, h1, h2, h3, h4, h5, h6, h7;
        size_t ptr;
        sph_u64 bcount;
 } skein256_4way_context;
 void skein256_4way_init(void *cc);
 void skein256_4way(void *cc, const void *data, size_t len);
 void skein256_4way_close(void *cc, void *dst);
 //void sph_skein256_addbits_and_close(
 //	void *cc, unsigned ub, unsigned n, void *dst);
 #endif
 #ifdef __cplusplus
 }
 #endif
 #endif
 #endif
--- a/algo/sm3/sm3-hash-4way.c
+++ b/algo/sm3/sm3-hash-4way.c
@@ -0,0 +1,231 @@
 /* ====================================================================
 * Copyright (c) 2014 - 2017 The GmSSL Project.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. All advertising materials mentioning features or use of this
 *    software must display the following acknowledgment:
 *    "This product includes software developed by the GmSSL Project.
 *    (http://gmssl.org/)"
 *
 * 4. The name "GmSSL Project" must not be used to endorse or promote
 *    products derived from this software without prior written
 *    permission. For written permission, please contact
 *    guanzhi1980@gmail.com.
 *
 * 5. Products derived from this software may not be called "GmSSL"
 *    nor may "GmSSL" appear in their names without prior written
 *    permission of the GmSSL Project.
 *
 * 6. Redistributions of any form whatsoever must retain the following
 *    acknowledgment:
 *    "This product includes software developed by the GmSSL Project
 *    (http://gmssl.org/)"
 *
 * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE GmSSL PROJECT OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 * OF THE POSSIBILITY OF SUCH DAMAGE.
 * ====================================================================
 */
 #include <string.h>
 #include "sm3-hash-4way.h"
 #ifdef __AVX__
 void sm3_4way_init( sm3_4way_ctx_t *ctx )
 {
 	ctx->digest[0] = _mm_set1_epi32( 0x7380166F );
 	ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 );
 	ctx->digest[2] = _mm_set1_epi32( 0x172442D7 );
 	ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 );
 	ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC );
 	ctx->digest[5] = _mm_set1_epi32( 0x163138AA );
 	ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D );
 	ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E );
 	ctx->nblocks = 0;
 	ctx->num = 0;
 }
 void sm3_4way( void *cc, const void *data, size_t len )
 {
   sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
   __m128i *block = (__m128i*)ctx->block;
   __m128i *vdata = (__m128i*)data;
   if ( ctx->num )
   {
      unsigned int left = SM3_BLOCK_SIZE - ctx->num;
      if ( len < left )
      {
         memcpy_128( block + (ctx->num >> 2), vdata , len>>2 ); 
         ctx->num += len;
         return;
      }
      else
      {
         memcpy_128( block + (ctx->num >> 2), vdata , left>>2 );
         sm3_4way_compress( ctx->digest, block );
         ctx->nblocks++;
         vdata += left>>2;
         len -= left;
      }
   }
   while ( len >= SM3_BLOCK_SIZE )
   {
      sm3_4way_compress( ctx->digest, vdata );
      ctx->nblocks++;
      vdata += SM3_BLOCK_SIZE>>2;
      len -= SM3_BLOCK_SIZE;
   }
   ctx->num = len;
   if ( len )
      memcpy_128( block, vdata, len>>2 );
 }
 void sm3_4way_close( void *cc, void *dst )
 {
   sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
   __m128i *hash = (__m128i*)dst;
   __m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
   __m128i *block = (__m128i*)ctx->block;
   int i;
   block[ctx->num] = _mm_set1_epi32( 0x80 );
   if ( ctx->num + 8 <= SM3_BLOCK_SIZE )
   {
      memset_zero_128( block + (ctx->num >> 2) + 1, 
                      ( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 ); 
   }
   else
   {
      memset_zero_128( block + (ctx->num >> 2) + 1, 
                             ( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) );
      sm3_4way_compress( ctx->digest, block );
      memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
   }
   count[0] = mm_byteswap_32(
                  _mm_set1_epi32( ctx->nblocks >> 23 ) );
   count[1] = mm_byteswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
                                              ( ctx->num     << 3 ) ) );
   sm3_4way_compress( ctx->digest, block );
   for ( i = 0; i < 8 ; i++ )
     hash[i] = mm_byteswap_32( ctx->digest[i] );
 }
 #define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x,  9 ), \
                                               mm_rotl_32( x, 17 ) ) ) 
 #define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x, 15 ), \
                                               mm_rotl_32( x, 23 ) ) ) 
 #define FF0(x,y,z) _mm_xor_si128( x, _mm_xor_si128( y, z ) )
 #define FF1(x,y,z) _mm_or_si128( _mm_or_si128( _mm_and_si128( x, y ), \
                                               _mm_and_si128( x, z ) ), \
                                               _mm_and_si128( y, z ) )
 #define GG0(x,y,z) FF0(x,y,z)
 #define GG1(x,y,z) _mm_or_si128( _mm_and_si128( x, y ), \
                                 _mm_andnot_si128( x, z ) )
 void sm3_4way_compress( __m128i *digest, __m128i *block )
 {
   __m128i W[68], W1[64];
   __m128i A = digest[ 0 ];
   __m128i B = digest[ 1 ];
   __m128i C = digest[ 2 ];
   __m128i D = digest[ 3 ];
   __m128i E = digest[ 4 ];
   __m128i F = digest[ 5 ];
   __m128i G = digest[ 6 ];
   __m128i H = digest[ 7 ];
   __m128i SS1, SS2, TT1, TT2, T;
   int j;
   for ( j = 0; j < 16; j++ )
      W[j] = mm_byteswap_32( block[j] );
   for ( j = 16; j < 68; j++ )
      W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],
                                                              W[ j-9 ] ),
                                               mm_rotl_32( W[ j-3 ], 15 ) ) ),
                            _mm_xor_si128( mm_rotl_32( W[ j-13 ], 7 ),
                                           W[ j-6 ] ) );
   for( j = 0; j < 64; j++ )
       W1[j] = _mm_xor_si128( W[j], W[j+4] );
   T = _mm_set1_epi32( 0x79CC4519UL );
   for( j =0; j < 16; j++ )
   {
      SS1 = mm_rotl_32( _mm_add_epi32( _mm_add_epi32( mm_rotl_32( A, 12 ), E ),
                                      mm_rotl_32( T, j ) ), 7 );
      SS2 = _mm_xor_si128( SS1, mm_rotl_32( A, 12 ) );
      TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF0( A, B, C ), D ),
                                          SS2 ), W1[j] );
      TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG0( E, F, G ), H ),
                                          SS1 ), W[j] );
      D = C;
      C = mm_rotl_32( B, 9 );
      B = A;
      A = TT1;
      H = G;
      G = mm_rotl_32( F, 19 );
      F = E;
      E = P0( TT2 );
   }
   T = _mm_set1_epi32( 0x7A879D8AUL );
   for( j =16; j < 64; j++ )
   {
      SS1 = mm_rotl_32( _mm_add_epi32( _mm_add_epi32( mm_rotl_32( A, 12 ), E ),
                                      mm_rotl_32( T, j&31 ) ), 7 );
      SS2 = _mm_xor_si128( SS1, mm_rotl_32( A, 12 ) );
      TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF1( A, B, C ), D ), 
                                          SS2 ), W1[j] );
      TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG1( E, F, G ), H ),
                                          SS1 ), W[j] );
      D = C;
      C = mm_rotl_32( B, 9 );
      B = A;
      A = TT1;
      H = G;
      G = mm_rotl_32( F, 19 );
      F = E;
      E = P0( TT2 );
   }
   digest[0] = _mm_xor_si128( digest[0], A );
   digest[1] = _mm_xor_si128( digest[1], B );
   digest[2] = _mm_xor_si128( digest[2], C );
   digest[3] = _mm_xor_si128( digest[3], D );
   digest[4] = _mm_xor_si128( digest[4], E );
   digest[5] = _mm_xor_si128( digest[5], F );
   digest[6] = _mm_xor_si128( digest[6], G );
   digest[7] = _mm_xor_si128( digest[7], H );
 }
 #endif
--- a/algo/sm3/sm3-hash-4way.h
+++ b/algo/sm3/sm3-hash-4way.h
@@ -0,0 +1,89 @@
 /* ====================================================================
 * Copyright (c) 2014 - 2016 The GmSSL Project.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. All advertising materials mentioning features or use of this
 *    software must display the following acknowledgment:
 *    "This product includes software developed by the GmSSL Project.
 *    (http://gmssl.org/)"
 *
 * 4. The name "GmSSL Project" must not be used to endorse or promote
 *    products derived from this software without prior written
 *    permission. For written permission, please contact
 *    guanzhi1980@gmail.com.
 *
 * 5. Products derived from this software may not be called "GmSSL"
 *    nor may "GmSSL" appear in their names without prior written
 *    permission of the GmSSL Project.
 *
 * 6. Redistributions of any form whatsoever must retain the following
 *    acknowledgment:
 *    "This product includes software developed by the GmSSL Project
 *    (http://gmssl.org/)"
 *
 * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE GmSSL PROJECT OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 * OF THE POSSIBILITY OF SUCH DAMAGE.
 * ====================================================================
 */
 #ifndef SPH_SM3_HASH_4WAY_H
 #define SPH_SM3_HASH_4WAY_H
 #define SM3_DIGEST_LENGTH	32
 #define SM3_BLOCK_SIZE		64
 #define SM3_CBLOCK		(SM3_BLOCK_SIZE)
 #define SM3_HMAC_SIZE		(SM3_DIGEST_LENGTH)
 #include <sys/types.h>
 #include <stdint.h>
 #include <string.h>
 #include "avxdefs.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef struct {
   __m128i block[16] __attribute__ ((aligned (64)));
   __m128i digest[8];
   uint32_t nblocks;
   uint32_t num;
 } sm3_4way_ctx_t;
 void sm3_4way_init( sm3_4way_ctx_t *ctx );
 //void sm3_4way_update( sm3_4way_ctx_t *ctx, const unsigned char* data,
 //                      size_t data_len );
 //void sm3_4way_final( sm3_4way_ctx_t *ctx,
 //                      unsigned char digest[SM3_DIGEST_LENGTH] );
 void sm3_4way_compress( __m128i *digest, __m128i *block );
 void sm3_4way(void *cc, const void *data, size_t len);
 void sm3_4way_close(void *cc, void *dst);
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/algo/sm3/sm3.c
+++ b/algo/sm3/sm3.c
@@ -189,7 +189,7 @@ void sm3_compress(uint32_t digest[8], const unsigned char block[64])
 	for(j =16; j < 64; j++) {
 		T[j] = 0x7A879D8A;
-		SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j)), 7);
+		SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j&31)), 7);
 		SS2 = SS1 ^ ROTATELEFT(A,12);
 		TT1 = FF1(A,B,C) + D + SS2 + W1[j];
 		TT2 = GG1(E,F,G) + H + SS1 + W[j];
--- a/algo/whirlpool/sph_whirlpool.c
+++ b/algo/whirlpool/sph_whirlpool.c
@@ -3468,9 +3468,10 @@ sph_ ## name ## _close(void *cc, void *dst) \
 	for (i = 0; i < 8; i ++) \
 		sph_enc64le((unsigned char *)dst + 8 * i, sc->state[i]); \
 }
-//	sph_ ## name ## _init(cc); \
+/*
-//}
+	sph_ ## name ## _init(cc); \
-
+}
 */
 MAKE_CLOSE(whirlpool)
 MAKE_CLOSE(whirlpool0)
 MAKE_CLOSE(whirlpool1)
--- a/algo/whirlpool/whirlpool-gate.h
+++ b/algo/whirlpool/whirlpool-gate.h
@@ -22,6 +22,7 @@ void whirlpool_hash( void *state, const void *input );
 int scanhash_whirlpool( int thr_id, struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done );
 void init_whirlpool_ctx();
 #endif
 #endif
--- a/algo/whirlpool/whirlpool-hash-4way.c
+++ b/algo/whirlpool/whirlpool-hash-4way.c
@@ -3345,8 +3345,10 @@ do { \
 #define READ_STATE     MUL8(READ_STATE_W)
 #define ROUND0         MUL8(ROUND0_W)
 #define UPDATE_STATE   MUL8(UPDATE_STATE_W)
-//#define BYTE(x, n) \
+/*
-//   _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
+#define BYTE(x, n) \
   _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
 */
 #define BYTE(x, n)     ((unsigned)((x) >> (8 * (n))) & 0xFF)
--- a/algo/x11/c11.c
+++ b/algo/x11/c11.c
@@ -162,6 +162,7 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
                {
 			pdata[19] = nonce;
 			*hashes_done = pdata[19] - first_nonce;
                        work_set_target_ratio( work, hash );
 			return 1;
 		}
 		nonce++;
--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -0,0 +1,274 @@
 #include "timetravel-gate.h"
 #if defined(__AVX2__) && defined(__AES__)
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/sse2/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread int permutation[TT8_FUNC_COUNT] = { 0 };
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
    hashState_luffa         luffa;
    cubehashParam           cube;
 } tt8_4way_ctx_holder;
 tt8_4way_ctx_holder tt8_4way_ctx __attribute__ ((aligned (64)));
 void init_tt8_4way_ctx()
 {
    blake512_4way_init( &tt8_4way_ctx.blake );
    bmw512_4way_init( &tt8_4way_ctx.bmw );
    init_groestl( &tt8_4way_ctx.groestl, 64 );
    skein512_4way_init( &tt8_4way_ctx.skein );
    jh512_4way_init( &tt8_4way_ctx.jh );
    keccak512_4way_init( &tt8_4way_ctx.keccak );
    init_luffa( &tt8_4way_ctx.luffa, 512 );
    cubehashInit( &tt8_4way_ctx.cube, 512, 16, 32 );
 };
 void timetravel_4way_hash(void *output, const void *input)
 {
   uint64_t hash0[8] __attribute__ ((aligned (64)));
   uint64_t hash1[8] __attribute__ ((aligned (64)));
   uint64_t hash2[8] __attribute__ ((aligned (64)));
   uint64_t hash3[8] __attribute__ ((aligned (64)));
   uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
   uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
   uint64_t *vhashA, *vhashB;
   tt8_4way_ctx_holder ctx __attribute__ ((aligned (64)));
   uint32_t dataLen = 64;
   int i;
   memcpy( &ctx, &tt8_4way_ctx, sizeof(tt8_4way_ctx) );
   for ( i = 0; i < TT8_FUNC_COUNT; i++ )
   {
      if (i == 0)
      {
 	 dataLen = 80;
         vhashA = (uint64_t*)input;
         vhashB = vhashX;
      }
      else
      {
         dataLen = 64;
         if ( i % 2 == 0 )
         {
           vhashA = vhashY;
           vhashB = vhashX;
         }
         else
         {
           vhashA = vhashX;
           vhashB = vhashY;
         }
      }
      switch ( permutation[i] )
      {
        case 0:
           blake512_4way( &ctx.blake, vhashA, dataLen );
           blake512_4way_close( &ctx.blake, vhashB );
           if ( i == 7 )
              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                       vhashB, dataLen<<3 );
        break;
        case 1:
           bmw512_4way( &ctx.bmw, vhashA, dataLen );
           bmw512_4way_close( &ctx.bmw, vhashB );
           if ( i == 7 )
              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                       vhashB, dataLen<<3 );
        break;
        case 2:
           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                    vhashA, dataLen<<3 );
           update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                                   (char*)hash0, dataLen<<3 );
           reinit_groestl( &ctx.groestl );
           update_and_final_groestl( &ctx.groestl, (char*)hash1,
                                                   (char*)hash1, dataLen<<3 );
           reinit_groestl( &ctx.groestl );     
           update_and_final_groestl( &ctx.groestl, (char*)hash2,
                                                   (char*)hash2, dataLen<<3 );
           reinit_groestl( &ctx.groestl );     
           update_and_final_groestl( &ctx.groestl, (char*)hash3,
                                                   (char*)hash3, dataLen<<3 );
           if ( i != 7 )
              mm256_interleave_4x64( vhashB,
                                     hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
        case 3:
           skein512_4way( &ctx.skein, vhashA, dataLen );
           skein512_4way_close( &ctx.skein, vhashB );
           if ( i == 7 )
              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                       vhashB, dataLen<<3 );
        break;
        case 4:
           jh512_4way( &ctx.jh, vhashA, dataLen );
           jh512_4way_close( &ctx.jh, vhashB );
           if ( i == 7 )
              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                       vhashB, dataLen<<3 );
        break;
        case 5:
           keccak512_4way( &ctx.keccak, vhashA, dataLen );
           keccak512_4way_close( &ctx.keccak, vhashB );
           if ( i == 7 )
              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                       vhashB, dataLen<<3 );
        break;
        case 6:
           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                    vhashA, dataLen<<3 );
           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
                                        (const BitSequence *)hash0, dataLen );
           memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
                                         (const BitSequence*)hash1, dataLen );
           memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
                                         (const BitSequence*)hash2, dataLen );
           memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                                         (const BitSequence*)hash3, dataLen );
           if ( i != 7 )           
              mm256_interleave_4x64( vhashB,
                                     hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
        case 7:
           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                    vhashA, dataLen<<3 );
           cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
                                      (const byte*)hash0, dataLen );
           memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
           cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
                                      (const byte*)hash1, dataLen );
           memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
           cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
                                      (const byte*)hash2, dataLen );
           memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
           cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
                                      (const byte*)hash3, dataLen );
           if ( i != 7 )           
              mm256_interleave_4x64( vhashB,
                                     hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
        default:
           applog(LOG_ERR,"SWERR: timetravel invalid permutation");
 	break;
      }
   }
   memcpy( output,    hash0, 32 );
   memcpy( output+32, hash1, 32 );
   memcpy( output+64, hash2, 32 );
   memcpy( output+96, hash3, 32 );
 }
 int scanhash_timetravel_4way( int thr_id, struct work *work, uint32_t max_nonce,
                              uint64_t *hashes_done )
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t endiandata[20] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   uint32_t *nonces = work->nonces;
   bool *found = work->nfound;
   int num_found = 0;
   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
   uint32_t *noncep1 = vdata + 75;
   uint32_t *noncep2 = vdata + 77;
   uint32_t *noncep3 = vdata + 79;
   const uint32_t Htarg = ptarget[7];
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
   int i;
   if ( opt_benchmark )
 	ptarget[7] = 0x0cff;
   for ( int k = 0; k < 19; k++ )
 	be32enc( &endiandata[k], pdata[k] );
   const uint32_t timestamp = endiandata[17];
   if ( timestamp != s_ntime )
   {
      const int steps = ( timestamp - TT8_FUNC_BASE_TIMESTAMP )
                    % TT8_FUNC_COUNT_PERMUTATIONS;
      for ( i = 0; i < TT8_FUNC_COUNT; i++ )
         permutation[i] = i;
      for ( i = 0; i < steps; i++ )
         tt8_next_permutation( permutation, permutation + TT8_FUNC_COUNT );
      s_ntime = timestamp;
   }
   uint64_t *edata = (uint64_t*)endiandata;
   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
   do
   {
      found[0] = found[1] = found[2] = found[3] = false;
      be32enc( noncep0, n   );
      be32enc( noncep1, n+1 );
      be32enc( noncep2, n+2 );
      be32enc( noncep3, n+3 );
      timetravel_4way_hash( hash, vdata );
      pdata[19] = n;
      if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
      {
         found[0] = true;
         num_found++;
         nonces[0] = n;
         work_set_target_ratio( work, hash );
      }
      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget) )
      {
         found[1] = true;
         num_found++;
         nonces[1] = n+1;
         work_set_target_ratio( work, hash+8 );
      }
      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget) )
      {
         found[2] = true;
         num_found++;
         nonces[2] = n+2;
         work_set_target_ratio( work, hash+16 );
      }
      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget) )
      {
         found[3] = true;
         num_found++;
         nonces[3] = n+3;
         work_set_target_ratio( work, hash+24 );
      }
      n += 4;
   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
   *hashes_done = n - first_nonce + 1;
   return num_found;
 }
 #endif
--- a/algo/x11/timetravel-gate.c
+++ b/algo/x11/timetravel-gate.c
@@ -0,0 +1,78 @@
 #include "timetravel-gate.h"
 void tt8_set_target( struct work* work, double job_diff )
 {
 work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }
 bool register_timetravel_algo( algo_gate_t* gate )
 {
 #ifdef TIMETRAVEL_4WAY
  init_tt8_4way_ctx();
  gate->scanhash   = (void*)&scanhash_timetravel_4way;
  gate->hash       = (void*)&timetravel_4way_hash;
 #else
  init_tt8_ctx();
  gate->scanhash   = (void*)&scanhash_timetravel;
  gate->hash       = (void*)&timetravel_hash;
 #endif
  gate->set_target = (void*)&tt8_set_target;
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  return true;
 };
 inline void tt_swap( int *a, int *b )
 {
        int c = *a;
        *a = *b;
        *b = c;
 }
 inline void reverse( int *pbegin, int *pend )
 {
   while ( (pbegin != pend) && (pbegin != --pend) )
   {
      tt_swap( pbegin, pend );
      pbegin++;
   }
 }
 void tt8_next_permutation( int *pbegin, int *pend )
 {
   if ( pbegin == pend )
        return;
   int *i = pbegin;
   ++i;
   if ( i == pend )
        return;
   i = pend;
   --i;
   while (1)
   {
        int *j = i;
        --i;
        if ( *i < *j )
        {
           int *k = pend;
           while ( !(*i < *--k) ) /* do nothing */ ;
           tt_swap( i, k );
           reverse(j, pend);
                return; // true
        }
        if ( i == pbegin )
        {
           reverse(pbegin, pend);
           return; // false
        }
        // else?
   }
 }
--- a/algo/x11/timetravel-gate.h
+++ b/algo/x11/timetravel-gate.h
@@ -0,0 +1,40 @@
 #ifndef TIMETRAVEL_GATE_H__
 #define TIMETRAVEL_GATE_H__ 1
 #include "algo-gate-api.h"
 #include <stdint.h>
 #if defined(HASH_4WAY) && defined(__AES__)
  #define TIMETRAVEL_4WAY
 #endif
 // Machinecoin Genesis Timestamp
 #define TT8_FUNC_BASE_TIMESTAMP 1389040865
 #define TT8_FUNC_COUNT 8
 #define TT8_FUNC_COUNT_PERMUTATIONS 40320
 void tt8_next_permutation( int *pbegin, int *pend );
 bool register_timetravel_algo( algo_gate_t* gate );
 #if defined(TIMETRAVEL_4WAY)
 void timetravel_4way_hash( void *state, const void *input );
 int scanhash_timetravel_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );
 void init_tt8_4way_ctx();
 #endif
 void timetravel_hash( void *state, const void *input );
 int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );
 void init_tt8_ctx();
 #endif
--- a/algo/x11/timetravel.c
+++ b/algo/x11/timetravel.c
@@ -1,11 +1,9 @@
-#include "algo-gate-api.h"
+#include "timetravel-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
 #include "avxdefs.h"
 #include "algo/blake/sph_blake.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/jh/sph_jh.h"
@@ -13,75 +11,14 @@
 #include "algo/skein/sph_skein.h"
 #include "algo/luffa/sse2/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #ifdef NO_AES_NI
  #include "algo/groestl/sph_groestl.h"
 #else
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #endif
 // Machinecoin Genesis Timestamp
 #define HASH_FUNC_BASE_TIMESTAMP 1389040865
 #define HASH_FUNC_COUNT 8
 #define HASH_FUNC_COUNT_PERMUTATIONS 40320
 static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread int permutation[HASH_FUNC_COUNT] = { 0 };
+static __thread int permutation[TT8_FUNC_COUNT] = { 0 };
 inline void tt_swap( int *a, int *b )
 {
 	int c = *a;
 	*a = *b;
 	*b = c;
 }
 inline void reverse( int *pbegin, int *pend )
 {
   while ( (pbegin != pend) && (pbegin != --pend) )
   {
      tt_swap( pbegin, pend );
      pbegin++;
   }
 }
 static void next_permutation( int *pbegin, int *pend )
 {
   if ( pbegin == pend )
 	return;
   int *i = pbegin;
   ++i;
   if ( i == pend )
 	return;
   i = pend;
   --i;
   while (1)
   {
 	int *j = i;
 	--i;
 	if ( *i < *j )
        {
           int *k = pend;
 	   while ( !(*i < *--k) ) /* do nothing */ ;
 	   tt_swap( i, k );
 	   reverse(j, pend);
 		return; // true
 	}
 	if ( i == pbegin )
        {
 	   reverse(pbegin, pend);
 	   return; // false
 	}
        // else?
   }
 }
 typedef struct {
        sph_blake512_context    blake;
@@ -101,7 +38,7 @@ typedef struct {
 tt_ctx_holder tt_ctx __attribute__ ((aligned (64)));
 __thread tt_ctx_holder tt_mid __attribute__ ((aligned (64)));
-void init_tt_ctx()
+void init_tt8_ctx()
 {
        sph_blake512_init( &tt_ctx.blake );
        sph_bmw512_init( &tt_ctx.bmw );
@@ -119,7 +56,7 @@ void init_tt_ctx()
 void timetravel_hash(void *output, const void *input)
 {
-   uint32_t hash[ 16 * HASH_FUNC_COUNT ] __attribute__ ((aligned (64)));
+   uint32_t hash[ 16 * TT8_FUNC_COUNT ] __attribute__ ((aligned (64)));
   uint32_t *hashA, *hashB;
   tt_ctx_holder ctx __attribute__ ((aligned (64)));
   uint32_t dataLen = 64;
@@ -130,7 +67,7 @@ void timetravel_hash(void *output, const void *input)
   memcpy( &ctx, &tt_ctx, sizeof(tt_ctx) );
-   for ( i = 0; i < HASH_FUNC_COUNT; i++ )
+   for ( i = 0; i < TT8_FUNC_COUNT; i++ )
   {
        if (i == 0)
        {
@@ -270,7 +207,7 @@ void timetravel_hash(void *output, const void *input)
    }
  }
-	memcpy(output, &hash[16 * (HASH_FUNC_COUNT - 1)], 32);
+	memcpy(output, &hash[16 * (TT8_FUNC_COUNT - 1)], 32);
 }
 int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
@@ -296,12 +233,12 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t timestamp = endiandata[17];
   if ( timestamp != s_ntime )
   {
-      const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP )
+      const int steps = ( timestamp - TT8_FUNC_BASE_TIMESTAMP )
-                    % HASH_FUNC_COUNT_PERMUTATIONS;
+                    % TT8_FUNC_COUNT_PERMUTATIONS;
-      for ( i = 0; i < HASH_FUNC_COUNT; i++ )
+      for ( i = 0; i < TT8_FUNC_COUNT; i++ )
         permutation[i] = i;
      for ( i = 0; i < steps; i++ )
-         next_permutation( permutation, permutation + HASH_FUNC_COUNT );
+         tt8_next_permutation( permutation, permutation + TT8_FUNC_COUNT );
      s_ntime = timestamp;
      // do midstate precalc for first function
@@ -359,6 +296,7 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
              work_set_target_ratio( work, hash );
              pdata[19] = nonce;
              *hashes_done = pdata[19] - first_nonce;
              work_set_target_ratio( work, hash );
              return 1;
         }
         nonce++;
@@ -370,19 +308,4 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
  return 0;
 }
 void timetravel_set_target( struct work* work, double job_diff )
 {
 work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }
 bool register_timetravel_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  init_tt_ctx();
  gate->scanhash   = (void*)&scanhash_timetravel;
  gate->hash       = (void*)&timetravel_hash;
  gate->set_target = (void*)&timetravel_set_target;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  return true;
 };
--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -0,0 +1,316 @@
 #include "timetravel10-gate.h"
 #if defined(__AVX2__) && defined(__AES__)
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/sse2/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/sse2/nist.h"
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
    hashState_luffa         luffa;
    cubehashParam           cube;
    sph_shavite512_context  shavite;
    hashState_sd            simd;
 } tt10_4way_ctx_holder;
 tt10_4way_ctx_holder tt10_4way_ctx __attribute__ ((aligned (64)));
 void init_tt10_4way_ctx()
 {
    blake512_4way_init( &tt10_4way_ctx.blake );
    bmw512_4way_init( &tt10_4way_ctx.bmw );
    init_groestl( &tt10_4way_ctx.groestl, 64 );
    skein512_4way_init( &tt10_4way_ctx.skein );
    jh512_4way_init( &tt10_4way_ctx.jh );
    keccak512_4way_init( &tt10_4way_ctx.keccak );
    init_luffa( &tt10_4way_ctx.luffa, 512 );
    cubehashInit( &tt10_4way_ctx.cube, 512, 16, 32 );
    sph_shavite512_init( &tt10_4way_ctx.shavite );
    init_sd( &tt10_4way_ctx.simd, 512 );
 };
 void timetravel10_4way_hash(void *output, const void *input)
 {
   uint64_t hash0[8] __attribute__ ((aligned (64)));
   uint64_t hash1[8] __attribute__ ((aligned (64)));
   uint64_t hash2[8] __attribute__ ((aligned (64)));
   uint64_t hash3[8] __attribute__ ((aligned (64)));
   uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
   uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
   uint64_t *vhashA, *vhashB;
   tt10_4way_ctx_holder ctx __attribute__ ((aligned (64)));
   uint32_t dataLen = 64;
   int i;
   memcpy( &ctx, &tt10_4way_ctx, sizeof(tt10_4way_ctx) );
   for ( i = 0; i < TT10_FUNC_COUNT; i++ )
   {
      if (i == 0)
      {
 	 dataLen = 80;
         vhashA = (uint64_t*)input;
         vhashB = vhashX;
      }
      else
      {
         dataLen = 64;
         if ( i % 2 == 0 )
         {
           vhashA = vhashY;
           vhashB = vhashX;
         }
         else
         {
           vhashA = vhashX;
           vhashB = vhashY;
         }
      }
      switch ( permutation[i] )
      {
        case 0:
           blake512_4way( &ctx.blake, vhashA, dataLen );
           blake512_4way_close( &ctx.blake, vhashB );
           if ( i == 9 )
              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                       vhashB, dataLen<<3 );
        break;
        case 1:
           bmw512_4way( &ctx.bmw, vhashA, dataLen );
           bmw512_4way_close( &ctx.bmw, vhashB );
           if ( i == 9 )
              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                       vhashB, dataLen<<3 );
        break;
        case 2:
           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                    vhashA, dataLen<<3 );
           update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                                   (char*)hash0, dataLen<<3 );
           reinit_groestl( &ctx.groestl );
           update_and_final_groestl( &ctx.groestl, (char*)hash1,
                                                   (char*)hash1, dataLen<<3 );
           reinit_groestl( &ctx.groestl );     
           update_and_final_groestl( &ctx.groestl, (char*)hash2,
                                                   (char*)hash2, dataLen<<3 );
           reinit_groestl( &ctx.groestl );     
           update_and_final_groestl( &ctx.groestl, (char*)hash3,
                                                   (char*)hash3, dataLen<<3 );
           if ( i != 9 )
              mm256_interleave_4x64( vhashB,
                                     hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
        case 3:
           skein512_4way( &ctx.skein, vhashA, dataLen );
           skein512_4way_close( &ctx.skein, vhashB );
           if ( i == 9 )
              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                       vhashB, dataLen<<3 );
        break;
        case 4:
           jh512_4way( &ctx.jh, vhashA, dataLen );
           jh512_4way_close( &ctx.jh, vhashB );
           if ( i == 9 )
              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                       vhashB, dataLen<<3 );
        break;
        case 5:
           keccak512_4way( &ctx.keccak, vhashA, dataLen );
           keccak512_4way_close( &ctx.keccak, vhashB );
           if ( i == 9 )
              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                       vhashB, dataLen<<3 );
        break;
        case 6:
           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                    vhashA, dataLen<<3 );
           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
                                        (const BitSequence *)hash0, dataLen );
           memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
                                         (const BitSequence*)hash1, dataLen );
           memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
                                         (const BitSequence*)hash2, dataLen );
           memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                                         (const BitSequence*)hash3, dataLen );
           if ( i != 9 )           
              mm256_interleave_4x64( vhashB,
                                     hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
        case 7:
           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                    vhashA, dataLen<<3 );
           cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
                                      (const byte*)hash0, dataLen );
           memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
           cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
                                      (const byte*)hash1, dataLen );
           memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
           cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
                                      (const byte*)hash2, dataLen );
           memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
           cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
                                      (const byte*)hash3, dataLen );
           if ( i != 9 )           
              mm256_interleave_4x64( vhashB,
                                     hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
        case 8:
           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                    vhashA, dataLen<<3 );
           sph_shavite512( &ctx.shavite, hash0, dataLen );
           sph_shavite512_close( &ctx.shavite, hash0 );
           memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
           sph_shavite512( &ctx.shavite, hash1, dataLen );
           sph_shavite512_close( &ctx.shavite, hash1 );
           memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
           sph_shavite512( &ctx.shavite, hash2, dataLen );
           sph_shavite512_close( &ctx.shavite, hash2 );
           memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
           sph_shavite512( &ctx.shavite, hash3, dataLen );
           sph_shavite512_close( &ctx.shavite, hash3 );
           if ( i != 9 )
              mm256_interleave_4x64( vhashB,
                                     hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
        case 9:
           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                    vhashA, dataLen<<3 );
           update_final_sd( &ctx.simd, (BitSequence *)hash0,
                            (const BitSequence *)hash0, dataLen<<3 );
           memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
           update_final_sd( &ctx.simd, (BitSequence *)hash1,
                            (const BitSequence *)hash1, dataLen<<3 );
           memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
           update_final_sd( &ctx.simd, (BitSequence *)hash2,
                            (const BitSequence *)hash2, dataLen<<3 );
           memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
           update_final_sd( &ctx.simd, (BitSequence *)hash3,
                            (const BitSequence *)hash3, dataLen<<3 );
           if ( i != 9 )
              mm256_interleave_4x64( vhashB,
                                     hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
        default:
           applog(LOG_ERR,"SWERR: timetravel invalid permutation");
 	break;
      }
   }
   memcpy( output,    hash0, 32 );
   memcpy( output+32, hash1, 32 );
   memcpy( output+64, hash2, 32 );
   memcpy( output+96, hash3, 32 );
 }
 int scanhash_timetravel10_4way( int thr_id, struct work *work,
                                uint32_t max_nonce, uint64_t *hashes_done )
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t endiandata[20] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   uint32_t *nonces = work->nonces;
   bool *found = work->nfound;
   int num_found = 0;
   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
   uint32_t *noncep1 = vdata + 75;
   uint32_t *noncep2 = vdata + 77;
   uint32_t *noncep3 = vdata + 79;
   const uint32_t Htarg = ptarget[7];
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
   int i;
   if ( opt_benchmark )
 	ptarget[7] = 0x0cff;
   for ( int k = 0; k < 19; k++ )
 	be32enc( &endiandata[k], pdata[k] );
   const uint32_t timestamp = endiandata[17];
   if ( timestamp != s_ntime )
   {
      const int steps = ( timestamp - TT10_FUNC_BASE_TIMESTAMP )
                    % TT10_FUNC_COUNT_PERMUTATIONS;
      for ( i = 0; i < TT10_FUNC_COUNT; i++ )
         permutation[i] = i;
      for ( i = 0; i < steps; i++ )
         tt10_next_permutation( permutation, permutation + TT10_FUNC_COUNT );
      s_ntime = timestamp;
   }
   uint64_t *edata = (uint64_t*)endiandata;
   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
   do
   {
      found[0] = found[1] = found[2] = found[3] = false;
      be32enc( noncep0, n   );
      be32enc( noncep1, n+1 );
      be32enc( noncep2, n+2 );
      be32enc( noncep3, n+3 );
      timetravel10_4way_hash( hash, vdata );
      pdata[19] = n;
      if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
      {
         found[0] = true;
         num_found++;
         nonces[0] = n;
         work_set_target_ratio( work, hash );
      }
      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget) )
      {
         found[1] = true;
         num_found++;
         nonces[1] = n+1;
         work_set_target_ratio( work, hash+8 );
      }
      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget) )
      {
         found[2] = true;
         num_found++;
         nonces[2] = n+2;
         work_set_target_ratio( work, hash+16 );
      }
      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget) )
      {
         found[3] = true;
         num_found++;
         nonces[3] = n+3;
         work_set_target_ratio( work, hash+24 );
      }
      n += 4;
   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
   *hashes_done = n - first_nonce + 1;
   return num_found;
 }
 #endif
--- a/algo/x11/timetravel10-gate.c
+++ b/algo/x11/timetravel10-gate.c
@@ -0,0 +1,78 @@
 #include "timetravel10-gate.h"
 void tt10_set_target( struct work* work, double job_diff )
 {
 work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }
 bool register_timetravel10_algo( algo_gate_t* gate )
 {
 #ifdef TIMETRAVEL10_4WAY
  init_tt10_4way_ctx();
  gate->scanhash   = (void*)&scanhash_timetravel10_4way;
  gate->hash       = (void*)&timetravel10_4way_hash;
 #else
  init_tt10_ctx();
  gate->scanhash   = (void*)&scanhash_timetravel10;
  gate->hash       = (void*)&timetravel10_hash;
 #endif
  gate->set_target = (void*)&tt10_set_target;
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  return true;
 };
 inline void tt10_swap( int *a, int *b )
 {
        int c = *a;
        *a = *b;
        *b = c;
 }
 inline void reverse( int *pbegin, int *pend )
 {
   while ( (pbegin != pend) && (pbegin != --pend) )
   {
      tt10_swap( pbegin, pend );
      pbegin++;
   }
 }
 void tt10_next_permutation( int *pbegin, int *pend )
 {
   if ( pbegin == pend )
        return;
   int *i = pbegin;
   ++i;
   if ( i == pend )
        return;
   i = pend;
   --i;
   while (1)
   {
        int *j = i;
        --i;
        if ( *i < *j )
        {
           int *k = pend;
           while ( !(*i < *--k) ) /* do nothing */ ;
           tt10_swap( i, k );
           reverse(j, pend);
                return; // true
        }
        if ( i == pbegin )
        {
           reverse(pbegin, pend);
           return; // false
        }
        // else?
   }
 }
--- a/algo/x11/timetravel10-gate.h
+++ b/algo/x11/timetravel10-gate.h
@@ -0,0 +1,39 @@
 #ifndef TIMETRAVEL10_GATE_H__
 #define TIMETRAVEL10_GATE_H__ 1
 #include "algo-gate-api.h"
 #include <stdint.h>
 #if defined(HASH_4WAY) && defined(__AES__)
  #define TIMETRAVEL10_4WAY
 #endif
 // BitCore Genesis Timestamp
 #define TT10_FUNC_BASE_TIMESTAMP 1492973331U
 #define TT10_FUNC_COUNT 10
 #define TT10_FUNC_COUNT_PERMUTATIONS 40320
 void tt10_next_permutation( int *pbegin, int *pend );
 bool register_timetravel10_algo( algo_gate_t* gate );
 #if defined(TIMETRAVEL10_4WAY)
 void timetravel10_4way_hash( void *state, const void *input );
 int scanhash_timetravel10_4way( int thr_id, struct work *work,
                                uint32_t max_nonce, uint64_t *hashes_done );
 void init_tt10_4way_ctx();
 #endif
 void timetravel10_hash( void *state, const void *input );
 int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );
 void init_tt10_ctx();
 #endif
--- a/algo/x11/timetravel10.c
+++ b/algo/x11/timetravel10.c
@@ -1,11 +1,8 @@
-#include "algo-gate-api.h"
+#include "timetravel10-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
 #include "avxdefs.h"
 #include "algo/blake/sph_blake.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/jh/sph_jh.h"
@@ -22,68 +19,8 @@
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #endif
 // BitCore Genesis Timestamp
 #define HASH_FUNC_BASE_TIMESTAMP 1492973331U
 #define HASH_FUNC_COUNT 10
 #define HASH_FUNC_COUNT_PERMUTATIONS 40320
 static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread int permutation[HASH_FUNC_COUNT] = { 0 };
+static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
 inline void tt10_swap( int *a, int *b )
 {
 	int c = *a;
 	*a = *b;
 	*b = c;
 }
 inline void reverse( int *pbegin, int *pend )
 {
   while ( (pbegin != pend) && (pbegin != --pend) )
   {
      tt10_swap( pbegin, pend );
      pbegin++;
   }
 }
 static void next_permutation( int *pbegin, int *pend )
 {
   if ( pbegin == pend )
 	return;
   int *i = pbegin;
   ++i;
   if ( i == pend )
 	return;
   i = pend;
   --i;
   while (1)
   {
 	int *j = i;
 	--i;
 	if ( *i < *j )
        {
           int *k = pend;
 	   while ( !(*i < *--k) ) /* do nothing */ ;
 	   tt10_swap( i, k );
 	   reverse(j, pend);
 		return; // true
 	}
 	if ( i == pbegin )
        {
 	   reverse(pbegin, pend);
 	   return; // false
 	}
        // else?
   }
 }
 typedef struct {
        sph_blake512_context    blake;
@@ -125,7 +62,7 @@ void init_tt10_ctx()
 void timetravel10_hash(void *output, const void *input)
 {
-   uint32_t hash[ 16 * HASH_FUNC_COUNT ] __attribute__ ((aligned (64)));
+   uint32_t hash[ 16 * TT10_FUNC_COUNT ] __attribute__ ((aligned (64)));
   uint32_t *hashA, *hashB;
   tt10_ctx_holder ctx __attribute__ ((aligned (64)));
   uint32_t dataLen = 64;
@@ -136,7 +73,7 @@ void timetravel10_hash(void *output, const void *input)
   memcpy( &ctx, &tt10_ctx, sizeof(tt10_ctx) );
-   for ( i = 0; i < HASH_FUNC_COUNT; i++ )
+   for ( i = 0; i < TT10_FUNC_COUNT; i++ )
   {
        if (i == 0)
        {
@@ -302,7 +239,7 @@ void timetravel10_hash(void *output, const void *input)
    }
  }
-	memcpy(output, &hash[16 * (HASH_FUNC_COUNT - 1)], 32);
+	memcpy(output, &hash[16 * (TT10_FUNC_COUNT - 1)], 32);
 }
 int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
@@ -328,12 +265,12 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t timestamp = endiandata[17];
   if ( timestamp != s_ntime )
   {
-      const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP )
+      const int steps = ( timestamp - TT10_FUNC_BASE_TIMESTAMP )
-                    % HASH_FUNC_COUNT_PERMUTATIONS;
+                    % TT10_FUNC_COUNT_PERMUTATIONS;
-      for ( i = 0; i < HASH_FUNC_COUNT; i++ )
+      for ( i = 0; i < TT10_FUNC_COUNT; i++ )
         permutation[i] = i;
      for ( i = 0; i < steps; i++ )
-         next_permutation( permutation, permutation + HASH_FUNC_COUNT );
+         tt10_next_permutation( permutation, permutation + TT10_FUNC_COUNT );
      s_ntime = timestamp;
      // do midstate precalc for first function
@@ -398,6 +335,7 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
        {
              work_set_target_ratio( work, hash );
              pdata[19] = nonce;
              work_set_target_ratio( work, hash );
              *hashes_done = pdata[19] - first_nonce;
              return 1;
         }
@@ -409,20 +347,3 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
        *hashes_done = pdata[19] - first_nonce + 1;
  return 0;
 }
 void timetravel10_set_target( struct work* work, double job_diff )
 {
 work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }
 bool register_timetravel10_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  init_tt10_ctx();
  gate->scanhash   = (void*)&scanhash_timetravel10;
  gate->hash       = (void*)&timetravel10_hash;
  gate->set_target = (void*)&timetravel10_set_target;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  return true;
 };
--- a/algo/x11/x11.c
+++ b/algo/x11/x11.c
@@ -179,6 +179,7 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
                 if ( fulltest( hash64, ptarget ) )
                 {
                    *hashes_done = n - first_nonce + 1;
                    work_set_target_ratio( work, hash64 );
                    return true;
                 }
              }
@@ -189,14 +190,3 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
        pdata[19] = n;
        return 0;
 }
 /*
 bool register_x11_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  init_x11_ctx();
  gate->scanhash  = (void*)&scanhash_x11;
  gate->hash      = (void*)&x11_hash;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
 */
--- a/algo/x11/x11evo-4way.c
+++ b/algo/x11/x11evo-4way.c
@@ -0,0 +1,340 @@
 #include "cpuminer-config.h"
 #include "x11evo-gate.h"
 #if defined(__AVX2__) && defined(__AES__)
 #include <string.h>
 #include <stdint.h>
 #include <compat/portable_endian.h>
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/sph_luffa.h"
 #include "algo/cubehash/sph_cubehash.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/sph_simd.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/luffa/sse2/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/simd/sse2/nist.h"
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
    hashState_luffa         luffa;
    cubehashParam           cube;
    sph_shavite512_context  shavite;
    hashState_sd            simd;
    hashState_echo          echo;
 } x11evo_4way_ctx_holder;
 static x11evo_4way_ctx_holder x11evo_4way_ctx __attribute__ ((aligned (64)));
 void init_x11evo_4way_ctx()
 {
     blake512_4way_init( &x11evo_4way_ctx.blake );
     bmw512_4way_init( &x11evo_4way_ctx.bmw );
     init_groestl( &x11evo_4way_ctx.groestl, 64 );
     skein512_4way_init( &x11evo_4way_ctx.skein );
     jh512_4way_init( &x11evo_4way_ctx.jh );
     keccak512_4way_init( &x11evo_4way_ctx.keccak );
     init_luffa( &x11evo_4way_ctx.luffa, 512 );
     cubehashInit( &x11evo_4way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &x11evo_4way_ctx.shavite );
     init_sd( &x11evo_4way_ctx.simd, 512 );
     init_echo( &x11evo_4way_ctx.echo, 512 );
 }
 static char hashOrder[X11EVO_FUNC_COUNT + 1] = { 0 };
 static __thread uint32_t s_ntime = UINT32_MAX;
 void x11evo_4way_hash( void *state, const void *input )
 {
   uint32_t hash0[16] __attribute__ ((aligned (64)));
   uint32_t hash1[16] __attribute__ ((aligned (64)));
   uint32_t hash2[16] __attribute__ ((aligned (64)));
   uint32_t hash3[16] __attribute__ ((aligned (64)));
   uint32_t vhash[16*4] __attribute__ ((aligned (64)));
   x11evo_4way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &x11evo_4way_ctx, sizeof(x11evo_4way_ctx) );
   if ( s_seq == -1 )
   {
       uint32_t *data = (uint32_t*) input;
       const uint32_t ntime = data[17];
       evo_twisted_code( ntime, hashOrder );
    }
   int i;
   int len = strlen( hashOrder );
   for ( i = 0; i < len; i++ )
   {
      char elem = hashOrder[i];
      uint8_t idx;
      if ( elem >= 'A' )
         idx = elem - 'A' + 10;
      else
         idx = elem - '0';
 //      int size = 64;
      switch ( idx )
      {
         case 0:
            blake512_4way( &ctx.blake, input, 80 );
            blake512_4way_close( &ctx.blake, vhash );
            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                        vhash, 64<<3 );
         break;
         case 1:
            bmw512_4way( &ctx.bmw, vhash, 64 );
            bmw512_4way_close( &ctx.bmw, vhash );
            if ( i >= len-1 )
               mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                        vhash, 64<<3 );
         break;
         case 2:
            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                     vhash, 64<<3 );
            update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                                    (char*)hash0, 512 );
            reinit_groestl( &ctx.groestl );
            update_and_final_groestl( &ctx.groestl, (char*)hash1,
                                                    (char*)hash1, 512 );
            reinit_groestl( &ctx.groestl );
            update_and_final_groestl( &ctx.groestl, (char*)hash2,
                                                      (char*)hash2, 512 );
            reinit_groestl( &ctx.groestl );
            update_and_final_groestl( &ctx.groestl, (char*)hash3,
                                                      (char*)hash3, 512 );
            if ( i < len-1 )
               mm256_interleave_4x64( vhash,
                                      hash0, hash1, hash2, hash3, 64<<3 );
         break;
         case 3:
            skein512_4way( &ctx.skein, vhash, 64 );
            skein512_4way_close( &ctx.skein, vhash );
            if ( i >= len-1 )
               mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                        vhash, 64<<3 );
         break;
         case 4:
            jh512_4way( &ctx.jh, vhash, 64 );
            jh512_4way_close( &ctx.jh, vhash );
            if ( i >= len-1 )
               mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                        vhash, 64<<3 );
         break;
         case 5:
            keccak512_4way( &ctx.keccak, vhash, 64 );
            keccak512_4way_close( &ctx.keccak, vhash );
            if ( i >= len-1 )
               mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                        vhash, 64<<3 );
         break;
         case 6:
            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                     vhash, 64<<3 );
            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
                                          (const BitSequence*)hash0, 64 );
            memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
                    sizeof(hashState_luffa) );
            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
                                          (const BitSequence*)hash1, 64 );
            memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
                    sizeof(hashState_luffa) );
            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
                                          (const BitSequence*)hash2, 64 );
            memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
                    sizeof(hashState_luffa) );
            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                                          (const BitSequence*)hash3, 64 );
            if ( i < len-1 )
               mm256_interleave_4x64( vhash,
                                      hash0, hash1, hash2, hash3, 64<<3 );
         break;
         case 7:
            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                     vhash, 64<<3 );
            cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
                                      (const byte*) hash0, 64 );
            memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
            cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
                                      (const byte*) hash1, 64 );
            memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
            cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
                                      (const byte*) hash2, 64 );
            memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
            cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
                                      (const byte*) hash3, 64 );
            if ( i < len-1 )
               mm256_interleave_4x64( vhash,
                                      hash0, hash1, hash2, hash3, 64<<3 );
         break;
         case 8:
            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                     vhash, 64<<3 );
            sph_shavite512( &ctx.shavite, hash0, 64 );
            sph_shavite512_close( &ctx.shavite, hash0 );
            memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
                    sizeof(sph_shavite512_context) );
            sph_shavite512( &ctx.shavite, hash1, 64 );
            sph_shavite512_close( &ctx.shavite, hash1 );
            memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
                    sizeof(sph_shavite512_context) );
            sph_shavite512( &ctx.shavite, hash2, 64 );
            sph_shavite512_close( &ctx.shavite, hash2 );
            memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
                    sizeof(sph_shavite512_context) );
            sph_shavite512( &ctx.shavite, hash3, 64 );
            sph_shavite512_close( &ctx.shavite, hash3 );
            if ( i < len-1 )
               mm256_interleave_4x64( vhash,
                                      hash0, hash1, hash2, hash3, 64<<3 );
         break;
         case 9:
            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                     vhash, 64<<3 );
            update_final_sd( &ctx.simd, (BitSequence *)hash0,
                                  (const BitSequence *)hash0, 512 );
            memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
            update_final_sd( &ctx.simd, (BitSequence *)hash1,
                                  (const BitSequence *)hash1, 512 );
            memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
            update_final_sd( &ctx.simd, (BitSequence *)hash2,
                                  (const BitSequence *)hash2, 512 );
            memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
            update_final_sd( &ctx.simd, (BitSequence *)hash3,
                                  (const BitSequence *)hash3, 512 );
            if ( i < len-1 )
               mm256_interleave_4x64( vhash,
                                      hash0, hash1, hash2, hash3, 64<<3 );
         break;
         case 10:
            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                     vhash, 64<<3 );
            update_final_echo( &ctx.echo, (BitSequence *)hash0,
                                   (const BitSequence *) hash0, 512 );
            memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
            update_final_echo( &ctx.echo, (BitSequence *)hash1,
                                   (const BitSequence *) hash1, 512 );
            memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
            update_final_echo( &ctx.echo, (BitSequence *)hash2,
                                   (const BitSequence *) hash2, 512 );
            memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
            update_final_echo( &ctx.echo, (BitSequence *)hash3,
                                   (const BitSequence *) hash3, 512 );
            if ( i < len-1 )
               mm256_interleave_4x64( vhash,
                                      hash0, hash1, hash2, hash3, 64<<3 );
         break;
      }
   }
   memcpy( state,    hash0, 32 );
   memcpy( state+32, hash1, 32 );
   memcpy( state+64, hash2, 32 );
   memcpy( state+96, hash3, 32 );
 }
 //static const uint32_t diff1targ = 0x0000ffff;
 int scanhash_x11evo_4way( int thr_id, struct work* work, uint32_t max_nonce,
                          uint64_t *hashes_done )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
     bool *found = work->nfound;
     int num_found = 0;
     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
     uint32_t *noncep1 = vdata + 75;
     uint32_t *noncep2 = vdata + 77;
     uint32_t *noncep3 = vdata + 79;
     const uint32_t Htarg = ptarget[7];
     swab32_array( endiandata, pdata, 20 );
     int ntime = endiandata[17];
     if ( ntime != s_ntime  ||  s_seq == -1 )
     {
         evo_twisted_code( ntime, hashOrder );
         s_ntime = ntime;
     }
     uint32_t hmask = 0xFFFFFFFF;
     if ( Htarg  > 0 )
     {
        if ( Htarg <= 0xF )
            hmask = 0xFFFFFFF0;
        else if ( Htarg <= 0xFF )
            hmask = 0xFFFFFF00;
        else if ( Htarg <= 0xFFF )
            hmask = 0xFFFF000;
        else if ( Htarg <= 0xFFFF )
           hmask = 0xFFFF000;
      }
     uint64_t *edata = (uint64_t*)endiandata;
     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
     do
     {
         found[0] = found[1] = found[2] = found[3] = false;
         be32enc( noncep0, n   );
         be32enc( noncep1, n+1 );
         be32enc( noncep2, n+2 );
         be32enc( noncep3, n+3 );
         x11evo_4way_hash( hash, vdata );
         pdata[19] = n;
         if ( ( hash[7] & hmask ) == 0 && fulltest( hash, ptarget ) )
         {
            found[0] = true;
            num_found++;
            nonces[0] = n;
            work_set_target_ratio( work, hash );
         }
         if ( ( (hash+8)[7] & hmask ) == 0 && fulltest( hash+8, ptarget ) )
         {
            found[1] = true;
            num_found++;
            nonces[1] = n+1;
            work_set_target_ratio( work, hash+8 );
         }
         if ( ( (hash+16)[7] & hmask ) == 0 && fulltest( hash+16, ptarget ) )
         {
            found[2] = true;
            num_found++;
            nonces[2] = n+2;
            work_set_target_ratio( work, hash+16 );
         }
         if ( ( (hash+24)[7] & hmask ) == 0 && fulltest( hash+24, ptarget ) )
         {
            found[3] = true;
            num_found++;
            nonces[3] = n+3;
            work_set_target_ratio( work, hash+24 );
         }
         n += 4;
     } while ( ( num_found == 0 ) && ( n < max_nonce )
                   && !work_restart[thr_id].restart );
     *hashes_done = n - first_nonce + 1;
     return num_found;
 }
 #endif
--- a/algo/x11/x11evo-gate.c
+++ b/algo/x11/x11evo-gate.c
@@ -0,0 +1,95 @@
 #include "x11evo-gate.h"
 int s_seq = -1;
 static inline int getCurrentAlgoSeq( uint32_t current_time )
 {
   // change once per day
   return (int) (current_time - X11EVO_INITIAL_DATE) / (60 * 60 * 24);
 }
 // swap_vars doesn't work here
 void evo_swap( uint8_t *a, uint8_t *b )
 {
   uint8_t __tmp = *a;
   *a = *b;
   *b = __tmp;
 }
 void initPerm( uint8_t n[], uint8_t count )
 {
   int i;
   for ( i = 0; i<count; i++ )
       n[i] = i;
 }
 int nextPerm( uint8_t n[], uint32_t count )
 {
   uint32_t tail = 0, i = 0, j = 0;
   if (unlikely( count <= 1 ))
      return 0;
   for ( i = count - 1; i>0 && n[i - 1] >= n[i]; i-- );
      tail = i;
   if ( tail > 0 )
      for ( j = count - 1; j>tail && n[j] <= n[tail - 1]; j-- );
           evo_swap( &n[tail - 1], &n[j] );
   for ( i = tail, j = count - 1; i<j; i++, j-- )
      evo_swap( &n[i], &n[j] );
   return ( tail != 0 );
 }
 void getAlgoString( char *str, uint32_t count )
 {
   uint8_t algoList[X11EVO_FUNC_COUNT];
   char *sptr;
   int j;
   int k;
   initPerm( algoList, X11EVO_FUNC_COUNT );
   for ( k = 0; k < count; k++ )
      nextPerm( algoList, X11EVO_FUNC_COUNT );
   sptr = str;
   for ( j = 0; j < X11EVO_FUNC_COUNT; j++ )
   {
      if ( algoList[j] >= 10 )
          sprintf( sptr, "%c", 'A' + (algoList[j] - 10) );
      else
          sprintf( sptr, "%u", algoList[j] );
      sptr++;
   }
  *sptr = 0;
        //applog(LOG_DEBUG, "nextPerm %s", str);
 }
 void evo_twisted_code( uint32_t ntime, char *permstr )
 {
   int seq = getCurrentAlgoSeq( ntime );
   if ( s_seq != seq )
   {
       getAlgoString( permstr, seq );
       s_seq = seq;
   }
 }
 bool register_x11evo_algo( algo_gate_t* gate )
 {
 #if defined (X11EVO_4WAY)
  init_x11evo_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x11evo_4way;
  gate->hash      = (void*)&x11evo_4way_hash;
 #else
  init_x11evo_ctx();
  gate->scanhash  = (void*)&scanhash_x11evo;
  gate->hash      = (void*)&x11evo_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  return true;
 };
--- a/algo/x11/x11evo-gate.h
+++ b/algo/x11/x11evo-gate.h
@@ -0,0 +1,39 @@
 #ifndef X11EVO_GATE_H__
 #define X11EVO_GATE_H__ 1
 #include "algo-gate-api.h"
 #include <stdint.h>
 #if defined(HASH_4WAY) && defined(__AES__)
  #define X11EVO_4WAY
 #endif
 #define X11EVO_INITIAL_DATE 1462060800
 #define X11EVO_FUNC_COUNT 11
 extern int s_seq;
 bool register_x11evo_algo( algo_gate_t* gate );
 #if defined(X11EVO_4WAY)
 void x11evo_4way_hash( void *state, const void *input );
 int scanhash_x11evo_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );
 void init_x11evo_4way_ctx();
 #endif
 void x11evo_hash( void *state, const void *input );
 int scanhash_x11evo( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );
 void init_x11evo_ctx();
 void evo_twisted_code( uint32_t ntime, char *permstr );
 #endif
--- a/algo/x11/x11evo.c
+++ b/algo/x11/x11evo.c
@@ -1,5 +1,5 @@
 #include "cpuminer-config.h"
-#include "algo-gate-api.h"
+#include "x11evo-gate.h"
 #include <string.h>
 #include <stdint.h>
@@ -26,9 +26,6 @@
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/simd/sse2/nist.h"
 #define INITIAL_DATE 1462060800
 #define HASH_FUNC_COUNT 11
 typedef struct {
 #ifdef NO_AES_NI
    sph_groestl512_context  groestl;
@@ -70,94 +67,10 @@ void init_x11evo_ctx()
     sph_shavite512_init( &x11evo_ctx.shavite );
 }
-/*
+static char hashOrder[X11EVO_FUNC_COUNT + 1] = { 0 };
 uint32_t getCurrentAlgoSeq(uint32_t current_time, uint32_t base_time)
 {
 	return (current_time - base_time) / (60 * 60 * 24);
 }
 */
 static inline int getCurrentAlgoSeq( uint32_t current_time )
 {
        // change once per day
        return (int) (current_time - INITIAL_DATE) / (60 * 60 * 24);
 }
 // swap_vars doesn't work here
 void evo_swap( uint8_t *a, uint8_t *b )
 {
 	uint8_t __tmp = *a;
 	*a = *b;
 	*b = __tmp;
 }
 void initPerm( uint8_t n[], uint8_t count )
 {
 	int i;
 	for ( i = 0; i<count; i++ )
 		n[i] = i;
 }
 int nextPerm( uint8_t n[], uint32_t count )
 {
 	uint32_t tail = 0, i = 0, j = 0;
 	if (unlikely( count <= 1 ))
 		return 0;
 	for ( i = count - 1; i>0 && n[i - 1] >= n[i]; i-- );
           tail = i;
 	if ( tail > 0 )
            for ( j = count - 1; j>tail && n[j] <= n[tail - 1]; j-- );
 	         evo_swap( &n[tail - 1], &n[j] );
 	for ( i = tail, j = count - 1; i<j; i++, j-- )
 		evo_swap( &n[i], &n[j] );
 	return ( tail != 0 );
 }
 void getAlgoString( char *str, uint32_t count )
 {
 	uint8_t algoList[HASH_FUNC_COUNT];
 	char *sptr;
        int j;
        int k;
 	initPerm( algoList, HASH_FUNC_COUNT );
 	for ( k = 0; k < count; k++ )
 		nextPerm( algoList, HASH_FUNC_COUNT );
 	sptr = str;
 	for ( j = 0; j < HASH_FUNC_COUNT; j++ )
        {
 		if ( algoList[j] >= 10 )
 			sprintf( sptr, "%c", 'A' + (algoList[j] - 10) );
 		else
 			sprintf( sptr, "%u", algoList[j] );
 		sptr++;
 	}
 	*sptr = 0;
 	//applog(LOG_DEBUG, "nextPerm %s", str);
 }
 static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
 static __thread uint32_t s_ntime = UINT32_MAX;
 static int s_seq = -1;
-static void evo_twisted_code(uint32_t ntime, char *permstr)
+void x11evo_hash( void *state, const void *input )
 {
        int seq = getCurrentAlgoSeq(ntime);
        if (s_seq != seq)
        {
                getAlgoString(permstr, seq);
                s_seq = seq;
        }
 }
 static inline void x11evo_hash( void *state, const void *input )
 {
   uint32_t hash[16] __attribute__ ((aligned (64)));
   x11evo_ctx_holder ctx __attribute__ ((aligned (64)));
@@ -242,10 +155,10 @@ static inline void x11evo_hash( void *state, const void *input )
    memcpy( state, hash, 32 );
 }
-static const uint32_t diff1targ = 0x0000ffff;
+//static const uint32_t diff1targ = 0x0000ffff;
 int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
-                     unsigned long *hashes_done )
+                     uint64_t *hashes_done )
 {
        uint32_t endiandata[20] __attribute__((aligned(64)));
        uint32_t hash64[8] __attribute__((aligned(64)));
@@ -281,12 +194,13 @@ int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
        {
          pdata[19] = ++n;
          be32enc( &endiandata[19], n );
-          x11evo_hash( hash64, &endiandata );
+          x11evo_hash( hash64, endiandata );
          if ( ( hash64[7] & hmask ) == 0 )
          {
             if ( fulltest( hash64, ptarget ) )
             {
                 *hashes_done = n - first_nonce + 1;
                 work_set_target_ratio( work, hash64 );
                 return true;
             }
           }
@@ -296,13 +210,3 @@ int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
 bool register_x11evo_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  gate->scanhash  = (void*)&scanhash_x11evo;
  gate->hash      = (void*)&x11evo_hash;
  init_x11evo_ctx();
  return true;
 };
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -161,6 +161,7 @@ int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			pdata[19] = nonce;
 			*hashes_done = pdata[19] - first_nonce;
                        work_set_target_ratio( work, hash );
 			return 1;
 		}
 		nonce++;
--- a/algo/x13/phi1612.c
+++ b/algo/x13/phi1612.c
@@ -116,6 +116,7 @@ int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce,
 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			pdata[19] = nonce;
                        work_set_target_ratio( work, hash );
 			*hashes_done = pdata[19] - first_nonce;
 			return 1;
 		}
--- a/algo/x13/skunk.c
+++ b/algo/x13/skunk.c
@@ -70,6 +70,7 @@ int scanhash_skunk( int thr_id, struct work *work, uint32_t max_nonce,
           {
 		pdata[19] = nonce;
 		*hashes_done = pdata[19] - first_nonce;
                work_set_target_ratio( work, hash );
 		return 1;
 	   }
 	   nonce++;
--- a/algo/x13/x13.c
+++ b/algo/x13/x13.c
@@ -234,6 +234,7 @@ int scanhash_x13(int thr_id, struct work *work, uint32_t max_nonce,
 		if (!(hash64[7] & mask)) {
 			printf("[%d]",thr_id);
 			if (fulltest(hash64, ptarget)) {
                                work_set_target_ratio( work, hash );
 				*hashes_done = n - first_nonce + 1;
 				return true;
 			}
--- a/algo/x13/x13sm3-4way.c
+++ b/algo/x13/x13sm3-4way.c
@@ -17,7 +17,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/sse2/nist.h"
 #include "algo/echo/aes_ni/hash_api.h"
-#include "algo/sm3/sph_sm3.h"
+#include "algo/sm3/sm3-hash-4way.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
@@ -33,7 +33,7 @@ typedef struct {
    sph_shavite512_context  shavite;
    hashState_sd            simd;
    hashState_echo          echo;
-    sm3_ctx_t               sm3;
+    sm3_4way_ctx_t          sm3;
    sph_hamsi512_context    hamsi;
    sph_fugue512_context    fugue;
 } x13sm3_4way_ctx_holder;
@@ -54,7 +54,7 @@ void init_x13sm3_4way_ctx()
     sph_shavite512_init( &x13sm3_4way_ctx.shavite );
     init_sd( &x13sm3_4way_ctx.simd, 512 );
     init_echo( &x13sm3_4way_ctx.echo, 512 );
-     sm3_init( &x13sm3_4way_ctx.sm3 );
+     sm3_4way_init( &x13sm3_4way_ctx.sm3 );
     sph_hamsi512_init( &x13sm3_4way_ctx.hamsi );
     sph_fugue512_init( &x13sm3_4way_ctx.fugue );
 };
@@ -85,14 +85,11 @@ void x13sm3_4way_hash( void *state, const void *input )
     // Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl,
+     reinit_groestl( &ctx.groestl );
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl, 
+     reinit_groestl( &ctx.groestl );
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl, 
+     reinit_groestl( &ctx.groestl );
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     // Parallel 4way
@@ -178,6 +175,8 @@ void x13sm3_4way_hash( void *state, const void *input )
                       (const BitSequence *) hash3, 512 );
     // SM3
     uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64)));
     memset( sm3_vhash, 0, sizeof sm3_vhash );
     uint32_t sm3_hash0[32] __attribute__ ((aligned (32)));
     memset( sm3_hash0, 0, sizeof sm3_hash0 );
     uint32_t sm3_hash1[32] __attribute__ ((aligned (32)));
@@ -187,17 +186,11 @@ void x13sm3_4way_hash( void *state, const void *input )
     uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
     memset( sm3_hash3, 0, sizeof sm3_hash3 );
-     sph_sm3( &ctx.sm3, hash0, 64 );
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
-     sph_sm3_close( &ctx.sm3, sm3_hash0 );
+     sm3_4way( &ctx.sm3, vhash, 64 );
-     memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
+     sm3_4way_close( &ctx.sm3, sm3_vhash );
-     sph_sm3( &ctx.sm3, hash1, 64 );
+     mm_deinterleave_4x32( sm3_hash0, sm3_hash1, sm3_hash2, sm3_hash3,
-     sph_sm3_close( &ctx.sm3, sm3_hash1 );
+                           sm3_vhash, 1024 );
     memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
     sph_sm3( &ctx.sm3, hash2, 64 );
     sph_sm3_close( &ctx.sm3, sm3_hash2 );
     memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
     sph_sm3( &ctx.sm3, hash3, 64 );
     sph_sm3_close( &ctx.sm3, sm3_hash3 );
     // Hamsi
     sph_hamsi512( &ctx.hamsi, sm3_hash0, 64 );
--- a/algo/x13/x13sm3.c
+++ b/algo/x13/x13sm3.c
@@ -224,6 +224,7 @@ int scanhash_x13sm3( int thr_id, struct work *work,
 				if (!(hash64[7] & mask)) {
 					printf("[%d]",thr_id);
 					if (fulltest(hash64, ptarget)) {
                                                work_set_target_ratio( work, hash64 );
 						*hashes_done = n - first_nonce + 1;
 						return true;
 					}
--- a/algo/x14/axiom.c
+++ b/algo/x14/axiom.c
@@ -65,6 +65,7 @@ int scanhash_axiom(int thr_id, struct work *work,
 		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
 			*hashes_done = n - first_nonce + 1;
 			pdata[19] = n;
                        work_set_target_ratio( work, hash64 );
 			return true;
 		}
 		n++;
--- a/algo/x14/polytimos-gate.h
+++ b/algo/x14/polytimos-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(HASH_4WAY) && defined(__AES__)
+#if defined(__AVX2__) && defined(__AES__)
  #define POLYTIMOS_4WAY
 #endif
--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -233,6 +233,7 @@ int scanhash_x14(int thr_id, struct work *work,
 				if (!(hash64[7] & mask)) {
 					printf("[%d]",thr_id);
 					if (fulltest(hash64, ptarget)) {
                                                work_set_target_ratio( work, hash64 );
 						*hashes_done = n - first_nonce + 1;
 						return true;
 					}
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -47,7 +47,6 @@ void init_x15_4way_ctx()
 {
     blake512_4way_init( &x15_4way_ctx.blake );
     bmw512_4way_init( &x15_4way_ctx.bmw );
     sph_bmw512_init( &x15_4way_ctx.bmw );
     init_groestl( &x15_4way_ctx.groestl, 64 );
     skein512_4way_init( &x15_4way_ctx.skein );
     jh512_4way_init( &x15_4way_ctx.jh );
--- a/algo/x15/x15.c
+++ b/algo/x15/x15.c
@@ -245,6 +245,7 @@ int scanhash_x15(int thr_id, struct work *work,
 				if (!(hash64[7] & mask)) {
 					printf("[%d]",thr_id);
 					if (fulltest(hash64, ptarget)) {
                                                work_set_target_ratio( work, hash64 );
 						*hashes_done = n - first_nonce + 1;
 						return true;
 					}
--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -266,6 +266,7 @@ int scanhash_x17(int thr_id, struct work *work,
 				if (!(hash64[7] & mask)) {
 					printf("[%d]",thr_id);
 					if (fulltest(hash64, ptarget)) {
                                                work_set_target_ratio( work, hash64 );
 						*hashes_done = n - first_nonce + 1;
 						return true;
 					}
@@ -281,13 +282,3 @@ int scanhash_x17(int thr_id, struct work *work,
 	pdata[19] = n;
 	return 0;
 }
 /*
 bool register_x17_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  init_x17_ctx();
  gate->scanhash = (void*)&scanhash_x17;
  gate->hash     = (void*)&x17hash;
  return true;
 };
 */
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -35,10 +35,18 @@
 #define mm_one_64    _mm_set1_epi64x( 1ULL )
 #define mm_one_32    _mm_set1_epi32(  1UL )
 #define mm_one_16    _mm_set1_epi16(  1U )
 #define mm_one_8     _mm_set1_epi8(   1U )
 // Constant minus 1
 #define mm_neg1      _mm_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
 // Lane index, useful for byte rotate using shuffle
 #define mm_lanex_64 _mm_set_epi64( 1ULL, 0ULL );
 #define mm_lanex_32 _mm_set_epi32( 3UL, 2UL, 1UL, 0UL );
 #define mm_lanex_16 _mm_set_epi16( 7U, 6U, 5U, 4U, 3U, 2U, 1U, 0U );
 #define mm_lanex_8 _mm_set_epi8( 15U, 14U, 13U, 12U, 11U, 10U , 9U,  8U, \
                                  7U,  6U,  5U,  4U,  3U,  2U,  1U,  0U );
 //
 // Basic operations without equivalent SIMD intrinsic
@@ -327,6 +335,16 @@ inline __m128i mm_byteswap_16( __m128i x )
 // Constant minus 1
 #define mm256_neg1           _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
 // Lane index, useful for rotate using permutevar
 #define mm256_lane_64 _mm_set_epi64x( 3ULL, 2ULL, 1ULL, 0ULL );
 #define mm256_lane_32 _mm_set_epi32( 7UL, 6UL, 5UL, 4UL, 3UL, 2UL, 1UL, 0UL );
 #define mm256_lane_16 _mm_set_epi16( 15U, 14U, 13U, 12U, 11U, 10U , 9U,  8U, \
                                      7U,  6U,  5U,  4U,  3U,  2U,  1U,  0U );
 #define mm256_lane_8 _mm_set_epi8( 31U, 30U, 29U, 28U, 27U, 26U, 25U, 24U, \
                                   23U, 22U, 21U, 20U, 19U, 18U, 17U, 16U, \
                                   15U, 14U, 13U, 12U, 11U, 10U , 9U,  8U, \
                                    7U,  6U,  5U,  4U,  3U,  2U,  1U,  0U );
 //
 // Basic operations without SIMD equivalent
@@ -1109,7 +1127,7 @@ inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1,
 }
 // Can't do it in place
-inline void mm256_reinterleave_4x64x( void *dst, void *src, int  bit_len )
+inline void mm256_reinterleave_4x64( void *dst, void *src, int  bit_len )
 {
   __m256i* d = (__m256i*)dst;
   uint32_t *s = (uint32_t*)src;
@@ -1146,7 +1164,8 @@ inline void mm256_reinterleave_4x64x( void *dst, void *src, int  bit_len )
 // likely of no use.
 // convert 4x32 byte (128 bit) vectors to 4x64 (256 bit) vectors for AVX2
 // bit_len must be multiple of 64
-inline void mm256_reinterleave_4x64( uint64_t *dst, uint32_t *src,
+// broken
 inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src,
                                         int  bit_len )
 {
   uint32_t *d = (uint32_t*)dst;
@@ -1200,6 +1219,7 @@ inline void mm256_reinterleave_4x32( void *dst, void *src, int  bit_len )
   // bit_len == 1024
 }
 // not used
 inline void mm_reinterleave_4x32( void *dst, void *src, int  bit_len )
 {
   uint32_t *d = (uint32_t*)dst;
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.9.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.10.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.7.9'
+PACKAGE_VERSION='3.7.10'
-PACKAGE_STRING='cpuminer-opt 3.7.9'
+PACKAGE_STRING='cpuminer-opt 3.7.10'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.7.9 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.7.10 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1392,7 +1392,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.7.9:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.7.10:";;
   esac
  cat <<\_ACEOF
@@ -1497,7 +1497,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.7.9
+cpuminer-opt configure 3.7.10
 generated by GNU Autoconf 2.69
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by cpuminer-opt $as_me 3.7.9, which was
+It was created by cpuminer-opt $as_me 3.7.10, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  $ $0 $@
@@ -2981,7 +2981,7 @@ fi
 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.7.9'
+ VERSION='3.7.10'
 cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.7.9, which was
+This file was extended by cpuminer-opt $as_me 3.7.10, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.7.9
+cpuminer-opt config.status 3.7.10
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.7.9])
+AC_INIT([cpuminer-opt], [3.7.10])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -346,6 +346,7 @@ static bool work_decode( const json_t *val, struct work *work )
    work->targetdiff = target_to_diff( work->target );
    // for api stats, on longpoll pools
    stratum_diff = work->targetdiff;
    work->sharediff = 0;
    algo_gate.display_extra_data( work, &net_blocks );
    return true;
 }
@@ -755,6 +756,7 @@ static int share_result( int result, struct work *work, const char *reason )
   uint32_t total_submits;
   float rate;
   char rate_s[8] = {0};
   double sharediff = work ? work->sharediff : stratum.sharediff;
   int i;
   pthread_mutex_lock(&stats_lock);
@@ -814,6 +816,8 @@ static int share_result( int result, struct work *work, const char *reason )
      sprintf(hr, "%.2f", hashrate );
   }
   if ( sharediff == 0 )
   {
 #if ((defined(_WIN64) || defined(__WINDOWS__)))
   applog( LOG_NOTICE, "%s %lu/%lu (%s%%), %s %sH, %s %sH/s",
                       sres, ( result ? accepted_count : rejected_count ),
@@ -824,6 +828,20 @@ static int share_result( int result, struct work *work, const char *reason )
                       total_submits, rate_s, hc, hc_units, hr, hr_units,
                       (uint32_t)cpu_temp(0) );
 #endif
   }
   else
   {
 #if ((defined(_WIN64) || defined(__WINDOWS__)))
   applog( LOG_NOTICE, "%s %lu/%lu (%s%%), diff %.3g, %s %sH/s",
                       sres, ( result ? accepted_count : rejected_count ),
                       total_submits, rate_s, sharediff, hr, hr_units );
 #else
   applog( LOG_NOTICE, "%s %lu/%lu (%s%%), diff %.3g, %s %sH/s, %dC",
                       sres, ( result ? accepted_count : rejected_count ),
                       total_submits, rate_s, sharediff, hr, hr_units,
                       (uint32_t)cpu_temp(0) );
 #endif
   }
   if (reason)
   {
@@ -1026,6 +1044,7 @@ static bool submit_upstream_work( CURL *curl, struct work *work )
   }
   if ( have_stratum )
   {
       stratum.sharediff = work->sharediff;
       algo_gate.build_stratum_request( req, work, &stratum );
       if ( unlikely( !stratum_send_line( &stratum, req ) ) )
       {
@@ -2984,25 +3003,22 @@ bool check_cpu_capability ()
     }
     if ( sw_has_avx2 && !( cpu_has_avx2 && cpu_has_aes ) )
     {
-        if ( sw_has_4way && algo_has_4way )
+        printf( "The SW build requires a CPU with AES and AVX2!\n" );
           printf( "A CPU with AES and AVX2 is required to use 4way!\n" );
        else if ( algo_has_avx2 )
           printf( "A CPU with AES and AVX2 is required!\n" );
        return false;
     }
-     if ( sw_has_avx && !( cpu_has_avx && cpu_has_aes ) )
+     if ( sw_has_avx && !cpu_has_avx )
     {
-        printf( "A CPU with AES and AVX2 is required!\n" );
+        printf( "The SW build requires a CPU with AVX!\n" );
        return false;
     }
-     if ( sw_has_aes && algo_has_aes && !cpu_has_aes )
+     if ( sw_has_aes && !cpu_has_aes )
     {
-        printf( "A CPU with AES is required!\n" );
+        printf( "The SW build requires a CPU with AES!\n" );
        return false;
     }
-     if ( sw_has_sha && algo_has_sha && !cpu_has_sha )
+     if ( sw_has_sha && !cpu_has_sha )
     {
-        printf( "A CPU with SHA is required!\n" );
+        printf( "The SW build requires a CPU with SHA!\n" );
        return false;
     }
@@ -3187,6 +3203,9 @@ int main(int argc, char *argv[])
 	}
 #endif
   if ( num_cpus != opt_n_threads )   
     applog( LOG_INFO,"%u CPU cores available, %u miner threads selected.",
             num_cpus, opt_n_threads );
   if ( opt_affinity != -1 )
   {
      if ( num_cpus > 64 )
--- a/miner.h
+++ b/miner.h
@@ -736,7 +736,7 @@ Options:\n\
                          whirlpool\n\
                          whirlpoolx\n\
                          x11          Dash\n\
-                          x11evo       Revolvercoin\n\
+                          x11evo       Revolvercoin (XRE)\n\
                          x11gost      sib (SibCoin)\n\
                          x13          X13\n\
                          x13sm3       hsr (Hshare)\n\