v3.7.10

2025-09-17 23:44:27 +00:00 · 2018-01-16 15:11:44 -05:00
parent bee78eac76
commit a90d75b8f5
77 changed files with 3408 additions and 1214 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -46,8 +46,10 @@ cpuminer_SOURCES = \
  algo/blake/sph_blake2b.c \
  algo/blake/blake2b.c \
  algo/blake/blake2s.c \
+  algo/blake/blakecoin-gate.c \
  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
+  algo/blake/blakecoin-4way.c \
  algo/blake/decred-gate.c \
  algo/blake/decred.c \
  algo/blake/decred-4way.c \
@@ -99,13 +101,17 @@ cpuminer_SOURCES = \
  algo/luffa/sse2/luffa_for_sse2.c \
  algo/lyra2/lyra2.c \
  algo/lyra2/sponge.c \
+  algo/lyra2/lyra2rev2-gate.c \
  algo/lyra2/lyra2rev2.c \
+  algo/lyra2/lyra2rev2-4way.c \
  algo/lyra2/lyra2re.c \
  algo/lyra2/lyra2z-gate.c \
  algo/lyra2/lyra2z.c \
  algo/lyra2/lyra2z-4way.c \
  algo/lyra2/lyra2z330.c \
+  algo/lyra2/lyra2h-gate.c \
  algo/lyra2/lyra2h.c \
+  algo/lyra2/lyra2h-4way.c \
  algo/m7m.c \
  algo/neoscrypt/neoscrypt.c \
  algo/nist5/nist5-gate.c \
@@ -113,7 +119,9 @@ cpuminer_SOURCES = \
  algo/nist5/nist5.c \
  algo/nist5/zr5.c \
  algo/pluck.c \
+  algo/quark/quark-gate.c \
  algo/quark/quark.c \
+  algo/quark/quark-4way.c \
  algo/qubit/qubit.c \
  algo/qubit/deep.c \
  algo/ripemd/sph_ripemd.c \
@@ -140,9 +148,8 @@ cpuminer_SOURCES = \
  algo/skein/skein2-4way.c \
  algo/skein/skein2-gate.c \
  algo/sm3/sm3.c \
+  algo/sm3/sm3-hash-4way.c \
  algo/tiger/sph_tiger.c \
-  algo/timetravel.c \
-  algo/timetravel10.c \
  algo/whirlpool/sph_whirlpool.c \
  algo/whirlpool/whirlpool-hash-4way.c \
  algo/whirlpool/whirlpool-gate.c \
@@ -161,8 +168,16 @@ cpuminer_SOURCES = \
  algo/x11/tribus-gate.c \
  algo/x11/tribus.c \
  algo/x11/tribus-4way.c \
+  algo/x11/timetravel-gate.c \
+  algo/x11/timetravel.c \
+  algo/x11/timetravel-4way.c \
+  algo/x11/timetravel10-gate.c \
+  algo/x11/timetravel10.c \
+  algo/x11/timetravel10-4way.c \
  algo/x11/fresh.c \
  algo/x11/x11evo.c \
+  algo/x11/x11evo-4way.c \
+  algo/x11/x11evo-gate.c \
  algo/x13/x13-gate.c \
  algo/x13/x13.c \
  algo/x13/x13-4way.c \
--- a/7
+++ b/7
@@ -165,6 +165,13 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------

+v3.7.10
+
+4way optimizations for lyra2rev2, lyra2h, quark, timetravel8, timetravel10
+   x11evo, blakecoin.
+Faster x13sm3 (hsr).
+Added share difficulty to accepted message.
+
 v3.7.9

 Partial 4way optimizations for veltor, skunk, polytimos, lyra2z.
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -1,31 +1,22 @@
 #include "blake-gate.h"
-#include "sph_blake.h"
+
+#if defined (__AVX__)
+
 #include "blake-hash-4way.h"
 #include <string.h>
 #include <stdint.h>
 #include <memory.h>

-#if defined (BLAKE_4WAY)
+blake256r14_4way_context blake_ctx;

 void blakehash_4way(void *state, const void *input)
 {
-     uint32_t vhash[4*4] __attribute__ ((aligned (64)));
-     uint32_t hash0[4] __attribute__ ((aligned (32)));
-     uint32_t hash1[4] __attribute__ ((aligned (32)));
-     uint32_t hash2[4] __attribute__ ((aligned (32)));
-     uint32_t hash3[4] __attribute__ ((aligned (32)));
-     blake256_4way_context ctx;
-
-     blake256_4way_init( &ctx );
-     blake256_4way( &ctx, input, 16 );
-     blake256_4way_close( &ctx, vhash );
-
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
-
-     memcpy( state,    hash0, 32 );
-     memcpy( state+32, hash1, 32 );
-     memcpy( state+64, hash1, 32 );
-     memcpy( state+96, hash1, 32 );
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake256r14_4way_context ctx;
+     memcpy( &ctx, &blake_ctx, sizeof ctx );
+     blake256r14_4way( &ctx, input + (64<<2), 16 );
+     blake256r14_4way_close( &ctx, vhash );
+     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -36,21 +27,24 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
-//   uint32_t HTarget = ptarget[7];
+   uint32_t HTarget = ptarget[7];
   uint32_t _ALIGN(32) edata[20];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
   bool *found = work->nfound;
   int num_found = 0;

-//   if (opt_benchmark)
-//      HTarget = 0x7f;
+   if (opt_benchmark)
+      HTarget = 0x7f;

   // we need big endian data...
   swab32_array( edata, pdata, 20 );

   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );

+   blake256r14_4way_init( &blake_ctx );
+   blake256r14_4way( &blake_ctx, vdata, 64 );
+
   uint32_t *noncep = vdata + 76;   // 19*4
   do {
      found[0] = found[1] = found[2] = found[3] = false;
@@ -61,45 +55,36 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,

      blakehash_4way( hash, vdata );

-      if ( hash[7] == 0 )
+      if (  hash[7] <= HTarget && fulltest( hash, ptarget ) )
      {
-         if ( fulltest( hash, ptarget ) )
-         {
-             found[0] = true;
-             num_found++;
-             nonces[0] = n;
-             pdata[19] = n;
-         }
+          found[0] = true;
+          num_found++;
+          nonces[0] = n;
+          pdata[19] = n;
+          work_set_target_ratio( work, hash );
      }
-      if ( (hash+8)[7] == 0 ) 
+      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
      {
-         if ( fulltest( hash+8, ptarget ) ) 
-         {
-             found[1] = true;
-             num_found++;
-             nonces[1] = n+1;
-         }
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash+8 );
      }
-      if ( (hash+16)[7] == 0 )
+      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
      {
-          if ( fulltest( hash+8, ptarget ) )
-          {
-              found[2] = true;
-              num_found++;
-              nonces[2] = n+2;
-          }
+           found[2] = true;
+           num_found++;
+           nonces[2] = n+2;
+           work_set_target_ratio( work, hash+16 );
      }
-      if ( (hash+24)[7] == 0 )
+      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
      {
-         if ( fulltest( hash+8, ptarget ) )
-         {
-              found[3] = true;
-              num_found++;
-              nonces[3] = n+3;
-         }
+           found[3] = true;
+           num_found++;
+           nonces[3] = n+3;
+           work_set_target_ratio( work, hash+24 );
      }
-       n += 4;
-      *hashes_done = n - first_nonce + 1;
+      n += 4;

   } while ( (num_found == 0) && (n < max_nonce) 
             && !work_restart[thr_id].restart );
--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
@@ -491,14 +491,9 @@ do { \
 		(state)->T1 = T1; \
 	} while (0)

-//#define BLAKE32_ROUNDS 8
-#ifndef BLAKE32_ROUNDS
-#define BLAKE32_ROUNDS 14
-#endif
-
 #if SPH_COMPACT_BLAKE_32

-#define COMPRESS32_4WAY   do { \
+#define COMPRESS32_4WAY( rounds )   do { \
 	__m128i M[16]; \
 	__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
 	__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
@@ -539,7 +534,7 @@ do { \
 	M[0xD] = mm_byteswap_32( *(buf + 13) ); \
 	M[0xE] = mm_byteswap_32( *(buf + 14) ); \
 	M[0xF] = mm_byteswap_32( *(buf + 15) ); \
-	for (r = 0; r < BLAKE32_ROUNDS; r ++) \
+	for (r = 0; r < rounds; r ++) \
 		ROUND_S_4WAY(r); \
        H0 = _mm_xor_si128( _mm_xor_si128( \
                                   _mm_xor_si128( S0, V0 ), V8 ), H0 ); \
@@ -563,80 +558,70 @@ do { \

 // current impl

-#define COMPRESS32_4WAY   do { \
-	__m128i M0, M1, M2, M3, M4, M5, M6, M7; \
-	__m128i M8, M9, MA, MB, MC, MD, ME, MF; \
-	__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
-	__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
-	V0 = H0; \
-	V1 = H1; \
-	V2 = H2; \
-	V3 = H3; \
-	V4 = H4; \
-	V5 = H5; \
-	V6 = H6; \
-	V7 = H7; \
-        V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
-        V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
-        VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
-        VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
-        VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
-                            _mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \
-        VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
-                            _mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \
-        VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
-                            _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
-        VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
-                            _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
-	M0 = mm_byteswap_32( * buf ); \
-	M1 = mm_byteswap_32( *(buf+1) ); \
-	M2 = mm_byteswap_32( *(buf+2) ); \
-	M3 = mm_byteswap_32( *(buf+3) ); \
-	M4 = mm_byteswap_32( *(buf+4) ); \
-	M5 = mm_byteswap_32( *(buf+5) ); \
-	M6 = mm_byteswap_32( *(buf+6) ); \
-	M7 = mm_byteswap_32( *(buf+7) ); \
-	M8 = mm_byteswap_32( *(buf+8) ); \
-	M9 = mm_byteswap_32( *(buf+9) ); \
-	MA = mm_byteswap_32( *(buf+10) ); \
-	MB = mm_byteswap_32( *(buf+11) ); \
-	MC = mm_byteswap_32( *(buf+12) ); \
-	MD = mm_byteswap_32( *(buf+13) ); \
-	ME = mm_byteswap_32( *(buf+14) ); \
-	MF = mm_byteswap_32( *(buf+15) ); \
-	ROUND_S_4WAY(0); \
-	ROUND_S_4WAY(1); \
-	ROUND_S_4WAY(2); \
-	ROUND_S_4WAY(3); \
-	ROUND_S_4WAY(4); \
-	ROUND_S_4WAY(5); \
-	ROUND_S_4WAY(6); \
-	ROUND_S_4WAY(7); \
-	if (BLAKE32_ROUNDS == 14) { \
-	ROUND_S_4WAY(8); \
-	ROUND_S_4WAY(9); \
-	ROUND_S_4WAY(0); \
-	ROUND_S_4WAY(1); \
-	ROUND_S_4WAY(2); \
-	ROUND_S_4WAY(3); \
-	} \
-        H0 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( V8, V0 ), S0 ), H0 ); \
-        H1 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( V9, V1 ), S1 ), H1 ); \
-        H2 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( VA, V2 ), S2 ), H2 ); \
-        H3 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( VB, V3 ), S3 ), H3 ); \
-        H4 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( VC, V4 ), S0 ), H4 ); \
-        H5 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( VD, V5 ), S1 ), H5 ); \
-        H6 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( VE, V6 ), S2 ), H6 ); \
-        H7 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( VF, V7 ), S3 ), H7 ); \
-	} while (0)
+#define COMPRESS32_4WAY( rounds ) \
+do { \
+   __m128i M0, M1, M2, M3, M4, M5, M6, M7; \
+   __m128i M8, M9, MA, MB, MC, MD, ME, MF; \
+   __m128i V0, V1, V2, V3, V4, V5, V6, V7; \
+   __m128i V8, V9, VA, VB, VC, VD, VE, VF; \
+   V0 = H0; \
+   V1 = H1; \
+   V2 = H2; \
+   V3 = H3; \
+   V4 = H4; \
+   V5 = H5; \
+   V6 = H6; \
+   V7 = H7; \
+   V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
+   V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
+   VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
+   VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
+   VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
+   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
+   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
+   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
+   M0 = mm_byteswap_32( * buf ); \
+   M1 = mm_byteswap_32( *(buf+1) ); \
+   M2 = mm_byteswap_32( *(buf+2) ); \
+   M3 = mm_byteswap_32( *(buf+3) ); \
+   M4 = mm_byteswap_32( *(buf+4) ); \
+   M5 = mm_byteswap_32( *(buf+5) ); \
+   M6 = mm_byteswap_32( *(buf+6) ); \
+   M7 = mm_byteswap_32( *(buf+7) ); \
+   M8 = mm_byteswap_32( *(buf+8) ); \
+   M9 = mm_byteswap_32( *(buf+9) ); \
+   MA = mm_byteswap_32( *(buf+10) ); \
+   MB = mm_byteswap_32( *(buf+11) ); \
+   MC = mm_byteswap_32( *(buf+12) ); \
+   MD = mm_byteswap_32( *(buf+13) ); \
+   ME = mm_byteswap_32( *(buf+14) ); \
+   MF = mm_byteswap_32( *(buf+15) ); \
+   ROUND_S_4WAY(0); \
+   ROUND_S_4WAY(1); \
+   ROUND_S_4WAY(2); \
+   ROUND_S_4WAY(3); \
+   ROUND_S_4WAY(4); \
+   ROUND_S_4WAY(5); \
+   ROUND_S_4WAY(6); \
+   ROUND_S_4WAY(7); \
+   if (rounds == 14) \
+   { \
+      ROUND_S_4WAY(8); \
+      ROUND_S_4WAY(9); \
+      ROUND_S_4WAY(0); \
+      ROUND_S_4WAY(1); \
+      ROUND_S_4WAY(2); \
+      ROUND_S_4WAY(3); \
+   } \
+   H0 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V8, V0 ), S0 ), H0 ); \
+   H1 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V9, V1 ), S1 ), H1 ); \
+   H2 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VA, V2 ), S2 ), H2 ); \
+   H3 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VB, V3 ), S3 ), H3 ); \
+   H4 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VC, V4 ), S0 ), H4 ); \
+   H5 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VD, V5 ), S1 ), H5 ); \
+   H6 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VE, V6 ), S2 ), H6 ); \
+   H7 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VF, V7 ), S3 ), H7 ); \
+} while (0)

 #endif

@@ -832,15 +817,16 @@ static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 };

 static void
 blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
-                   const sph_u32 *salt)
+                   const sph_u32 *salt, int rounds )
 {
-        int i;
-        for ( i = 0; i < 8; i++ )
-           sc->H[i] = _mm_set1_epi32( iv[i] );
-        for ( i = 0; i < 4; i++ )
-           sc->S[i] = _mm_set1_epi32( salt[i] );
-	sc->T0 = sc->T1 = 0;
-	sc->ptr = 0;
+   int i;
+   for ( i = 0; i < 8; i++ )
+      sc->H[i] = _mm_set1_epi32( iv[i] );
+   for ( i = 0; i < 4; i++ )
+      sc->S[i] = _mm_set1_epi32( salt[i] );
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
+   sc->rounds = rounds;
 }

 static void
@@ -878,7 +864,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
           {
 		if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
 			T1 = SPH_T32(T1 + 1);
-		COMPRESS32_4WAY;
+                COMPRESS32_4WAY( sc->rounds );
 		ptr = 0;
 	   }
 	}
@@ -1079,10 +1065,11 @@ blake64_4way_close( blake_4way_big_context *sc,

 #endif

+// default 14 rounds, backward copatibility
 void
 blake256_4way_init(void *cc)
 {
-	blake32_4way_init(cc, IV256, salt_zero_small);
+   blake32_4way_init( cc, IV256, salt_zero_small, 14 );
 }

 void
@@ -1094,13 +1081,43 @@ blake256_4way(void *cc, const void *data, size_t len)
 void
 blake256_4way_close(void *cc, void *dst)
 {
-	blake256_4way_addbits_and_close(cc, 0, 0, dst);
+        blake32_4way_close(cc, 0, 0, dst, 8);
+}
+
+// 14 rounds blake, decred
+void blake256r14_4way_init(void *cc)
+{
+   blake32_4way_init( cc, IV256, salt_zero_small, 14 );
 }

 void
-blake256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+blake256r14_4way(void *cc, const void *data, size_t len)
 {
-	blake32_4way_close(cc, ub, n, dst, 8);
+   blake32_4way(cc, data, len);
+}
+
+void
+blake256r14_4way_close(void *cc, void *dst)
+{
+   blake32_4way_close(cc, 0, 0, dst, 8);
+}
+
+// 8 rounds blakecoin, vanilla
+void blake256r8_4way_init(void *cc)
+{
+   blake32_4way_init( cc, IV256, salt_zero_small, 8 );
+}
+
+void
+blake256r8_4way(void *cc, const void *data, size_t len)
+{
+   blake32_4way(cc, data, len);
+}
+
+void
+blake256r8_4way_close(void *cc, void *dst)
+{
+   blake32_4way_close(cc, 0, 0, dst, 8);
 }

 #if defined (__AVX2__)
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -35,7 +35,9 @@
 */

 #ifndef __BLAKE_HASH_4WAY__
-#define __BLAKE_HASH_4WAY___
+#define __BLAKE_HASH_4WAY__
+
+#ifdef __AVX__

 #ifdef __cplusplus
 extern "C"{
@@ -45,38 +47,36 @@ extern "C"{
 #include "algo/sha/sph_types.h"
 #include "avxdefs.h"

-/**
- * Output size (in bits) for BLAKE-256.
- */
 #define SPH_SIZE_blake256   256

-#if SPH_64
-
-/**
- * Output size (in bits) for BLAKE-512.
- */
 #define SPH_SIZE_blake512   512

-#endif
-
-#ifdef __AVX__
 typedef struct {
-        __m128i buf[16] __attribute__ ((aligned (64)));
-        __m128i H[8];
-        __m128i S[4];    
-        size_t ptr;
-	sph_u32 T0, T1;
+   __m128i buf[16] __attribute__ ((aligned (64)));
+   __m128i H[8];
+   __m128i S[4];    
+   size_t ptr;
+   sph_u32 T0, T1;
+   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
 } blake_4way_small_context;

+// Default 14 rounds
 typedef blake_4way_small_context blake256_4way_context;
-
 void blake256_4way_init(void *cc);
 void blake256_4way(void *cc, const void *data, size_t len);
 void blake256_4way_close(void *cc, void *dst);
-void blake256_4way_addbits_and_close(
-        void *cc, unsigned ub, unsigned n, void *dst);

-#endif
+// 14 rounds, blake, decred
+typedef blake_4way_small_context blake256r14_4way_context;
+void blake256r14_4way_init(void *cc);
+void blake256r14_4way(void *cc, const void *data, size_t len);
+void blake256r14_4way_close(void *cc, void *dst);
+
+// 8 rounds, blakecoin, vanilla
+typedef blake_4way_small_context blake256r8_4way_context;
+void blake256r8_4way_init(void *cc);
+void blake256r8_4way(void *cc, const void *data, size_t len);
+void blake256r8_4way_close(void *cc, void *dst);

 #ifdef __AVX2__

@@ -103,3 +103,5 @@ void blake512_4way_addbits_and_close(
 #endif

 #endif
+
+#endif
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -0,0 +1,106 @@
+#include "blakecoin-gate.h"
+
+#if defined (__AVX__)
+
+#include "blake-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+#include <memory.h>
+
+blake256r8_4way_context blakecoin_ctx;
+
+void blakecoin_4way_hash(void *state, const void *input)
+{
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake256r8_4way_context ctx;
+     memcpy( &ctx, &blakecoin_ctx, sizeof ctx );
+     blake256r8_4way( &ctx, input + (64<<2), 16 );
+     blake256r8_4way_close( &ctx, vhash );
+     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+}
+
+int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t HTarget = ptarget[7];
+   uint32_t _ALIGN(32) edata[20];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+
+   if (opt_benchmark)
+      HTarget = 0x7f;
+
+   // we need big endian data...
+   swab32_array( edata, pdata, 20 );
+
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+
+   blake256r8_4way_init( &blakecoin_ctx );
+   blake256r8_4way( &blakecoin_ctx, vdata, 64 );
+
+   uint32_t *noncep = vdata + 76;   // 19*4
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+
+      blakecoin_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      if (  hash[7] <= HTarget && fulltest( hash, ptarget ) )
+      {
+          found[0] = true;
+          num_found++;
+          nonces[0] = n;
+          work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) ) 
+      {
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
+      {
+           found[2] = true;
+           num_found++;
+           nonces[2] = n+2;
+           work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
+      {
+           found[3] = true;
+           num_found++;
+           nonces[3] = n+3;
+           work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+
+   } while ( (num_found == 0) && (n < max_nonce) 
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+
+   // workaround to prevent flood of hash reports when nonce range exhasuted
+   // and thread is spinning waiting for new work
+   if ( ( n >= max_nonce ) && ( *hashes_done < 10 ) )
+   {
+      *hashes_done = 0;
+      sleep(1);
+   }
+
+   return num_found;
+}
+
+#endif
+
--- a/algo/blake/blakecoin-gate.c
+++ b/algo/blake/blakecoin-gate.c
@@ -0,0 +1,71 @@
+#include "blakecoin-gate.h"
+#include <memory.h>
+
+// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
+int64_t blakecoin_get_max64 ()
+{
+  return 0x7ffffLL;
+//  return 0x3fffffLL;
+}
+
+// Blakecoin 4 way hashes so fast it runs out of nonces.
+// This is an attempt to solve this but the result may be
+// to rehash old nonces until new work is received.
+void bc4w_get_new_work( struct work* work, struct work* g_work, int thr_id,
+                     uint32_t *end_nonce_ptr, bool clean_job )
+{
+   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
+// 
+//   if ( have_stratum && ( *nonceptr >= *end_nonce_ptr ) )
+//      algo_gate.stratum_gen_work( &stratum, g_work );
+
+   if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size ) 
+   || ( *nonceptr >= *end_nonce_ptr )
+   || (  work->job_id != g_work->job_id ) && clean_job  )
+/*
+   if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
+      && ( clean_job || ( *nonceptr >= *end_nonce_ptr )
+         || ( work->job_id != g_work->job_id ) ) )
+*/   
+   {
+     work_free( work );
+     work_copy( work, g_work );
+     *nonceptr = 0xffffffffU / opt_n_threads * thr_id;
+     if ( opt_randomize )
+       *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
+     *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20; 
+// try incrementing the xnonce to chsnge the data
+//     for ( int i = 0; i < work->xnonce2_size && !( ++work->xnonce2[i] ); i++ );
+   }
+   else
+       ++(*nonceptr);
+}
+
+
+// vanilla uses default gen merkle root, otherwise identical to blakecoin
+bool register_vanilla_algo( algo_gate_t* gate )
+{
+#if defined(BLAKECOIN_4WAY)
+//  four_way_not_tested();
+  gate->optimizations = FOUR_WAY_OPT;
+  gate->scanhash  = (void*)&scanhash_blakecoin_4way;
+  gate->hash      = (void*)&blakecoin_4way_hash;
+//  gate->get_new_work = (void*)&bc4w_get_new_work;
+//  blakecoin_4way_init( &blake_4way_init_ctx );
+#else
+  gate->scanhash = (void*)&scanhash_blakecoin;
+  gate->hash     = (void*)&blakecoinhash;
+//  blakecoin_init( &blake_init_ctx );
+#endif
+  gate->optimizations = AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&blakecoin_get_max64;
+  return true;
+}
+
+bool register_blakecoin_algo( algo_gate_t* gate )
+{
+  register_vanilla_algo( gate );
+  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  return true;
+}
+
--- a/algo/blake/blakecoin-gate.h
+++ b/algo/blake/blakecoin-gate.h
@@ -0,0 +1,21 @@
+#ifndef __BLAKECOIN_GATE_H__
+#define __BLAKECOIN_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(FOUR_WAY) && defined(__AVX__)
+  #define BLAKECOIN_4WAY
+#endif
+
+#if defined (BLAKECOIN_4WAY)
+void blakecoin_4way_hash(void *state, const void *input);
+int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#endif
+
+void blakecoinhash( void *state, const void *input );
+int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done );
+
+#endif
--- a/algo/blake/blakecoin.c
+++ b/algo/blake/blakecoin.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "blakecoin-gate.h"
 #define BLAKE32_ROUNDS 8
 #include "sph_blake.h"

@@ -98,7 +98,7 @@ void blakecoin_gen_merkle_root ( char* merkle_root, struct stratum_ctx* sctx )
 SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
 }
 */
-
+/*
 // changed to get_max64_0x3fffffLL in cpuminer-multi-decred
 int64_t blakecoin_get_max64 ()
 {
@@ -121,4 +121,4 @@ bool register_blakecoin_algo( algo_gate_t* gate )
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  return true;
 }
-
+*/
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -1,5 +1,4 @@
 #include "decred-gate.h"
-#include "sph_blake.h"
 #include "blake-hash-4way.h"
 #include <string.h>
 #include <stdint.h>
@@ -9,7 +8,6 @@
 #if defined (DECRED_4WAY)

 static __thread blake256_4way_context blake_mid;
-static __thread bool ctx_midstate_done = false;

 void decred_hash_4way( void *state, const void *input )
 {
@@ -18,50 +16,14 @@ void decred_hash_4way( void *state, const void *input )
     uint32_t hash1[8] __attribute__ ((aligned (32)));
     uint32_t hash2[8] __attribute__ ((aligned (32)));
     uint32_t hash3[8] __attribute__ ((aligned (32)));
-     blake256_4way_context ctx __attribute__ ((aligned (64)));
-
-     sph_blake256_context ctx2 __attribute__ ((aligned (64)));
-     uint32_t hash[16] __attribute__ ((aligned (64)));
-     uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
-
-     mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );
-
     void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
     int tail_len = 180 - DECRED_MIDSTATE_LEN; 
+     blake256_4way_context ctx __attribute__ ((aligned (64)));

     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
     blake256_4way( &ctx, tail, tail_len );
     blake256_4way_close( &ctx, vhash );
-/*
-     sph_blake256_init( &ctx2 );
-     sph_blake256( &ctx2, sin0, 180 );
-     sph_blake256_close( &ctx2, hash );
-*/
-/*
-     blake256_4way_init( &ctx );
-     blake256_4way( &ctx, input, 180 );
-     blake256_4way_close( &ctx, vhash );
-*/
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
-/*
-        for ( int i = 0; i < 8; i++ )
-          if ( hash[i] != hash0[i] )
-            printf(" hash mismatch, i = %u\n",i);
-
-printf("hash:  %08lx %08lx %08lx %08lx\n", *hash, *(hash+1),
-                             *(hash+2), *(hash+3) );
-printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1),
-                             *(hash0+2), *(hash0+3) );
-printf("\n");
-*/
-
-     memcpy( state,    hash0, 32 );
-     memcpy( state+32, hash1, 32 );
-     memcpy( state+64, hash2, 32 );
-     memcpy( state+96, hash3, 32 );
-
-//     memcpy( state, hash, 32 );
-
+     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -69,21 +31,21 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t vdata[48*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
-        uint32_t _ALIGN(64) edata[48];
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-        const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
-        uint32_t n = first_nonce;
-        const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
+   uint32_t _ALIGN(64) edata[48];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
+   uint32_t n = first_nonce;
+   const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
   uint32_t *nonces = work->nonces;
   bool *found = work->nfound;
   int num_found = 0;

-        ctx_midstate_done = false;
-        memcpy( edata, pdata, 180 );
+   // copy to buffer guaranteed to be aligned.
+   memcpy( edata, pdata, 180 );

   // use the old way until  new way updated for size.
-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 180*8 );
+   mm_interleave_4x32x( vdata, edata, edata, edata, edata, 180*8 );

   blake256_4way_init( &blake_mid );
   blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
@@ -106,22 +68,13 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
          nonces[0] = n;
          pdata[DECRED_NONCE_INDEX] = n;
      }
-/*
      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
      {
-printf("found 1\n");          
-
-printf("vhash: %08lx %08lx %08lx %08lx\n", hash[8], hash[9], hash[10],hash[11] );
-printf("vhash: %08lx %08lx %08lx %08lx\n", hash[12], hash[13], hash[14],hash[15] );
-printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
-printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
-
          work_set_target_ratio( work, hash+8 );
          found[1] = true;
          num_found++;
          nonces[1] = n+1;
      }
-*/
      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
      {
          work_set_target_ratio( work, hash+16 );
@@ -129,24 +82,15 @@ printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[
          num_found++;
          nonces[2] = n+2;
      }
-/*
+
      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
      {
-printf("found 3\n");          
-
-printf("vhash: %08lx %08lx %08lx %08lx\n", hash[0], hash[1], hash[2],hash[3] );
-printf("vhash: %08lx %08lx %08lx %08lx\n", hash[4], hash[5], hash[6],hash[7] );
-printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
-printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
-
          work_set_target_ratio( work, hash+24 );
          found[3] = true;
          num_found++;
          nonces[3] = n+3;
      }
-*/
-      n += 2;
-//      n += 4;
+      n += 4;
  } while ( (num_found == 0) && (n < max_nonce) 
            && !work_restart[thr_id].restart );

--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -1,4 +1,7 @@
 #include "pentablake-gate.h"
+
+#ifdef __AVX2__
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -9,8 +12,6 @@

 //#define DEBUG_ALGO

-#ifdef PENTABLAKE_4WAY
-
 extern void pentablakehash_4way( void *output, const void *input )
 {
 	unsigned char _ALIGN(32) hash[128];
--- a/algo/blake/pentablake-gate.h
+++ b/algo/blake/pentablake-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(FOUR_WAY) && defined(__AVX__)
+#if defined(FOUR_WAY) && defined(__AVX2__)
  #define PENTABLAKE_4WAY
 #endif

--- a/algo/bmw/bmw-hash-4way.c
+++ b/algo/bmw/bmw-hash-4way.c
@@ -41,19 +41,13 @@
 extern "C"{
 #endif

-//#include "sph_bmw.h"
-
-//#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BMW
-#define SPH_SMALL_FOOTPRINT_BMW   1
-//#endif
-
 #ifdef _MSC_VER
 #pragma warning (disable: 4146)
 #endif

-//#undef SPH_ROTL64
-//#define SPH_ROTL64(x,n)  (((x) << (n)) | ((x) >> (64 - (n))))
-//#define SPH_ROTL64(x,n)  mm256_rotl_64(x,n)
+#define LPAR   (
+
+// BMW256

 static const sph_u32 IV256[] = {
 	SPH_C32(0x40414243), SPH_C32(0x44454647),
@@ -66,8 +60,7 @@ static const sph_u32 IV256[] = {
 	SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
 };

-#if SPH_64
-
+// BMW512
 static const sph_u64 IV512[] = {
 	SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
 	SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
@@ -79,74 +72,113 @@ static const sph_u64 IV512[] = {
 	SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
 };

-#endif
+// BMW256

-#define XCAT(x, y)    XCAT_(x, y)
-#define XCAT_(x, y)   x ## y
+#define ss0(x) \
+   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
+                                 _mm_slli_epi32( (x), 3) ), \
+                  _mm_xor_si128( mm_rotl_32( (x),  4), \
+                                 mm_rotl_32( (x), 19) ) )

-#define LPAR   (
+#define ss1(x) \
+   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
+                                 _mm_slli_epi32( (x), 2) ), \
+                  _mm_xor_si128( mm_rotl_32( (x),  8), \
+                                 mm_rotl_32( (x), 23) ) )

-/*
-#define ss0(x)    (((x) >> 1) ^ SPH_T32((x) << 3) \
-                  ^ SPH_ROTL32(x,  4) ^ SPH_ROTL32(x, 19))
-#define ss1(x)    (((x) >> 1) ^ SPH_T32((x) << 2) \
-                  ^ SPH_ROTL32(x,  8) ^ SPH_ROTL32(x, 23))
-#define ss2(x)    (((x) >> 2) ^ SPH_T32((x) << 1) \
-                  ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
-#define ss3(x)    (((x) >> 2) ^ SPH_T32((x) << 2) \
-                  ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
-#define ss4(x)    (((x) >> 1) ^ (x))
-#define ss5(x)    (((x) >> 2) ^ (x))
-#define rs1(x)    SPH_ROTL32(x,  3)
-#define rs2(x)    SPH_ROTL32(x,  7)
-#define rs3(x)    SPH_ROTL32(x, 13)
-#define rs4(x)    SPH_ROTL32(x, 16)
-#define rs5(x)    SPH_ROTL32(x, 19)
-#define rs6(x)    SPH_ROTL32(x, 23)
-#define rs7(x)    SPH_ROTL32(x, 27)
+#define ss2(x) \
+   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
+                                 _mm_slli_epi32( (x), 1) ), \
+                  _mm_xor_si128( mm_rotl_32( (x), 12), \
+                                 mm_rotl_32( (x), 25) ) )

-#define Ks(j)   SPH_T32((sph_u32)(j) * SPH_C32(0x05555555))
+#define ss3(x) \
+   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
+                                 _mm_slli_epi32( (x), 2) ), \
+                  _mm_xor_si128( mm_rotl_32( (x), 15), \
+                                 mm_rotl_32( (x), 29) ) )

-#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
-	(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \
-		- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
+#define ss4(x) \
+  _mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) )

-#define expand1s_inner(qf, mf, hf, i16, \
-		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
-		i9, i10, i11, i12, i13, i14, i15, \
-		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
-	SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
-		+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
-		+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
-		+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
-		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+#define ss5(x) \
+  _mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) )

-#define expand1s(qf, mf, hf, i16) \
-	expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
-#define expand1s_(qf, mf, hf, i16, ix, iy) \
-	expand1s_inner LPAR qf, mf, hf, i16, ix, iy)
+#define rs1(x)    mm_rotl_32( x,  3 ) 
+#define rs2(x)    mm_rotl_32( x,  7 ) 
+#define rs3(x)    mm_rotl_32( x, 13 ) 
+#define rs4(x)    mm_rotl_32( x, 16 ) 
+#define rs5(x)    mm_rotl_32( x, 19 ) 
+#define rs6(x)    mm_rotl_32( x, 23 ) 
+#define rs7(x)    mm_rotl_32( x, 27 ) 

-#define expand2s_inner(qf, mf, hf, i16, \
-		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
-		i9, i10, i11, i12, i13, i14, i15, \
-		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
-	SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
-		+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
-		+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
-		+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
-		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+#define rol_off_32( M, j, off ) \
+   mm_rotl_32( M[ ( (j) + (off) ) & 0xF ] , \
+                ( ( (j) + (off) ) & 0xF ) + 1 )

-#define expand2s(qf, mf, hf, i16) \
-	expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
-#define expand2s_(qf, mf, hf, i16, ix, iy) \
-	expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
-*/
-#if SPH_64
+#define add_elt_s( M, H, j ) \
+   _mm_xor_si128( \
+      _mm_add_epi32( \
+            _mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
+                                          rol_off_32( M, j, 3 ) ), \
+                           rol_off_32( M, j, 10 ) ), \
+            _mm_set1_epi32( ( (j) + 16 ) * 0x05555555UL ) ), \
+    H[ ( (j)+7 ) & 0xF ] )
+
+
+#define expand1s( qt, M, H, i ) \
+   _mm_add_epi32( \
+      _mm_add_epi32( \
+         _mm_add_epi32( \
+             _mm_add_epi32( \
+                _mm_add_epi32( ss1( qt[ (i)-16 ] ), \
+                               ss2( qt[ (i)-15 ] ) ), \
+                _mm_add_epi32( ss3( qt[ (i)-14 ] ), \
+                               ss0( qt[ (i)-13 ] ) ) ), \
+             _mm_add_epi32( \
+                _mm_add_epi32( ss1( qt[ (i)-12 ] ), \
+                               ss2( qt[ (i)-11 ] ) ), \
+                _mm_add_epi32( ss3( qt[ (i)-10 ] ), \
+                               ss0( qt[ (i)- 9 ] ) ) ) ), \
+         _mm_add_epi32( \
+             _mm_add_epi32( \
+                _mm_add_epi32( ss1( qt[ (i)- 8 ] ), \
+                               ss2( qt[ (i)- 7 ] ) ), \
+                _mm_add_epi32( ss3( qt[ (i)- 6 ] ), \
+                               ss0( qt[ (i)- 5 ] ) ) ), \
+             _mm_add_epi32( \
+                _mm_add_epi32( ss1( qt[ (i)- 4 ] ), \
+                               ss2( qt[ (i)- 3 ] ) ), \
+                _mm_add_epi32( ss3( qt[ (i)- 2 ] ), \
+                               ss0( qt[ (i)- 1 ] ) ) ) ) ), \
+      add_elt_s( M, H, (i)-16 ) )
+
+#define expand2s( qt, M, H, i) \
+   _mm_add_epi32( \
+      _mm_add_epi32( \
+         _mm_add_epi32( \
+             _mm_add_epi32( \
+                _mm_add_epi32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ) ), \
+                _mm_add_epi32( qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ) ), \
+             _mm_add_epi64( \
+                _mm_add_epi32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ) ), \
+                _mm_add_epi32( qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ) ) ), \
+         _mm_add_epi32( \
+             _mm_add_epi32( \
+                _mm_add_epi32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ) ), \
+                _mm_add_epi32( qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ) ), \
+             _mm_add_epi32( \
+                _mm_add_epi32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ) ), \
+                _mm_add_epi32( ss4( qt[ (i)- 2 ] ), \
+                               ss5( qt[ (i)- 1 ] ) ) ) ) ), \
+      add_elt_s( M, H, (i)-16 ) )
+
+// BMW512

 #define sb0(x) \
   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
                                       _mm256_slli_epi64( (x), 3) ), \
-                     _mm256_xor_si256( mm256_rotl_64( (x), 4), \
+                     _mm256_xor_si256( mm256_rotl_64( (x),  4), \
                                       mm256_rotl_64( (x), 37) ) )

 #define sb1(x) \
@@ -181,18 +213,18 @@ static const sph_u64 IV512[] = {
 #define rb6(x)    mm256_rotl_64( x, 43 ) 
 #define rb7(x)    mm256_rotl_64( x, 53 ) 

-#define rol_off( M, j, off ) \
-   mm256_rotl_64( M[ ( (j) + (off) ) & 15 ] , \
-                   ( ( (j) + (off) ) & 15 ) + 1 )
+#define rol_off_64( M, j, off ) \
+   mm256_rotl_64( M[ ( (j) + (off) ) & 0xF ] , \
+                   ( ( (j) + (off) ) & 0xF ) + 1 )

 #define add_elt_b( M, H, j ) \
   _mm256_xor_si256( \
      _mm256_add_epi64( \
-            _mm256_sub_epi64( _mm256_add_epi64( rol_off( M, j, 0 ), \
-                                                rol_off( M, j, 3 ) ), \
-                             rol_off( M, j, 10 ) ), \
+            _mm256_sub_epi64( _mm256_add_epi64( rol_off_64( M, j, 0 ), \
+                                                rol_off_64( M, j, 3 ) ), \
+                             rol_off_64( M, j, 10 ) ), \
            _mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
-       H[ ( (j)+7 ) & 15 ] )
+       H[ ( (j)+7 ) & 0xF ] )
          
 #define expand1b( qt, M, H, i ) \
   _mm256_add_epi64( \
@@ -241,132 +273,301 @@ static const sph_u64 IV512[] = {
                                  sb5( qt[ (i)- 1 ] ) ) ) ) ), \
      add_elt_b( M, H, (i)-16 ) )

-#endif
+// BMW256

-/*
-#define MAKE_W( i0, op01, i1, op12, i2, op23, i3, op34, i4) \
-        ((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
-        op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
-*/
+#define Ws0 \
+   _mm_add_epi32( \
+       _mm_add_epi32( \
+          _mm_add_epi32( \
+             _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
+                            _mm_xor_si128( M[ 7], H[ 7] ) ), \
+             _mm_xor_si128( M[10], H[10] ) ), \
+          _mm_xor_si128( M[13], H[13] ) ), \
+       _mm_xor_si128( M[14], H[14] ) )

-/*
-#define Ws0    MAKE_W(SPH_T32,  5, -,  7, +, 10, +, 13, +, 14)
-#define Ws1    MAKE_W(SPH_T32,  6, -,  8, +, 11, +, 14, -, 15)
-#define Ws2    MAKE_W(SPH_T32,  0, +,  7, +,  9, -, 12, +, 15)
-#define Ws3    MAKE_W(SPH_T32,  0, -,  1, +,  8, -, 10, +, 13)
-#define Ws4    MAKE_W(SPH_T32,  1, +,  2, +,  9, -, 11, -, 14)
-#define Ws5    MAKE_W(SPH_T32,  3, -,  2, +, 10, -, 12, +, 15)
-#define Ws6    MAKE_W(SPH_T32,  4, -,  0, -,  3, -, 11, +, 13)
-#define Ws7    MAKE_W(SPH_T32,  1, -,  4, -,  5, -, 12, -, 14)
-#define Ws8    MAKE_W(SPH_T32,  2, -,  5, -,  6, +, 13, -, 15)
-#define Ws9    MAKE_W(SPH_T32,  0, -,  3, +,  6, -,  7, +, 14)
-#define Ws10   MAKE_W(SPH_T32,  8, -,  1, -,  4, -,  7, +, 15)
-#define Ws11   MAKE_W(SPH_T32,  8, -,  0, -,  2, -,  5, +,  9)
-#define Ws12   MAKE_W(SPH_T32,  1, +,  3, -,  6, -,  9, +, 10)
-#define Ws13   MAKE_W(SPH_T32,  2, +,  4, +,  7, +, 10, +, 11)
-#define Ws14   MAKE_W(SPH_T32,  3, -,  5, +,  8, -, 11, -, 12)
-#define Ws15   MAKE_W(SPH_T32, 12, -,  4, -,  6, -,  9, +, 13)
+#define Ws1 \
+   _mm_sub_epi32( \
+       _mm_add_epi32( \
+          _mm_add_epi32( \
+             _mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
+                            _mm_xor_si128( M[ 8], H[ 8] ) ), \
+             _mm_xor_si128( M[11], H[11] ) ), \
+          _mm_xor_si128( M[14], H[14] ) ), \
+       _mm_xor_si128( M[15], H[15] ) )

-#if SPH_SMALL_FOOTPRINT_BMW
+#define Ws2 \
+   _mm_add_epi32( \
+       _mm_sub_epi32( \
+          _mm_add_epi32( \
+             _mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
+                            _mm_xor_si128( M[ 7], H[ 7] ) ), \
+             _mm_xor_si128( M[ 9], H[ 9] ) ), \
+          _mm_xor_si128( M[12], H[12] ) ), \
+       _mm_xor_si128( M[15], H[15] ) )

-#define MAKE_Qas   do { \
-		unsigned u; \
-		sph_u32 Ws[16]; \
-		Ws[ 0] = Ws0; \
-		Ws[ 1] = Ws1; \
-		Ws[ 2] = Ws2; \
-		Ws[ 3] = Ws3; \
-		Ws[ 4] = Ws4; \
-		Ws[ 5] = Ws5; \
-		Ws[ 6] = Ws6; \
-		Ws[ 7] = Ws7; \
-		Ws[ 8] = Ws8; \
-		Ws[ 9] = Ws9; \
-		Ws[10] = Ws10; \
-		Ws[11] = Ws11; \
-		Ws[12] = Ws12; \
-		Ws[13] = Ws13; \
-		Ws[14] = Ws14; \
-		Ws[15] = Ws15; \
-		for (u = 0; u < 15; u += 5) { \
-			qt[u + 0] = SPH_T32(ss0(Ws[u + 0]) + H(u + 1)); \
-			qt[u + 1] = SPH_T32(ss1(Ws[u + 1]) + H(u + 2)); \
-			qt[u + 2] = SPH_T32(ss2(Ws[u + 2]) + H(u + 3)); \
-			qt[u + 3] = SPH_T32(ss3(Ws[u + 3]) + H(u + 4)); \
-			qt[u + 4] = SPH_T32(ss4(Ws[u + 4]) + H(u + 5)); \
-		} \
-		qt[15] = SPH_T32(ss0(Ws[15]) + H(0)); \
-	} while (0)
+#define Ws3 \
+   _mm_add_epi32( \
+       _mm_sub_epi32( \
+          _mm_add_epi32( \
+             _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
+                            _mm_xor_si128( M[ 1], H[ 1] ) ), \
+             _mm_xor_si128( M[ 8], H[ 8] ) ), \
+          _mm_xor_si128( M[10], H[10] ) ), \
+       _mm_xor_si128( M[13], H[13] ) )

-#define MAKE_Qbs   do { \
-		qt[16] = expand1s(Qs, M, H, 16); \
-		qt[17] = expand1s(Qs, M, H, 17); \
-		qt[18] = expand2s(Qs, M, H, 18); \
-		qt[19] = expand2s(Qs, M, H, 19); \
-		qt[20] = expand2s(Qs, M, H, 20); \
-		qt[21] = expand2s(Qs, M, H, 21); \
-		qt[22] = expand2s(Qs, M, H, 22); \
-		qt[23] = expand2s(Qs, M, H, 23); \
-		qt[24] = expand2s(Qs, M, H, 24); \
-		qt[25] = expand2s(Qs, M, H, 25); \
-		qt[26] = expand2s(Qs, M, H, 26); \
-		qt[27] = expand2s(Qs, M, H, 27); \
-		qt[28] = expand2s(Qs, M, H, 28); \
-		qt[29] = expand2s(Qs, M, H, 29); \
-		qt[30] = expand2s(Qs, M, H, 30); \
-		qt[31] = expand2s(Qs, M, H, 31); \
-	} while (0)
+#define Ws4 \
+   _mm_sub_epi32( \
+       _mm_sub_epi32( \
+          _mm_add_epi32( \
+             _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
+                            _mm_xor_si128( M[ 2], H[ 2] ) ), \
+             _mm_xor_si128( M[ 9], H[ 9] ) ), \
+          _mm_xor_si128( M[11], H[11] ) ), \
+       _mm_xor_si128( M[14], H[14] ) )

-#else
+#define Ws5 \
+   _mm_add_epi32( \
+       _mm_sub_epi32( \
+          _mm_add_epi32( \
+             _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
+                            _mm_xor_si128( M[ 2], H[ 2] ) ), \
+             _mm_xor_si128( M[10], H[10] ) ), \
+          _mm_xor_si128( M[12], H[12] ) ), \
+       _mm_xor_si128( M[15], H[15] ) )

-#define MAKE_Qas   do { \
-		qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \
-		qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \
-		qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \
-		qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \
-		qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \
-		qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \
-		qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \
-		qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \
-		qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
-		qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
-		qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
-		qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
-		qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
-		qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
-		qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
-		qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
-	} while (0)
+#define Ws6 \
+   _mm_add_epi32( \
+       _mm_sub_epi32( \
+          _mm_sub_epi32( \
+             _mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
+                            _mm_xor_si128( M[ 0], H[ 0] ) ), \
+             _mm_xor_si128( M[ 3], H[ 3] ) ), \
+          _mm_xor_si128( M[11], H[11] ) ), \
+       _mm_xor_si128( M[13], H[13] ) )

-#define MAKE_Qbs   do { \
-		qt[16] = expand1s(Qs, M, H, 16); \
-		qt[17] = expand1s(Qs, M, H, 17); \
-		qt[18] = expand2s(Qs, M, H, 18); \
-		qt[19] = expand2s(Qs, M, H, 19); \
-		qt[20] = expand2s(Qs, M, H, 20); \
-		qt[21] = expand2s(Qs, M, H, 21); \
-		qt[22] = expand2s(Qs, M, H, 22); \
-		qt[23] = expand2s(Qs, M, H, 23); \
-		qt[24] = expand2s(Qs, M, H, 24); \
-		qt[25] = expand2s(Qs, M, H, 25); \
-		qt[26] = expand2s(Qs, M, H, 26); \
-		qt[27] = expand2s(Qs, M, H, 27); \
-		qt[28] = expand2s(Qs, M, H, 28); \
-		qt[29] = expand2s(Qs, M, H, 29); \
-		qt[30] = expand2s(Qs, M, H, 30); \
-		qt[31] = expand2s(Qs, M, H, 31); \
-	} while (0)
+#define Ws7 \
+   _mm_sub_epi32( \
+       _mm_sub_epi32( \
+          _mm_sub_epi32( \
+             _mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
+                            _mm_xor_si128( M[ 4], H[ 4] ) ), \
+             _mm_xor_si128( M[ 5], H[ 5] ) ), \
+          _mm_xor_si128( M[12], H[12] ) ), \
+       _mm_xor_si128( M[14], H[14] ) )

-#endif
+#define Ws8 \
+   _mm_sub_epi32( \
+       _mm_add_epi32( \
+          _mm_sub_epi32( \
+             _mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
+                            _mm_xor_si128( M[ 5], H[ 5] ) ), \
+             _mm_xor_si128( M[ 6], H[ 6] ) ), \
+          _mm_xor_si128( M[13], H[13] ) ), \
+       _mm_xor_si128( M[15], H[15] ) )

-#define MAKE_Qs   do { \
-		MAKE_Qas; \
-		MAKE_Qbs; \
-	} while (0)
+#define Ws9 \
+   _mm_add_epi32( \
+       _mm_sub_epi32( \
+          _mm_add_epi32( \
+             _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
+                            _mm_xor_si128( M[ 3], H[ 3] ) ), \
+             _mm_xor_si128( M[ 6], H[ 6] ) ), \
+          _mm_xor_si128( M[ 7], H[ 7] ) ), \
+       _mm_xor_si128( M[14], H[14] ) )

-#define Qs(j)   (qt[j])
-*/
-#if SPH_64
+#define Ws10 \
+   _mm_add_epi32( \
+       _mm_sub_epi32( \
+          _mm_sub_epi32( \
+             _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
+                            _mm_xor_si128( M[ 1], H[ 1] ) ), \
+             _mm_xor_si128( M[ 4], H[ 4] ) ), \
+          _mm_xor_si128( M[ 7], H[ 7] ) ), \
+       _mm_xor_si128( M[15], H[15] ) )
+
+#define Ws11 \
+   _mm_add_epi32( \
+       _mm_sub_epi32( \
+          _mm_sub_epi32( \
+             _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
+                            _mm_xor_si128( M[ 0], H[ 0] ) ), \
+             _mm_xor_si128( M[ 2], H[ 2] ) ), \
+          _mm_xor_si128( M[ 5], H[ 5] ) ), \
+       _mm_xor_si128( M[ 9], H[ 9] ) )
+
+#define Ws12 \
+   _mm_add_epi32( \
+       _mm_sub_epi32( \
+          _mm_sub_epi32( \
+             _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
+                            _mm_xor_si128( M[ 3], H[ 3] ) ), \
+             _mm_xor_si128( M[ 6], H[ 6] ) ), \
+          _mm_xor_si128( M[ 9], H[ 9] ) ), \
+       _mm_xor_si128( M[10], H[10] ) )
+
+#define Ws13 \
+   _mm_add_epi32( \
+       _mm_add_epi32( \
+          _mm_add_epi32( \
+             _mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
+                            _mm_xor_si128( M[ 4], H[ 4] ) ), \
+             _mm_xor_si128( M[ 7], H[ 7] ) ), \
+          _mm_xor_si128( M[10], H[10] ) ), \
+       _mm_xor_si128( M[11], H[11] ) )
+
+#define Ws14 \
+   _mm_sub_epi32( \
+       _mm_sub_epi32( \
+          _mm_add_epi32( \
+             _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
+                               _mm_xor_si128( M[ 5], H[ 5] ) ), \
+             _mm_xor_si128( M[ 8], H[ 8] ) ), \
+          _mm_xor_si128( M[11], H[11] ) ), \
+       _mm_xor_si128( M[12], H[12] ) )
+
+#define Ws15 \
+   _mm_add_epi32( \
+       _mm_sub_epi32( \
+          _mm_sub_epi32( \
+             _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
+                            _mm_xor_si128( M[ 4], H[ 4] ) ), \
+             _mm_xor_si128( M[ 6], H[ 6] ) ), \
+          _mm_xor_si128( M[ 9], H[ 9] ) ), \
+       _mm_xor_si128( M[13], H[13] ) )
+
+
+void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
+{
+   __m128i qt[32], xl, xh; \
+
+   qt[ 0] = ss0( Ws0 ) + H[ 1];
+   qt[ 1] = ss1( Ws1 ) + H[ 2];
+   qt[ 2] = ss2( Ws2 ) + H[ 3];
+   qt[ 3] = ss3( Ws3 ) + H[ 4];
+   qt[ 4] = ss4( Ws4 ) + H[ 5];
+   qt[ 5] = ss0( Ws5 ) + H[ 6];
+   qt[ 6] = ss1( Ws6 ) + H[ 7];
+   qt[ 7] = ss2( Ws7 ) + H[ 8];
+   qt[ 8] = ss3( Ws8 ) + H[ 9];
+   qt[ 9] = ss4( Ws9 ) + H[10];
+   qt[10] = ss0( Ws10) + H[11];
+   qt[11] = ss1( Ws11) + H[12];
+   qt[12] = ss2( Ws12) + H[13];
+   qt[13] = ss3( Ws13) + H[14];
+   qt[14] = ss4( Ws14) + H[15];
+   qt[15] = ss0( Ws15) + H[ 0];
+   qt[16] = expand1s( qt, M, H, 16 );
+   qt[17] = expand1s( qt, M, H, 17 );
+   qt[18] = expand2s( qt, M, H, 18 );
+   qt[19] = expand2s( qt, M, H, 19 );
+   qt[20] = expand2s( qt, M, H, 20 );
+   qt[21] = expand2s( qt, M, H, 21 );
+   qt[22] = expand2s( qt, M, H, 22 );
+   qt[23] = expand2s( qt, M, H, 23 );
+   qt[24] = expand2s( qt, M, H, 24 );
+   qt[25] = expand2s( qt, M, H, 25 );
+   qt[26] = expand2s( qt, M, H, 26 );
+   qt[27] = expand2s( qt, M, H, 27 );
+   qt[28] = expand2s( qt, M, H, 28 );
+   qt[29] = expand2s( qt, M, H, 29 );
+   qt[30] = expand2s( qt, M, H, 30 );
+   qt[31] = expand2s( qt, M, H, 31 );
+
+   xl = _mm_xor_si128(
+              _mm_xor_si128( _mm_xor_si128( qt[16], qt[17] ),
+                             _mm_xor_si128( qt[18], qt[19] ) ),
+              _mm_xor_si128( _mm_xor_si128( qt[20], qt[21] ),
+                             _mm_xor_si128( qt[22], qt[23] ) ) );
+   xh = _mm_xor_si128( xl,
+             _mm_xor_si128(
+                 _mm_xor_si128( _mm_xor_si128( qt[24], qt[25] ),
+                                   _mm_xor_si128( qt[26], qt[27] ) ),
+                 _mm_xor_si128( _mm_xor_si128( qt[28], qt[29] ),
+                                   _mm_xor_si128( qt[30], qt[31] ) )));
+
+   dH[ 0] = _mm_add_epi32(
+                 _mm_xor_si128( M[0],
+                      _mm_xor_si128( _mm_slli_epi32( xh, 5 ),
+                                     _mm_srli_epi32( qt[16], 5 ) ) ),
+                 _mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] ));
+   dH[ 1] = _mm_add_epi32(
+                 _mm_xor_si128( M[1],
+                      _mm_xor_si128( _mm_srli_epi32( xh, 7 ),
+                                     _mm_slli_epi32( qt[17], 8 ) ) ),
+                 _mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] ));
+   dH[ 2] = _mm_add_epi32(
+                 _mm_xor_si128( M[2],
+                      _mm_xor_si128( _mm_srli_epi32( xh, 5 ),
+                                     _mm_slli_epi32( qt[18], 5 ) ) ),
+                 _mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] ));
+   dH[ 3] = _mm_add_epi32(
+                 _mm_xor_si128( M[3],
+                      _mm_xor_si128( _mm_srli_epi32( xh, 1 ),
+                                     _mm_slli_epi32( qt[19], 5 ) ) ),
+                 _mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] ));
+   dH[ 4] = _mm_add_epi32(
+                 _mm_xor_si128( M[4],
+                      _mm_xor_si128( _mm_srli_epi32( xh, 3 ),
+                                     _mm_slli_epi32( qt[20], 0 ) ) ),
+                 _mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] ));
+   dH[ 5] = _mm_add_epi32(
+                 _mm_xor_si128( M[5],
+                      _mm_xor_si128( _mm_slli_epi32( xh, 6 ),
+                                     _mm_srli_epi32( qt[21], 6 ) ) ),
+                 _mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] ));
+   dH[ 6] = _mm_add_epi32(
+                 _mm_xor_si128( M[6],
+                      _mm_xor_si128( _mm_srli_epi32( xh, 4 ),
+                                     _mm_slli_epi32( qt[22], 6 ) ) ),
+                 _mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] ));
+   dH[ 7] = _mm_add_epi32(
+                 _mm_xor_si128( M[7],
+                      _mm_xor_si128( _mm_srli_epi32( xh, 11 ),
+                                     _mm_slli_epi32( qt[23], 2 ) ) ),
+                 _mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ));
+   dH[ 8] = _mm_add_epi32( _mm_add_epi32(
+                 mm_rotl_32( dH[4], 9 ),
+                 _mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )),
+                 _mm_xor_si128( _mm_slli_epi32( xl, 8 ),
+                                _mm_xor_si128( qt[23], qt[ 8] ) ) );
+   dH[ 9] = _mm_add_epi32( _mm_add_epi32(
+                 mm_rotl_32( dH[5], 10 ),
+                 _mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )),
+                 _mm_xor_si128( _mm_srli_epi32( xl, 6 ),
+                                _mm_xor_si128( qt[16], qt[ 9] ) ) );
+   dH[10] = _mm_add_epi32( _mm_add_epi32(
+                 mm_rotl_32( dH[6], 11 ),
+                 _mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )),
+                 _mm_xor_si128( _mm_slli_epi32( xl, 6 ),
+                                _mm_xor_si128( qt[17], qt[10] ) ) );
+   dH[11] = _mm_add_epi32( _mm_add_epi32(
+                 mm_rotl_32( dH[7], 12 ),
+                 _mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
+                 _mm_xor_si128( _mm_slli_epi32( xl, 4 ),
+                                _mm_xor_si128( qt[18], qt[11] ) ) );
+   dH[12] = _mm_add_epi32( _mm_add_epi32(
+                 mm_rotl_32( dH[0], 13 ),
+                 _mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )),
+                 _mm_xor_si128( _mm_srli_epi32( xl, 3 ),
+                                _mm_xor_si128( qt[19], qt[12] ) ) );
+   dH[13] = _mm_add_epi32( _mm_add_epi32(
+                 mm_rotl_32( dH[1], 14 ),
+                 _mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )),
+                 _mm_xor_si128( _mm_srli_epi32( xl, 4 ),
+                                _mm_xor_si128( qt[20], qt[13] ) ) );
+   dH[14] = _mm_add_epi32( _mm_add_epi32(
+                 mm_rotl_32( dH[2], 15 ),
+                 _mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )),
+                 _mm_xor_si128( _mm_srli_epi32( xl, 7 ),
+                                _mm_xor_si128( qt[21], qt[14] ) ) );
+   dH[15] = _mm_add_epi32( _mm_add_epi32(
+                 mm_rotl_32( dH[3], 16 ),
+                 _mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )),
+                 _mm_xor_si128( _mm_srli_epi32( xl, 2 ),
+                                _mm_xor_si128( qt[22], qt[15] ) ) );
+}
+
+// BMW512

 #define Wb0 \
   _mm256_add_epi64( \
@@ -564,6 +765,7 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
   qt[29] = expand2b( qt, M, H, 29 ); 
   qt[30] = expand2b( qt, M, H, 30 ); 
   qt[31] = expand2b( qt, M, H, 31 ); 
+
   xl = _mm256_xor_si256( 
              _mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ), 
                                _mm256_xor_si256( qt[18], qt[19] ) ), 
@@ -575,6 +777,7 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
                                   _mm256_xor_si256( qt[26], qt[27] ) ),
                 _mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
                                   _mm256_xor_si256( qt[30], qt[31] ) )));
+
   dH[ 0] = _mm256_add_epi64(
                 _mm256_xor_si256( M[0],
                      _mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
@@ -657,137 +860,113 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
 } 

-#endif  // 64
+// BMW256

-//#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_ROTL32, M, Qs, dH)
-
-
-/*
-static void
-compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
+static const __m128i final_s[16] =
 {
-#define M(x)    sph_dec32le_aligned(data + 4 * (x))
-#define H(x)    (h[x])
-#define dH(x)   (dh[x])
-
-	FOLDs;
-
-#undef M
-#undef H
-#undef dH
-}
-
-static const sph_u32 final_s[16] = {
-	SPH_C32(0xaaaaaaa0), SPH_C32(0xaaaaaaa1), SPH_C32(0xaaaaaaa2),
-	SPH_C32(0xaaaaaaa3), SPH_C32(0xaaaaaaa4), SPH_C32(0xaaaaaaa5),
-	SPH_C32(0xaaaaaaa6), SPH_C32(0xaaaaaaa7), SPH_C32(0xaaaaaaa8),
-	SPH_C32(0xaaaaaaa9), SPH_C32(0xaaaaaaaa), SPH_C32(0xaaaaaaab),
-	SPH_C32(0xaaaaaaac), SPH_C32(0xaaaaaaad), SPH_C32(0xaaaaaaae),
-	SPH_C32(0xaaaaaaaf)
+   { 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
+   { 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
+   { 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 },
+   { 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 },
+   { 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 },
+   { 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 },
+   { 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 },
+   { 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 },
+   { 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 },
+   { 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 },
+   { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
+   { 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab },
+   { 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac },
+   { 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad },
+   { 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
+   { 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
 };

 static void
 bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv)
 {
-	memcpy(sc->H, iv, sizeof sc->H);
-	sc->ptr = 0;
-#if SPH_64
-	sc->bit_count = 0;
-#else
-	sc->bit_count_high = 0;
-	sc->bit_count_low = 0;
-#endif
+   for ( int i = 0; i < 16; i++ )
+      sc->H[i] = _mm_set1_epi32( iv[i] );
+   sc->ptr = 0;
+   sc->bit_count = 0;
 }

 static void
 bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
 {
-	unsigned char *buf;
-	size_t ptr;
-	sph_u32 htmp[16];
-	sph_u32 *h1, *h2;
-#if !SPH_64
-	sph_u32 tmp;
-#endif
+   __m128i *vdata = (__m128i*)data;
+   __m128i *buf;
+   __m128i htmp[16];
+   __m128i *h1, *h2;
+   size_t ptr;
+   const int buf_size = 64;  // bytes of one lane, compatible with len

-#if SPH_64
-	sc->bit_count += (sph_u64)len << 3;
-#else
-	tmp = sc->bit_count_low;
-	sc->bit_count_low = SPH_T32(tmp + ((sph_u32)len << 3));
-	if (sc->bit_count_low < tmp)
-		sc->bit_count_high ++;
-	sc->bit_count_high += len >> 29;
-#endif
-	buf = sc->buf;
-	ptr = sc->ptr;
-	h1 = sc->H;
-	h2 = htmp;
-	while (len > 0) {
-		size_t clen;
-
-		clen = (sizeof sc->buf) - ptr;
-		if (clen > len)
-			clen = len;
-		memcpy(buf + ptr, data, clen);
-		data = (const unsigned char *)data + clen;
-		len -= clen;
-		ptr += clen;
-		if (ptr == sizeof sc->buf) {
-			sph_u32 *ht;
-
-			compress_small(buf, h1, h2);
-			ht = h1;
-			h1 = h2;
-			h2 = ht;
-			ptr = 0;
-		}
-	}
-	sc->ptr = ptr;
-	if (h1 != sc->H)
-		memcpy(sc->H, h1, sizeof sc->H);
+   sc->bit_count += (sph_u64)len << 3;
+   buf = sc->buf;
+   ptr = sc->ptr;
+   h1 = sc->H;
+   h2 = htmp;
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_128( buf + (ptr>>2), vdata, clen >> 2 );
+      vdata += ( clen >> 2 );
+      len -= clen;
+      ptr += clen;
+      if ( ptr == buf_size )
+      {
+         __m128i *ht;
+         compress_small( buf, h1, h2 );
+         ht = h1;
+         h1 = h2;
+         h2 = ht;
+         ptr = 0;
+      }
+   }
+   sc->ptr = ptr;
+   if ( h1 != sc->H )
+        memcpy_128( sc->H, h1, 16 );
 }

 static void
 bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
 	void *dst, size_t out_size_w32)
 {
-	unsigned char *buf, *out;
-	size_t ptr, u, v;
-	unsigned z;
-	sph_u32 h1[16], h2[16], *h;
+   __m128i *buf;
+   __m128i h1[16], h2[16], *h;
+   size_t ptr, u, v;
+   unsigned z;
+   const int buf_size = 64;  // bytes of one lane, compatible with len

-	buf = sc->buf;
-	ptr = sc->ptr;
-	z = 0x80 >> n;
-	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
-	h = sc->H;
-	if (ptr > (sizeof sc->buf) - 8) {
-		memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
-		compress_small(buf, h, h1);
-		ptr = 0;
-		h = h1;
-	}
-	memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr);
-#if SPH_64
-	sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
-		SPH_T64(sc->bit_count + n));
-#else
-	sph_enc32le_aligned(buf + (sizeof sc->buf) - 8,
-		sc->bit_count_low + n);
-	sph_enc32le_aligned(buf + (sizeof sc->buf) - 4,
-		SPH_T32(sc->bit_count_high));
-#endif
-	compress_small(buf, h, h2);
-	for (u = 0; u < 16; u ++)
-		sph_enc32le_aligned(buf + 4 * u, h2[u]);
-	compress_small(buf, final_s, h1);
-	out = dst;
-	for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
-		sph_enc32le(out + 4 * u, h1[v]);
+   buf = sc->buf;
+   ptr = sc->ptr;
+   z = 0x80 >> n;
+   buf[ ptr>>2 ] = _mm_set1_epi32( z );
+   ptr += 4;
+   h = sc->H;
+
+   // assume bit_count fits in 32 bits 
+   if ( ptr > buf_size - 4 )
+   {
+      memset_zero_128( buf + (ptr>>2), (buf_size - ptr) >> 2 );
+      compress_small( buf, h, h1 );
+      ptr = 0;
+      h = h1;
+   }
+   memset_zero_128( buf + (ptr>>2), (buf_size - 4 - ptr) >> 2 );
+   buf[ (buf_size - 4) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
+   compress_small( buf, h, h2 );
+   for ( u = 0; u < 16; u ++ )
+      buf[u] = h2[u];
+   compress_small( buf, final_s, h1 );
+   for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
+      casti_m128i( dst, u ) = h1[v];
 }
-*/
-#if SPH_64
+
+// BMW512

 static const __m256i final_b[16] =
 {
@@ -908,33 +1087,33 @@ bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n,
      casti_m256i(dst,u) = h1[v];
 }

-#endif
+// BMW256

 void
 bmw256_4way_init(void *cc)
 {
-//	bmw32_4way_init(cc, IV256);
+	bmw32_4way_init(cc, IV256);
 }

 void
 bmw256_4way(void *cc, const void *data, size_t len)
 {
-//	bmw32_4way(cc, data, len);
+	bmw32_4way(cc, data, len);
 }

 void
 bmw256_4way_close(void *cc, void *dst)
 {
-//	bmw256_4way_addbits_and_close(cc, 0, 0, dst);
+	bmw256_4way_addbits_and_close(cc, 0, 0, dst);
 }

 void
 bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
-//	bmw32_4way_close(cc, ub, n, dst, 8);
+	bmw32_4way_close(cc, ub, n, dst, 8);
 }

-#if SPH_64
+// BMW512

 void
 bmw512_4way_init(void *cc)
@@ -960,10 +1139,8 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 	bmw64_4way_close(cc, ub, n, dst, 8);
 }

-#endif
-
 #ifdef __cplusplus
 }
 #endif

-#endif
+#endif  // __AVX2__
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -46,94 +46,37 @@ extern "C"{
 #include "algo/sha/sph_types.h"
 #include "avxdefs.h"

-/**
- * Output size (in bits) for BMW-224.
- */
-#define SPH_SIZE_bmw224   224
-
-/**
- * Output size (in bits) for BMW-256.
- */
 #define SPH_SIZE_bmw256   256

-#if SPH_64
-
-/**
- * Output size (in bits) for BMW-384.
- */
-#define SPH_SIZE_bmw384   384
-
-/**
- * Output size (in bits) for BMW-512.
- */
 #define SPH_SIZE_bmw512   512

-#endif
-
-/**
- * This structure is a context for BMW-224 and BMW-256 computations:
- * it contains the intermediate values and some data from the last
- * entered block. Once a BMW computation has been performed, the
- * context can be reused for another computation.
- *
- * The contents of this structure are private. A running BMW
- * computation can be cloned by copying the context (e.g. with a simple
- * <code>memcpy()</code>).
- */
 typedef struct {
-#ifndef DOXYGEN_IGNORE
-	unsigned char buf[64];    /* first field, for alignment */
-	size_t ptr;
-	sph_u32 H[16];
-#if SPH_64
-	sph_u64 bit_count;
-#else
-	sph_u32 bit_count_high, bit_count_low;
-#endif
-#endif
+   __m128i buf[64];
+   __m128i H[16];
+   size_t ptr;
+   sph_u32 bit_count;  // assume bit_count fits in 32 bits
 } bmw_4way_small_context;

 typedef bmw_4way_small_context bmw256_4way_context;

-#if SPH_64
-
-/**
- * This structure is a context for BMW-384 and BMW-512 computations:
- * it contains the intermediate values and some data from the last
- * entered block. Once a BMW computation has been performed, the
- * context can be reused for another computation.
- *
- * The contents of this structure are private. A running BMW
- * computation can be cloned by copying the context (e.g. with a simple
- * <code>memcpy()</code>).
- */
 typedef struct {
-#ifndef DOXYGEN_IGNORE
   __m256i buf[16];
   __m256i H[16];
-
-//	unsigned char buf[128];    /* first field, for alignment */
-	size_t ptr;
-//	sph_u64 H[16];
-	sph_u64 bit_count;
-#endif
+   size_t ptr;
+   sph_u64 bit_count;
 } bmw_4way_big_context;

 typedef bmw_4way_big_context bmw512_4way_context;

-#endif
-
 void bmw256_4way_init(void *cc);

 void bmw256_4way(void *cc, const void *data, size_t len);

 void bmw256_4way_close(void *cc, void *dst);

-void bmw256_addbits_and_close(
+void bmw256_4way_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);

-#if SPH_64
-
 void bmw512_4way_init(void *cc);

 void bmw512_4way(void *cc, const void *data, size_t len);
@@ -150,5 +93,3 @@ void bmw512_4way_addbits_and_close(
 #endif

 #endif
-
-#endif
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -23,12 +23,12 @@ void jha_hash_4way( void *out, const void *input )
    uint64_t hash2[8] __attribute__ ((aligned (64)));
    uint64_t hash3[8] __attribute__ ((aligned (64)));
    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-    uint64_t vhash0[8*4] __attribute__ ((aligned (64)));
-    uint64_t vhash1[8*4] __attribute__ ((aligned (64)));
-    __m256i mask, mask0, mask1;
-    __m256i* vh = (__m256i*)vhash;
-    __m256i* vh0 = (__m256i*)vhash0;
-    __m256i* vh1 = (__m256i*)vhash1;
+    uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
+    __m256i* vh  = (__m256i*)vhash;
+    __m256i* vhA = (__m256i*)vhashA;
+    __m256i* vhB = (__m256i*)vhashB;
+    __m256i vh_mask;

    blake512_4way_context  ctx_blake;
    hashState_groestl      ctx_groestl;
@@ -40,127 +40,69 @@ void jha_hash_4way( void *out, const void *input )
    keccak512_4way( &ctx_keccak, input, 80 );
    keccak512_4way_close( &ctx_keccak, vhash );

-//    memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
-//    keccak512_4way( &ctx_keccak, input + (64<<2), 16 );
-//    keccak512_4way_close( &ctx_keccak, vhash );
-
    // Heavy & Light Pair Loop
    for ( int round = 0; round < 3; round++ )
    {
-      // select next function based on bit 0 of previous hash.
-      // Specutively execute both functions and use mask to
-      // select results from correct function for each lane.
-      // hash = mask : vhash0 ? vhash1
-      mask = mm256_negate_64(
-                     _mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) );
-
-// second version
-//      mask0 = mask
-//      mask1 = mm256_not( mask );
-
-// first version
-//       mask = _mm256_sub_epi64( _mm256_and_si256( vh[0],
-//                     _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) );
-
-       // groestl (serial) vs skein
+       vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256(
+               vh[0], _mm256_set1_epi64x( 1 ) ), mm256_zero );

       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash0,
-                                 (char*)hash0, 512 );
+                                               (char*)hash0, 512 );
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash1,
-                                 (char*)hash1, 512 );
+                                               (char*)hash1, 512 );
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash2,
-                                 (char*)hash2, 512 );
+                                               (char*)hash2, 512 );
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash3,
-                                 (char*)hash3, 512 );
-
-       mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 );
-
-       // skein
+                                               (char*)hash3, 512 );
+       mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );

       skein512_4way_init( &ctx_skein );
       skein512_4way( &ctx_skein, vhash, 64 );
-       skein512_4way_close( &ctx_skein, vhash1 );
+       skein512_4way_close( &ctx_skein, vhashB );

-       // merge vectored hash
       for ( int i = 0; i < 8; i++ )
-       {
-          // blend should be faster
-          vh[i] = _mm256_blendv_epi8( vh0[i], vh1[i], mask );
-
-// second version
-//          vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
-//                                   _mm256_and_si256( vh1[i], mask1 ) );
-
-// first version
-/*
-          vh0[i] = _mm256_maskload_epi64( 
-                                      vhash0 + i*4, mm256_not( mask ) );
-          vh1[i] = _mm256_maskload_epi64(
-                                      vhash1 + i*4, mask );
-          vh[i]  = _mm256_or_si256( vh0[i], vh1[i] );
-*/
-       }
-
-       // blake v jh
+          vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );

       blake512_4way_init( &ctx_blake );
       blake512_4way( &ctx_blake, vhash, 64 );
-       blake512_4way_close( &ctx_blake, vhash0 );
+       blake512_4way_close( &ctx_blake, vhashA );

       jh512_4way_init( &ctx_jh );
       jh512_4way( &ctx_jh, vhash, 64 );
-       jh512_4way_close( &ctx_jh, vhash1 );
+       jh512_4way_close( &ctx_jh, vhashB );

-       // merge hash
       for ( int i = 0; i < 8; i++ )
-       {
-          vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
-                                   _mm256_and_si256( vh1[i], mask1 ) );
-/*
-          vha256[i] = _mm256_maskload_epi64(
-                                      vhasha + i*4, mm256_not( mask ) );
-          vhb256[i] = _mm256_maskload_epi64(
-                                      vhashb + i*4, mask );
-          vh256[i]  = _mm256_or_si256( vha256[i], vhb256[i] );
-*/
-       }
+          vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
    }

    mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
-
-//    memcpy( output,       hash0, 32 );
-//    memcpy( output+32,    hash1, 32 );
-//    memcpy( output+64,    hash2, 32 );
-//    memcpy( output+96,    hash3, 32 );
-
 }

 int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done )
 {
-     uint32_t hash[8*4] __attribute__ ((aligned (64)));
-     uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-     uint32_t endiandata[20] __attribute__((aligned(64)));
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
-	const uint32_t first_nonce = pdata[19];
-	const uint32_t Htarg = ptarget[7];
-	uint32_t n = pdata[19];
-     uint32_t *nonces = work->nonces;
-     bool *found = work->nfound;
-     int num_found = 0;
-     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-     uint32_t *noncep1 = vdata + 75;
-     uint32_t *noncep2 = vdata + 77;
-     uint32_t *noncep3 = vdata + 79;
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t Htarg = ptarget[7];
+   uint32_t n = pdata[19];
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+   uint32_t *noncep1 = vdata + 75;
+   uint32_t *noncep2 = vdata + 77;
+   uint32_t *noncep3 = vdata + 79;

-	uint64_t htmax[] = {
+   uint64_t htmax[] = {
 		0,
 		0xF,
 		0xFF,
@@ -168,7 +110,7 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
 		0xFFFF,
 		0x10000000
 	};
-	uint32_t masks[] = {
+   uint32_t masks[] = {
 		0xFFFFFFFF,
 		0xFFFFFFF0,
 		0xFFFFFF00,
@@ -177,17 +119,12 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
 		0
 	};

-   // we need bigendian data...
   for ( int i=0; i < 19; i++ )
      be32enc( &endiandata[i], pdata[i] );

   uint64_t *edata = (uint64_t*)endiandata;
   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

-   // precalc midstate for keccak
-//   keccak512_4way_init( &jha_kec_mid );
-//   keccak512_4way( &jha_kec_mid, vdata, 64 );
-
   for ( int m = 0; m < 6; m++ )
   {
      if ( Htarg <= htmax[m] )
@@ -201,7 +138,6 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
              be32enc( noncep3, n+3 );

              jha_hash_4way( hash, vdata );
-
              pdata[19] = n;

              if ( ( !(hash[7] & mask) )
@@ -239,11 +175,9 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
              n += 4;
         } while ( ( num_found == 0 ) && ( n < max_nonce )
                     && !work_restart[thr_id].restart );
-
         break;
      }
   }
-
   *hashes_done = n - first_nonce + 1;
   return num_found;
 }
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -0,0 +1,128 @@
+#include "lyra2h-gate.h"
+
+#ifdef LYRA2H_4WAY
+
+#include <memory.h>
+#include <mm_malloc.h>
+#include "lyra2.h"
+#include "algo/blake/sph_blake.h"
+#include "algo/blake/blake-hash-4way.h"
+
+__thread uint64_t* lyra2h_4way_matrix;
+
+bool lyra2h_4way_thread_init()
+{
+ return ( lyra2h_4way_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
+}
+
+static __thread blake256_4way_context l2h_4way_blake_mid;
+
+void lyra2h_4way_midstate( const void* input )
+{
+       blake256_4way_init( &l2h_4way_blake_mid );
+       blake256_4way( &l2h_4way_blake_mid, input, 64 );
+}
+
+void lyra2h_4way_hash( void *state, const void *input )
+{
+     uint32_t hash0[8] __attribute__ ((aligned (64)));
+     uint32_t hash1[8] __attribute__ ((aligned (64)));
+     uint32_t hash2[8] __attribute__ ((aligned (64)));
+     uint32_t hash3[8] __attribute__ ((aligned (64)));
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
+
+     memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
+     blake256_4way( &ctx_blake, input + (64*4), 16 );
+     blake256_4way_close( &ctx_blake, vhash );
+
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+
+     LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 16, 16, 16 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done )
+{
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 76; // 19*4
+   uint32_t *noncep1 = vdata + 77;
+   uint32_t *noncep2 = vdata + 78;
+   uint32_t *noncep3 = vdata + 79;
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0000ff;
+
+   for ( int i=0; i < 19; i++ )
+      be32enc( &edata[i], pdata[i] );
+
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+
+   lyra2h_4way_midstate( vdata );
+
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      be32enc( &edata[19], n );
+      lyra2h_4way_hash( hash, vdata );
+
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      {
+          found[0] = true;
+          num_found++;
+          nonces[0] = pdata[19] = n;
+          work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
+      {
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
+      {
+          found[2] = true;
+          num_found++;
+          nonces[2] = n+2;
+          work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
+      {
+          found[3] = true;
+          num_found++;
+          nonces[3] = n+3;
+          work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
+
--- a/algo/lyra2/lyra2h-gate.c
+++ b/algo/lyra2/lyra2h-gate.c
@@ -0,0 +1,25 @@
+#include "lyra2h-gate.h"
+#include "lyra2.h"
+
+void lyra2h_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool register_lyra2h_algo( algo_gate_t* gate )
+{
+#ifdef LYRA2H_4WAY
+  gate->miner_thread_init = (void*)&lyra2h_4way_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2h_4way;
+  gate->hash       = (void*)&lyra2h_4way_hash;
+#else
+  gate->miner_thread_init = (void*)&lyra2h_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2h;
+  gate->hash       = (void*)&lyra2h_hash;
+#endif
+  gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  gate->set_target = (void*)&lyra2h_set_target;
+  return true;
+};
+
--- a/algo/lyra2/lyra2h-gate.h
+++ b/algo/lyra2/lyra2h-gate.h
@@ -0,0 +1,32 @@
+#ifndef LYRA2H_GATE_H__
+#define LYRA2H_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY)
+  #define LYRA2H_4WAY
+#endif
+
+#define LYRA2H_MATRIX_SIZE  BLOCK_LEN_INT64 * 16 * 16 * 8
+
+#if defined(LYRA2H_4WAY)
+
+void lyra2h_4way_hash( void *state, const void *input );
+
+int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+bool lyra2h_4way_thread_init();
+
+#endif
+
+void lyra2h_hash( void *state, const void *input );
+
+int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+bool lyra2h_thread_init();
+
+#endif
+
--- a/algo/lyra2/lyra2h.c
+++ b/algo/lyra2/lyra2h.c
@@ -1,6 +1,6 @@
+#include "lyra2h-gate.h"
 #include <memory.h>
 #include <mm_malloc.h>
-#include "algo-gate-api.h"
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"

@@ -8,8 +8,7 @@ __thread uint64_t* lyra2h_matrix;

 bool lyra2h_thread_init()
 {
-   const int i = 16 * 16 * 96;
-   lyra2h_matrix = _mm_malloc( i, 64 );
+   lyra2h_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 );
   return lyra2h_matrix;
 }

@@ -74,20 +73,3 @@ int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
-
-void lyra2h_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool register_lyra2h_algo( algo_gate_t* gate )
-{
-  gate->optimizations = AVX_OPT | AVX2_OPT;
-  gate->miner_thread_init = (void*)&lyra2h_thread_init;
-  gate->scanhash   = (void*)&scanhash_lyra2h;
-  gate->hash       = (void*)&lyra2h_hash;
-  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  gate->set_target = (void*)&lyra2h_set_target;
-  return true;
-};
-
--- a/algo/lyra2/lyra2re.c
+++ b/algo/lyra2/lyra2re.c
@@ -106,6 +106,7 @@ int scanhash_lyra2re(int thr_id, struct work *work,
                   {
 			pdata[19] = nonce;
 			*hashes_done = pdata[19] - first_nonce;
+                        work_set_target_ratio( work, hash );
 			return 1;
                   }
 		}
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -0,0 +1,177 @@
+#include "lyra2rev2-gate.h"
+#include <memory.h>
+
+#ifdef __AVX2__	
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+
+#include "algo/cubehash/sph_cubehash.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h" 
+
+typedef struct {
+   blake256_4way_context     blake;
+   keccak256_4way_context    keccak;
+   cubehashParam             cube;
+   skein256_4way_context     skein;
+        sph_bmw256_context       bmw;
+
+} lyra2v2_4way_ctx_holder;
+
+static lyra2v2_4way_ctx_holder l2v2_4way_ctx;
+
+void init_lyra2rev2_4way_ctx()
+{
+//   blake256_4way_init( &l2v2_4way_ctx.blake );
+   keccak256_4way_init( &l2v2_4way_ctx.keccak );
+   cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
+   skein256_4way_init( &l2v2_4way_ctx.skein );
+        sph_bmw256_init( &l2v2_4way_ctx.bmw );
+}
+
+void lyra2rev2_4way_hash( void *state, const void *input )
+{
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (32)));
+   uint32_t hash2[8] __attribute__ ((aligned (32)));
+   uint32_t hash3[8] __attribute__ ((aligned (32)));
+   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+   uint64_t vhash64[4*4] __attribute__ ((aligned (64)));
+   lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
+   memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
+
+   blake256_4way( &ctx.blake, input + (64<<2), 16 );
+//   blake256_4way( &ctx.blake, input, 80 );
+   blake256_4way_close( &ctx.blake, vhash );
+
+   mm256_reinterleave_4x64( vhash64, vhash, 256 );
+   keccak256_4way( &ctx.keccak, vhash64, 32 );
+   keccak256_4way_close( &ctx.keccak, vhash64 );
+   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
+
+   LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
+   LYRA2REV2( l2v2_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
+   LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
+   LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
+
+   mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+   skein256_4way( &ctx.skein, vhash64, 32 );
+   skein256_4way_close( &ctx.skein, vhash64 );
+   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
+
+
+	sph_bmw256( &ctx.bmw, hash0, 32 );
+	sph_bmw256_close( &ctx.bmw, hash0 );
+        memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
+        sph_bmw256( &ctx.bmw, hash1, 32 );
+        sph_bmw256_close( &ctx.bmw, hash1 );
+        memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
+        sph_bmw256( &ctx.bmw, hash2, 32 );
+        sph_bmw256_close( &ctx.bmw, hash2 );
+        memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
+        sph_bmw256( &ctx.bmw, hash3, 32 );
+        sph_bmw256_close( &ctx.bmw, hash3 );
+
+
+   memcpy( state,    hash0, 32 );
+   memcpy( state+32, hash1, 32 );
+   memcpy( state+64, hash2, 32 );
+   memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done )
+{
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t Htarg = ptarget[7];
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 76; // 19*4
+   uint32_t *noncep1 = vdata + 77;
+   uint32_t *noncep2 = vdata + 78;
+   uint32_t *noncep3 = vdata + 79;
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   swab32_array( edata, pdata, 20 );
+
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+
+   blake256_4way_init( &l2v2_4way_ctx.blake );
+   blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
+
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      lyra2rev2_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      {
+          found[0] = true;
+          num_found++;
+          nonces[0] = pdata[19] = n;
+          work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
+      {
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
+      {
+          found[2] = true;
+          num_found++;
+          nonces[2] = n+2;
+          work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
+      {
+          found[3] = true;
+          num_found++;
+          nonces[3] = n+3;
+          work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/lyra2/lyra2rev2-gate.c
+++ b/algo/lyra2/lyra2rev2-gate.c
@@ -0,0 +1,38 @@
+#include "lyra2rev2-gate.h"
+
+__thread uint64_t* l2v2_wholeMatrix;
+
+void lyra2rev2_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool lyra2rev2_thread_init()
+{
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+   int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+   l2v2_wholeMatrix = _mm_malloc( i, 64 );
+
+   return l2v2_wholeMatrix;
+}
+
+bool register_lyra2rev2_algo( algo_gate_t* gate )
+{
+#if defined (LYRA2REV2_4WAY)
+  init_lyra2rev2_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
+  gate->hash      = (void*)&lyra2rev2_4way_hash;
+#else
+  init_lyra2rev2_ctx();
+  gate->scanhash  = (void*)&scanhash_lyra2rev2;
+  gate->hash      = (void*)&lyra2rev2_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
+  gate->set_target        = (void*)&lyra2rev2_set_target;
+  return true;
+};
+
+
--- a/algo/lyra2/lyra2rev2-gate.h
+++ b/algo/lyra2/lyra2rev2-gate.h
@@ -0,0 +1,35 @@
+#ifndef LYRA2REV2_GATE_H__
+#define LYRA2REV2_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+#include "lyra2.h"
+
+#if defined(HASH_4WAY)
+  #define LYRA2REV2_4WAY
+#endif
+
+extern __thread uint64_t* l2v2_wholeMatrix;
+
+bool register_lyra2rev2_algo( algo_gate_t* gate );
+
+#if defined(LYRA2REV2_4WAY)
+
+void lyra2rev2_4way_hash( void *state, const void *input );
+
+int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_lyra2rev2_4way_ctx();
+
+#endif
+
+void lyra2rev2_hash( void *state, const void *input );
+
+int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_lyra2rev2_ctx();
+
+#endif
+
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -1,20 +1,12 @@
+#include "lyra2rev2-gate.h"
 #include <memory.h>
-
-#include "algo-gate-api.h"
-
 #include "algo/blake/sph_blake.h"
 #include "algo/cubehash/sph_cubehash.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
-#include "lyra2.h"
-#include "avxdefs.h"
-
-// This gets allocated when miner_thread starts up and is never freed.
-// It's not a leak because the only way to allocate it again is to exit
-// the thread and that only occurs when the entire program exits.
-__thread uint64_t* l2v2_wholeMatrix;
+//#include "lyra2.h"

 typedef struct {
        cubehashParam           cube1;
@@ -106,6 +98,7 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
                   if( fulltest(hash, ptarget) )
                   {
 			pdata[19] = nonce;
+                        work_set_target_ratio( work, hash );
 			*hashes_done = pdata[19] - first_nonce;
 		   	return 1;
 		   }
@@ -119,30 +112,3 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
 	return 0;
 }

-void lyra2rev2_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool lyra2rev2_thread_init()
-{
-   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
-   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
-
-   int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
-   l2v2_wholeMatrix = _mm_malloc( i, 64 );
-
-   return l2v2_wholeMatrix;
-}
-
-bool register_lyra2rev2_algo( algo_gate_t* gate )
-{
-  init_lyra2rev2_ctx();
-  gate->optimizations = AVX_OPT | AVX2_OPT;
-  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
-  gate->scanhash          = (void*)&scanhash_lyra2rev2;
-  gate->hash              = (void*)&lyra2rev2_hash;
-  gate->set_target        = (void*)&lyra2rev2_set_target;
-  return true;
-};
-
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -85,8 +85,8 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
      be32enc( noncep2, n+2 );
      be32enc( noncep3, n+3 );

-      be32enc( &edata[19], n );
      lyra2z_4way_hash( hash, vdata );
+      pdata[19] = n;

      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
      {
--- a/algo/lyra2/lyra2z.c
+++ b/algo/lyra2/lyra2z.c
@@ -82,41 +82,3 @@ int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

-/*
-//int64_t get_max64_0xffffLL() { return 0xffffLL; };
-
-void lyra2z_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
-{
-   work->height = sctx->bloc_height;
-   return false;
-}
-
-
-bool lyra2z_thread_init()
-{
-   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
-   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
-
-   int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
-   lyra2z_wholeMatrix = _mm_malloc( i, 64 );
-
-   return lyra2z_wholeMatrix;
-}
-
-bool register_lyra2z_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  gate->miner_thread_init = (void*)&lyra2z_thread_init;
-  gate->scanhash   = (void*)&scanhash_lyra2z;
-  gate->hash       = (void*)&lyra2z_hash;
-  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  gate->set_target = (void*)&lyra2z_set_target;
-//  gate->prevent_dupes = (void*)&zcoin_get_work_height;
-  return true;
-};
-*/
--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -346,6 +346,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
                    hash_str,
                    target_str);
            }
+            work_set_target_ratio( work, hash );
            pdata[19] = data[19];
            goto out;
 	  }
--- a/algo/nist5/nist5.c
+++ b/algo/nist5/nist5.c
@@ -132,6 +132,7 @@ int scanhash_nist5(int thr_id, struct work *work,
 				if (!(hash64[7] & mask)) {
 					printf("[%d]",thr_id);
 					if (fulltest(hash64, ptarget)) {
+                                                work_set_target_ratio( work, hash64 );
 						*hashes_done = n - first_nonce + 1;
 						return true;
 					}
--- a/algo/nist5/zr5.c
+++ b/algo/nist5/zr5.c
@@ -172,6 +172,7 @@ int scanhash_zr5( int thr_id, struct work *work,
         pdata[0] = tmpdata[0];
         pdata[19] = nonce;
         *hashes_done = pdata[19] - first_nonce + 1;
+         work_set_target_ratio( work, hash );
         if (opt_debug)
           applog(LOG_INFO, "found nonce %x", nonce);
         return 1;
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -0,0 +1,207 @@
+#include "cpuminer-config.h"
+#include "quark-gate.h"
+
+#if defined (__AVX2__) && defined (__AES__)
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+
+typedef struct {
+    blake512_4way_context  blake;
+    bmw512_4way_context    bmw;
+    hashState_groestl      groestl;
+    jh512_4way_context     jh;
+    skein512_4way_context  skein;
+    keccak512_4way_context keccak;
+} quark_4way_ctx_holder;
+
+quark_4way_ctx_holder quark_4way_ctx __attribute__ ((aligned (64)));
+
+void init_quark_4way_ctx()
+{
+     blake512_4way_init( &quark_4way_ctx.blake );
+     bmw512_4way_init( &quark_4way_ctx.bmw );
+     init_groestl( &quark_4way_ctx.groestl, 64 );
+     skein512_4way_init( &quark_4way_ctx.skein );
+     jh512_4way_init( &quark_4way_ctx.jh );
+     keccak512_4way_init( &quark_4way_ctx.keccak );
+}
+
+void quark_4way_hash( void *state, const void *input )
+{
+    uint64_t hash0[8] __attribute__ ((aligned (64)));
+    uint64_t hash1[8] __attribute__ ((aligned (64)));
+    uint64_t hash2[8] __attribute__ ((aligned (64)));
+    uint64_t hash3[8] __attribute__ ((aligned (64)));
+    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
+    __m256i* vh  = (__m256i*)vhash;
+    __m256i* vhA = (__m256i*)vhashA;
+    __m256i* vhB = (__m256i*)vhashB;
+    __m256i vh_mask;
+    __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
+    int i;
+    quark_4way_ctx_holder ctx;
+    memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
+
+    blake512_4way( &ctx.blake, input, 80 );
+    blake512_4way_close( &ctx.blake, vhash );
+
+    bmw512_4way( &ctx.bmw, vhash, 64 );
+    bmw512_4way_close( &ctx.bmw, vhash );
+
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
+                                  mm256_zero );
+
+       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+       update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                               (char*)hash0, 512 );
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                               (char*)hash1, 512 );
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                               (char*)hash2, 512 );
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                               (char*)hash3, 512 );
+       mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+
+       skein512_4way( &ctx.skein, vhash, 64 );
+       skein512_4way_close( &ctx.skein, vhashB );
+
+    for ( i = 0; i < 8; i++ )
+       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
+    mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+    jh512_4way( &ctx.jh, vhash, 64 );
+    jh512_4way_close( &ctx.jh, vhash );
+
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
+                                  mm256_zero );
+
+       blake512_4way_init( &ctx.blake );
+       blake512_4way( &ctx.blake, vhash, 64 );
+       blake512_4way_close( &ctx.blake, vhashA );
+
+       bmw512_4way_init( &ctx.bmw );
+       bmw512_4way( &ctx.bmw, vhash, 64 );
+       bmw512_4way_close( &ctx.bmw, vhashB );
+
+    for ( i = 0; i < 8; i++ )
+       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
+    keccak512_4way( &ctx.keccak, vhash, 64 );
+    keccak512_4way_close( &ctx.keccak, vhash );
+
+    skein512_4way_init( &ctx.skein );
+    skein512_4way( &ctx.skein, vhash, 64 );
+    skein512_4way_close( &ctx.skein, vhash );
+
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
+                                  mm256_zero );
+
+       keccak512_4way_init( &ctx.keccak );
+       keccak512_4way( &ctx.keccak, vhash, 64 );
+       keccak512_4way_close( &ctx.keccak, vhashA );
+
+       jh512_4way_init( &ctx.jh );
+       jh512_4way( &ctx.jh, vhash, 64 );
+       jh512_4way_close( &ctx.jh, vhashB );
+
+    for ( i = 0; i < 8; i++ )
+       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
+    mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
+}
+
+int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done)
+{
+    uint32_t hash[4*8] __attribute__ ((aligned (64)));
+    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+    uint32_t endiandata[20] __attribute__((aligned(64)));
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    uint32_t n = pdata[19];
+    const uint32_t first_nonce = pdata[19];
+    uint32_t *nonces = work->nonces;
+    bool *found = work->nfound;
+    int num_found = 0;
+    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+    uint32_t *noncep1 = vdata + 75;
+    uint32_t *noncep2 = vdata + 77;
+    uint32_t *noncep3 = vdata + 79;
+
+    swab32_array( endiandata, pdata, 20 );
+
+    uint64_t *edata = (uint64_t*)endiandata;
+    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+    do
+    {
+       found[0] = found[1] = found[2] = found[3] = false;
+       be32enc( noncep0, n   );
+       be32enc( noncep1, n+1 );
+       be32enc( noncep2, n+2 );
+       be32enc( noncep3, n+3 );
+
+       quark_4way_hash( hash, vdata );
+       pdata[19] = n;
+
+       if ( ( hash[7] & 0xFFFFFF00 ) == 0 && fulltest( hash, ptarget ) ) 
+       {
+          found[0] = true;
+          num_found++;
+          nonces[0] = n;
+          work_set_target_ratio( work, hash );
+       }
+       if ( ( (hash+8)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+8, ptarget ) )
+       {
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash );
+       }
+       if ( ( (hash+16)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+16, ptarget ) )
+       {
+          found[2] = true;
+          num_found++;
+          nonces[2] = n+2;
+          work_set_target_ratio( work, hash );
+       }
+       if ( ( (hash+24)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+24, ptarget ) )
+       {
+          found[3] = true;
+          num_found++;
+          nonces[3] = n+3;
+          work_set_target_ratio( work, hash );
+       }
+       n += 4;
+    } while ( ( num_found == 0 ) && ( n < max_nonce )
+              && !work_restart[thr_id].restart );
+
+    *hashes_done = n - first_nonce + 1;
+    return num_found;
+}
+
+#endif
--- a/algo/quark/quark-gate.c
+++ b/algo/quark/quark-gate.c
@@ -0,0 +1,17 @@
+#include "quark-gate.h"
+
+bool register_quark_algo( algo_gate_t* gate )
+{
+#if defined (QUARK_4WAY)
+  init_quark_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_quark_4way;
+  gate->hash      = (void*)&quark_4way_hash;
+#else
+  init_quark_ctx();
+  gate->scanhash  = (void*)&scanhash_quark;
+  gate->hash      = (void*)&quark_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  return true;
+};
+
--- a/algo/quark/quark-gate.h
+++ b/algo/quark/quark-gate.h
@@ -0,0 +1,32 @@
+#ifndef QUARK_GATE_H__
+#define QUARK_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define QUARK_4WAY
+#endif
+
+bool register_quark_algo( algo_gate_t* gate );
+
+#if defined(QUARK_4WAY)
+
+void quark_4way_hash( void *state, const void *input );
+
+int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_quark_4way_ctx();
+
+#endif
+
+void quark_hash( void *state, const void *input );
+
+int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_quark_ctx();
+
+#endif
+
--- a/algo/quark/quark.c
+++ b/algo/quark/quark.c
@@ -1,5 +1,5 @@
 #include "cpuminer-config.h"
-#include "algo-gate-api.h"
+#include "quark-gate.h"

 #include <stdio.h>
 #include <string.h>
@@ -47,7 +47,7 @@ void init_quark_ctx()
 #endif
 }

-inline static void quarkhash(void *state, const void *input)
+void quark_hash(void *state, const void *input)
 {
    unsigned char hashbuf[128];
    size_t hashptr;
@@ -187,11 +187,12 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
 	do {
 		pdata[19] = ++n;
 		be32enc(&endiandata[19], n); 
-		quarkhash(hash64, &endiandata);
+		quark_hash(hash64, &endiandata);
                if ((hash64[7]&0xFFFFFF00)==0)
                {
                  if (fulltest(hash64, ptarget)) 
                  {
+                    work_set_target_ratio( work, hash64 );
                    *hashes_done = n - first_nonce + 1;
 		    return true;
                  }
@@ -203,12 +204,3 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

-bool register_quark_algo( algo_gate_t* gate )
-{
-  init_quark_ctx();
-  gate->optimizations = SSE2_OPT | AES_OPT;
-  gate->scanhash         = (void*)&scanhash_quark;
-  gate->hash             = (void*)&quarkhash;
-  return true;
-};
-
--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -122,6 +122,7 @@ int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
 	        	if (!(hash64[7] & mask)) {
 		            printf("[%d]",thr_id);
 			    if (fulltest(hash64, ptarget)) {
+                             work_set_target_ratio( work, hash64 );
                             *hashes_done = n - first_nonce + 1;
 				return true;
 	                    }
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -134,6 +134,7 @@ int scanhash_qubit(int thr_id, struct work *work,
 	        	if (!(hash64[7] & mask)) {
 		            printf("[%d]",thr_id);
 			    if (fulltest(hash64, ptarget)) {
+                             work_set_target_ratio( work, hash64 );
                             *hashes_done = n - first_nonce + 1;
 				return true;
 	                    }
--- a/algo/scrypt.c
+++ b/algo/scrypt.c
@@ -754,6 +754,7 @@ extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce,
 			if (unlikely(hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget))) {
 				*hashes_done = n - pdata[19] + 1;
 				pdata[19] = data[i * 20 + 19];
+                                work_set_target_ratio( work, hash );
 				return 1;
 			}
 		}
--- a/algo/scryptjane/scrypt-jane-chacha.h
+++ b/algo/scryptjane/scrypt-jane-chacha.h
@@ -114,7 +114,7 @@ available_implementations() {
 	return flags;
 }
 #endif
-
+/*
 static int
 scrypt_test_mix() {
 	static const uint8_t expected[16] = {
@@ -145,4 +145,4 @@ scrypt_test_mix() {

 	return ret;
 }
-
+*/
--- a/algo/scryptjane/scrypt-jane-hash.h
+++ b/algo/scryptjane/scrypt-jane-hash.h
@@ -26,7 +26,7 @@
 #include "scrypt-jane-pbkdf2.h"

 #define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */
-
+/*
 static int
 scrypt_test_hash() {
 	scrypt_hash_state st;
@@ -45,4 +45,4 @@ scrypt_test_hash() {
 	scrypt_hash_finish(&st, final);
 	return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE);
 }
-
+*/
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -342,17 +342,6 @@ do { \
 do { \
  sph_u64 t0, t1, t2; \
  __m256i h8; \
-/* can LE be assumed? \
-   dec64le does nothing when SPH_LITTLE endian is set, as it is. \
-  __m256i m0 = _mm256_dec64le( buf ); \
-  __m256i m1 = _mm256_dec64le( buf +  8*4 ); \
-  __m256i m2 = _mm256_dec64le( buf + 16*4 ); \
-  __m256i m3 = _mm256_dec64le( buf + 24*4 ); \
-  __m256i m4 = _mm256_dec64le( buf + 32*4 ); \
-  __m256i m5 = _mm256_dec64le( buf + 40*4 ); \
-  __m256i m6 = _mm256_dec64le( buf + 48*4 ); \
-  __m256i m7 = _mm256_dec64le( buf + 56*4 ); \
-*/ \
  __m256i m0 =  buf[0]; \
  __m256i m1 =  buf[1]; \
  __m256i m2 =  buf[2]; \
--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -39,7 +39,9 @@
 */

 #ifndef __SKEIN_HASH_4WAY_H__
-#define __SKEIN_HASH_4WAY_H__
+#define __SKEIN_HASH_4WAY_H__ 1
+
+#ifdef __AVX2__

 #ifdef __cplusplus
 extern "C"{
@@ -53,14 +55,15 @@ extern "C"{
 #define SPH_SIZE_skein256   256
 #define SPH_SIZE_skein512   512

-#ifdef __AVX2__
-
 typedef struct {
        __m256i buf[8] __attribute__ ((aligned (32)));
        __m256i h0, h1, h2, h3, h4, h5, h6, h7;
        size_t ptr;
 	sph_u64 bcount;
-} skein512_4way_context;
+} sph_skein_4way_big_context;
+
+typedef sph_skein_4way_big_context skein512_4way_context;
+typedef sph_skein_4way_big_context skein256_4way_context;

 void skein512_4way_init(void *cc);
 void skein512_4way(void *cc, const void *data, size_t len);
@@ -68,26 +71,15 @@ void skein512_4way_close(void *cc, void *dst);
 //void sph_skein512_addbits_and_close(
 //        void *cc, unsigned ub, unsigned n, void *dst);

-#endif
-
-#ifdef __AVX__
-
-typedef struct {
-        __m128i buf[8] __attribute__ ((aligned (32)));
-        __m128i h0, h1, h2, h3, h4, h5, h6, h7;
-        size_t ptr;
-        sph_u64 bcount;
-} skein256_4way_context;
-
 void skein256_4way_init(void *cc);
 void skein256_4way(void *cc, const void *data, size_t len);
 void skein256_4way_close(void *cc, void *dst);
 //void sph_skein256_addbits_and_close(
 //	void *cc, unsigned ub, unsigned n, void *dst);

-#endif

 #ifdef __cplusplus
 }
 #endif
 #endif
+#endif
--- a/algo/sm3/sm3-hash-4way.c
+++ b/algo/sm3/sm3-hash-4way.c
@@ -0,0 +1,231 @@
+/* ====================================================================
+ * Copyright (c) 2014 - 2017 The GmSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the GmSSL Project.
+ *    (http://gmssl.org/)"
+ *
+ * 4. The name "GmSSL Project" must not be used to endorse or promote
+ *    products derived from this software without prior written
+ *    permission. For written permission, please contact
+ *    guanzhi1980@gmail.com.
+ *
+ * 5. Products derived from this software may not be called "GmSSL"
+ *    nor may "GmSSL" appear in their names without prior written
+ *    permission of the GmSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the GmSSL Project
+ *    (http://gmssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE GmSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <string.h>
+#include "sm3-hash-4way.h"
+
+#ifdef __AVX__
+
+void sm3_4way_init( sm3_4way_ctx_t *ctx )
+{
+	ctx->digest[0] = _mm_set1_epi32( 0x7380166F );
+	ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 );
+	ctx->digest[2] = _mm_set1_epi32( 0x172442D7 );
+	ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 );
+	ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC );
+	ctx->digest[5] = _mm_set1_epi32( 0x163138AA );
+	ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D );
+	ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E );
+	ctx->nblocks = 0;
+	ctx->num = 0;
+}
+
+void sm3_4way( void *cc, const void *data, size_t len )
+{
+   sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
+   __m128i *block = (__m128i*)ctx->block;
+   __m128i *vdata = (__m128i*)data;
+
+   if ( ctx->num )
+   {
+      unsigned int left = SM3_BLOCK_SIZE - ctx->num;
+      if ( len < left )
+      {
+         memcpy_128( block + (ctx->num >> 2), vdata , len>>2 ); 
+         ctx->num += len;
+         return;
+      }
+      else
+      {
+         memcpy_128( block + (ctx->num >> 2), vdata , left>>2 );
+         sm3_4way_compress( ctx->digest, block );
+         ctx->nblocks++;
+         vdata += left>>2;
+         len -= left;
+      }
+   }
+   while ( len >= SM3_BLOCK_SIZE )
+   {
+      sm3_4way_compress( ctx->digest, vdata );
+      ctx->nblocks++;
+      vdata += SM3_BLOCK_SIZE>>2;
+      len -= SM3_BLOCK_SIZE;
+   }
+   ctx->num = len;
+   if ( len )
+      memcpy_128( block, vdata, len>>2 );
+}
+
+void sm3_4way_close( void *cc, void *dst )
+{
+   sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
+   __m128i *hash = (__m128i*)dst;
+   __m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
+   __m128i *block = (__m128i*)ctx->block;
+   int i;
+
+   block[ctx->num] = _mm_set1_epi32( 0x80 );
+
+   if ( ctx->num + 8 <= SM3_BLOCK_SIZE )
+   {
+      memset_zero_128( block + (ctx->num >> 2) + 1, 
+                      ( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 ); 
+   }
+   else
+   {
+      memset_zero_128( block + (ctx->num >> 2) + 1, 
+                             ( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) );
+      sm3_4way_compress( ctx->digest, block );
+      memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
+   }
+
+   count[0] = mm_byteswap_32(
+                  _mm_set1_epi32( ctx->nblocks >> 23 ) );
+   count[1] = mm_byteswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
+                                              ( ctx->num     << 3 ) ) );
+   sm3_4way_compress( ctx->digest, block );
+
+   for ( i = 0; i < 8 ; i++ )
+     hash[i] = mm_byteswap_32( ctx->digest[i] );
+}
+
+#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x,  9 ), \
+                                               mm_rotl_32( x, 17 ) ) ) 
+#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x, 15 ), \
+                                               mm_rotl_32( x, 23 ) ) ) 
+
+#define FF0(x,y,z) _mm_xor_si128( x, _mm_xor_si128( y, z ) )
+#define FF1(x,y,z) _mm_or_si128( _mm_or_si128( _mm_and_si128( x, y ), \
+                                               _mm_and_si128( x, z ) ), \
+                                               _mm_and_si128( y, z ) )
+
+#define GG0(x,y,z) FF0(x,y,z)
+#define GG1(x,y,z) _mm_or_si128( _mm_and_si128( x, y ), \
+                                 _mm_andnot_si128( x, z ) )
+
+
+void sm3_4way_compress( __m128i *digest, __m128i *block )
+{
+   __m128i W[68], W1[64];
+   __m128i A = digest[ 0 ];
+   __m128i B = digest[ 1 ];
+   __m128i C = digest[ 2 ];
+   __m128i D = digest[ 3 ];
+   __m128i E = digest[ 4 ];
+   __m128i F = digest[ 5 ];
+   __m128i G = digest[ 6 ];
+   __m128i H = digest[ 7 ];
+   __m128i SS1, SS2, TT1, TT2, T;
+   int j;
+
+   for ( j = 0; j < 16; j++ )
+      W[j] = mm_byteswap_32( block[j] );
+
+   for ( j = 16; j < 68; j++ )
+      W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],
+                                                              W[ j-9 ] ),
+                                               mm_rotl_32( W[ j-3 ], 15 ) ) ),
+                            _mm_xor_si128( mm_rotl_32( W[ j-13 ], 7 ),
+                                           W[ j-6 ] ) );
+
+   for( j = 0; j < 64; j++ )
+       W1[j] = _mm_xor_si128( W[j], W[j+4] );
+
+   T = _mm_set1_epi32( 0x79CC4519UL );
+   for( j =0; j < 16; j++ )
+   {
+      SS1 = mm_rotl_32( _mm_add_epi32( _mm_add_epi32( mm_rotl_32( A, 12 ), E ),
+                                      mm_rotl_32( T, j ) ), 7 );
+      SS2 = _mm_xor_si128( SS1, mm_rotl_32( A, 12 ) );
+      TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF0( A, B, C ), D ),
+                                          SS2 ), W1[j] );
+      TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG0( E, F, G ), H ),
+                                          SS1 ), W[j] );
+      D = C;
+      C = mm_rotl_32( B, 9 );
+      B = A;
+      A = TT1;
+      H = G;
+      G = mm_rotl_32( F, 19 );
+      F = E;
+      E = P0( TT2 );
+   }
+
+   T = _mm_set1_epi32( 0x7A879D8AUL );
+   for( j =16; j < 64; j++ )
+   {
+      SS1 = mm_rotl_32( _mm_add_epi32( _mm_add_epi32( mm_rotl_32( A, 12 ), E ),
+                                      mm_rotl_32( T, j&31 ) ), 7 );
+      SS2 = _mm_xor_si128( SS1, mm_rotl_32( A, 12 ) );
+      TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF1( A, B, C ), D ), 
+                                          SS2 ), W1[j] );
+      TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG1( E, F, G ), H ),
+                                          SS1 ), W[j] );
+      D = C;
+      C = mm_rotl_32( B, 9 );
+      B = A;
+      A = TT1;
+      H = G;
+      G = mm_rotl_32( F, 19 );
+      F = E;
+      E = P0( TT2 );
+   }
+
+   digest[0] = _mm_xor_si128( digest[0], A );
+   digest[1] = _mm_xor_si128( digest[1], B );
+   digest[2] = _mm_xor_si128( digest[2], C );
+   digest[3] = _mm_xor_si128( digest[3], D );
+   digest[4] = _mm_xor_si128( digest[4], E );
+   digest[5] = _mm_xor_si128( digest[5], F );
+   digest[6] = _mm_xor_si128( digest[6], G );
+   digest[7] = _mm_xor_si128( digest[7], H );
+}
+
+#endif
+
--- a/algo/sm3/sm3-hash-4way.h
+++ b/algo/sm3/sm3-hash-4way.h
@@ -0,0 +1,89 @@
+/* ====================================================================
+ * Copyright (c) 2014 - 2016 The GmSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the GmSSL Project.
+ *    (http://gmssl.org/)"
+ *
+ * 4. The name "GmSSL Project" must not be used to endorse or promote
+ *    products derived from this software without prior written
+ *    permission. For written permission, please contact
+ *    guanzhi1980@gmail.com.
+ *
+ * 5. Products derived from this software may not be called "GmSSL"
+ *    nor may "GmSSL" appear in their names without prior written
+ *    permission of the GmSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the GmSSL Project
+ *    (http://gmssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE GmSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#ifndef SPH_SM3_HASH_4WAY_H
+#define SPH_SM3_HASH_4WAY_H
+
+#define SM3_DIGEST_LENGTH	32
+#define SM3_BLOCK_SIZE		64
+#define SM3_CBLOCK		(SM3_BLOCK_SIZE)
+#define SM3_HMAC_SIZE		(SM3_DIGEST_LENGTH)
+
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <string.h>
+#include "avxdefs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef struct {
+   __m128i block[16] __attribute__ ((aligned (64)));
+   __m128i digest[8];
+   uint32_t nblocks;
+   uint32_t num;
+} sm3_4way_ctx_t;
+
+void sm3_4way_init( sm3_4way_ctx_t *ctx );
+//void sm3_4way_update( sm3_4way_ctx_t *ctx, const unsigned char* data,
+//                      size_t data_len );
+//void sm3_4way_final( sm3_4way_ctx_t *ctx,
+//                      unsigned char digest[SM3_DIGEST_LENGTH] );
+void sm3_4way_compress( __m128i *digest, __m128i *block );
+
+void sm3_4way(void *cc, const void *data, size_t len);
+void sm3_4way_close(void *cc, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/algo/sm3/sm3.c
+++ b/algo/sm3/sm3.c
@@ -189,7 +189,7 @@ void sm3_compress(uint32_t digest[8], const unsigned char block[64])
 	for(j =16; j < 64; j++) {

 		T[j] = 0x7A879D8A;
-		SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j)), 7);
+		SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j&31)), 7);
 		SS2 = SS1 ^ ROTATELEFT(A,12);
 		TT1 = FF1(A,B,C) + D + SS2 + W1[j];
 		TT2 = GG1(E,F,G) + H + SS1 + W[j];
--- a/algo/whirlpool/sph_whirlpool.c
+++ b/algo/whirlpool/sph_whirlpool.c
@@ -3468,9 +3468,10 @@ sph_ ## name ## _close(void *cc, void *dst) \
 	for (i = 0; i < 8; i ++) \
 		sph_enc64le((unsigned char *)dst + 8 * i, sc->state[i]); \
 }
-//	sph_ ## name ## _init(cc); \
-//}
-
+/*
+	sph_ ## name ## _init(cc); \
+}
+*/
 MAKE_CLOSE(whirlpool)
 MAKE_CLOSE(whirlpool0)
 MAKE_CLOSE(whirlpool1)
--- a/algo/whirlpool/whirlpool-gate.h
+++ b/algo/whirlpool/whirlpool-gate.h
@@ -22,6 +22,7 @@ void whirlpool_hash( void *state, const void *input );

 int scanhash_whirlpool( int thr_id, struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done );
+void init_whirlpool_ctx();
 #endif

 #endif
--- a/algo/whirlpool/whirlpool-hash-4way.c
+++ b/algo/whirlpool/whirlpool-hash-4way.c
@@ -3345,8 +3345,10 @@ do { \
 #define READ_STATE     MUL8(READ_STATE_W)
 #define ROUND0         MUL8(ROUND0_W)
 #define UPDATE_STATE   MUL8(UPDATE_STATE_W)
-//#define BYTE(x, n) \
-//   _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
+/*
+#define BYTE(x, n) \
+   _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
+*/
 #define BYTE(x, n)     ((unsigned)((x) >> (8 * (n))) & 0xFF)


--- a/algo/x11/c11.c
+++ b/algo/x11/c11.c
@@ -162,7 +162,8 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
                {
 			pdata[19] = nonce;
 			*hashes_done = pdata[19] - first_nonce;
-			return 1;
+                        work_set_target_ratio( work, hash );
+ 			return 1;
 		}
 		nonce++;
 	} while ( nonce < max_nonce && !(*restart) );
--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -0,0 +1,274 @@
+#include "timetravel-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread int permutation[TT8_FUNC_COUNT] = { 0 };
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+} tt8_4way_ctx_holder;
+
+tt8_4way_ctx_holder tt8_4way_ctx __attribute__ ((aligned (64)));
+
+void init_tt8_4way_ctx()
+{
+    blake512_4way_init( &tt8_4way_ctx.blake );
+    bmw512_4way_init( &tt8_4way_ctx.bmw );
+    init_groestl( &tt8_4way_ctx.groestl, 64 );
+    skein512_4way_init( &tt8_4way_ctx.skein );
+    jh512_4way_init( &tt8_4way_ctx.jh );
+    keccak512_4way_init( &tt8_4way_ctx.keccak );
+    init_luffa( &tt8_4way_ctx.luffa, 512 );
+    cubehashInit( &tt8_4way_ctx.cube, 512, 16, 32 );
+};
+
+void timetravel_4way_hash(void *output, const void *input)
+{
+   uint64_t hash0[8] __attribute__ ((aligned (64)));
+   uint64_t hash1[8] __attribute__ ((aligned (64)));
+   uint64_t hash2[8] __attribute__ ((aligned (64)));
+   uint64_t hash3[8] __attribute__ ((aligned (64)));
+   uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
+   uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
+   uint64_t *vhashA, *vhashB;
+   tt8_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+   uint32_t dataLen = 64;
+   int i;
+
+   memcpy( &ctx, &tt8_4way_ctx, sizeof(tt8_4way_ctx) );
+
+   for ( i = 0; i < TT8_FUNC_COUNT; i++ )
+   {
+      if (i == 0)
+      {
+	 dataLen = 80;
+         vhashA = (uint64_t*)input;
+         vhashB = vhashX;
+      }
+      else
+      {
+         dataLen = 64;
+         if ( i % 2 == 0 )
+         {
+           vhashA = vhashY;
+           vhashB = vhashX;
+         }
+         else
+         {
+           vhashA = vhashX;
+           vhashB = vhashY;
+         }
+      }
+
+      switch ( permutation[i] )
+      {
+        case 0:
+           blake512_4way( &ctx.blake, vhashA, dataLen );
+           blake512_4way_close( &ctx.blake, vhashB );
+           if ( i == 7 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 1:
+           bmw512_4way( &ctx.bmw, vhashA, dataLen );
+           bmw512_4way_close( &ctx.bmw, vhashB );
+           if ( i == 7 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 2:
+           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                    vhashA, dataLen<<3 );
+           update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                   (char*)hash0, dataLen<<3 );
+           reinit_groestl( &ctx.groestl );
+           update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                   (char*)hash1, dataLen<<3 );
+           reinit_groestl( &ctx.groestl );     
+           update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                   (char*)hash2, dataLen<<3 );
+           reinit_groestl( &ctx.groestl );     
+           update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                   (char*)hash3, dataLen<<3 );
+           if ( i != 7 )
+              mm256_interleave_4x64( vhashB,
+                                     hash0, hash1, hash2, hash3, dataLen<<3 );
+        break;
+        case 3:
+           skein512_4way( &ctx.skein, vhashA, dataLen );
+           skein512_4way_close( &ctx.skein, vhashB );
+           if ( i == 7 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 4:
+           jh512_4way( &ctx.jh, vhashA, dataLen );
+           jh512_4way_close( &ctx.jh, vhashB );
+           if ( i == 7 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 5:
+           keccak512_4way( &ctx.keccak, vhashA, dataLen );
+           keccak512_4way_close( &ctx.keccak, vhashB );
+           if ( i == 7 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 6:
+           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                    vhashA, dataLen<<3 );
+           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                                        (const BitSequence *)hash0, dataLen );
+           memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
+           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                                         (const BitSequence*)hash1, dataLen );
+           memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
+           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                                         (const BitSequence*)hash2, dataLen );
+           memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
+           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                                         (const BitSequence*)hash3, dataLen );
+           if ( i != 7 )           
+              mm256_interleave_4x64( vhashB,
+                                     hash0, hash1, hash2, hash3, dataLen<<3 );
+        break;
+        case 7:
+           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                    vhashA, dataLen<<3 );
+           cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                                      (const byte*)hash0, dataLen );
+           memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
+           cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
+                                      (const byte*)hash1, dataLen );
+           memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
+           cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
+                                      (const byte*)hash2, dataLen );
+           memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
+           cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
+                                      (const byte*)hash3, dataLen );
+           if ( i != 7 )           
+              mm256_interleave_4x64( vhashB,
+                                     hash0, hash1, hash2, hash3, dataLen<<3 );
+        break;
+        default:
+           applog(LOG_ERR,"SWERR: timetravel invalid permutation");
+	break;
+      }
+   }
+
+   memcpy( output,    hash0, 32 );
+   memcpy( output+32, hash1, 32 );
+   memcpy( output+64, hash2, 32 );
+   memcpy( output+96, hash3, 32 );
+}
+
+int scanhash_timetravel_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                              uint64_t *hashes_done )
+{
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+   uint32_t *noncep1 = vdata + 75;
+   uint32_t *noncep2 = vdata + 77;
+   uint32_t *noncep3 = vdata + 79;
+   const uint32_t Htarg = ptarget[7];
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   int i;
+
+   if ( opt_benchmark )
+	ptarget[7] = 0x0cff;
+
+   for ( int k = 0; k < 19; k++ )
+	be32enc( &endiandata[k], pdata[k] );
+
+   const uint32_t timestamp = endiandata[17];
+   if ( timestamp != s_ntime )
+   {
+      const int steps = ( timestamp - TT8_FUNC_BASE_TIMESTAMP )
+                    % TT8_FUNC_COUNT_PERMUTATIONS;
+      for ( i = 0; i < TT8_FUNC_COUNT; i++ )
+         permutation[i] = i;
+      for ( i = 0; i < steps; i++ )
+         tt8_next_permutation( permutation, permutation + TT8_FUNC_COUNT );
+      s_ntime = timestamp;
+   }
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   do
+   {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      timetravel_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
+      {
+         found[0] = true;
+         num_found++;
+         nonces[0] = n;
+         work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget) )
+      {
+         found[1] = true;
+         num_found++;
+         nonces[1] = n+1;
+         work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget) )
+      {
+         found[2] = true;
+         num_found++;
+         nonces[2] = n+2;
+         work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget) )
+      {
+         found[3] = true;
+         num_found++;
+         nonces[3] = n+3;
+         work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/x11/timetravel-gate.c
+++ b/algo/x11/timetravel-gate.c
@@ -0,0 +1,78 @@
+#include "timetravel-gate.h"
+
+void tt8_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool register_timetravel_algo( algo_gate_t* gate )
+{
+#ifdef TIMETRAVEL_4WAY
+  init_tt8_4way_ctx();
+  gate->scanhash   = (void*)&scanhash_timetravel_4way;
+  gate->hash       = (void*)&timetravel_4way_hash;
+#else
+  init_tt8_ctx();
+  gate->scanhash   = (void*)&scanhash_timetravel;
+  gate->hash       = (void*)&timetravel_hash;
+#endif
+  gate->set_target = (void*)&tt8_set_target;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  return true;
+};
+
+inline void tt_swap( int *a, int *b )
+{
+        int c = *a;
+        *a = *b;
+        *b = c;
+}
+
+inline void reverse( int *pbegin, int *pend )
+{
+   while ( (pbegin != pend) && (pbegin != --pend) )
+   {
+      tt_swap( pbegin, pend );
+      pbegin++;
+   }
+}
+
+void tt8_next_permutation( int *pbegin, int *pend )
+{
+   if ( pbegin == pend )
+        return;
+
+   int *i = pbegin;
+   ++i;
+   if ( i == pend )
+        return;
+
+   i = pend;
+   --i;
+
+   while (1)
+   {
+        int *j = i;
+        --i;
+
+        if ( *i < *j )
+        {
+           int *k = pend;
+
+           while ( !(*i < *--k) ) /* do nothing */ ;
+
+           tt_swap( i, k );
+           reverse(j, pend);
+                return; // true
+        }
+
+        if ( i == pbegin )
+        {
+           reverse(pbegin, pend);
+           return; // false
+        }
+        // else?
+   }
+}
+
--- a/algo/x11/timetravel-gate.h
+++ b/algo/x11/timetravel-gate.h
@@ -0,0 +1,40 @@
+#ifndef TIMETRAVEL_GATE_H__
+#define TIMETRAVEL_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define TIMETRAVEL_4WAY
+#endif
+
+// Machinecoin Genesis Timestamp
+#define TT8_FUNC_BASE_TIMESTAMP 1389040865
+
+#define TT8_FUNC_COUNT 8
+#define TT8_FUNC_COUNT_PERMUTATIONS 40320
+
+void tt8_next_permutation( int *pbegin, int *pend );
+
+bool register_timetravel_algo( algo_gate_t* gate );
+
+#if defined(TIMETRAVEL_4WAY)
+
+void timetravel_4way_hash( void *state, const void *input );
+
+int scanhash_timetravel_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_tt8_4way_ctx();
+
+#endif
+
+void timetravel_hash( void *state, const void *input );
+
+int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_tt8_ctx();
+
+#endif
+
--- a/algo/x11/timetravel.c
+++ b/algo/x11/timetravel.c
@@ -1,11 +1,9 @@
-#include "algo-gate-api.h"
+#include "timetravel-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "avxdefs.h"
-
 #include "algo/blake/sph_blake.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/jh/sph_jh.h"
@@ -13,75 +11,14 @@
 #include "algo/skein/sph_skein.h"
 #include "algo/luffa/sse2/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-
 #ifdef NO_AES_NI
  #include "algo/groestl/sph_groestl.h"
 #else
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #endif

-// Machinecoin Genesis Timestamp
-#define HASH_FUNC_BASE_TIMESTAMP 1389040865
-
-#define HASH_FUNC_COUNT 8
-#define HASH_FUNC_COUNT_PERMUTATIONS 40320
-
 static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread int permutation[HASH_FUNC_COUNT] = { 0 };
-
-inline void tt_swap( int *a, int *b )
-{
-	int c = *a;
-	*a = *b;
-	*b = c;
-}
-
-inline void reverse( int *pbegin, int *pend )
-{
-   while ( (pbegin != pend) && (pbegin != --pend) )
-   {
-      tt_swap( pbegin, pend );
-      pbegin++;
-   }
-}
-
-static void next_permutation( int *pbegin, int *pend )
-{
-   if ( pbegin == pend )
-	return;
-
-   int *i = pbegin;
-   ++i;
-   if ( i == pend )
-	return;
-
-   i = pend;
-   --i;
-
-   while (1)
-   {
-	int *j = i;
-	--i;
-
-	if ( *i < *j )
-        {
-           int *k = pend;
-
-	   while ( !(*i < *--k) ) /* do nothing */ ;
-
-	   tt_swap( i, k );
-	   reverse(j, pend);
-		return; // true
-	}
-
-	if ( i == pbegin )
-        {
-	   reverse(pbegin, pend);
-	   return; // false
-	}
-        // else?
-   }
-}
+static __thread int permutation[TT8_FUNC_COUNT] = { 0 };

 typedef struct {
        sph_blake512_context    blake;
@@ -101,7 +38,7 @@ typedef struct {
 tt_ctx_holder tt_ctx __attribute__ ((aligned (64)));
 __thread tt_ctx_holder tt_mid __attribute__ ((aligned (64)));

-void init_tt_ctx()
+void init_tt8_ctx()
 {
        sph_blake512_init( &tt_ctx.blake );
        sph_bmw512_init( &tt_ctx.bmw );
@@ -119,7 +56,7 @@ void init_tt_ctx()

 void timetravel_hash(void *output, const void *input)
 {
-   uint32_t hash[ 16 * HASH_FUNC_COUNT ] __attribute__ ((aligned (64)));
+   uint32_t hash[ 16 * TT8_FUNC_COUNT ] __attribute__ ((aligned (64)));
   uint32_t *hashA, *hashB;
   tt_ctx_holder ctx __attribute__ ((aligned (64)));
   uint32_t dataLen = 64;
@@ -130,7 +67,7 @@ void timetravel_hash(void *output, const void *input)

   memcpy( &ctx, &tt_ctx, sizeof(tt_ctx) );

-   for ( i = 0; i < HASH_FUNC_COUNT; i++ )
+   for ( i = 0; i < TT8_FUNC_COUNT; i++ )
   {
        if (i == 0)
        {
@@ -270,7 +207,7 @@ void timetravel_hash(void *output, const void *input)
    }
  }

-	memcpy(output, &hash[16 * (HASH_FUNC_COUNT - 1)], 32);
+	memcpy(output, &hash[16 * (TT8_FUNC_COUNT - 1)], 32);
 }

 int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
@@ -296,12 +233,12 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t timestamp = endiandata[17];
   if ( timestamp != s_ntime )
   {
-      const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP )
-                    % HASH_FUNC_COUNT_PERMUTATIONS;
-      for ( i = 0; i < HASH_FUNC_COUNT; i++ )
+      const int steps = ( timestamp - TT8_FUNC_BASE_TIMESTAMP )
+                    % TT8_FUNC_COUNT_PERMUTATIONS;
+      for ( i = 0; i < TT8_FUNC_COUNT; i++ )
         permutation[i] = i;
      for ( i = 0; i < steps; i++ )
-         next_permutation( permutation, permutation + HASH_FUNC_COUNT );
+         tt8_next_permutation( permutation, permutation + TT8_FUNC_COUNT );
      s_ntime = timestamp;

      // do midstate precalc for first function
@@ -359,6 +296,7 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
              work_set_target_ratio( work, hash );
              pdata[19] = nonce;
              *hashes_done = pdata[19] - first_nonce;
+              work_set_target_ratio( work, hash );
              return 1;
         }
         nonce++;
@@ -370,19 +308,4 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
  return 0;
 }

-void timetravel_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool register_timetravel_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_tt_ctx();
-  gate->scanhash   = (void*)&scanhash_timetravel;
-  gate->hash       = (void*)&timetravel_hash;
-  gate->set_target = (void*)&timetravel_set_target;
-  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  return true;
-};

--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -0,0 +1,316 @@
+#include "timetravel10-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+} tt10_4way_ctx_holder;
+
+tt10_4way_ctx_holder tt10_4way_ctx __attribute__ ((aligned (64)));
+
+void init_tt10_4way_ctx()
+{
+    blake512_4way_init( &tt10_4way_ctx.blake );
+    bmw512_4way_init( &tt10_4way_ctx.bmw );
+    init_groestl( &tt10_4way_ctx.groestl, 64 );
+    skein512_4way_init( &tt10_4way_ctx.skein );
+    jh512_4way_init( &tt10_4way_ctx.jh );
+    keccak512_4way_init( &tt10_4way_ctx.keccak );
+    init_luffa( &tt10_4way_ctx.luffa, 512 );
+    cubehashInit( &tt10_4way_ctx.cube, 512, 16, 32 );
+    sph_shavite512_init( &tt10_4way_ctx.shavite );
+    init_sd( &tt10_4way_ctx.simd, 512 );
+};
+
+void timetravel10_4way_hash(void *output, const void *input)
+{
+   uint64_t hash0[8] __attribute__ ((aligned (64)));
+   uint64_t hash1[8] __attribute__ ((aligned (64)));
+   uint64_t hash2[8] __attribute__ ((aligned (64)));
+   uint64_t hash3[8] __attribute__ ((aligned (64)));
+   uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
+   uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
+   uint64_t *vhashA, *vhashB;
+   tt10_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+   uint32_t dataLen = 64;
+   int i;
+
+   memcpy( &ctx, &tt10_4way_ctx, sizeof(tt10_4way_ctx) );
+
+   for ( i = 0; i < TT10_FUNC_COUNT; i++ )
+   {
+      if (i == 0)
+      {
+	 dataLen = 80;
+         vhashA = (uint64_t*)input;
+         vhashB = vhashX;
+      }
+      else
+      {
+         dataLen = 64;
+         if ( i % 2 == 0 )
+         {
+           vhashA = vhashY;
+           vhashB = vhashX;
+         }
+         else
+         {
+           vhashA = vhashX;
+           vhashB = vhashY;
+         }
+      }
+
+      switch ( permutation[i] )
+      {
+        case 0:
+           blake512_4way( &ctx.blake, vhashA, dataLen );
+           blake512_4way_close( &ctx.blake, vhashB );
+           if ( i == 9 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 1:
+           bmw512_4way( &ctx.bmw, vhashA, dataLen );
+           bmw512_4way_close( &ctx.bmw, vhashB );
+           if ( i == 9 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 2:
+           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                    vhashA, dataLen<<3 );
+           update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                   (char*)hash0, dataLen<<3 );
+           reinit_groestl( &ctx.groestl );
+           update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                   (char*)hash1, dataLen<<3 );
+           reinit_groestl( &ctx.groestl );     
+           update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                   (char*)hash2, dataLen<<3 );
+           reinit_groestl( &ctx.groestl );     
+           update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                   (char*)hash3, dataLen<<3 );
+           if ( i != 9 )
+              mm256_interleave_4x64( vhashB,
+                                     hash0, hash1, hash2, hash3, dataLen<<3 );
+        break;
+        case 3:
+           skein512_4way( &ctx.skein, vhashA, dataLen );
+           skein512_4way_close( &ctx.skein, vhashB );
+           if ( i == 9 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 4:
+           jh512_4way( &ctx.jh, vhashA, dataLen );
+           jh512_4way_close( &ctx.jh, vhashB );
+           if ( i == 9 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 5:
+           keccak512_4way( &ctx.keccak, vhashA, dataLen );
+           keccak512_4way_close( &ctx.keccak, vhashB );
+           if ( i == 9 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 6:
+           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                    vhashA, dataLen<<3 );
+           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                                        (const BitSequence *)hash0, dataLen );
+           memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
+           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                                         (const BitSequence*)hash1, dataLen );
+           memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
+           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                                         (const BitSequence*)hash2, dataLen );
+           memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
+           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                                         (const BitSequence*)hash3, dataLen );
+           if ( i != 9 )           
+              mm256_interleave_4x64( vhashB,
+                                     hash0, hash1, hash2, hash3, dataLen<<3 );
+        break;
+        case 7:
+           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                    vhashA, dataLen<<3 );
+           cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                                      (const byte*)hash0, dataLen );
+           memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
+           cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
+                                      (const byte*)hash1, dataLen );
+           memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
+           cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
+                                      (const byte*)hash2, dataLen );
+           memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
+           cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
+                                      (const byte*)hash3, dataLen );
+           if ( i != 9 )           
+              mm256_interleave_4x64( vhashB,
+                                     hash0, hash1, hash2, hash3, dataLen<<3 );
+        break;
+        case 8:
+           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                    vhashA, dataLen<<3 );
+           sph_shavite512( &ctx.shavite, hash0, dataLen );
+           sph_shavite512_close( &ctx.shavite, hash0 );
+           memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
+           sph_shavite512( &ctx.shavite, hash1, dataLen );
+           sph_shavite512_close( &ctx.shavite, hash1 );
+           memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
+           sph_shavite512( &ctx.shavite, hash2, dataLen );
+           sph_shavite512_close( &ctx.shavite, hash2 );
+           memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
+           sph_shavite512( &ctx.shavite, hash3, dataLen );
+           sph_shavite512_close( &ctx.shavite, hash3 );
+           if ( i != 9 )
+              mm256_interleave_4x64( vhashB,
+                                     hash0, hash1, hash2, hash3, dataLen<<3 );
+        break;
+        case 9:
+           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                    vhashA, dataLen<<3 );
+           update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                            (const BitSequence *)hash0, dataLen<<3 );
+           memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
+           update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                            (const BitSequence *)hash1, dataLen<<3 );
+           memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
+           update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                            (const BitSequence *)hash2, dataLen<<3 );
+           memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
+           update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                            (const BitSequence *)hash3, dataLen<<3 );
+           if ( i != 9 )
+              mm256_interleave_4x64( vhashB,
+                                     hash0, hash1, hash2, hash3, dataLen<<3 );
+        break;
+        default:
+           applog(LOG_ERR,"SWERR: timetravel invalid permutation");
+	break;
+      }
+   }
+
+   memcpy( output,    hash0, 32 );
+   memcpy( output+32, hash1, 32 );
+   memcpy( output+64, hash2, 32 );
+   memcpy( output+96, hash3, 32 );
+}
+
+int scanhash_timetravel10_4way( int thr_id, struct work *work,
+                                uint32_t max_nonce, uint64_t *hashes_done )
+{
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+   uint32_t *noncep1 = vdata + 75;
+   uint32_t *noncep2 = vdata + 77;
+   uint32_t *noncep3 = vdata + 79;
+   const uint32_t Htarg = ptarget[7];
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   int i;
+
+   if ( opt_benchmark )
+	ptarget[7] = 0x0cff;
+
+   for ( int k = 0; k < 19; k++ )
+	be32enc( &endiandata[k], pdata[k] );
+
+   const uint32_t timestamp = endiandata[17];
+   if ( timestamp != s_ntime )
+   {
+      const int steps = ( timestamp - TT10_FUNC_BASE_TIMESTAMP )
+                    % TT10_FUNC_COUNT_PERMUTATIONS;
+      for ( i = 0; i < TT10_FUNC_COUNT; i++ )
+         permutation[i] = i;
+      for ( i = 0; i < steps; i++ )
+         tt10_next_permutation( permutation, permutation + TT10_FUNC_COUNT );
+      s_ntime = timestamp;
+   }
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   do
+   {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      timetravel10_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
+      {
+         found[0] = true;
+         num_found++;
+         nonces[0] = n;
+         work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget) )
+      {
+         found[1] = true;
+         num_found++;
+         nonces[1] = n+1;
+         work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget) )
+      {
+         found[2] = true;
+         num_found++;
+         nonces[2] = n+2;
+         work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget) )
+      {
+         found[3] = true;
+         num_found++;
+         nonces[3] = n+3;
+         work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/x11/timetravel10-gate.c
+++ b/algo/x11/timetravel10-gate.c
@@ -0,0 +1,78 @@
+#include "timetravel10-gate.h"
+
+void tt10_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool register_timetravel10_algo( algo_gate_t* gate )
+{
+#ifdef TIMETRAVEL10_4WAY
+  init_tt10_4way_ctx();
+  gate->scanhash   = (void*)&scanhash_timetravel10_4way;
+  gate->hash       = (void*)&timetravel10_4way_hash;
+#else
+  init_tt10_ctx();
+  gate->scanhash   = (void*)&scanhash_timetravel10;
+  gate->hash       = (void*)&timetravel10_hash;
+#endif
+  gate->set_target = (void*)&tt10_set_target;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  return true;
+};
+
+inline void tt10_swap( int *a, int *b )
+{
+        int c = *a;
+        *a = *b;
+        *b = c;
+}
+
+inline void reverse( int *pbegin, int *pend )
+{
+   while ( (pbegin != pend) && (pbegin != --pend) )
+   {
+      tt10_swap( pbegin, pend );
+      pbegin++;
+   }
+}
+
+void tt10_next_permutation( int *pbegin, int *pend )
+{
+   if ( pbegin == pend )
+        return;
+
+   int *i = pbegin;
+   ++i;
+   if ( i == pend )
+        return;
+
+   i = pend;
+   --i;
+
+   while (1)
+   {
+        int *j = i;
+        --i;
+
+        if ( *i < *j )
+        {
+           int *k = pend;
+
+           while ( !(*i < *--k) ) /* do nothing */ ;
+
+           tt10_swap( i, k );
+           reverse(j, pend);
+                return; // true
+        }
+
+        if ( i == pbegin )
+        {
+           reverse(pbegin, pend);
+           return; // false
+        }
+        // else?
+   }
+}
+
--- a/algo/x11/timetravel10-gate.h
+++ b/algo/x11/timetravel10-gate.h
@@ -0,0 +1,39 @@
+#ifndef TIMETRAVEL10_GATE_H__
+#define TIMETRAVEL10_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define TIMETRAVEL10_4WAY
+#endif
+
+// BitCore Genesis Timestamp
+#define TT10_FUNC_BASE_TIMESTAMP 1492973331U
+#define TT10_FUNC_COUNT 10
+#define TT10_FUNC_COUNT_PERMUTATIONS 40320
+
+void tt10_next_permutation( int *pbegin, int *pend );
+
+bool register_timetravel10_algo( algo_gate_t* gate );
+
+#if defined(TIMETRAVEL10_4WAY)
+
+void timetravel10_4way_hash( void *state, const void *input );
+
+int scanhash_timetravel10_4way( int thr_id, struct work *work,
+                                uint32_t max_nonce, uint64_t *hashes_done );
+
+void init_tt10_4way_ctx();
+
+#endif
+
+void timetravel10_hash( void *state, const void *input );
+
+int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_tt10_ctx();
+
+#endif
+
--- a/algo/x11/timetravel10.c
+++ b/algo/x11/timetravel10.c
@@ -1,11 +1,8 @@
-#include "algo-gate-api.h"
-
+#include "timetravel10-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "avxdefs.h"
-
 #include "algo/blake/sph_blake.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/jh/sph_jh.h"
@@ -22,68 +19,8 @@
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #endif

-// BitCore Genesis Timestamp
-#define HASH_FUNC_BASE_TIMESTAMP 1492973331U
-
-#define HASH_FUNC_COUNT 10
-#define HASH_FUNC_COUNT_PERMUTATIONS 40320
-
 static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread int permutation[HASH_FUNC_COUNT] = { 0 };
-
-inline void tt10_swap( int *a, int *b )
-{
-	int c = *a;
-	*a = *b;
-	*b = c;
-}
-
-inline void reverse( int *pbegin, int *pend )
-{
-   while ( (pbegin != pend) && (pbegin != --pend) )
-   {
-      tt10_swap( pbegin, pend );
-      pbegin++;
-   }
-}
-
-static void next_permutation( int *pbegin, int *pend )
-{
-   if ( pbegin == pend )
-	return;
-
-   int *i = pbegin;
-   ++i;
-   if ( i == pend )
-	return;
-
-   i = pend;
-   --i;
-
-   while (1)
-   {
-	int *j = i;
-	--i;
-
-	if ( *i < *j )
-        {
-           int *k = pend;
-
-	   while ( !(*i < *--k) ) /* do nothing */ ;
-
-	   tt10_swap( i, k );
-	   reverse(j, pend);
-		return; // true
-	}
-
-	if ( i == pbegin )
-        {
-	   reverse(pbegin, pend);
-	   return; // false
-	}
-        // else?
-   }
-}
+static __thread int permutation[TT10_FUNC_COUNT] = { 0 };

 typedef struct {
        sph_blake512_context    blake;
@@ -125,7 +62,7 @@ void init_tt10_ctx()

 void timetravel10_hash(void *output, const void *input)
 {
-   uint32_t hash[ 16 * HASH_FUNC_COUNT ] __attribute__ ((aligned (64)));
+   uint32_t hash[ 16 * TT10_FUNC_COUNT ] __attribute__ ((aligned (64)));
   uint32_t *hashA, *hashB;
   tt10_ctx_holder ctx __attribute__ ((aligned (64)));
   uint32_t dataLen = 64;
@@ -136,7 +73,7 @@ void timetravel10_hash(void *output, const void *input)

   memcpy( &ctx, &tt10_ctx, sizeof(tt10_ctx) );

-   for ( i = 0; i < HASH_FUNC_COUNT; i++ )
+   for ( i = 0; i < TT10_FUNC_COUNT; i++ )
   {
        if (i == 0)
        {
@@ -302,7 +239,7 @@ void timetravel10_hash(void *output, const void *input)
    }
  }

-	memcpy(output, &hash[16 * (HASH_FUNC_COUNT - 1)], 32);
+	memcpy(output, &hash[16 * (TT10_FUNC_COUNT - 1)], 32);
 }

 int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
@@ -328,12 +265,12 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t timestamp = endiandata[17];
   if ( timestamp != s_ntime )
   {
-      const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP )
-                    % HASH_FUNC_COUNT_PERMUTATIONS;
-      for ( i = 0; i < HASH_FUNC_COUNT; i++ )
+      const int steps = ( timestamp - TT10_FUNC_BASE_TIMESTAMP )
+                    % TT10_FUNC_COUNT_PERMUTATIONS;
+      for ( i = 0; i < TT10_FUNC_COUNT; i++ )
         permutation[i] = i;
      for ( i = 0; i < steps; i++ )
-         next_permutation( permutation, permutation + HASH_FUNC_COUNT );
+         tt10_next_permutation( permutation, permutation + TT10_FUNC_COUNT );
      s_ntime = timestamp;

      // do midstate precalc for first function
@@ -398,6 +335,7 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
        {
              work_set_target_ratio( work, hash );
              pdata[19] = nonce;
+              work_set_target_ratio( work, hash );
              *hashes_done = pdata[19] - first_nonce;
              return 1;
         }
@@ -409,20 +347,3 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
        *hashes_done = pdata[19] - first_nonce + 1;
  return 0;
 }
-
-void timetravel10_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool register_timetravel10_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_tt10_ctx();
-  gate->scanhash   = (void*)&scanhash_timetravel10;
-  gate->hash       = (void*)&timetravel10_hash;
-  gate->set_target = (void*)&timetravel10_set_target;
-  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  return true;
-};
-
--- a/algo/x11/x11.c
+++ b/algo/x11/x11.c
@@ -179,6 +179,7 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
                 if ( fulltest( hash64, ptarget ) )
                 {
                    *hashes_done = n - first_nonce + 1;
+                    work_set_target_ratio( work, hash64 );
                    return true;
                 }
              }
@@ -189,14 +190,3 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
        pdata[19] = n;
        return 0;
 }
-/*
-bool register_x11_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_x11_ctx();
-  gate->scanhash  = (void*)&scanhash_x11;
-  gate->hash      = (void*)&x11_hash;
-  gate->get_max64 = (void*)&get_max64_0x3ffff;
-  return true;
-};
-*/
--- a/algo/x11/x11evo-4way.c
+++ b/algo/x11/x11evo-4way.c
@@ -0,0 +1,340 @@
+#include "cpuminer-config.h"
+#include "x11evo-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <string.h>
+#include <stdint.h>
+#include <compat/portable_endian.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sph_luffa.h"
+#include "algo/cubehash/sph_cubehash.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sph_simd.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/simd/sse2/nist.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+} x11evo_4way_ctx_holder;
+
+static x11evo_4way_ctx_holder x11evo_4way_ctx __attribute__ ((aligned (64)));
+
+void init_x11evo_4way_ctx()
+{
+     blake512_4way_init( &x11evo_4way_ctx.blake );
+     bmw512_4way_init( &x11evo_4way_ctx.bmw );
+     init_groestl( &x11evo_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x11evo_4way_ctx.skein );
+     jh512_4way_init( &x11evo_4way_ctx.jh );
+     keccak512_4way_init( &x11evo_4way_ctx.keccak );
+     init_luffa( &x11evo_4way_ctx.luffa, 512 );
+     cubehashInit( &x11evo_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x11evo_4way_ctx.shavite );
+     init_sd( &x11evo_4way_ctx.simd, 512 );
+     init_echo( &x11evo_4way_ctx.echo, 512 );
+}
+
+static char hashOrder[X11EVO_FUNC_COUNT + 1] = { 0 };
+static __thread uint32_t s_ntime = UINT32_MAX;
+
+void x11evo_4way_hash( void *state, const void *input )
+{
+   uint32_t hash0[16] __attribute__ ((aligned (64)));
+   uint32_t hash1[16] __attribute__ ((aligned (64)));
+   uint32_t hash2[16] __attribute__ ((aligned (64)));
+   uint32_t hash3[16] __attribute__ ((aligned (64)));
+   uint32_t vhash[16*4] __attribute__ ((aligned (64)));
+   x11evo_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &x11evo_4way_ctx, sizeof(x11evo_4way_ctx) );
+
+   if ( s_seq == -1 )
+   {
+       uint32_t *data = (uint32_t*) input;
+       const uint32_t ntime = data[17];
+       evo_twisted_code( ntime, hashOrder );
+    }
+
+   int i;
+   int len = strlen( hashOrder );
+   for ( i = 0; i < len; i++ )
+   {
+      char elem = hashOrder[i];
+      uint8_t idx;
+      if ( elem >= 'A' )
+         idx = elem - 'A' + 10;
+      else
+         idx = elem - '0';
+
+//      int size = 64;
+
+      switch ( idx )
+      {
+         case 0:
+            blake512_4way( &ctx.blake, input, 80 );
+            blake512_4way_close( &ctx.blake, vhash );
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                        vhash, 64<<3 );
+         break;
+         case 1:
+            bmw512_4way( &ctx.bmw, vhash, 64 );
+            bmw512_4way_close( &ctx.bmw, vhash );
+            if ( i >= len-1 )
+               mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                        vhash, 64<<3 );
+         break;
+         case 2:
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                     vhash, 64<<3 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                    (char*)hash0, 512 );
+            reinit_groestl( &ctx.groestl );
+            update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                    (char*)hash1, 512 );
+            reinit_groestl( &ctx.groestl );
+            update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                      (char*)hash2, 512 );
+            reinit_groestl( &ctx.groestl );
+            update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                      (char*)hash3, 512 );
+            if ( i < len-1 )
+               mm256_interleave_4x64( vhash,
+                                      hash0, hash1, hash2, hash3, 64<<3 );
+         break;
+         case 3:
+            skein512_4way( &ctx.skein, vhash, 64 );
+            skein512_4way_close( &ctx.skein, vhash );
+            if ( i >= len-1 )
+               mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                        vhash, 64<<3 );
+         break;
+         case 4:
+            jh512_4way( &ctx.jh, vhash, 64 );
+            jh512_4way_close( &ctx.jh, vhash );
+            if ( i >= len-1 )
+               mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                        vhash, 64<<3 );
+         break;
+         case 5:
+            keccak512_4way( &ctx.keccak, vhash, 64 );
+            keccak512_4way_close( &ctx.keccak, vhash );
+            if ( i >= len-1 )
+               mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                        vhash, 64<<3 );
+         break;
+         case 6:
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                     vhash, 64<<3 );
+            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                                          (const BitSequence*)hash0, 64 );
+            memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
+                    sizeof(hashState_luffa) );
+            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                                          (const BitSequence*)hash1, 64 );
+            memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
+                    sizeof(hashState_luffa) );
+            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                                          (const BitSequence*)hash2, 64 );
+            memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
+                    sizeof(hashState_luffa) );
+            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                                          (const BitSequence*)hash3, 64 );
+            if ( i < len-1 )
+               mm256_interleave_4x64( vhash,
+                                      hash0, hash1, hash2, hash3, 64<<3 );
+         break;
+         case 7:
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                     vhash, 64<<3 );
+            cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                                      (const byte*) hash0, 64 );
+            memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
+            cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
+                                      (const byte*) hash1, 64 );
+            memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
+            cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
+                                      (const byte*) hash2, 64 );
+            memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
+            cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
+                                      (const byte*) hash3, 64 );
+            if ( i < len-1 )
+               mm256_interleave_4x64( vhash,
+                                      hash0, hash1, hash2, hash3, 64<<3 );
+         break;
+         case 8:
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                     vhash, 64<<3 );
+            sph_shavite512( &ctx.shavite, hash0, 64 );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
+                    sizeof(sph_shavite512_context) );
+            sph_shavite512( &ctx.shavite, hash1, 64 );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
+                    sizeof(sph_shavite512_context) );
+            sph_shavite512( &ctx.shavite, hash2, 64 );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
+                    sizeof(sph_shavite512_context) );
+            sph_shavite512( &ctx.shavite, hash3, 64 );
+            sph_shavite512_close( &ctx.shavite, hash3 );
+            if ( i < len-1 )
+               mm256_interleave_4x64( vhash,
+                                      hash0, hash1, hash2, hash3, 64<<3 );
+         break;
+         case 9:
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                     vhash, 64<<3 );
+            update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                                  (const BitSequence *)hash0, 512 );
+            memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
+            update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                                  (const BitSequence *)hash1, 512 );
+            memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
+            update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                                  (const BitSequence *)hash2, 512 );
+            memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
+            update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                                  (const BitSequence *)hash3, 512 );
+            if ( i < len-1 )
+               mm256_interleave_4x64( vhash,
+                                      hash0, hash1, hash2, hash3, 64<<3 );
+         break;
+         case 10:
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                     vhash, 64<<3 );
+            update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                                   (const BitSequence *) hash0, 512 );
+            memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
+            update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                                   (const BitSequence *) hash1, 512 );
+            memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
+            update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                                   (const BitSequence *) hash2, 512 );
+            memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
+            update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                                   (const BitSequence *) hash3, 512 );
+            if ( i < len-1 )
+               mm256_interleave_4x64( vhash,
+                                      hash0, hash1, hash2, hash3, 64<<3 );
+         break;
+      }
+   }
+
+   memcpy( state,    hash0, 32 );
+   memcpy( state+32, hash1, 32 );
+   memcpy( state+64, hash2, 32 );
+   memcpy( state+96, hash3, 32 );
+}
+
+//static const uint32_t diff1targ = 0x0000ffff;
+
+int scanhash_x11evo_4way( int thr_id, struct work* work, uint32_t max_nonce,
+                          uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+
+     swab32_array( endiandata, pdata, 20 );
+
+     int ntime = endiandata[17];
+     if ( ntime != s_ntime  ||  s_seq == -1 )
+     {
+         evo_twisted_code( ntime, hashOrder );
+         s_ntime = ntime;
+     }
+
+     uint32_t hmask = 0xFFFFFFFF;
+     if ( Htarg  > 0 )
+     {
+        if ( Htarg <= 0xF )
+            hmask = 0xFFFFFFF0;
+        else if ( Htarg <= 0xFF )
+            hmask = 0xFFFFFF00;
+        else if ( Htarg <= 0xFFF )
+            hmask = 0xFFFF000;
+        else if ( Htarg <= 0xFFFF )
+           hmask = 0xFFFF000;
+      }
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     do
+     {
+         found[0] = found[1] = found[2] = found[3] = false;
+         be32enc( noncep0, n   );
+         be32enc( noncep1, n+1 );
+         be32enc( noncep2, n+2 );
+         be32enc( noncep3, n+3 );
+
+         x11evo_4way_hash( hash, vdata );
+         pdata[19] = n;
+
+         if ( ( hash[7] & hmask ) == 0 && fulltest( hash, ptarget ) )
+         {
+            found[0] = true;
+            num_found++;
+            nonces[0] = n;
+            work_set_target_ratio( work, hash );
+         }
+         if ( ( (hash+8)[7] & hmask ) == 0 && fulltest( hash+8, ptarget ) )
+         {
+            found[1] = true;
+            num_found++;
+            nonces[1] = n+1;
+            work_set_target_ratio( work, hash+8 );
+         }
+         if ( ( (hash+16)[7] & hmask ) == 0 && fulltest( hash+16, ptarget ) )
+         {
+            found[2] = true;
+            num_found++;
+            nonces[2] = n+2;
+            work_set_target_ratio( work, hash+16 );
+         }
+         if ( ( (hash+24)[7] & hmask ) == 0 && fulltest( hash+24, ptarget ) )
+         {
+            found[3] = true;
+            num_found++;
+            nonces[3] = n+3;
+            work_set_target_ratio( work, hash+24 );
+         }
+         n += 4;
+     } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x11/x11evo-gate.c
+++ b/algo/x11/x11evo-gate.c
@@ -0,0 +1,95 @@
+#include "x11evo-gate.h"
+
+int s_seq = -1;
+
+static inline int getCurrentAlgoSeq( uint32_t current_time )
+{
+   // change once per day
+   return (int) (current_time - X11EVO_INITIAL_DATE) / (60 * 60 * 24);
+}
+
+// swap_vars doesn't work here
+void evo_swap( uint8_t *a, uint8_t *b )
+{
+   uint8_t __tmp = *a;
+   *a = *b;
+   *b = __tmp;
+}
+
+void initPerm( uint8_t n[], uint8_t count )
+{
+   int i;
+   for ( i = 0; i<count; i++ )
+       n[i] = i;
+}
+
+int nextPerm( uint8_t n[], uint32_t count )
+{
+   uint32_t tail = 0, i = 0, j = 0;
+
+   if (unlikely( count <= 1 ))
+      return 0;
+
+   for ( i = count - 1; i>0 && n[i - 1] >= n[i]; i-- );
+      tail = i;
+
+   if ( tail > 0 )
+      for ( j = count - 1; j>tail && n[j] <= n[tail - 1]; j-- );
+           evo_swap( &n[tail - 1], &n[j] );
+
+   for ( i = tail, j = count - 1; i<j; i++, j-- )
+      evo_swap( &n[i], &n[j] );
+
+   return ( tail != 0 );
+}
+
+void getAlgoString( char *str, uint32_t count )
+{
+   uint8_t algoList[X11EVO_FUNC_COUNT];
+   char *sptr;
+   int j;
+   int k;
+   initPerm( algoList, X11EVO_FUNC_COUNT );
+
+   for ( k = 0; k < count; k++ )
+      nextPerm( algoList, X11EVO_FUNC_COUNT );
+
+   sptr = str;
+   for ( j = 0; j < X11EVO_FUNC_COUNT; j++ )
+   {
+      if ( algoList[j] >= 10 )
+          sprintf( sptr, "%c", 'A' + (algoList[j] - 10) );
+      else
+          sprintf( sptr, "%u", algoList[j] );
+      sptr++;
+   }
+  *sptr = 0;
+
+        //applog(LOG_DEBUG, "nextPerm %s", str);
+}
+
+void evo_twisted_code( uint32_t ntime, char *permstr )
+{
+   int seq = getCurrentAlgoSeq( ntime );
+   if ( s_seq != seq )
+   {
+       getAlgoString( permstr, seq );
+       s_seq = seq;
+   }
+}
+
+bool register_x11evo_algo( algo_gate_t* gate )
+{
+#if defined (X11EVO_4WAY)
+  init_x11evo_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x11evo_4way;
+  gate->hash      = (void*)&x11evo_4way_hash;
+#else
+  init_x11evo_ctx();
+  gate->scanhash  = (void*)&scanhash_x11evo;
+  gate->hash      = (void*)&x11evo_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  return true;
+};
+
--- a/algo/x11/x11evo-gate.h
+++ b/algo/x11/x11evo-gate.h
@@ -0,0 +1,39 @@
+#ifndef X11EVO_GATE_H__
+#define X11EVO_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X11EVO_4WAY
+#endif
+
+#define X11EVO_INITIAL_DATE 1462060800
+#define X11EVO_FUNC_COUNT 11
+
+extern int s_seq;
+
+bool register_x11evo_algo( algo_gate_t* gate );
+
+#if defined(X11EVO_4WAY)
+
+void x11evo_4way_hash( void *state, const void *input );
+
+int scanhash_x11evo_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_x11evo_4way_ctx();
+
+#endif
+
+void x11evo_hash( void *state, const void *input );
+
+int scanhash_x11evo( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_x11evo_ctx();
+
+void evo_twisted_code( uint32_t ntime, char *permstr );
+
+#endif
+
--- a/algo/x11/x11evo.c
+++ b/algo/x11/x11evo.c
@@ -1,5 +1,5 @@
 #include "cpuminer-config.h"
-#include "algo-gate-api.h"
+#include "x11evo-gate.h"

 #include <string.h>
 #include <stdint.h>
@@ -26,9 +26,6 @@
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/simd/sse2/nist.h"

-#define INITIAL_DATE 1462060800
-#define HASH_FUNC_COUNT 11
-
 typedef struct {
 #ifdef NO_AES_NI
    sph_groestl512_context  groestl;
@@ -70,94 +67,10 @@ void init_x11evo_ctx()
     sph_shavite512_init( &x11evo_ctx.shavite );
 }

-/*
-uint32_t getCurrentAlgoSeq(uint32_t current_time, uint32_t base_time)
-{
-	return (current_time - base_time) / (60 * 60 * 24);
-}
-*/
-
-static inline int getCurrentAlgoSeq( uint32_t current_time )
-{
-        // change once per day
-        return (int) (current_time - INITIAL_DATE) / (60 * 60 * 24);
-}
-
-// swap_vars doesn't work here
-void evo_swap( uint8_t *a, uint8_t *b )
-{
-	uint8_t __tmp = *a;
-	*a = *b;
-	*b = __tmp;
-}
-
-void initPerm( uint8_t n[], uint8_t count )
-{
-	int i;
-	for ( i = 0; i<count; i++ )
-		n[i] = i;
-}
-
-int nextPerm( uint8_t n[], uint32_t count )
-{
-	uint32_t tail = 0, i = 0, j = 0;
-
-	if (unlikely( count <= 1 ))
-		return 0;
-
-	for ( i = count - 1; i>0 && n[i - 1] >= n[i]; i-- );
-           tail = i;
-
-	if ( tail > 0 )
-            for ( j = count - 1; j>tail && n[j] <= n[tail - 1]; j-- );
-	         evo_swap( &n[tail - 1], &n[j] );
-
-	for ( i = tail, j = count - 1; i<j; i++, j-- )
-		evo_swap( &n[i], &n[j] );
-
-	return ( tail != 0 );
-}
-
-void getAlgoString( char *str, uint32_t count )
-{
-	uint8_t algoList[HASH_FUNC_COUNT];
-	char *sptr;
-        int j;
-        int k;
-	initPerm( algoList, HASH_FUNC_COUNT );
-
-	for ( k = 0; k < count; k++ )
-		nextPerm( algoList, HASH_FUNC_COUNT );
-
-	sptr = str;
-	for ( j = 0; j < HASH_FUNC_COUNT; j++ )
-        {
-		if ( algoList[j] >= 10 )
-			sprintf( sptr, "%c", 'A' + (algoList[j] - 10) );
-		else
-			sprintf( sptr, "%u", algoList[j] );
-		sptr++;
-	}
-	*sptr = 0;
-
-	//applog(LOG_DEBUG, "nextPerm %s", str);
-}
-
-static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
+static char hashOrder[X11EVO_FUNC_COUNT + 1] = { 0 };
 static __thread uint32_t s_ntime = UINT32_MAX;
-static int s_seq = -1;

-static void evo_twisted_code(uint32_t ntime, char *permstr)
-{
-        int seq = getCurrentAlgoSeq(ntime);
-        if (s_seq != seq)
-        {
-                getAlgoString(permstr, seq);
-                s_seq = seq;
-        }
-}
-
-static inline void x11evo_hash( void *state, const void *input )
+void x11evo_hash( void *state, const void *input )
 {
   uint32_t hash[16] __attribute__ ((aligned (64)));
   x11evo_ctx_holder ctx __attribute__ ((aligned (64)));
@@ -242,10 +155,10 @@ static inline void x11evo_hash( void *state, const void *input )
    memcpy( state, hash, 32 );
 }

-static const uint32_t diff1targ = 0x0000ffff;
+//static const uint32_t diff1targ = 0x0000ffff;

 int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
-                     unsigned long *hashes_done )
+                     uint64_t *hashes_done )
 {
        uint32_t endiandata[20] __attribute__((aligned(64)));
        uint32_t hash64[8] __attribute__((aligned(64)));
@@ -274,19 +187,20 @@ int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
         else if ( Htarg <= 0xFFF )
            hmask = 0xFFFF000;
         else if ( Htarg <= 0xFFFF )
-            hmask = 0xFFFF000;
+           hmask = 0xFFFF000;
        }

        do
        {
          pdata[19] = ++n;
          be32enc( &endiandata[19], n );
-          x11evo_hash( hash64, &endiandata );
+          x11evo_hash( hash64, endiandata );
          if ( ( hash64[7] & hmask ) == 0 )
          {
             if ( fulltest( hash64, ptarget ) )
             {
                 *hashes_done = n - first_nonce + 1;
+                 work_set_target_ratio( work, hash64 );
                 return true;
             }
           }
@@ -296,13 +210,3 @@ int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
-
-bool register_x11evo_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  gate->scanhash  = (void*)&scanhash_x11evo;
-  gate->hash      = (void*)&x11evo_hash;
-  init_x11evo_ctx();
-  return true;
-};
-
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -161,6 +161,7 @@ int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			pdata[19] = nonce;
 			*hashes_done = pdata[19] - first_nonce;
+                        work_set_target_ratio( work, hash );
 			return 1;
 		}
 		nonce++;
--- a/algo/x13/phi1612.c
+++ b/algo/x13/phi1612.c
@@ -116,6 +116,7 @@ int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce,

 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			pdata[19] = nonce;
+                        work_set_target_ratio( work, hash );
 			*hashes_done = pdata[19] - first_nonce;
 			return 1;
 		}
--- a/algo/x13/skunk.c
+++ b/algo/x13/skunk.c
@@ -70,6 +70,7 @@ int scanhash_skunk( int thr_id, struct work *work, uint32_t max_nonce,
           {
 		pdata[19] = nonce;
 		*hashes_done = pdata[19] - first_nonce;
+                work_set_target_ratio( work, hash );
 		return 1;
 	   }
 	   nonce++;
--- a/algo/x13/x13.c
+++ b/algo/x13/x13.c
@@ -234,6 +234,7 @@ int scanhash_x13(int thr_id, struct work *work, uint32_t max_nonce,
 		if (!(hash64[7] & mask)) {
 			printf("[%d]",thr_id);
 			if (fulltest(hash64, ptarget)) {
+                                work_set_target_ratio( work, hash );
 				*hashes_done = n - first_nonce + 1;
 				return true;
 			}
--- a/algo/x13/x13sm3-4way.c
+++ b/algo/x13/x13sm3-4way.c
@@ -17,7 +17,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/sse2/nist.h"
 #include "algo/echo/aes_ni/hash_api.h"
-#include "algo/sm3/sph_sm3.h"
+#include "algo/sm3/sm3-hash-4way.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"

@@ -33,7 +33,7 @@ typedef struct {
    sph_shavite512_context  shavite;
    hashState_sd            simd;
    hashState_echo          echo;
-    sm3_ctx_t               sm3;
+    sm3_4way_ctx_t          sm3;
    sph_hamsi512_context    hamsi;
    sph_fugue512_context    fugue;
 } x13sm3_4way_ctx_holder;
@@ -54,7 +54,7 @@ void init_x13sm3_4way_ctx()
     sph_shavite512_init( &x13sm3_4way_ctx.shavite );
     init_sd( &x13sm3_4way_ctx.simd, 512 );
     init_echo( &x13sm3_4way_ctx.echo, 512 );
-     sm3_init( &x13sm3_4way_ctx.sm3 );
+     sm3_4way_init( &x13sm3_4way_ctx.sm3 );
     sph_hamsi512_init( &x13sm3_4way_ctx.hamsi );
     sph_fugue512_init( &x13sm3_4way_ctx.fugue );
 };
@@ -85,14 +85,11 @@ void x13sm3_4way_hash( void *state, const void *input )

     // Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl,
-             sizeof(hashState_groestl) );
+     reinit_groestl( &ctx.groestl );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl, 
-             sizeof(hashState_groestl) );
+     reinit_groestl( &ctx.groestl );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl, 
-             sizeof(hashState_groestl) );
+     reinit_groestl( &ctx.groestl );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

     // Parallel 4way
@@ -178,6 +175,8 @@ void x13sm3_4way_hash( void *state, const void *input )
                       (const BitSequence *) hash3, 512 );

     // SM3
+     uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64)));
+     memset( sm3_vhash, 0, sizeof sm3_vhash );
     uint32_t sm3_hash0[32] __attribute__ ((aligned (32)));
     memset( sm3_hash0, 0, sizeof sm3_hash0 );
     uint32_t sm3_hash1[32] __attribute__ ((aligned (32)));
@@ -187,17 +186,11 @@ void x13sm3_4way_hash( void *state, const void *input )
     uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
     memset( sm3_hash3, 0, sizeof sm3_hash3 );

-     sph_sm3( &ctx.sm3, hash0, 64 );
-     sph_sm3_close( &ctx.sm3, sm3_hash0 );
-     memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
-     sph_sm3( &ctx.sm3, hash1, 64 );
-     sph_sm3_close( &ctx.sm3, sm3_hash1 );
-     memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
-     sph_sm3( &ctx.sm3, hash2, 64 );
-     sph_sm3_close( &ctx.sm3, sm3_hash2 );
-     memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
-     sph_sm3( &ctx.sm3, hash3, 64 );
-     sph_sm3_close( &ctx.sm3, sm3_hash3 );
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     sm3_4way( &ctx.sm3, vhash, 64 );
+     sm3_4way_close( &ctx.sm3, sm3_vhash );
+     mm_deinterleave_4x32( sm3_hash0, sm3_hash1, sm3_hash2, sm3_hash3,
+                           sm3_vhash, 1024 );

     // Hamsi
     sph_hamsi512( &ctx.hamsi, sm3_hash0, 64 );
--- a/algo/x13/x13sm3.c
+++ b/algo/x13/x13sm3.c
@@ -224,6 +224,7 @@ int scanhash_x13sm3( int thr_id, struct work *work,
 				if (!(hash64[7] & mask)) {
 					printf("[%d]",thr_id);
 					if (fulltest(hash64, ptarget)) {
+                                                work_set_target_ratio( work, hash64 );
 						*hashes_done = n - first_nonce + 1;
 						return true;
 					}
--- a/algo/x14/axiom.c
+++ b/algo/x14/axiom.c
@@ -65,6 +65,7 @@ int scanhash_axiom(int thr_id, struct work *work,
 		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
 			*hashes_done = n - first_nonce + 1;
 			pdata[19] = n;
+                        work_set_target_ratio( work, hash64 );
 			return true;
 		}
 		n++;
--- a/algo/x14/polytimos-gate.h
+++ b/algo/x14/polytimos-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY) && defined(__AES__)
+#if defined(__AVX2__) && defined(__AES__)
  #define POLYTIMOS_4WAY
 #endif

--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -233,6 +233,7 @@ int scanhash_x14(int thr_id, struct work *work,
 				if (!(hash64[7] & mask)) {
 					printf("[%d]",thr_id);
 					if (fulltest(hash64, ptarget)) {
+                                                work_set_target_ratio( work, hash64 );
 						*hashes_done = n - first_nonce + 1;
 						return true;
 					}
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -47,7 +47,6 @@ void init_x15_4way_ctx()
 {
     blake512_4way_init( &x15_4way_ctx.blake );
     bmw512_4way_init( &x15_4way_ctx.bmw );
-     sph_bmw512_init( &x15_4way_ctx.bmw );
     init_groestl( &x15_4way_ctx.groestl, 64 );
     skein512_4way_init( &x15_4way_ctx.skein );
     jh512_4way_init( &x15_4way_ctx.jh );
--- a/algo/x15/x15.c
+++ b/algo/x15/x15.c
@@ -245,6 +245,7 @@ int scanhash_x15(int thr_id, struct work *work,
 				if (!(hash64[7] & mask)) {
 					printf("[%d]",thr_id);
 					if (fulltest(hash64, ptarget)) {
+                                                work_set_target_ratio( work, hash64 );
 						*hashes_done = n - first_nonce + 1;
 						return true;
 					}
--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -266,6 +266,7 @@ int scanhash_x17(int thr_id, struct work *work,
 				if (!(hash64[7] & mask)) {
 					printf("[%d]",thr_id);
 					if (fulltest(hash64, ptarget)) {
+                                                work_set_target_ratio( work, hash64 );
 						*hashes_done = n - first_nonce + 1;
 						return true;
 					}
@@ -281,13 +282,3 @@ int scanhash_x17(int thr_id, struct work *work,
 	pdata[19] = n;
 	return 0;
 }
-/*
-bool register_x17_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_x17_ctx();
-  gate->scanhash = (void*)&scanhash_x17;
-  gate->hash     = (void*)&x17hash;
-  return true;
-};
-*/
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -35,10 +35,18 @@
 #define mm_one_64    _mm_set1_epi64x( 1ULL )
 #define mm_one_32    _mm_set1_epi32(  1UL )
 #define mm_one_16    _mm_set1_epi16(  1U )
+#define mm_one_8     _mm_set1_epi8(   1U )

 // Constant minus 1
 #define mm_neg1      _mm_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )

+// Lane index, useful for byte rotate using shuffle
+#define mm_lanex_64 _mm_set_epi64( 1ULL, 0ULL );
+#define mm_lanex_32 _mm_set_epi32( 3UL, 2UL, 1UL, 0UL );
+#define mm_lanex_16 _mm_set_epi16( 7U, 6U, 5U, 4U, 3U, 2U, 1U, 0U );
+#define mm_lanex_8 _mm_set_epi8( 15U, 14U, 13U, 12U, 11U, 10U , 9U,  8U, \
+                                  7U,  6U,  5U,  4U,  3U,  2U,  1U,  0U );
+
 //
 // Basic operations without equivalent SIMD intrinsic

@@ -327,6 +335,16 @@ inline __m128i mm_byteswap_16( __m128i x )
 // Constant minus 1
 #define mm256_neg1           _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )

+// Lane index, useful for rotate using permutevar
+#define mm256_lane_64 _mm_set_epi64x( 3ULL, 2ULL, 1ULL, 0ULL );
+#define mm256_lane_32 _mm_set_epi32( 7UL, 6UL, 5UL, 4UL, 3UL, 2UL, 1UL, 0UL );
+#define mm256_lane_16 _mm_set_epi16( 15U, 14U, 13U, 12U, 11U, 10U , 9U,  8U, \
+                                      7U,  6U,  5U,  4U,  3U,  2U,  1U,  0U );
+#define mm256_lane_8 _mm_set_epi8( 31U, 30U, 29U, 28U, 27U, 26U, 25U, 24U, \
+                                   23U, 22U, 21U, 20U, 19U, 18U, 17U, 16U, \
+                                   15U, 14U, 13U, 12U, 11U, 10U , 9U,  8U, \
+                                    7U,  6U,  5U,  4U,  3U,  2U,  1U,  0U );
+
 //
 // Basic operations without SIMD equivalent

@@ -1109,7 +1127,7 @@ inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1,
 }

 // Can't do it in place
-inline void mm256_reinterleave_4x64x( void *dst, void *src, int  bit_len )
+inline void mm256_reinterleave_4x64( void *dst, void *src, int  bit_len )
 {
   __m256i* d = (__m256i*)dst;
   uint32_t *s = (uint32_t*)src;
@@ -1146,7 +1164,8 @@ inline void mm256_reinterleave_4x64x( void *dst, void *src, int  bit_len )
 // likely of no use.
 // convert 4x32 byte (128 bit) vectors to 4x64 (256 bit) vectors for AVX2
 // bit_len must be multiple of 64
-inline void mm256_reinterleave_4x64( uint64_t *dst, uint32_t *src,
+// broken
+inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src,
                                         int  bit_len )
 {
   uint32_t *d = (uint32_t*)dst;
@@ -1200,6 +1219,7 @@ inline void mm256_reinterleave_4x32( void *dst, void *src, int  bit_len )
   // bit_len == 1024
 }

+// not used
 inline void mm_reinterleave_4x32( void *dst, void *src, int  bit_len )
 {
   uint32_t *d = (uint32_t*)dst;
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.9.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.10.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.7.9'
-PACKAGE_STRING='cpuminer-opt 3.7.9'
+PACKAGE_VERSION='3.7.10'
+PACKAGE_STRING='cpuminer-opt 3.7.10'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.7.9 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.7.10 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1392,7 +1392,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.7.9:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.7.10:";;
   esac
  cat <<\_ACEOF

@@ -1497,7 +1497,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.7.9
+cpuminer-opt configure 3.7.10
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.7.9, which was
+It was created by cpuminer-opt $as_me 3.7.10, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2981,7 +2981,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.7.9'
+ VERSION='3.7.10'


 cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.7.9, which was
+This file was extended by cpuminer-opt $as_me 3.7.10, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.7.9
+cpuminer-opt config.status 3.7.10
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.7.9])
+AC_INIT([cpuminer-opt], [3.7.10])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -346,6 +346,7 @@ static bool work_decode( const json_t *val, struct work *work )
    work->targetdiff = target_to_diff( work->target );
    // for api stats, on longpoll pools
    stratum_diff = work->targetdiff;
+    work->sharediff = 0;
    algo_gate.display_extra_data( work, &net_blocks );
    return true;
 }
@@ -755,6 +756,7 @@ static int share_result( int result, struct work *work, const char *reason )
   uint32_t total_submits;
   float rate;
   char rate_s[8] = {0};
+   double sharediff = work ? work->sharediff : stratum.sharediff;
   int i;

   pthread_mutex_lock(&stats_lock);
@@ -814,6 +816,8 @@ static int share_result( int result, struct work *work, const char *reason )
      sprintf(hr, "%.2f", hashrate );
   }

+   if ( sharediff == 0 )
+   {
 #if ((defined(_WIN64) || defined(__WINDOWS__)))
   applog( LOG_NOTICE, "%s %lu/%lu (%s%%), %s %sH, %s %sH/s",
                       sres, ( result ? accepted_count : rejected_count ),
@@ -824,6 +828,20 @@ static int share_result( int result, struct work *work, const char *reason )
                       total_submits, rate_s, hc, hc_units, hr, hr_units,
                       (uint32_t)cpu_temp(0) );
 #endif
+   }
+   else
+   {
+#if ((defined(_WIN64) || defined(__WINDOWS__)))
+   applog( LOG_NOTICE, "%s %lu/%lu (%s%%), diff %.3g, %s %sH/s",
+                       sres, ( result ? accepted_count : rejected_count ),
+                       total_submits, rate_s, sharediff, hr, hr_units );
+#else
+   applog( LOG_NOTICE, "%s %lu/%lu (%s%%), diff %.3g, %s %sH/s, %dC",
+                       sres, ( result ? accepted_count : rejected_count ),
+                       total_submits, rate_s, sharediff, hr, hr_units,
+                       (uint32_t)cpu_temp(0) );
+#endif
+   }

   if (reason)
   {
@@ -1026,6 +1044,7 @@ static bool submit_upstream_work( CURL *curl, struct work *work )
   }
   if ( have_stratum )
   {
+       stratum.sharediff = work->sharediff;
       algo_gate.build_stratum_request( req, work, &stratum );
       if ( unlikely( !stratum_send_line( &stratum, req ) ) )
       {
@@ -1569,7 +1588,7 @@ void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
                     uint32_t *end_nonce_ptr, bool clean_job )
 {
   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
-   
+
   if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
      && ( clean_job || ( *nonceptr >= *end_nonce_ptr )
         || ( work->job_id != g_work->job_id ) ) )
@@ -2984,25 +3003,22 @@ bool check_cpu_capability ()
     }
     if ( sw_has_avx2 && !( cpu_has_avx2 && cpu_has_aes ) )
     {
-        if ( sw_has_4way && algo_has_4way )
-           printf( "A CPU with AES and AVX2 is required to use 4way!\n" );
-        else if ( algo_has_avx2 )
-           printf( "A CPU with AES and AVX2 is required!\n" );
+        printf( "The SW build requires a CPU with AES and AVX2!\n" );
        return false;
     }
-     if ( sw_has_avx && !( cpu_has_avx && cpu_has_aes ) )
+     if ( sw_has_avx && !cpu_has_avx )
     {
-        printf( "A CPU with AES and AVX2 is required!\n" );
+        printf( "The SW build requires a CPU with AVX!\n" );
        return false;
     }
-     if ( sw_has_aes && algo_has_aes && !cpu_has_aes )
+     if ( sw_has_aes && !cpu_has_aes )
     {
-        printf( "A CPU with AES is required!\n" );
+        printf( "The SW build requires a CPU with AES!\n" );
        return false;
     }
-     if ( sw_has_sha && algo_has_sha && !cpu_has_sha )
+     if ( sw_has_sha && !cpu_has_sha )
     {
-        printf( "A CPU with SHA is required!\n" );
+        printf( "The SW build requires a CPU with SHA!\n" );
        return false;
     }

@@ -3187,6 +3203,9 @@ int main(int argc, char *argv[])
 	}
 #endif

+   if ( num_cpus != opt_n_threads )   
+     applog( LOG_INFO,"%u CPU cores available, %u miner threads selected.",
+             num_cpus, opt_n_threads );
   if ( opt_affinity != -1 )
   {
      if ( num_cpus > 64 )
--- a/miner.h
+++ b/miner.h
@@ -736,7 +736,7 @@ Options:\n\
                          whirlpool\n\
                          whirlpoolx\n\
                          x11          Dash\n\
-                          x11evo       Revolvercoin\n\
+                          x11evo       Revolvercoin (XRE)\n\
                          x11gost      sib (SibCoin)\n\
                          x13          X13\n\
                          x13sm3       hsr (Hshare)\n\