v3.9.4

2025-09-17 23:44:27 +00:00 · 2019-06-18 13:15:45 -04:00
parent 71d6b97ee8
commit d6e8d7a46e
75 changed files with 1790 additions and 781 deletions
--- a/3
+++ b/3
@@ -42,9 +42,6 @@ openssl 1.1.0e or higher. Add one of the following, depending on the
 compiler version, to CFLAGS:
 "-march=native" or "-march=znver1" or "-msha".
 Due to poor AVX2 performance on Ryzen users should add -DRYZEN_ to CFLAGS
 to override multiway AVX2 on algos with sha256, and use SHA instead.
 Additional instructions for static compilalation can be found here:
 https://lxadm.com/Static_compilation_of_cpuminer
 Static builds should only considered in a homogeneous HW and SW environment.
--- a/Makefile.am
+++ b/Makefile.am
@@ -131,6 +131,7 @@ cpuminer_SOURCES = \
  algo/lyra2/lyra2h-4way.c \
  algo/lyra2/allium-4way.c \
  algo/lyra2/allium.c \
  algo/lyra2/phi2-4way.c \
  algo/lyra2/phi2.c \
  algo/m7m.c \
  algo/neoscrypt/neoscrypt.c \
@@ -147,6 +148,9 @@ cpuminer_SOURCES = \
  algo/quark/anime-gate.c \
  algo/quark/anime.c \
  algo/quark/anime-4way.c \
  algo/quark/hmq1725-gate.c \
  algo/quark/hmq1725-4way.c \
  algo/quark/hmq1725.c \
  algo/qubit/qubit-gate.c \
  algo/qubit/qubit.c \
  algo/qubit/qubit-2way.c \
@@ -257,7 +261,6 @@ cpuminer_SOURCES = \
  algo/x17/xevan-gate.c \
  algo/x17/xevan.c \
  algo/x17/xevan-4way.c \
  algo/x17/hmq1725.c \
  algo/x17/sonoa-gate.c \
  algo/x17/sonoa-4way.c \
  algo/x17/sonoa.c \
--- a/README.txt
+++ b/README.txt
@@ -29,7 +29,7 @@ cpuminer-sse2.exe      "-msse2"                  Core2, Nehalem
 cpuminer-aes-sse42.exe "-march=westmere"         Westmere
 cpuminer-avx.exe       "-march=corei7-avx"       Sandy-Ivybridge
 cpuminer-avx2.exe      "-march=core-avx2"        Haswell, Sky-Kaby-Coffeelake
-cpuminer-zen           "-march=znver1 -DRYZEN_"  Ryzen
+cpuminer-zen           "-march=znver1"           AMD Ryzen, Threadripper
 If you like this software feel free to donate:
--- a/10
+++ b/10
@@ -38,9 +38,17 @@ supported.
 Change Log
 ----------
 v3.9.4
 Faster AVX2 for lyra2v3, quark, anime.
 Fixed skein AVX2 regression (invalid shares since v3.9.0) and faster.
 Faster skein2 with 4way AVX2 enabled.
 Automatic SHA override on Ryzen CPUs, no need for -DRYZEN compile flag.
 Ongoing restructuring.
 v3.9.3.1
-Skippped v3.9.3 due to misidentification of v3.9.2.5 as v3.9.3.
+Skipped v3.9.3 due to misidentification of v3.9.2.5 as v3.9.3.
 Fixed x16r algo 25% invalid share reject rate. The bug may have also
 affected other algos.
--- a/algo/argon2/argon2d/argon2d/core.c
+++ b/algo/argon2/argon2d/argon2d/core.c
@@ -112,7 +112,7 @@ int allocate_memory(const argon2_context *context, uint8_t **memory,
 void free_memory(const argon2_context *context, uint8_t *memory,
                 size_t num, size_t size) {
    size_t memory_size = num*size;
-    clear_internal_memory(memory, memory_size);
+//    clear_internal_memory(memory, memory_size);
    if (context->free_cbk) {
        (context->free_cbk)(memory, memory_size);
    } else {
@@ -137,7 +137,7 @@ void NOT_OPTIMIZED secure_wipe_memory(void *v, size_t n) {
 int FLAG_clear_internal_memory = 0;
 void clear_internal_memory(void *v, size_t n) {
  if (FLAG_clear_internal_memory && v) {
-    secure_wipe_memory(v, n);
+//    secure_wipe_memory(v, n);
  }
 }
@@ -559,7 +559,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
                       context->pwdlen);
        if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) {
-            secure_wipe_memory(context->pwd, context->pwdlen);
+//            secure_wipe_memory(context->pwd, context->pwdlen);
            context->pwdlen = 0;
        }
    }
@@ -580,7 +580,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
                       context->secretlen);
        if (context->flags & ARGON2_FLAG_CLEAR_SECRET) {
-            secure_wipe_memory(context->secret, context->secretlen);
+//            secure_wipe_memory(context->secret, context->secretlen);
            context->secretlen = 0;
        }
    }
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -537,6 +537,8 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
      }
   }
   sc->ptr = ptr;
   if ( h1 != sc->H )
        memcpy_128( sc->H, h1, 16 );
 }
@@ -571,6 +573,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
   for ( u = 0; u < 16; u ++ )
      buf[u] = h2[u];
   compress_small( buf, (__m128i*)final_s, h1 );
   for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
@@ -1041,22 +1044,22 @@ static const __m256i final_s8[16] =
 void bmw256_8way_init( bmw256_8way_context *ctx )
 {
-   ctx->H[ 0] = _mm256_set1_epi64x( IV256[ 0] );
+   ctx->H[ 0] = _mm256_set1_epi32( IV256[ 0] );
-   ctx->H[ 1] = _mm256_set1_epi64x( IV256[ 1] );
+   ctx->H[ 1] = _mm256_set1_epi32( IV256[ 1] );
-   ctx->H[ 2] = _mm256_set1_epi64x( IV256[ 2] );
+   ctx->H[ 2] = _mm256_set1_epi32( IV256[ 2] );
-   ctx->H[ 3] = _mm256_set1_epi64x( IV256[ 3] );
+   ctx->H[ 3] = _mm256_set1_epi32( IV256[ 3] );
-   ctx->H[ 4] = _mm256_set1_epi64x( IV256[ 4] );
+   ctx->H[ 4] = _mm256_set1_epi32( IV256[ 4] );
-   ctx->H[ 5] = _mm256_set1_epi64x( IV256[ 5] );
+   ctx->H[ 5] = _mm256_set1_epi32( IV256[ 5] );
-   ctx->H[ 6] = _mm256_set1_epi64x( IV256[ 6] );
+   ctx->H[ 6] = _mm256_set1_epi32( IV256[ 6] );
-   ctx->H[ 7] = _mm256_set1_epi64x( IV256[ 7] );
+   ctx->H[ 7] = _mm256_set1_epi32( IV256[ 7] );
-   ctx->H[ 8] = _mm256_set1_epi64x( IV256[ 8] );
+   ctx->H[ 8] = _mm256_set1_epi32( IV256[ 8] );
-   ctx->H[ 9] = _mm256_set1_epi64x( IV256[ 9] );
+   ctx->H[ 9] = _mm256_set1_epi32( IV256[ 9] );
-   ctx->H[10] = _mm256_set1_epi64x( IV256[10] );
+   ctx->H[10] = _mm256_set1_epi32( IV256[10] );
-   ctx->H[11] = _mm256_set1_epi64x( IV256[11] );
+   ctx->H[11] = _mm256_set1_epi32( IV256[11] );
-   ctx->H[12] = _mm256_set1_epi64x( IV256[12] );
+   ctx->H[12] = _mm256_set1_epi32( IV256[12] );
-   ctx->H[13] = _mm256_set1_epi64x( IV256[13] );
+   ctx->H[13] = _mm256_set1_epi32( IV256[13] );
-   ctx->H[14] = _mm256_set1_epi64x( IV256[14] );
+   ctx->H[14] = _mm256_set1_epi32( IV256[14] );
-   ctx->H[15] = _mm256_set1_epi64x( IV256[15] );
+   ctx->H[15] = _mm256_set1_epi32( IV256[15] );
   ctx->ptr       = 0;
   ctx->bit_count = 0;
@@ -1076,14 +1079,15 @@ void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
   ptr = ctx->ptr;
   h1 = ctx->H;
   h2 = htmp;
   while ( len > 0 )
   {
      size_t clen;
      clen = buf_size - ptr;
      if ( clen > len )
         clen = len;
-      memcpy_256( buf + (ptr>>3), vdata, clen >> 3 );
+      memcpy_256( buf + (ptr>>2), vdata, clen >> 2 );
-      vdata = vdata + (clen>>3);
+      vdata = vdata + (clen>>2);
      len -= clen;
      ptr += clen;
      if ( ptr == buf_size )
@@ -1097,6 +1101,7 @@ void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
      }
   }
   ctx->ptr = ptr;
   if ( h1 != ctx->H )
        memcpy_256( ctx->H, h1, 16 );
 }
@@ -1106,24 +1111,26 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
   __m256i *buf;
   __m256i h1[16], h2[16], *h;
   size_t ptr, u, v;
 //   unsigned z;
   const int buf_size = 64;  // bytes of one lane, compatible with len
   buf = ctx->buf;
   ptr = ctx->ptr;
-   buf[ ptr>>3 ] = _mm256_set1_epi32( 0x80 );
+   buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
-   ptr += 8;
+   ptr += 4;
   h = ctx->H;
-   if (  ptr > (buf_size - 8) )
+   if (  ptr > (buf_size - 4) )
   {
-      memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+      memset_zero_256( buf + (ptr>>2), (buf_size - ptr) >> 2 );
      compress_small_8way( buf, h, h1 );
      ptr = 0;
      h = h1;
   }
-   memset_zero_256( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
+   memset_zero_256( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
-   buf[ (buf_size - 8) >> 3 ] = _mm256_set1_epi64x( ctx->bit_count );
+   buf[ (buf_size - 8) >> 2 ] = _mm256_set1_epi32( ctx->bit_count );
   buf[ (buf_size - 4) >> 2 ] = m256_zero;
   compress_small_8way( buf, h, h2 );
   for ( u = 0; u < 16; u ++ )
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -47,7 +47,9 @@ bool lyra2rev3_thread_init()
   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
   l2v3_wholeMatrix = _mm_malloc( size, 64 );
-#if defined (LYRA2REV3_4WAY)
+#if defined (LYRA2REV3_8WAY)
   init_lyra2rev3_8way_ctx();;
 #elif defined (LYRA2REV3_4WAY)
   init_lyra2rev3_4way_ctx();;
 #else
   init_lyra2rev3_ctx();
@@ -57,7 +59,10 @@ bool lyra2rev3_thread_init()
 bool register_lyra2rev3_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV3_4WAY)
+#if defined (LYRA2REV3_8WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev3_8way;
  gate->hash      = (void*)&lyra2rev3_8way_hash;
 #elif defined (LYRA2REV3_4WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev3_4way;
  gate->hash      = (void*)&lyra2rev3_4way_hash;
 #else
@@ -203,13 +208,18 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 bool register_phi2_algo( algo_gate_t* gate )
 {
-   init_phi2_ctx();
+//   init_phi2_ctx();
   gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
   gate->get_work_data_size = (void*)&phi2_get_work_data_size;
   gate->decode_extra_data  = (void*)&phi2_decode_extra_data;
   gate->build_extraheader  = (void*)&phi2_build_extraheader;
   gate->set_target         = (void*)&alt_set_target; 
   gate->get_max64          = (void*)&get_max64_0xffffLL;
 #if defined(PHI2_4WAY)
   gate->scanhash           = (void*)&scanhash_phi2_4way;
 #else
   init_phi2_ctx();
   gate->scanhash           = (void*)&scanhash_phi2;
 #endif
   return true;
 }
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -5,7 +5,9 @@
 #include <stdint.h>
 #include "lyra2.h"
-//#if defined(__AVX2__)
+#if defined(__AVX2__)
  #define LYRA2REV3_8WAY
 #endif
 #if defined(__SSE2__)
  #define LYRA2REV3_4WAY
@@ -14,8 +16,14 @@
 extern __thread uint64_t* l2v3_wholeMatrix;
 bool register_lyra2rev3_algo( algo_gate_t* gate );
 #if defined(LYRA2REV3_8WAY)
-#if defined(LYRA2REV3_4WAY)
+void lyra2rev3_8way_hash( void *state, const void *input );
 int scanhash_lyra2rev3_8way( int thr_id, struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev3_8way_ctx();
 #elif defined(LYRA2REV3_4WAY)
 void lyra2rev3_4way_hash( void *state, const void *input );
 int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -142,15 +150,29 @@ bool init_allium_ctx();
 /////////////////////////////////////////
 #if defined(__AVX2__) && defined(__AES__)
 //  #define PHI2_4WAY
 #endif
 bool phi2_has_roots;
 bool register_phi2_algo( algo_gate_t* gate );
 #if defined(PHI2_4WAY)
 void phi2_hash_4way( void *state, const void *input );
 int scanhash_phi2_4way( int thr_id, struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr );
 //void init_phi2_ctx();
 #else
 void phi2_hash( void *state, const void *input );
 int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr );
 void init_phi2_ctx();
 #endif
 #endif  // LYRA2_GATE_H__
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -1,12 +1,138 @@
 #include "lyra2-gate.h"
 #include <memory.h>
 #if defined (LYRA2REV3_4WAY)	
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 
 #if defined (LYRA2REV3_8WAY)
 typedef struct {
   blake256_8way_context     blake;
   cubehashParam             cube;
   bmw256_8way_context       bmw;
 } lyra2v3_8way_ctx_holder;
 static lyra2v3_8way_ctx_holder l2v3_8way_ctx;
 bool init_lyra2rev3_8way_ctx()
 {
   blake256_8way_init( &l2v3_8way_ctx.blake );
   cubehashInit( &l2v3_8way_ctx.cube, 256, 16, 32 );
   bmw256_8way_init( &l2v3_8way_ctx.bmw );
   return true;
 }
 void lyra2rev3_8way_hash( void *state, const void *input )
 {
   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
   uint32_t hash0[8] __attribute__ ((aligned (64)));
   uint32_t hash1[8] __attribute__ ((aligned (32)));
   uint32_t hash2[8] __attribute__ ((aligned (32)));
   uint32_t hash3[8] __attribute__ ((aligned (32)));
   uint32_t hash4[8] __attribute__ ((aligned (32)));
   uint32_t hash5[8] __attribute__ ((aligned (32)));
   uint32_t hash6[8] __attribute__ ((aligned (32)));
   uint32_t hash7[8] __attribute__ ((aligned (32)));
   lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );
   blake256_8way( &ctx.blake, input, 80 );
   blake256_8way_close( &ctx.blake, vhash );
   mm256_dintrlv_8x32( hash0, hash1, hash2, hash3,
                       hash4, hash5, hash6, hash7, vhash, 256 );
   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash4, 32, hash4, 32, hash4, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash5, 32, hash5, 32, hash5, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash4, (const byte*) hash4, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash5, (const byte*) hash5, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash6, (const byte*) hash6, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash7, (const byte*) hash7, 32 );
   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash4, 32, hash4, 32, hash4, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash5, 32, hash5, 32, hash5, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
   mm256_intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
                             hash4, hash5, hash6, hash7, 256 );
   bmw256_8way( &ctx.bmw, vhash, 32 );
   bmw256_8way_close( &ctx.bmw, state );
   }
 int scanhash_lyra2rev3_8way( int thr_id, struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[7<<3]);
   uint32_t lane_hash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;
   mm256_bswap_intrlv80_8x32( vdata, pdata );
   do
   {
      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
                                                  n+3, n+2, n+1, n ) );
      lyra2rev3_8way_hash( hash, vdata );
      pdata[19] = n;
      for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
      {
         mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr, lane );
         }
      }
      n += 8;
   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
 #endif
 #if defined (LYRA2REV3_4WAY)  
 typedef struct {
   blake256_4way_context     blake;
   cubehashParam             cube;
--- a/algo/lyra2/phi2-4way.c
+++ b/algo/lyra2/phi2-4way.c
@@ -0,0 +1,233 @@
 /**
 * Phi-2 algo Implementation
 */
 #include "lyra2-gate.h"
 #if defined(PHI2_4WAY)
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/echo/aes_ni/hash_api.h"
 typedef struct {
     cubehashParam           cube;
     jh512_4way_context      jh;
     hashState_echo          echo;
 //     hashState_echo          echo2;
     sph_gost512_context     gost;
     skein512_4way_context   skein;
 } phi2_ctx_holder;
 /*
 phi2_ctx_holder phi2_ctx;
 void init_phi2_ctx()
 {
   cubehashInit( &phi2_ctx.cube, 512, 16, 32 );
   sph_jh512_init(&phi2_ctx.jh);
   init_echo( &phi2_ctx.echo1, 512 );
   init_echo( &phi2_ctx.echo2, 512 );
   sph_gost512_init(&phi2_ctx.gost);
   sph_skein512_init(&phi2_ctx.skein);
 };
 */
 void phi2_hash_4way( void *state, const void *input )
 {
   uint32_t hash[4][16] __attribute__ ((aligned (64)));
   uint32_t hashA[4][16] __attribute__ ((aligned (64)));
   uint32_t hashB[4][16] __attribute__ ((aligned (64)));
   uint32_t vhash[4*16] __attribute__ ((aligned (64)));
 //   unsigned char _ALIGN(128) hash[64];
 //	unsigned char _ALIGN(128) hashA[64];
 //	unsigned char _ALIGN(128) hashB[64];
   phi2_ctx_holder ctx __attribute__ ((aligned (64)));
 //  memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
   cubehashInit( &ctx.cube, 512, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[0], (const byte*)input,
                        phi2_has_roots ? 144 : 80 );
   cubehashInit( &ctx.cube, 512, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[1], (const byte*)input+144,
                        phi2_has_roots ? 144 : 80 );
   cubehashInit( &ctx.cube, 512, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[2], (const byte*)input+288,
                        phi2_has_roots ? 144 : 80 );
   cubehashInit( &ctx.cube, 512, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[3], (const byte*)input+432,
                        phi2_has_roots ? 144 : 80 );
 	LYRA2RE( &hashA[0][0], 32, &hashB[0][0], 32, &hashB[0][0], 32, 1, 8, 8 );
 	LYRA2RE( &hashA[0][8], 32, &hashB[0][8], 32, &hashB[0][8], 32, 1, 8, 8 );
   LYRA2RE( &hashA[1][0], 32, &hashB[1][0], 32, &hashB[1][0], 32, 1, 8, 8 );
   LYRA2RE( &hashA[1][8], 32, &hashB[1][8], 32, &hashB[1][8], 32, 1, 8, 8 );
   LYRA2RE( &hashA[2][0], 32, &hashB[2][0], 32, &hashB[2][0], 32, 1, 8, 8 );
   LYRA2RE( &hashA[2][8], 32, &hashB[2][8], 32, &hashB[2][8], 32, 1, 8, 8 );
   LYRA2RE( &hashA[3][0], 32, &hashB[3][0], 32, &hashB[3][0], 32, 1, 8, 8 );
   LYRA2RE( &hashA[3][8], 32, &hashB[3][8], 32, &hashB[3][8], 32, 1, 8, 8 );
   mm256_intrlv_4x64( vhash, hashA[0], hashA[1], hashA[2], hashA[3], 512 );
   jh512_4way_init( &ctx.jh );
   jh512_4way( &ctx.jh, vhash, 64 );
   jh512_4way_close( &ctx.jh, vhash );
   mm256_dintrlv_4x64( hash[0], hash[1], hash[2], hash[3], vhash, 512 );
   if ( hash[0][0] & 1 )
  	{
      sph_gost512_init( &ctx.gost );
      sph_gost512( &ctx.gost, (const void*)hash[0], 64 );
 	   sph_gost512_close( &ctx.gost, (void*)hash[0] );
 	}
  	else
  	{
      init_echo( &ctx.echo, 512 );
      update_final_echo ( &ctx.echo, (BitSequence *)hash[0],
                          (const BitSequence *)hash[0], 512 );
      init_echo( &ctx.echo, 512 );
      update_final_echo ( &ctx.echo, (BitSequence *)hash[0],
                          (const BitSequence *)hash[0], 512 );
 	}
   if ( hash[1][0] & 1 )
   {
      sph_gost512_init( &ctx.gost );
      sph_gost512( &ctx.gost, (const void*)hash[1], 64 );
      sph_gost512_close( &ctx.gost, (void*)hash[1] );
   }
   else
   {
      init_echo( &ctx.echo, 512 );
      update_final_echo ( &ctx.echo, (BitSequence *)hash[1],
                          (const BitSequence *)hash[1], 512 );
      init_echo( &ctx.echo, 512 );
      update_final_echo ( &ctx.echo, (BitSequence *)hash[1],
                          (const BitSequence *)hash[1], 512 );
   }
   if ( hash[2][0] & 1 )
   {
      sph_gost512_init( &ctx.gost );
      sph_gost512( &ctx.gost, (const void*)hash[2], 64 );
      sph_gost512_close( &ctx.gost, (void*)hash[2] );
   }
   else
   {
      init_echo( &ctx.echo, 512 );
      update_final_echo ( &ctx.echo, (BitSequence *)hash[2],
                          (const BitSequence *)hash[2], 512 );
      init_echo( &ctx.echo, 512 );
      update_final_echo ( &ctx.echo, (BitSequence *)hash[2],
                          (const BitSequence *)hash[2], 512 );
   }
   if ( hash[3][0] & 1 )
   {
      sph_gost512_init( &ctx.gost );
      sph_gost512( &ctx.gost, (const void*)hash[3], 64 );
      sph_gost512_close( &ctx.gost, (void*)hash[3] );
   }
   else
   {
      init_echo( &ctx.echo, 512 );
      update_final_echo ( &ctx.echo, (BitSequence *)hash[3],
                          (const BitSequence *)hash[3], 512 );
      init_echo( &ctx.echo, 512 );
      update_final_echo ( &ctx.echo, (BitSequence *)hash[3],
                          (const BitSequence *)hash[3], 512 );
   }
   mm256_intrlv_4x64( vhash, hash[0], hash[1], hash[2], hash[3], 512 );
   skein512_4way_init( &ctx.skein );
 	skein512_4way( &ctx.skein, vhash, 64 );
 	skein512_4way_close( &ctx.skein, vhash );
   for (int i=0; i<4; i++)
   {
      ( (uint64_t*)vhash    )[i] ^= ( (uint64_t*)vhash    )[i+4];
      ( (uint64_t*)vhash+ 8 )[i] ^= ( (uint64_t*)vhash+ 8 )[i+4];
      ( (uint64_t*)vhash+16 )[i] ^= ( (uint64_t*)vhash+16 )[i+4];
      ( (uint64_t*)vhash+24 )[i] ^= ( (uint64_t*)vhash+24 )[i+4];
   }
 //   for ( int i = 0; i < 4; i++ )
 //      casti_m256i( vhash, i ) = _mm256_xor_si256( casti_m256i( vhash, i   ),
 //                                                  casti_m256i( vhash, i+4 ) );
 	memcpy( state, vhash, 128 );
 }
 int scanhash_phi2_4way( int thr_id, struct work *work, uint32_t max_nonce,
 	                     uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(128) hash[8];
   uint32_t _ALIGN(128) edata[36];
   uint32_t vdata[4][36] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[25]);
   uint32_t lane_hash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
   if(opt_benchmark){
   	ptarget[7] = 0x00ff;
   }
 // Data is not interleaved, but hash is.
 // any non-zero data at index 20 or above sets roots true.
 // Split up the operations, bswap first, then set roots.
   phi2_has_roots = false;
   for ( int i=0; i < 36; i++ )
   {
   be32enc(&edata[i], pdata[i]);
   if (i >= 20 && pdata[i]) phi2_has_roots = true;
   }
 /*
   casti_m256i( vdata[0], 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );   
   casti_m256i( vdata[0], 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
   casti_m256i( vdata[0], 2 ) = mm256_bswap_32( casti_m256i( pdata, 2 ) );
   casti_m256i( vdata[0], 3 ) = mm256_bswap_32( casti_m256i( pdata, 3 ) );
   casti_m128i( vdata[0], 8 ) = mm128_bswap_32( casti_m128i( pdata, 8 ) );
   phi2_has_roots = mm128_anybits1( casti_m128i( vdata[0], 5 ) ) ||
                    mm128_anybits1( casti_m128i( vdata[0], 6 ) ) ||
                    mm128_anybits1( casti_m128i( vdata[0], 7 ) ) ||
                    mm128_anybits1( casti_m128i( vdata[0], 8 ) );
 */   
   memcpy( vdata[0], edata, 144 );
   memcpy( vdata[1], edata, 144 );
   memcpy( vdata[2], edata, 144 );
   memcpy( vdata[3], edata, 144 );
   do {
      be32enc( &vdata[0][19], n );
      be32enc( &vdata[1][19], n+1 );
      be32enc( &vdata[2][19], n+2 );
      be32enc( &vdata[3][19], n+3 );
      phi2_hash_4way( hash, vdata );
      for ( int lane = 0; lane < 4; lane++ ) if (  hash7[ lane<<1 ] < Htarg )
      {
          mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
          {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr, lane );
          }
       }
       n += 4;
    } while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
    *hashes_done = n - first_nonce + 1;
    return 0;
 }
 #endif  // PHI2_4WAY
--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -144,8 +144,8 @@ void init_m7m_ctx()
 #define NM7M 5
 #define SW_DIVS 5
 #define M7_MIDSTATE_LEN 76
-int scanhash_m7m_hash( int thr_id, struct work* work,
+int scanhash_m7m_hash( int thr_id, struct work* work, uint64_t max_nonce,
-                       uint64_t max_nonce, unsigned long *hashes_done )
+                       unsigned long *hashes_done, struct thr_info *mythr )
 {
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
@@ -154,6 +154,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
    uint32_t hash[8] __attribute__((aligned(64)));
    uint8_t bhash[7][64] __attribute__((aligned(64)));
    uint32_t n = pdata[19] - 1;
    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
    uint32_t usw_, mpzscale;
    const uint32_t first_nonce = pdata[19];
    char data_str[161], hash_str[65], target_str[65];
--- a/algo/pluck.c
+++ b/algo/pluck.c
@@ -445,7 +445,7 @@ void pluck_hash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const i
 }
 int scanhash_pluck(int thr_id, struct work *work, uint32_t max_nonce,
-        uint64_t *hashes_done  )
+        uint64_t *hashes_done, struct thr_info *mythr  )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -454,6 +454,8 @@ int scanhash_pluck(int thr_id, struct work *work, uint32_t max_nonce,
 	const uint32_t first_nonce = pdata[19];
 	volatile uint8_t *restart = &(work_restart[thr_id].restart);
 	uint32_t n = first_nonce;
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0ffff;
--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -48,8 +48,8 @@ void anime_4way_hash( void *state, const void *input )
    __m256i* vhA = (__m256i*)vhashA;
    __m256i* vhB = (__m256i*)vhashB;
    __m256i vh_mask;
    const uint32_t mask = 8;
    const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
    int i;
    anime_4way_ctx_holder ctx;
    memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );
@@ -62,27 +62,44 @@ void anime_4way_hash( void *state, const void *input )
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );
-    mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
    update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                            (char*)hash0, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash1,
                                            (char*)hash1, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash2,
                                            (char*)hash2, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash3,
                                            (char*)hash3, 512 );
    mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
-    skein512_4way( &ctx.skein, vhash, 64 );
+    if ( hash0[0] & mask )
-    skein512_4way_close( &ctx.skein, vhashB );
+    {
       update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                               (char*)hash0, 512 );
    }
    if ( hash1[0] & mask )
    {
       reinit_groestl( &ctx.groestl );
       update_and_final_groestl( &ctx.groestl, (char*)hash1,
                                               (char*)hash1, 512 );
    }
    if ( hash2[0] & mask )
    {
       reinit_groestl( &ctx.groestl );
       update_and_final_groestl( &ctx.groestl, (char*)hash2,
                                               (char*)hash2, 512 );
    }
    if ( hash3[0] & mask )
    {
       reinit_groestl( &ctx.groestl );
       update_and_final_groestl( &ctx.groestl, (char*)hash3,
                                               (char*)hash3, 512 );
    }
-    for ( i = 0; i < 8; i++ )
+    mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
-       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
    if ( mm256_anybits0( vh_mask ) )
    {
       skein512_4way( &ctx.skein, vhash, 64 );
       skein512_4way_close( &ctx.skein, vhashB );
    }
    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
    mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
    mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
    reinit_groestl( &ctx.groestl );
@@ -91,7 +108,8 @@ void anime_4way_hash( void *state, const void *input )
    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-    mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
    mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
    jh512_4way( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhash );
@@ -99,16 +117,20 @@ void anime_4way_hash( void *state, const void *input )
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );
    if ( mm256_anybits1( vh_mask ) )
    {
       blake512_4way_init( &ctx.blake );
       blake512_4way( &ctx.blake, vhash, 64 );
       blake512_4way_close( &ctx.blake, vhashA );
-
+    }
    if ( mm256_anybits0( vh_mask ) )
    {
       bmw512_4way_init( &ctx.bmw );
       bmw512_4way( &ctx.bmw, vhash, 64 );
       bmw512_4way_close( &ctx.bmw, vhashB );
    }
-    for ( i = 0; i < 8; i++ )
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
    keccak512_4way( &ctx.keccak, vhash, 64 );
    keccak512_4way_close( &ctx.keccak, vhash );
@@ -120,33 +142,35 @@ void anime_4way_hash( void *state, const void *input )
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );
-    keccak512_4way_init( &ctx.keccak );
+    if ( mm256_anybits1( vh_mask ) )
-    keccak512_4way( &ctx.keccak, vhash, 64 );
+    {
-    keccak512_4way_close( &ctx.keccak, vhashA );
+       keccak512_4way_init( &ctx.keccak );
       keccak512_4way( &ctx.keccak, vhash, 64 );
       keccak512_4way_close( &ctx.keccak, vhashA );
    }
    if ( mm256_anybits0( vh_mask ) )
    {
       jh512_4way_init( &ctx.jh );
       jh512_4way( &ctx.jh, vhash, 64 );
       jh512_4way_close( &ctx.jh, vhashB );
    }
-    jh512_4way_init( &ctx.jh );
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
    jh512_4way( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhashB );
-    for ( i = 0; i < 8; i++ )
+    mm256_dintrlv_4x64( state, state+32, state+64, state+96, vhash, 256 );
       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
    mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
 }
 int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done)
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[4*8] __attribute__ ((aligned (64)));
    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
    uint32_t endiandata[20] __attribute__((aligned(64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
-    uint32_t *nonces = work->nonces;
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-    int num_found = 0;
+    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
    uint32_t *noncep = vdata + 73;   // 9*8 + 1
    const uint32_t Htarg = ptarget[7];
    uint64_t htmax[] = {
                0,
@@ -165,10 +189,7 @@ int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
                0
        };
-    swab32_array( endiandata, pdata, 20 );
+    mm256_bswap_intrlv80_4x64( vdata, pdata );
    uint64_t *edata = (uint64_t*)endiandata;
    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
    for (int m=0; m < 6; m++)
       if (Htarg <= htmax[m])
@@ -177,30 +198,26 @@ int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
          do
          {
-             be32enc( noncep,   n   );
+             *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-             be32enc( noncep+2, n+1 );
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
             be32enc( noncep+4, n+2 );
             be32enc( noncep+6, n+3 );
             anime_4way_hash( hash, vdata );
             pdata[19] = n;
             for ( int i = 0; i < 4; i++ )
             if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
-                && fulltest( hash+(i<<3), ptarget ) )
+                && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
             {
                pdata[19] = n+i;
-                nonces[ num_found++ ] = n+i;
+                submit_solution( work, hash+(i<<3), mythr, i );
                work_set_target_ratio( work, hash+(i<<3) );
             }
             n += 4;
-          } while ( ( num_found == 0 ) && ( n < max_nonce )
+          } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
              && !work_restart[thr_id].restart );
          break;
       }
    *hashes_done = n - first_nonce + 1;
-    return num_found;
+    return 0;
 }
 #endif
--- a/algo/quark/anime-gate.h
+++ b/algo/quark/anime-gate.h
@@ -13,19 +13,15 @@ bool register_anime_algo( algo_gate_t* gate );
 #if defined(ANIME_4WAY)
 void anime_4way_hash( void *state, const void *input );
 int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+                         uint64_t *hashes_done, struct thr_info *mythr );
 void init_anime_4way_ctx();
 #endif
 void anime_hash( void *state, const void *input );
 int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+                    uint64_t *hashes_done, struct thr_info *mythr );
 void init_anime_ctx();
 #endif
--- a/algo/quark/anime.c
+++ b/algo/quark/anime.c
@@ -46,20 +46,6 @@ void init_anime_ctx()
 void anime_hash( void *state, const void *input )
 {
    unsigned char hash[128] __attribute__ ((aligned (32)));
 /*
    uint64_t hash0[8] __attribute__ ((aligned (64)));
    uint64_t hash1[8] __attribute__ ((aligned (64)));
    uint64_t hash2[8] __attribute__ ((aligned (64)));
    uint64_t hash3[8] __attribute__ ((aligned (64)));
    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
    uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
    uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
    __m256i* vh  = (__m256i*)vhash;
    __m256i* vhA = (__m256i*)vhashA;
    __m256i* vhB = (__m256i*)vhashB;
    __m256i vh_mask;
    __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
 */
    uint32_t mask = 8;
    anime_ctx_holder ctx;
    memcpy( &ctx, &anime_ctx, sizeof(anime_ctx) );
@@ -134,7 +120,7 @@ void anime_hash( void *state, const void *input )
 }
 int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done)
+                         uint64_t *hashes_done, struct thr_info *mythr)
 {
    uint32_t hash[8] __attribute__ ((aligned (64)));
    uint32_t endiandata[20] __attribute__((aligned(64)));
@@ -142,6 +128,7 @@ int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
    uint32_t *ptarget = work->target;
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
    const uint32_t Htarg = ptarget[7];
    uint64_t htmax[] = {
                0,
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -0,0 +1,618 @@
 #include "hmq1725-gate.h"
 #if defined(HMQ1725_4WAY)
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/sha/sha2-hash-4way.h"
 union _hmq1725_4way_context_overlay
 {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
    hashState_luffa         luffa;
    cubehashParam           cube;
    sph_shavite512_context  shavite;
    hashState_sd            simd;
    hashState_echo          echo;
    hamsi512_4way_context   hamsi;
    sph_fugue512_context    fugue;
    shabal512_4way_context  shabal;
    sph_whirlpool_context   whirlpool;
    sha512_4way_context     sha512;
    haval256_5_4way_context haval;
 };
 typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;
 extern void hmq1725_4way_hash(void *state, const void *input)
 {
 // why so big? only really need 8, haval thing uses 16.
     uint32_t hash0 [32]    __attribute__ ((aligned (64)));
     uint32_t hash1 [32]    __attribute__ ((aligned (64)));
     uint32_t hash2 [32]    __attribute__ ((aligned (64)));
     uint32_t hash3 [32]    __attribute__ ((aligned (64)));
     uint32_t vhash [32<<2] __attribute__ ((aligned (64)));
     uint32_t vhashA[32<<2] __attribute__ ((aligned (64)));
     uint32_t vhashB[32<<2] __attribute__ ((aligned (64)));
     hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
     __m256i vh_mask;     
     const __m256i vmask = _mm256_set1_epi64x( 24 );
     const uint32_t mask = 24;
     __m256i* vh  = (__m256i*)vhash;
     __m256i* vhA = (__m256i*)vhashA;
     __m256i* vhB = (__m256i*)vhashB;
     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, input, 80 );
     bmw512_4way_close( &ctx.bmw, vhash );
     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash0 );
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash1 );
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash2 );
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash3 );
 // first fork, A is groestl serial, B is skein parallel.
     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
                                   m256_zero );
 // A
 //     if ( hash0[0] & mask )
 //     {
       init_groestl( &ctx.groestl, 64 );
       update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                               (char*)hash0, 512 );
 //     }
 //     if ( hash1[0] & mask )
 //     {
       init_groestl( &ctx.groestl, 64 );
       update_and_final_groestl( &ctx.groestl, (char*)hash1,
                                               (char*)hash1, 512 );
 //     }
 //     if ( hash2[0] & mask )
 //     {
       init_groestl( &ctx.groestl, 64 );
       update_and_final_groestl( &ctx.groestl, (char*)hash2,
                                               (char*)hash2, 512 );
 //     }
 //     if ( hash3[0] & mask )
 //     {
       init_groestl( &ctx.groestl, 64 );
       update_and_final_groestl( &ctx.groestl, (char*)hash3,
                                               (char*)hash3, 512 );
 //     }
     mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
 // B
 //     if ( mm256_any_clr_256( vh_mask ) )
 //     {
       skein512_4way_init( &ctx.skein );
       skein512_4way( &ctx.skein, vhash, 64 );
       skein512_4way_close( &ctx.skein, vhashB );
 //     }
     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
     jh512_4way_init( &ctx.jh );
     jh512_4way( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );
     keccak512_4way_init( &ctx.keccak );
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
 // second fork, A = blake parallel, B= bmw parallel.
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
                                   m256_zero );
 //     if ( mm256_any_set_256( vh_mask ) )
 //     {
       blake512_4way_init( &ctx.blake );
       blake512_4way( &ctx.blake, vhash, 64 );
       blake512_4way_close( &ctx.blake, vhashA );
 //     }
 //     if ( mm256_any_clr_256( vh_mask ) )
 //     {
       bmw512_4way_init( &ctx.bmw );
       bmw512_4way( &ctx.bmw, vhash, 64 );
       bmw512_4way_close( &ctx.bmw, vhashB );
 //     }
     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     init_luffa( &ctx.luffa, 512 );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
                                   (const BitSequence*)hash0, 64 );
     init_luffa( &ctx.luffa, 512 );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
                                   (const BitSequence*)hash1, 64 );
     init_luffa( &ctx.luffa, 512 );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
                                   (const BitSequence*)hash2, 64 );
     init_luffa( &ctx.luffa, 512 );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                                   (const BitSequence*)hash3, 64 );
     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash0,
                                (const BitSequence *)hash0, 64 );
     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash1,
                                (const BitSequence *)hash1, 64 );
     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash2,
                                (const BitSequence *)hash2, 64 );
     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash3,
                                (const BitSequence *)hash3, 64 );
     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 // A= keccak parallel, B= jh parallel
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
                                  m256_zero );
 //     if ( mm256_any_set_256( vh_mask ) )
 //     {
        keccak512_4way_init( &ctx.keccak );
        keccak512_4way( &ctx.keccak, vhash, 64 );
        keccak512_4way_close( &ctx.keccak, vhashA );
 //     }
 //     if ( mm256_any_clr_256( vh_mask ) )
 //     {
        jh512_4way_init( &ctx.jh );
        jh512_4way( &ctx.jh, vhash, 64 );
        jh512_4way_close( &ctx.jh, vhashB );
 //     }
     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     sph_shavite512_init( &ctx.shavite );
     sph_shavite512 ( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     sph_shavite512_init( &ctx.shavite );
     sph_shavite512 ( &ctx.shavite, hash1, 64 );
     sph_shavite512_close( &ctx.shavite, hash1 );
     sph_shavite512_init( &ctx.shavite );
     sph_shavite512 ( &ctx.shavite, hash2, 64 );
     sph_shavite512_close( &ctx.shavite, hash2 );
     sph_shavite512_init( &ctx.shavite );
     sph_shavite512 ( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );
     init_sd( &ctx.simd, 512 );
     update_final_sd( &ctx.simd, (BitSequence *)hash0,
                           (const BitSequence *)hash0, 512 );
     init_sd( &ctx.simd, 512 );
     update_final_sd( &ctx.simd, (BitSequence *)hash1,
                           (const BitSequence *)hash1, 512 );
     init_sd( &ctx.simd, 512 );
     update_final_sd( &ctx.simd, (BitSequence *)hash2,
                           (const BitSequence *)hash2, 512 );
     init_sd( &ctx.simd, 512 );
     update_final_sd( &ctx.simd, (BitSequence *)hash3,
                           (const BitSequence *)hash3, 512 );
 // A is whirlpool serial, B is haval parallel.
     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
                                   m256_zero );
     // A
 //     if ( hash0[0] & mask )
 //     {
        sph_whirlpool_init( &ctx.whirlpool );
        sph_whirlpool( &ctx.whirlpool, hash0, 64 );
        sph_whirlpool_close( &ctx.whirlpool, hash0 );
 //     }
 //     if ( hash1[0] & mask )
 //     {
        sph_whirlpool_init( &ctx.whirlpool );
        sph_whirlpool( &ctx.whirlpool, hash1, 64 );
        sph_whirlpool_close( &ctx.whirlpool, hash1 );
 //     }
 //     if ( hash2[0] & mask )
 //     {
        sph_whirlpool_init( &ctx.whirlpool );
        sph_whirlpool( &ctx.whirlpool, hash2, 64 );
        sph_whirlpool_close( &ctx.whirlpool, hash2 );
 //     }
 //     if ( hash3[0] & mask )
 //     {
        sph_whirlpool_init( &ctx.whirlpool );
        sph_whirlpool( &ctx.whirlpool, hash3, 64 );
        sph_whirlpool_close( &ctx.whirlpool, hash3 );
 //     }
     mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
 // B
 //     if ( mm256_any_clr_256( vh_mask ) )
 //     {
        haval256_5_4way_init( &ctx.haval );
        haval256_5_4way( &ctx.haval, vhash, 64 );
        haval256_5_4way_close( &ctx.haval, vhashB );
        memset( &vhashB[8<<2], 0, 32<<2);
 //     }
     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                             (const BitSequence *)hash0, 512 );
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                             (const BitSequence *)hash1, 512 );
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                             (const BitSequence *)hash2, 512 );
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                             (const BitSequence *)hash3, 512 );
     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     blake512_4way_init( &ctx.blake );
     blake512_4way( &ctx.blake, vhash, 64 );
     blake512_4way_close( &ctx.blake, vhash );
     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 // shavite & luffa, both serial, select individually.
   if ( hash0[0] & mask )
   {
      sph_shavite512_init( &ctx.shavite );
      sph_shavite512( &ctx.shavite, hash0, 64 ); //
      sph_shavite512_close( &ctx.shavite, hash0 ); //8
   }
   else
   {
      init_luffa( &ctx.luffa, 512 );
      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash0,
                                    (const BitSequence *)hash0, 64 );
   }
   if ( hash1[0] & mask )
   {
      sph_shavite512_init( &ctx.shavite );
      sph_shavite512( &ctx.shavite, hash1, 64 ); //
      sph_shavite512_close( &ctx.shavite, hash1 ); //8
   }
   else
   {
      init_luffa( &ctx.luffa, 512 );
      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash1,
                                    (const BitSequence *)hash1, 64 );
   }
   if ( hash2[0] & mask )
   {
      sph_shavite512_init( &ctx.shavite );
      sph_shavite512( &ctx.shavite, hash2, 64 ); //
      sph_shavite512_close( &ctx.shavite, hash2 ); //8
   }
   else
   {
      init_luffa( &ctx.luffa, 512 );
      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash2,
                                    (const BitSequence *)hash2, 64 );
   }
   if ( hash3[0] & mask )
   {
      sph_shavite512_init( &ctx.shavite );
      sph_shavite512( &ctx.shavite, hash3, 64 ); //
      sph_shavite512_close( &ctx.shavite, hash3 ); //8
   }
   else
   {
      init_luffa( &ctx.luffa, 512 );
      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash3,
                                    (const BitSequence *)hash3, 64 );
   }
   mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
   hamsi512_4way_init( &ctx.hamsi );
   hamsi512_4way( &ctx.hamsi, vhash, 64 );
   hamsi512_4way_close( &ctx.hamsi, vhash );
   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
   sph_fugue512_init( &ctx.fugue );
   sph_fugue512( &ctx.fugue, hash0, 64 );
   sph_fugue512_close( &ctx.fugue, hash0 );
   sph_fugue512_init( &ctx.fugue );
   sph_fugue512( &ctx.fugue, hash1, 64 );
   sph_fugue512_close( &ctx.fugue, hash1 );
   sph_fugue512_init( &ctx.fugue );
   sph_fugue512( &ctx.fugue, hash2, 64 );
   sph_fugue512_close( &ctx.fugue, hash2 );
   sph_fugue512_init( &ctx.fugue );
   sph_fugue512( &ctx.fugue, hash3, 64 );
   sph_fugue512_close( &ctx.fugue, hash3 );
 //  A echo, B sd both serial
   if ( hash0[0] & mask ) //4
   {
       init_echo( &ctx.echo, 512 );
       update_final_echo( &ctx.echo, (BitSequence *)hash0,
                               (const BitSequence *)hash0, 512 );
   }
   else
   {
       init_sd( &ctx.simd, 512 );
       update_final_sd( &ctx.simd, (BitSequence *)hash0,
                             (const BitSequence *)hash0, 512 );
   }
   if ( hash1[0] & mask ) //4
   {
       init_echo( &ctx.echo, 512 );
       update_final_echo( &ctx.echo, (BitSequence *)hash1,
                               (const BitSequence *)hash1, 512 );
   }
   else
   {
       init_sd( &ctx.simd, 512 );
       update_final_sd( &ctx.simd, (BitSequence *)hash1,
                             (const BitSequence *)hash1, 512 );
   }
   if ( hash2[0] & mask ) //4
   {
       init_echo( &ctx.echo, 512 );
       update_final_echo( &ctx.echo, (BitSequence *)hash2,
                               (const BitSequence *)hash2, 512 );
   }
   else
   {
       init_sd( &ctx.simd, 512 );
       update_final_sd( &ctx.simd, (BitSequence *)hash2,
                             (const BitSequence *)hash2, 512 );
   }
   if ( hash3[0] & mask ) //4
   {
       init_echo( &ctx.echo, 512 );
       update_final_echo( &ctx.echo, (BitSequence *)hash3,
                               (const BitSequence *)hash3, 512 );
   }
   else
   {
       init_sd( &ctx.simd, 512 );
       update_final_sd( &ctx.simd, (BitSequence *)hash3,
                             (const BitSequence *)hash3, 512 );
   }
   mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
   shabal512_4way_init( &ctx.shabal );
   shabal512_4way( &ctx.shabal, vhash, 64 );
   shabal512_4way_close( &ctx.shabal, vhash );
   mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
   sph_whirlpool_init( &ctx.whirlpool );
   sph_whirlpool( &ctx.whirlpool, hash0, 64 );
   sph_whirlpool_close( &ctx.whirlpool, hash0 );
   sph_whirlpool_init( &ctx.whirlpool );
   sph_whirlpool( &ctx.whirlpool, hash1, 64 );
   sph_whirlpool_close( &ctx.whirlpool, hash1 );
   sph_whirlpool_init( &ctx.whirlpool );
   sph_whirlpool( &ctx.whirlpool, hash2, 64 );
   sph_whirlpool_close( &ctx.whirlpool, hash2 );
   sph_whirlpool_init( &ctx.whirlpool );
   sph_whirlpool( &ctx.whirlpool, hash3, 64 );
   sph_whirlpool_close( &ctx.whirlpool, hash3 );
 // A = fugue serial, B = sha512 prarallel
   mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
   vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
                                 m256_zero );
 //   if ( hash0[0] & mask ) 
 //   {
      sph_fugue512_init( &ctx.fugue );
      sph_fugue512( &ctx.fugue, hash0, 64 );
      sph_fugue512_close( &ctx.fugue, hash0 );
 //   }
 //   if ( hash1[0] & mask ) 
 //   {
      sph_fugue512_init( &ctx.fugue );
      sph_fugue512( &ctx.fugue, hash1, 64 );
      sph_fugue512_close( &ctx.fugue, hash1 );
 //   }
 //   if ( hash2[0] & mask ) 
 //   {
      sph_fugue512_init( &ctx.fugue );
      sph_fugue512( &ctx.fugue, hash2, 64 );
      sph_fugue512_close( &ctx.fugue, hash2 );
 //   }
 //   if ( hash3[0] & mask ) 
 //   {
      sph_fugue512_init( &ctx.fugue );
      sph_fugue512( &ctx.fugue, hash3, 64 );
      sph_fugue512_close( &ctx.fugue, hash3 );
 //   }
   mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
 //   if ( mm256_any_clr_256( vh_mask ) )
 //   {
      sha512_4way_init( &ctx.sha512 );
      sha512_4way( &ctx.sha512, vhash, 64 );
      sha512_4way_close( &ctx.sha512, vhashB );
 //   }
   mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
   init_groestl( &ctx.groestl, 64 );
   update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
   init_groestl( &ctx.groestl, 64 );
   update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
   init_groestl( &ctx.groestl, 64 );
   update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
   init_groestl( &ctx.groestl, 64 );
   update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
   mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
   sha512_4way_init( &ctx.sha512 ); 
   sha512_4way( &ctx.sha512, vhash, 64 );
   sha512_4way_close( &ctx.sha512, vhash ); 
 // A = haval parallel, B = Whirlpool serial
   vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
                                 m256_zero );
   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 //   if ( mm256_any_set_256( vh_mask ) ) //4
 //   {
      haval256_5_4way_init( &ctx.haval );
      haval256_5_4way( &ctx.haval, vhash, 64 );
      haval256_5_4way_close( &ctx.haval, vhashA );
      memset( &vhashA[8<<2], 0, 32<<2 );
 //   }
 //   if ( !( hash0[0] & mask ) )
 //   {
      sph_whirlpool_init( &ctx.whirlpool );
      sph_whirlpool( &ctx.whirlpool, hash0, 64 );
      sph_whirlpool_close( &ctx.whirlpool, hash0 );
 //   }
 //   if ( !( hash2[0] & mask ) )
 //   {
      sph_whirlpool_init( &ctx.whirlpool );
      sph_whirlpool( &ctx.whirlpool, hash1, 64 );
      sph_whirlpool_close( &ctx.whirlpool, hash1 );
 //   }
 //   if ( !( hash2[0] & mask ) )
 //   {
      sph_whirlpool_init( &ctx.whirlpool );
      sph_whirlpool( &ctx.whirlpool, hash2, 64 );
      sph_whirlpool_close( &ctx.whirlpool, hash2 );
 //   }
 //   if ( !( hash3[0] & mask ) )
 //   {
      sph_whirlpool_init( &ctx.whirlpool );
      sph_whirlpool( &ctx.whirlpool, hash3, 64 );
      sph_whirlpool_close( &ctx.whirlpool, hash3 );
 //   }
   mm256_intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, 512 );
   mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
   bmw512_4way_init( &ctx.bmw );
   bmw512_4way( &ctx.bmw, vhash, 64 );
   bmw512_4way_close( &ctx.bmw, vhash );
 	memcpy(state, vhash, 32<<2 );
 }
 int scanhash_hmq1725_4way( int thr_id, struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
 //   uint32_t *hash7 = &(hash[7<<2]);
 //   uint32_t lane_hash[8];
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19] - 1;
   const uint32_t first_nonce = pdata[19];
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
   const uint32_t Htarg = ptarget[7];
   uint64_t htmax[] = {          0,        0xF,       0xFF,
                             0xFFF,     0xFFFF, 0x10000000  };
   uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                        0xFFFFF000, 0xFFFF0000,          0  };
   mm256_bswap_intrlv80_4x64( vdata, pdata );
   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
   {
      uint32_t mask = masks[ m ];
      do
      {
         *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
         hmq1725_4way_hash( hash, vdata );
         for ( int i = 0; i < 4; i++ )
         if ( ( (hash+(i<<3))[7] & mask ) == 0 )
         {
            if ( fulltest( (hash+(i<<3)), ptarget ) && !opt_benchmark )
            {
               pdata[19] = n + i;
               submit_solution( work, (hash+(i<<3)), mythr, i );
            }
         }
 	      n += 4;
      } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );	
 	   break;
 	}
 	*hashes_done = n - first_nonce + 1;
 	return 0;
 }
 #endif // HMQ1725_4WAY
--- a/algo/quark/hmq1725-gate.c
+++ b/algo/quark/hmq1725-gate.c
@@ -0,0 +1,17 @@
 #include "hmq1725-gate.h"
 bool register_hmq1725_algo( algo_gate_t* gate )
 {
 #if defined(HMQ1725_4WAY)
  gate->scanhash  = (void*)&scanhash_hmq1725_4way;
  gate->hash      = (void*)&hmq1725_4way_hash;
 #else
  init_hmq1725_ctx();
  gate->scanhash  = (void*)&scanhash_hmq1725;
  gate->hash      = (void*)&hmq1725hash;
 #endif
  gate->set_target       = (void*)&scrypt_set_target;
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  return true;
 };
--- a/algo/quark/hmq1725-gate.h
+++ b/algo/quark/hmq1725-gate.h
@@ -0,0 +1,28 @@
 #ifndef HMQ1725_GATE_H__
 #define HMQ1725_GATE_H__ 1
 #include "algo-gate-api.h"
 #include <stdint.h>
 #if defined(__AVX2__) && defined(__AES__)
 //  #define HMQ1725_4WAY
 #endif
 bool register_hmq1725_algo( algo_gate_t* gate );
 #if defined(HMQ1725_4WAY)
 void hmq1725_4way_hash( void *state, const void *input );
 int scanhash_hmq1725_4way( int thr_id, struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 #else
 void hmq1725hash( void *state, const void *input );
 int scanhash_hmq1725( int thr_id, struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );
 void init_hmq1725_ctx();
 #endif
 #endif  // HMQ1725_GATE_H__
--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "hmq1725-gate.h"
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/sph_blake.h"
@@ -298,10 +298,11 @@ extern void hmq1725hash(void *state, const void *input)
 	memcpy(state, hashA, 32);
 }
-int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
+int scanhash_hmq1725( int thr_id, struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t endiandata[32] __attribute__((aligned(64)));
+//        uint32_t endiandata[32] __attribute__((aligned(64)));
        uint32_t endiandata[20] __attribute__((aligned(64)));
        uint32_t hash64[8] __attribute__((aligned(64)));
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -311,7 +312,8 @@ int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
 	//const uint32_t Htarg = ptarget[7];
 	//we need bigendian data...
-        for (int k = 0; k < 32; k++)
+//        for (int k = 0; k < 32; k++)
        for (int k = 0; k < 20; k++)
                be32enc(&endiandata[k], pdata[k]);
        hmq_bmw512_midstate( endiandata );
@@ -407,14 +409,14 @@ int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
-
+/*
 bool register_hmq1725_algo( algo_gate_t* gate )
 {
  init_hmq1725_ctx();
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->set_target       = (void*)&scrypt_set_target;
  gate->scanhash         = (void*)&scanhash_hmq1725;
  gate->hash             = (void*)&hmq1725hash;
  return true;
 };
-
+*/
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -48,9 +48,10 @@ void quark_4way_hash( void *state, const void *input )
    __m256i* vhA = (__m256i*)vhashA;
    __m256i* vhB = (__m256i*)vhashB;
    __m256i vh_mask;
    __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
    int i;
    quark_4way_ctx_holder ctx;
    const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
    const uint32_t mask = 8;
    memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
    blake512_4way( &ctx.blake, input, 80 );
@@ -62,27 +63,44 @@ void quark_4way_hash( void *state, const void *input )
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );
-       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
    if ( hash0[0] & mask )
    {
       update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                               (char*)hash0, 512 );
    }
    if ( hash1[0] & mask )
    {
       reinit_groestl( &ctx.groestl );
       update_and_final_groestl( &ctx.groestl, (char*)hash1,
                                               (char*)hash1, 512 );
    }
    if ( hash2[0] & mask )
    {   
       reinit_groestl( &ctx.groestl );
       update_and_final_groestl( &ctx.groestl, (char*)hash2,
                                               (char*)hash2, 512 );
    }
    if ( hash3[0] & mask )
    {   
       reinit_groestl( &ctx.groestl );
       update_and_final_groestl( &ctx.groestl, (char*)hash3,
                                               (char*)hash3, 512 );
-       mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+    }
    mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
    if ( mm256_anybits0( vh_mask ) )   
    {
       skein512_4way( &ctx.skein, vhash, 64 );
       skein512_4way_close( &ctx.skein, vhashB );
    }
-    for ( i = 0; i < 8; i++ )
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
-       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
    mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
    mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
    reinit_groestl( &ctx.groestl );
@@ -91,7 +109,8 @@ void quark_4way_hash( void *state, const void *input )
    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-    mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
    mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
    jh512_4way( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhash );
@@ -99,16 +118,21 @@ void quark_4way_hash( void *state, const void *input )
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );
    if ( mm256_anybits1( vh_mask ) )
    {
       blake512_4way_init( &ctx.blake );
       blake512_4way( &ctx.blake, vhash, 64 );
       blake512_4way_close( &ctx.blake, vhashA );
    }
    if ( mm256_anybits0( vh_mask ) )
    {
       bmw512_4way_init( &ctx.bmw );
       bmw512_4way( &ctx.bmw, vhash, 64 );
       bmw512_4way_close( &ctx.bmw, vhashB );
    }
-    for ( i = 0; i < 8; i++ )
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
    keccak512_4way( &ctx.keccak, vhash, 64 );
    keccak512_4way_close( &ctx.keccak, vhash );
@@ -120,63 +144,65 @@ void quark_4way_hash( void *state, const void *input )
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );
    if ( mm256_anybits1( vh_mask ) )
    {
       keccak512_4way_init( &ctx.keccak );
       keccak512_4way( &ctx.keccak, vhash, 64 );
       keccak512_4way_close( &ctx.keccak, vhashA );
    }
    if ( mm256_anybits0( vh_mask ) )
    {
       jh512_4way_init( &ctx.jh );
       jh512_4way( &ctx.jh, vhash, 64 );
       jh512_4way_close( &ctx.jh, vhashB );
    }
-    for ( i = 0; i < 8; i++ )
+    // Final blend, directly to state, only need 32 bytes.
-       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+    casti_m256i( state, 0 ) = _mm256_blendv_epi8( vhA[0], vhB[0], vh_mask );
-
+    casti_m256i( state, 1 ) = _mm256_blendv_epi8( vhA[1], vhB[1], vh_mask );
-    mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
+    casti_m256i( state, 2 ) = _mm256_blendv_epi8( vhA[2], vhB[2], vh_mask );
    casti_m256i( state, 3 ) = _mm256_blendv_epi8( vhA[3], vhB[3], vh_mask );
 }
 int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done)
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[4*8] __attribute__ ((aligned (64)));
    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-    uint32_t endiandata[20] __attribute__((aligned(64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
    uint32_t *hash7 = &(hash[25]);
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
-    uint32_t *nonces = work->nonces;
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-    int num_found = 0;
+    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
    uint32_t *noncep = vdata + 73;   // 9*8 + 1
    swab32_array( endiandata, pdata, 20 );
    uint64_t *edata = (uint64_t*)endiandata;
    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
    mm256_bswap_intrlv80_4x64( vdata, pdata );
    do
    {
-       be32enc( noncep,   n   );
+       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-       be32enc( noncep+2, n+1 );
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
       be32enc( noncep+4, n+2 );
       be32enc( noncep+6, n+3 );
       quark_4way_hash( hash, vdata );
       pdata[19] = n;
       for ( int i = 0; i < 4; i++ )
-       if ( ( ( (hash+(i<<3))[7] & 0xFFFFFF00 ) == 0 )
+       if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 )
            && fulltest( hash+(i<<3), ptarget ) )
       {
-          pdata[19] = n+i;
+          mm256_extract_lane_4x64( lane_hash, hash, i, 256 );
-          nonces[ num_found++ ] = n+i;
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark  )
-          work_set_target_ratio( work, hash+(i<<3) );
+          {
            pdata[19] = n+i;
            submit_solution( work, lane_hash, mythr, i );
          }
       }
       n += 4;
-    } while ( ( num_found == 0 ) && ( n < max_nonce )
+    } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
              && !work_restart[thr_id].restart );
    *hashes_done = n - first_nonce + 1;
-    return num_found;
+    return 0;
 }
 #endif
--- a/algo/quark/quark-gate.h
+++ b/algo/quark/quark-gate.h
@@ -13,19 +13,15 @@ bool register_quark_algo( algo_gate_t* gate );
 #if defined(QUARK_4WAY)
 void quark_4way_hash( void *state, const void *input );
 int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+                         uint64_t *hashes_done, struct thr_info *mythr );
 void init_quark_4way_ctx();
 #endif
 void quark_hash( void *state, const void *input );
 int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+                    uint64_t *hashes_done, struct thr_info *mythr );
 void init_quark_ctx();
 #endif
--- a/algo/quark/quark.c
+++ b/algo/quark/quark.c
@@ -173,16 +173,17 @@ void quark_hash(void *state, const void *input)
 }
 int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done)
+                    uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
-        uint32_t hash64[8] __attribute__((aligned(32)));
+   uint32_t hash64[8] __attribute__((aligned(32)));
-        uint32_t *pdata = work->data;
+   uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
-        swab32_array( endiandata, pdata, 20 );
+   swab32_array( endiandata, pdata, 20 );
 	do {
 		pdata[19] = ++n;
--- a/algo/qubit/deep-2way.c
+++ b/algo/qubit/deep-2way.c
@@ -64,7 +64,7 @@ void deep_2way_hash( void *output, const void *input )
 }
 int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
-                         uint64_t *hashes_done )
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -73,17 +73,17 @@ int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
     int num_found = 0;
     uint32_t *noncep = vdata + 32+3;   // 4*8 + 3
     /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };
-     // big endian encode 0..18 uint32_t, 64 bits at a time
+     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
-     swab32_array( endiandata, pdata, 20 );
+     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
     casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
     uint64_t *edata = (uint64_t*)endiandata;
     mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
@@ -102,23 +102,24 @@ int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
            deep_2way_hash( hash, vdata );
            pdata[19] = n;
-            if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
+            if ( !( hash[7] & mask ) )
            if ( fulltest( hash, ptarget) && !opt_benchmark )
            {
-               nonces[ num_found++ ] = n;
+                pdata[19] = n;
-               work_set_target_ratio( work, hash );
+                submit_solution( work, hash, mythr, 0 );
            }
-            if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
+            if ( !( (hash+8)[7] & mask ) )
            if ( fulltest( hash+8, ptarget) && !opt_benchmark )
            {
-               nonces[ num_found++ ] = n+1;
+               pdata[19] = n+1;
-               work_set_target_ratio( work, hash+8 );
+               submit_solution( work, hash+8, mythr, 1 );
            }
            n += 2;
-         } while ( ( num_found == 0 ) && ( n < max_nonce )
+         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
                   && !work_restart[thr_id].restart );
         break;
     }
     *hashes_done = n - first_nonce + 1;
-     return num_found;
+     return 0;
 }
 #endif
--- a/algo/qubit/deep-gate.h
+++ b/algo/qubit/deep-gate.h
@@ -13,19 +13,15 @@ bool register_deep_algo( algo_gate_t* gate );
 #if defined(DEEP_2WAY)
 void deep_2way_hash( void *state, const void *input );
 int scanhash_deep_2way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+                         uint64_t *hashes_done, struct thr_info *mythr );
 void init_deep_2way_ctx();
 #endif
 void deep_hash( void *state, const void *input );
 int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+                    uint64_t *hashes_done, struct thr_info *mythr );
 void init_deep_ctx();
 #endif
--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -72,14 +72,15 @@ void deep_hash(void *output, const void *input)
 }
 int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done)
+                   uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
-        uint32_t hash64[8] __attribute__((aligned(32)));
+   uint32_t hash64[8] __attribute__((aligned(32)));
-        uint32_t *pdata = work->data;
+   uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	const uint32_t Htarg = ptarget[7];
 	uint64_t htmax[] = { 0, 0xF, 0xFF,  0xFFF, 0xFFFF, 0x10000000 };
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -70,7 +70,7 @@ void qubit_2way_hash( void *output, const void *input )
 }
 int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
-                         uint64_t *hashes_done )
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -79,17 +79,17 @@ int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
     int num_found = 0;
     uint32_t *noncep = vdata + 32+3;   // 4*8 + 3
     /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };
-     // big endian encode 0..18 uint32_t, 64 bits at a time
+     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
-     swab32_array( endiandata, pdata, 20 );
+     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
     casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
     uint64_t *edata = (uint64_t*)endiandata;
     mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
@@ -107,25 +107,24 @@ int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
            qubit_2way_hash( hash, vdata );
            pdata[19] = n;
-
+            if ( !( hash[7] & mask ) )
-            if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
+            if ( fulltest( hash, ptarget) && !opt_benchmark )
            {
-               nonces[ num_found++ ] = n;
+                pdata[19] = n;
-               work_set_target_ratio( work, hash );
+                submit_solution( work, hash, mythr, 0 );
            }
-            if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
+            if ( !( (hash+8)[7] & mask ) )
            if ( fulltest( hash+8, ptarget) && !opt_benchmark )
            {
               pdata[19] = n+1;
-               nonces[ num_found++ ] = n+1;
+               submit_solution( work, hash+8, mythr, 1 );
               work_set_target_ratio( work, hash+8 );
            }
            n += 2;
-         } while ( ( num_found == 0 ) && ( n < max_nonce )
+         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
                   && !work_restart[thr_id].restart );
         break;
     }
     *hashes_done = n - first_nonce + 1;
-     return num_found;
+     return 0;
 }
 #endif
--- a/algo/qubit/qubit-gate.h
+++ b/algo/qubit/qubit-gate.h
@@ -13,19 +13,15 @@ bool register_qubit_algo( algo_gate_t* gate );
 #if defined(QUBIT_2WAY)
 void qubit_2way_hash( void *state, const void *input );
 int scanhash_qubit_2way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+                         uint64_t *hashes_done, struct thr_info *mythr );
 void init_qubit_2way_ctx();
 #endif
 void qubit_hash( void *state, const void *input );
 int scanhash_qubit( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+                    uint64_t *hashes_done, struct thr_info *mythr );
 void init_qubit_ctx();
 #endif
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -83,15 +83,16 @@ void qubit_hash(void *output, const void *input)
        memcpy(output, hash, 32);
 }
-int scanhash_qubit(int thr_id, struct work *work,
+int scanhash_qubit( int thr_id, struct work *work,	uint32_t max_nonce,
-		uint32_t max_nonce, uint64_t *hashes_done)
+                    uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
-        uint32_t hash64[8] __attribute__((aligned(64)));
+   uint32_t hash64[8] __attribute__((aligned(64)));
-        uint32_t *pdata = work->data;
+   uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	const uint32_t Htarg = ptarget[7];
 	uint64_t htmax[] = { 0, 0xF, 0xFF,  0xFFF, 0xFFFF, 0x10000000 };
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -40,9 +40,9 @@ void lbry_8way_hash( void* output, const void* input )
   sha256_8way_close( &ctx_sha256, vhashA );
   // reinterleave to do sha512 4-way 64 bit twice.
-   mm256_deinterleave_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 );
+   mm256_dintrlv_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 );
-   mm256_interleave_4x64( vhashA, h0, h1, h2, h3, 256 );
+   mm256_intrlv_4x64( vhashA, h0, h1, h2, h3, 256 );
-   mm256_interleave_4x64( vhashB, h4, h5, h6, h7, 256 );
+   mm256_intrlv_4x64( vhashB, h4, h5, h6, h7, 256 );
   sha512_4way_init( &ctx_sha512 );
   sha512_4way( &ctx_sha512, vhashA, 32 );
@@ -53,9 +53,9 @@ void lbry_8way_hash( void* output, const void* input )
   sha512_4way_close( &ctx_sha512, vhashB );
   // back to 8-way 32 bit
-   mm256_deinterleave_4x64( h0, h1, h2, h3, vhashA, 512 );
+   mm256_dintrlv_4x64( h0, h1, h2, h3, vhashA, 512 );
-   mm256_deinterleave_4x64( h4, h5, h6, h7, vhashB, 512 );
+   mm256_dintrlv_4x64( h4, h5, h6, h7, vhashB, 512 );
-   mm256_interleave_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );
+   mm256_intrlv_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );
   ripemd160_8way_init( &ctx_ripemd );
   ripemd160_8way( &ctx_ripemd, vhashA, 32 );
@@ -72,27 +72,24 @@ void lbry_8way_hash( void* output, const void* input )
   sha256_8way_init( &ctx_sha256 );
   sha256_8way( &ctx_sha256, vhashA, 32 );
-   sha256_8way_close( &ctx_sha256, vhashA );
+   sha256_8way_close( &ctx_sha256, output );
   mm256_deinterleave_8x32( output,     output+ 32, output+ 64, output+ 96,
                            output+128, output+160, output+192, output+224,
                            vhashA, 256 );
 }
 int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done)
+                        uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[32*8] __attribute__ ((aligned (64)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[7<<3]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[27];
   const uint32_t first_nonce = pdata[27];
   const uint32_t Htarg = ptarget[7];
   uint32_t edata[32] __attribute__ ((aligned (64)));
-   uint32_t *nonces = work->nonces;
+   __m256i  *noncev = (__m256i*)vdata + 27;   // aligned
-   int num_found = 0;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
   uint32_t *noncep = vdata + 216; // 27*8
   uint64_t htmax[] = {          0,        0xF,       0xFF,
                             0xFFF,     0xFFFF, 0x10000000 };
@@ -100,9 +97,12 @@ int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
                        0xFFFFF000, 0xFFFF0000,          0 };
   // we need bigendian data...
-   swab32_array( edata, pdata, 32 );
+   casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
-   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+   casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
-                                 edata, edata, edata, edata, 1024 );
+   casti_m256i( edata, 2 ) = mm256_bswap_32( casti_m256i( pdata, 2 ) );
   casti_m256i( edata, 3 ) = mm256_bswap_32( casti_m256i( pdata, 3 ) );
   mm256_intrlv_8x32( vdata, edata, edata, edata, edata,
                             edata, edata, edata, edata, 1024 );
   sha256_8way_init( &sha256_8w_mid );
   sha256_8way( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
@@ -111,136 +111,26 @@ int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
      uint32_t mask = masks[m];
      do
      {
-         be32enc( noncep,   n   );
+        *noncev = mm256_bswap_32( _mm256_set_epi32(
-         be32enc( noncep+1, n+1 );
+                                          n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
         be32enc( noncep+2, n+2 );
         be32enc( noncep+3, n+3 );
         be32enc( noncep+4, n+4 );
         be32enc( noncep+5, n+5 );
         be32enc( noncep+6, n+6 );
         be32enc( noncep+7, n+7 );
         lbry_8way_hash( hash, vdata );
-         for ( int i = 0; i < 8; i++ )
+         for ( int i = 0; i < 8; i++ )  if ( !( hash7[ i ] & mask ) )
         if ( !( (hash+(i<<3))[7] & mask ) && fulltest( hash+(i<<3), ptarget ) )
         {
-            pdata[27] = n+i;
+            // deinterleave hash for lane
-            nonces[ num_found++ ] = n+i;
+            mm256_extract_lane_8x32( lane_hash, hash, i, 256 );
-            work_set_target_ratio( work, hash+(i<<3) );
+            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
            {
              pdata[27] = n + i;
              submit_solution( work, lane_hash, mythr, i );
            }
         }
-         n+=8;
+         n += 8;
-      } while ( ( num_found == 0 ) && ( n < max_nonce )
+      } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
                   && !work_restart[thr_id].restart );
      break;
   }
-
+   *hashes_done = n - first_nonce + 1;
-   *hashes_done = n - first_nonce;
+   return 0;
   return num_found;
 }
 #elif defined(LBRY_4WAY)
 static __thread sha256_4way_context sha256_mid;
 void lbry_4way_hash( void* output, const void* input )
 {
   sha256_4way_context     ctx_sha256 __attribute__ ((aligned (64)));
   sha512_4way_context     ctx_sha512;
   ripemd160_4way_context  ctx_ripemd;
   uint32_t _ALIGN(64) vhashA[16<<2];
   uint32_t _ALIGN(64) vhashB[16<<2];
   uint32_t _ALIGN(64) vhashC[16<<2];
   memcpy( &ctx_sha256, &sha256_mid, sizeof(ctx_sha256) );
   sha256_4way( &ctx_sha256, input + (LBRY_MIDSTATE<<2), LBRY_TAIL );
   sha256_4way_close( &ctx_sha256, vhashA );
   sha256_4way_init( &ctx_sha256 );
   sha256_4way( &ctx_sha256, vhashA, 32 );
   sha256_4way_close( &ctx_sha256, vhashA );
   // sha512 64 bit data, 64 byte output
   mm256_reinterleave_4x64( vhashB, vhashA, 256 );
   sha512_4way_init( &ctx_sha512 );
   sha512_4way( &ctx_sha512, vhashB, 32 );
   sha512_4way_close( &ctx_sha512, vhashB );
   mm256_reinterleave_4x32( vhashA, vhashB, 512 );
   ripemd160_4way_init( &ctx_ripemd );
   ripemd160_4way( &ctx_ripemd, vhashA, 32 );
   ripemd160_4way_close( &ctx_ripemd, vhashB );
   ripemd160_4way_init( &ctx_ripemd );
   ripemd160_4way( &ctx_ripemd, vhashA+(8<<2), 32 );
   ripemd160_4way_close( &ctx_ripemd, vhashC );
   sha256_4way_init( &ctx_sha256 );
   sha256_4way( &ctx_sha256, vhashB, 20 );
   sha256_4way( &ctx_sha256, vhashC, 20 );
   sha256_4way_close( &ctx_sha256, vhashA );
   sha256_4way_init( &ctx_sha256 );
   sha256_4way( &ctx_sha256, vhashA, 32 );
   sha256_4way_close( &ctx_sha256, vhashA );
   mm128_deinterleave_4x32( output, output+32, output+64, output+96,
 		            vhashA, 256 );
 }
 int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done)
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t vdata[32*4] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[27];
   const uint32_t first_nonce = pdata[27];
   const uint32_t Htarg = ptarget[7];
   uint32_t edata[32] __attribute__ ((aligned (64)));
   uint32_t *nonces = work->nonces;
   int num_found = 0;
   uint32_t *noncep = vdata + 108; // 27*4
   uint64_t htmax[] = {          0,        0xF,       0xFF,
                             0xFFF,     0xFFFF, 0x10000000 };
   uint32_t masks[] = {	0xFFFFFFFF, 0xFFFFFFF0,	0xFFFFFF00,
                        0xFFFFF000, 0xFFFF0000,          0 };
   // we need bigendian data...
   swab32_array( edata, pdata, 32 );
   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 1024 );
   sha256_4way_init( &sha256_mid );
   sha256_4way( &sha256_mid, vdata, LBRY_MIDSTATE );
   for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] )
   {
      uint32_t mask = masks[m];
      do
      {
         be32enc( noncep,   n   );
         be32enc( noncep+1, n+1 );
         be32enc( noncep+2, n+2 );
         be32enc( noncep+3, n+3 );
         lbry_4way_hash( hash, vdata );
         for ( int i = 0; i < 4; i++ )
         if ( !( (hash+(i<<3))[7] & mask ) && fulltest( hash+(i<<3), ptarget ) )
         {
            pdata[27] = n+i;
            nonces[ num_found++ ] = n+i;
            work_set_target_ratio( work, hash+(i<<3) );
         }
         n+=4;
      } while ( ( num_found == 0 ) && ( n < max_nonce )
                   && !work_restart[thr_id].restart );
      break;
   }
   *hashes_done = n - first_nonce;
   return num_found;
 }
 #endif
--- a/algo/ripemd/lbry-gate.h
+++ b/algo/ripemd/lbry-gate.h
@@ -4,12 +4,10 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-// Overide multi way on ryzen, SHA is better.
+#if !defined(__SHA__)
-#if !defined(RYZEN_)
+ #if defined(__AVX2__)
 // need sha512 2 way AVX x2 or 1 way scalar x4 to support 4way AVX.
 #if defined(__AVX2__)
  #define LBRY_8WAY
-#endif
+ #endif
 #endif
 #define LBRY_NTIME_INDEX 25
@@ -24,17 +22,18 @@ bool register_lbry_algo( algo_gate_t* gate );
 void lbry_8way_hash( void *state, const void *input );
 int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+                         uint64_t *hashes_done, struct thr_info *mythr );
-
+/*
 #elif defined(LBRY_4WAY)
 void lbry_4way_hash( void *state, const void *input );
 int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );
 */
 #else
 void lbry_hash( void *state, const void *input );
 int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+                    uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 #endif
--- a/algo/ripemd/lbry.c
+++ b/algo/ripemd/lbry.c
@@ -48,13 +48,14 @@ void lbry_hash(void* output, const void* input)
 }
 int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done)
+                   uint64_t *hashes_done, struct thr_info *mythr)
 {
  uint32_t *pdata = work->data;
  uint32_t *ptarget = work->target;
 	uint32_t n = pdata[27] - 1;
 	const uint32_t first_nonce = pdata[27];
 	const uint32_t Htarg = ptarget[7];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	uint32_t hash64[8] __attribute__((aligned(64)));
 	uint32_t endiandata[32] __attribute__ ((aligned (64)));
--- a/algo/scrypt.c
+++ b/algo/scrypt.c
@@ -696,7 +696,7 @@ static void scrypt_1024_1_1_256_24way(const uint32_t *input,
 #endif /* HAVE_SCRYPT_6WAY */
 extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce,
-                            uint64_t *hashes_done )
+                            uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -704,6 +704,7 @@ extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce,
 	uint32_t midstate[8];
 	uint32_t n = pdata[19] - 1;
 	const uint32_t Htarg = ptarget[7];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	int throughput = scrypt_best_throughput();
 	int i;
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -5,13 +5,12 @@
 #include "algo-gate-api.h"
 // Override multi way on ryzen, SHA is better.
-#if !defined(RYZEN_)
+#if !defined(__SHA__)
-#if defined(__SSE2__)
+ #if defined(__AVX2__)
  #define SHA256T_4WAY
 #endif
 #if defined(__AVX2__)
  #define SHA256T_8WAY
-#endif
+#elif defined(__SSE2__)
  #define SHA256T_4WAY
 #endif
 #endif
 bool register_sha256t_algo( algo_gate_t* gate );
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -6,6 +6,12 @@
 #if defined (__AVX2__)
 union _m256_v16 {
  uint16_t u16[16];
  __m256i v256;
 };
 typedef union _m256_v16 m256_v16;
 // imported from simd_iv.h
 uint32_t SIMD_IV_512[] = { 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc,
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -2,7 +2,11 @@
 #include <string.h>
 #include <stdint.h>
 #include "skein-hash-4way.h"
-#include "algo/sha/sha2-hash-4way.h"
+#if defined(__SHA__)
  #include <openssl/sha.h>
 #else
  #include "algo/sha/sha2-hash-4way.h"
 #endif
 #if defined (SKEIN_4WAY)
@@ -11,53 +15,69 @@ void skeinhash_4way( void *state, const void *input )
     uint64_t vhash64[8*4] __attribute__ ((aligned (64)));
     uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
     skein512_4way_context ctx_skein;
 #if defined(__SHA__)
     uint32_t hash0[16] __attribute__ ((aligned (64)));
     uint32_t hash1[16] __attribute__ ((aligned (64)));
     uint32_t hash2[16] __attribute__ ((aligned (64)));
     uint32_t hash3[16] __attribute__ ((aligned (64)));
     SHA256_CTX           ctx_sha256;
 #else
     sha256_4way_context ctx_sha256;
 #endif
     skein512_4way_init( &ctx_skein );
     skein512_4way( &ctx_skein, input, 80 );
     skein512_4way_close( &ctx_skein, vhash64 );
 #if defined(__SHA__)      
     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
     SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
     SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
     SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
     SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
     mm128_intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
 #else
     mm256_rintrlv_4x64_4x32( vhash32, vhash64, 512 );
     sha256_4way_init( &ctx_sha256 );
     sha256_4way( &ctx_sha256, vhash32, 64 );
     sha256_4way_close( &ctx_sha256, state );
-
+#endif
     mm128_dintrlv_4x32( state, state+32, state+64, state+96,
 		              vhash32, 256 );
 }
 int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done )
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
    uint32_t hash[8*4] __attribute__ ((aligned (64)));
    uint32_t lane_hash[8];
    uint32_t *hash7 = &(hash[7<<2]);
    uint32_t edata[20] __attribute__ ((aligned (64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
-    // hash is returned deinterleaved
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-    uint32_t *nonces = work->nonces;
+    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
    int num_found = 0;
 // data is 80 bytes, 20 u32 or 4 u64.
    swab32_array( edata, pdata, 20 );
    mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
    uint32_t *noncep = vdata + 73;   // 9*8 + 1
   mm256_bswap_intrlv80_4x64( vdata, pdata );
   do
   {
-       be32enc( noncep,   n   );
+       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-       be32enc( noncep+2, n+1 );
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
       be32enc( noncep+4, n+2 );
       be32enc( noncep+6, n+3 );
       skeinhash_4way( hash, vdata );
@@ -68,16 +88,14 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
          if ( fulltest( lane_hash, ptarget ) )
          {
             pdata[19] = n + lane;
-             nonces[ num_found++ ] = n + lane;
+             submit_solution( work, lane_hash, mythr, lane );
             work_set_target_ratio( work, lane_hash );
          }
       }
       n += 4;
-    } while ( (num_found == 0) && (n < max_nonce)
+    } while ( (n < max_nonce) && !work_restart[thr_id].restart );
               && !work_restart[thr_id].restart );
    *hashes_done = n - first_nonce + 1;
-    return num_found;
+    return 0;
 }
 #endif
--- a/algo/skein/skein-gate.h
+++ b/algo/skein/skein-gate.h
@@ -3,24 +3,21 @@
 #include <stdint.h>
 #include "algo-gate-api.h"
 // Override multi way on ryzen, SHA is better.
 #if !defined(RYZEN_)
 #if defined(__AVX2__)
  #define SKEIN_4WAY
 #endif
 #endif
 #if defined(SKEIN_4WAY)
 void skeinhash_4way( void *output, const void *input );
 int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+                         uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 void skeinhash( void *output, const void *input );
 int scanhash_skein( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+                    uint64_t *hashes_done, struct thr_info *mythr );
 #endif
--- a/algo/skein/skein.c
+++ b/algo/skein/skein.c
@@ -21,8 +21,8 @@ void skeinhash(void *state, const void *input)
     memcpy(state, hash, 32);
 }
-int scanhash_skein(int thr_id, struct work *work,
+int scanhash_skein( int thr_id, struct work *work, uint32_t max_nonce,
-	uint32_t max_nonce, uint64_t *hashes_done)
+                    uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -31,6 +31,7 @@ int scanhash_skein(int thr_id, struct work *work,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t n = first_nonce;
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
        swab32_array( endiandata, pdata, 20 );
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -20,55 +20,43 @@ void skein2hash_4way( void *output, const void *input )
 }
 int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done )
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[8*4] __attribute__ ((aligned (64)));
    uint32_t *hash7 = &(hash[25]);
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
    uint32_t endiandata[20] __attribute__ ((aligned (64)));
    uint64_t *edata = (uint64_t*)endiandata;
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
-    uint32_t *nonces = work->nonces;
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-    int num_found = 0;
+    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
    swab32_array( endiandata, pdata, 20 );
    mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 );
    uint32_t *noncep = vdata + 73;   // 9*8 + 1
    mm256_bswap_intrlv80_4x64( vdata, pdata );
    do 
    {
-       be32enc( noncep,   n   );
+       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-       be32enc( noncep+2, n+1 );
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
       be32enc( noncep+4, n+2 );
       be32enc( noncep+6, n+3 );
-       skein2hash( hash, vdata );
+       skein2hash_4way( hash, vdata );
       for ( int lane = 0; lane < 4; lane++ )
-       if ( hash7[ lane ] <= Htarg )
+       if ( hash7[ lane<<1 ] <= Htarg )
       {
          // deinterleave hash for lane
          uint32_t lane_hash[8];
          mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) )
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
          {
             pdata[19] = n + lane;
-             nonces[ num_found++ ] = n + lane;
+             submit_solution( work, lane_hash, mythr, lane );
             work_set_target_ratio( work, lane_hash );
          }
       }
       n += 4;
-    } while ( (num_found == 0) && (n < max_nonce)
+    } while ( (n < max_nonce) && !work_restart[thr_id].restart );
             &&  !work_restart[thr_id].restart );
    *hashes_done = n - first_nonce + 1;
-    return num_found;
+    return 0;
 }
 #endif
--- a/algo/skein/skein2-gate.c
+++ b/algo/skein/skein2-gate.c
@@ -10,10 +10,9 @@ int64_t skein2_get_max64 ()
 bool register_skein2_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX2_OPT;
-#if defined (FOUR_WAY) && defined (__AVX2__)
+#if defined (SKEIN2_4WAY)
  gate->scanhash  = (void*)&scanhash_skein2_4way;
  gate->hash      = (void*)&skein2hash_4way;
  four_way_not_tested();
 #else
  gate->scanhash  = (void*)&scanhash_skein2;
  gate->hash      = (void*)&skein2hash;
--- a/algo/skein/skein2-gate.h
+++ b/algo/skein/skein2-gate.h
@@ -10,11 +10,11 @@
 #if defined(SKEIN2_4WAY)
 void skein2hash_4way( void *output, const void *input );
 int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t* hashes_done );
+                          uint64_t* hashes_done, struct thr_info *mythr );
 #endif
 void skein2hash( void *output, const void *input );
 int scanhash_skein2( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
+                     uint64_t *hashes_done, struct thr_info *mythr );
 #endif
--- a/algo/skein/skein2.c
+++ b/algo/skein/skein2.c
@@ -34,8 +34,8 @@ void skein2hash(void *output, const void *input)
 }
-int scanhash_skein2(int thr_id, struct work *work,
+int scanhash_skein2( int thr_id, struct work *work,	uint32_t max_nonce,
-	uint32_t max_nonce, uint64_t *hashes_done)
+                     uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -44,6 +44,7 @@ int scanhash_skein2(int thr_id, struct work *work,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t n = first_nonce;
    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
        swab32_array( endiandata, pdata, 20 );
--- a/algo/x14/axiom.c
+++ b/algo/x14/axiom.c
@@ -44,12 +44,13 @@ void axiomhash(void *output, const void *input)
 }
 int scanhash_axiom(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
 	uint32_t _ALIGN(64) hash64[8];
 	uint32_t _ALIGN(64) endiandata[20];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
--- a/algo/x14/polytimos-4way.c
+++ b/algo/x14/polytimos-4way.c
@@ -13,26 +13,16 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/echo/aes_ni/hash_api.h"
-typedef struct {
+union _poly_4way_context_overlay
 {
   skein512_4way_context   skein;
   shabal512_4way_context  shabal;
   hashState_echo          echo;
   luffa_2way_context      luffa;
   sph_fugue512_context    fugue;
   sph_gost512_context     gost;
-} poly_4way_ctx_holder;
+};
-
+typedef union _poly_4way_context_overlay poly_4way_context_overlay;
 poly_4way_ctx_holder poly_4way_ctx;
 void init_polytimos_4way_ctx()
 {
    skein512_4way_init( &poly_4way_ctx.skein );
    shabal512_4way_init( &poly_4way_ctx.shabal );
    init_echo( &poly_4way_ctx.echo, 512  );
    luffa_2way_init( &poly_4way_ctx.luffa, 512 );
    sph_fugue512_init( &poly_4way_ctx.fugue );
    sph_gost512_init( &poly_4way_ctx.gost );
 }
 void polytimos_4way_hash( void *output, const void *input )
 {
@@ -41,51 +31,57 @@ void polytimos_4way_hash( void *output, const void *input )
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-     poly_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+     poly_4way_context_overlay ctx;
     memcpy( &ctx, &poly_4way_ctx, sizeof(poly_4way_ctx) );
     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, input, 80 );
     skein512_4way_close( &ctx.skein, vhash );
     // Need to convert from 64 bit interleaved to 32 bit interleaved.
     uint32_t vhash32[16*4];
     mm256_rintrlv_4x64_4x32( vhash32, vhash, 512 );
     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash32, 64 );
     shabal512_4way_close( &ctx.shabal, vhash32 );
     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
     init_echo( &ctx.echo, 512 );
     update_final_echo ( &ctx.echo, (BitSequence *)hash0,
                         (const BitSequence *)hash0, 512 );
-     memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, 512 );
-     memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, 512 );
-     memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
     mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
     mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
     mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
     mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
-     memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash1, 64 );
     sph_fugue512_close( &ctx.fugue, hash1 );
-     memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash2, 64 );
     sph_fugue512_close( &ctx.fugue, hash2 );
-     memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );
     sph_gost512_init( &ctx.gost );
     sph_gost512( &ctx.gost, hash0, 64 );
     sph_gost512_close( &ctx.gost, hash0 );
     sph_gost512_init( &ctx.gost );
@@ -104,51 +100,43 @@ void polytimos_4way_hash( void *output, const void *input )
     memcpy( output+96, hash3, 32 );
 }
-int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,                              uint64_t *hashes_done )
+int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t endiandata[20] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
+     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   int num_found = 0;
   uint32_t *noncep = vdata + 73;   // 9*8 + 1
   const uint32_t Htarg = ptarget[7];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
   if ( opt_benchmark )
      ptarget[7] = 0x0cff;
-   for ( int i=0; i < 19; i++ )
+   mm256_bswap_intrlv80_4x64( vdata, pdata );
      be32enc( &endiandata[i], pdata[i] );
   uint64_t *edata = (uint64_t*)endiandata;
   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
   do {
-      be32enc( noncep,   n   );
+      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-      be32enc( noncep+2, n+1 );
+                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
      be32enc( noncep+4, n+2 );
      be32enc( noncep+6, n+3 );
      polytimos_4way_hash(hash, vdata);
      pdata[19] = n;
-      for ( int i = 0; i < 4; i++ )
+      for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
      {
         pdata[19] = n+i;
-         nonces[ num_found++ ] = n+i;
+         submit_solution( work, hash+(i<<3), mythr, i );
         work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;
-   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart));
+   } while ( ( n < max_nonce-4 ) && !(*restart));
   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }
 #endif
--- a/algo/x14/polytimos-gate.c
+++ b/algo/x14/polytimos-gate.c
@@ -4,7 +4,6 @@ bool register_polytimos_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
 #ifdef POLYTIMOS_4WAY
  init_polytimos_4way_ctx();
  gate->scanhash  = (void*)&scanhash_polytimos_4way;
  gate->hash      = (void*)&polytimos_4way_hash;
 #else
--- a/algo/x14/polytimos-gate.h
+++ b/algo/x14/polytimos-gate.h
@@ -13,19 +13,14 @@ bool register_polytimos_algo( algo_gate_t* gate );
 #if defined(POLYTIMOS_4WAY)
 void polytimos_4way_hash( void *state, const void *input );
 int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done );
+                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_polytimos_4way_ctx();
 #endif
 void polytimos_hash( void *state, const void *input );
 int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done );
+                  uint64_t *hashes_done, struct thr_info *mythr );
 void init_polytimos_ctx();
 #endif
--- a/algo/x14/polytimos.c
+++ b/algo/x14/polytimos.c
@@ -76,13 +76,14 @@ void polytimos_hash(void *output, const void *input)
 	memcpy(output, hashA, 32);
 }
-int scanhash_polytimos(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(128) hash[8];
 	uint32_t _ALIGN(128) endiandata[20];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
-
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
--- a/algo/x14/veltor-4way.c
+++ b/algo/x14/veltor-4way.c
@@ -40,7 +40,7 @@ void veltor_4way_hash( void *output, const void *input )
     skein512_4way( &ctx.skein, input, 80 );
     skein512_4way_close( &ctx.skein, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
@@ -54,10 +54,10 @@ void veltor_4way_hash( void *output, const void *input )
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );
-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
     sph_gost512( &ctx.gost, hash0, 64 );
     sph_gost512_close( &ctx.gost, hash0 );
@@ -78,7 +78,7 @@ void veltor_4way_hash( void *output, const void *input )
 }
 int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done )
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -91,6 +91,7 @@ int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
     uint32_t *nonces = work->nonces;
     int num_found = 0;
     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
     volatile uint8_t *restart = &(work_restart[thr_id].restart);
     if ( opt_benchmark )
@@ -101,7 +102,7 @@ int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
     }
     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
     do
     {
         be32enc( noncep,   n   );
--- a/algo/x14/veltor-gate.h
+++ b/algo/x14/veltor-gate.h
@@ -15,7 +15,7 @@ bool register_veltor_algo( algo_gate_t* gate );
 void veltor_4way_hash( void *state, const void *input );
 int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done );
+                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_veltor_4way_ctx();
@@ -24,7 +24,7 @@ void init_veltor_4way_ctx();
 void veltor_hash( void *state, const void *input );
 int scanhash_veltor( int thr_id, struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done );
+                  uint64_t *hashes_done, struct thr_info *mythr );
 void init_veltor_ctx();
--- a/algo/x14/veltor.c
+++ b/algo/x14/veltor.c
@@ -61,12 +61,14 @@ void veltor_hash(void *output, const void *input)
 	memcpy(output, hashB, 32);
 }
-int scanhash_veltor(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_veltor( int thr_id, struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(128) hash[8];
 	uint32_t _ALIGN(128) endiandata[20];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -78,7 +78,7 @@ void x14_4way_hash( void *state, const void *input )
     bmw512_4way_close( &ctx.bmw, vhash );
     // Serial
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 3 Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -90,7 +90,7 @@ void x14_4way_hash( void *state, const void *input )
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     // Parallel 4way
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     // 4 Skein
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -105,16 +105,16 @@ void x14_4way_hash( void *state, const void *input )
     keccak512_4way_close( &ctx.keccak, vhash );
     // Serial
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 7 Luffa
-     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
-     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -142,13 +142,13 @@ void x14_4way_hash( void *state, const void *input )
     sph_shavite512_close( &ctx.shavite, hash3 );
     // 10 Simd
-     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
-     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
     // 11 Echo
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -164,10 +164,10 @@ void x14_4way_hash( void *state, const void *input )
                       (const BitSequence *) hash3, 512 );
     // 12 Hamsi parallel 4way 32 bit
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 13 Fugue serial
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -183,14 +183,14 @@ void x14_4way_hash( void *state, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );
     // 14 Shabal, parallel 32 bit
-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, state );
 }
 int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done )
+                       uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*16] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -203,6 +203,7 @@ int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
     int num_found = 0;
     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     const uint32_t Htarg = ptarget[7];
     /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
@@ -212,7 +213,7 @@ int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
     swab32_array( endiandata, pdata, 20 );
     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
     for ( int m=0; m < 6; m++ )
       if ( Htarg <= htmax[m] )
--- a/algo/x14/x14-gate.h
+++ b/algo/x14/x14-gate.h
@@ -13,19 +13,15 @@ bool register_x14_algo( algo_gate_t* gate );
 #if defined(X14_4WAY)
 void x14_4way_hash( void *state, const void *input );
 int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done );
+                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x14_4way_ctx();
 #endif
 void x14hash( void *state, const void *input );
 int scanhash_x14( int thr_id, struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done );
+                  uint64_t *hashes_done, struct thr_info *mythr );
 void init_x14_ctx();
 #endif
--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -180,8 +180,8 @@ void x14hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }
-int scanhash_x14(int thr_id, struct work *work,
+int scanhash_x14( int thr_id, struct work *work, uint32_t max_nonce,
-				uint32_t max_nonce, uint64_t *hashes_done)
+                  uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t endiandata[20] __attribute__((aligned(64)));
        uint32_t hash64[8] __attribute__((aligned(64)));
@@ -190,6 +190,7 @@ int scanhash_x14(int thr_id, struct work *work,
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
 	const uint32_t Htarg = ptarget[7];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	uint64_t htmax[] = {
 		0,
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -81,7 +81,7 @@ void x15_4way_hash( void *state, const void *input )
     bmw512_4way_close( &ctx.bmw, vhash );
     // Serial
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 3 Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -93,7 +93,7 @@ void x15_4way_hash( void *state, const void *input )
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     // Parallel 4way
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     // 4 Skein
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -108,16 +108,16 @@ void x15_4way_hash( void *state, const void *input )
     keccak512_4way_close( &ctx.keccak, vhash );
     // Serial to the end
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 7 Luffa
-     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
-     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -145,13 +145,13 @@ void x15_4way_hash( void *state, const void *input )
     sph_shavite512_close( &ctx.shavite, hash3 );
     // 10 Simd
-     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
-     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
     // 11 Echo
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -167,10 +167,10 @@ void x15_4way_hash( void *state, const void *input )
                       (const BitSequence *) hash3, 512 );
     // 12 Hamsi parallel 4way 32 bit
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 13 Fugue
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -186,10 +186,10 @@ void x15_4way_hash( void *state, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );
     // 14 Shabal, parallel 32 bit
-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
     // 15 Whirlpool
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -214,7 +214,7 @@ void x15_4way_hash( void *state, const void *input )
 }
 int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done )
+                       uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -227,6 +227,7 @@ int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
     int num_found = 0;
     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     const uint32_t Htarg = ptarget[7];
     /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
@@ -236,7 +237,7 @@ int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
     swab32_array( endiandata, pdata, 20 );
     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
     for ( int m=0; m < 6; m++ )
       if ( Htarg <= htmax[m] )
--- a/algo/x15/x15-gate.h
+++ b/algo/x15/x15-gate.h
@@ -13,19 +13,15 @@ bool register_x15_algo( algo_gate_t* gate );
 #if defined(X15_4WAY)
 void x15_4way_hash( void *state, const void *input );
 int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done );
+                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x15_4way_ctx();
 #endif
 void x15hash( void *state, const void *input );
 int scanhash_x15( int thr_id, struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done );
+                  uint64_t *hashes_done, struct thr_info *mythr );
 void init_x15_ctx();
 #endif
--- a/algo/x15/x15.c
+++ b/algo/x15/x15.c
@@ -186,8 +186,8 @@ void x15hash(void *output, const void *input)
 	memcpy(output, hashB, 32);
 }
-int scanhash_x15(int thr_id, struct work *work,
+int scanhash_x15( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint32_t max_nonce, uint64_t *hashes_done)
+                  uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t endiandata[20] __attribute__((aligned(64)));
        uint32_t hash64[8] __attribute__((aligned(64)));
@@ -196,6 +196,7 @@ int scanhash_x15(int thr_id, struct work *work,
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
 	const uint32_t Htarg = ptarget[7];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	uint64_t htmax[] = {
 		0,
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -32,8 +32,8 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
-
+union _x16r_4way_context_overlay
-typedef struct {
+{
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
    hashState_echo          echo;
@@ -50,16 +50,8 @@ typedef struct {
    shabal512_4way_context  shabal;
    sph_whirlpool_context   whirlpool;
    sha512_4way_context     sha512;
 } x16r_4way_ctx_holder;
 x16r_4way_ctx_holder x16r_4way_ctx __attribute__ ((aligned (64)));
 // Cube needs one full init so fast reinits can be done in the hash loop.
 void init_x16r_4way_ctx()
 {
   cubehashInit( &x16r_4way_ctx.cube, 512, 16, 32 );
 };
-
+typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;
 void x16r_4way_hash( void* output, const void* input )
 {
@@ -68,14 +60,14 @@ void x16r_4way_hash( void* output, const void* input )
   uint32_t hash2[24] __attribute__ ((aligned (64)));
   uint32_t hash3[24] __attribute__ ((aligned (64)));
   uint32_t vhash[24*4] __attribute__ ((aligned (64)));
-   x16r_4way_ctx_holder ctx;
+   x16r_4way_context_overlay ctx;
   void *in0 = (void*) hash0;
   void *in1 = (void*) hash1;
   void *in2 = (void*) hash2;
   void *in3 = (void*) hash3;
   int size = 80;
-   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, input, 640 );
+   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
   if ( s_ntime == UINT32_MAX )
   {
@@ -104,11 +96,11 @@ void x16r_4way_hash( void* output, const void* input )
               blake512_4way( &ctx.blake, input, size );
            else
            {
-               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               blake512_4way( &ctx.blake, vhash, size );
            }
            blake512_4way_close( &ctx.blake, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case BMW:
            bmw512_4way_init( &ctx.bmw );
@@ -116,11 +108,11 @@ void x16r_4way_hash( void* output, const void* input )
               bmw512_4way( &ctx.bmw, input, size );
            else
            {
-               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               bmw512_4way( &ctx.bmw, vhash, size );
            }
            bmw512_4way_close( &ctx.bmw, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case GROESTL:
               init_groestl( &ctx.groestl, 64 );
@@ -142,11 +134,11 @@ void x16r_4way_hash( void* output, const void* input )
               skein512_4way( &ctx.skein, input, size );
            else
            {
-               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               skein512_4way( &ctx.skein, vhash, size );
            }
            skein512_4way_close( &ctx.skein, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case JH:
            jh512_4way_init( &ctx.jh );
@@ -154,11 +146,11 @@ void x16r_4way_hash( void* output, const void* input )
               jh512_4way( &ctx.jh, input, size );
            else
            {
-               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               jh512_4way( &ctx.jh, vhash, size );
            }
            jh512_4way_close( &ctx.jh, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case KECCAK:
            keccak512_4way_init( &ctx.keccak );
@@ -166,21 +158,21 @@ void x16r_4way_hash( void* output, const void* input )
               keccak512_4way( &ctx.keccak, input, size );
            else
            {
-               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               keccak512_4way( &ctx.keccak, vhash, size );
            }
            keccak512_4way_close( &ctx.keccak, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case LUFFA:
-            mm256_interleave_2x128( vhash, in0, in1, size<<3 );
+            mm256_intrlv_2x128( vhash, in0, in1, size<<3 );
            luffa_2way_init( &ctx.luffa, 512 );
            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
-            mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+            mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
-            mm256_interleave_2x128( vhash, in2, in3, size<<3 );
+            mm256_intrlv_2x128( vhash, in2, in3, size<<3 );
            luffa_2way_init( &ctx.luffa, 512 );
            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
-            mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+            mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
         break;
         case CUBEHASH:
            cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -194,7 +186,7 @@ void x16r_4way_hash( void* output, const void* input )
                                  (const byte*)in2, size );
            cubehashInit( &ctx.cube, 512, 16, 32 );
            cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
-                                        (const byte*)in3, size );
+                                  (const byte*)in3, size );
         break;
         case SHAVITE:
            sph_shavite512_init( &ctx.shavite );
@@ -211,14 +203,14 @@ void x16r_4way_hash( void* output, const void* input )
            sph_shavite512_close( &ctx.shavite, hash3 );
         break;
         case SIMD:
-            mm256_interleave_2x128( vhash, in0, in1, size<<3 );
+            mm256_intrlv_2x128( vhash, in0, in1, size<<3 );
            simd_2way_init( &ctx.simd, 512 );
            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+            mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
-            mm256_interleave_2x128( vhash, in2, in3, size<<3 );
+            mm256_intrlv_2x128( vhash, in2, in3, size<<3 );
            simd_2way_init( &ctx.simd, 512 );
            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+            mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
         break;
         case ECHO:
             init_echo( &ctx.echo, 512 );
@@ -235,11 +227,11 @@ void x16r_4way_hash( void* output, const void* input )
                                (const BitSequence*)in3, size<<3 );
         break;
         case HAMSI:
-             mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+             mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
             hamsi512_4way_init( &ctx.hamsi );
             hamsi512_4way( &ctx.hamsi, vhash, size );
             hamsi512_4way_close( &ctx.hamsi, vhash );
-             mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+             mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case FUGUE:
             sph_fugue512_init( &ctx.fugue );
@@ -256,11 +248,11 @@ void x16r_4way_hash( void* output, const void* input )
             sph_fugue512_close( &ctx.fugue, hash3 );
         break;
         case SHABAL:
-             mm128_interleave_4x32( vhash, in0, in1, in2, in3, size<<3 );
+             mm128_intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
             shabal512_4way_init( &ctx.shabal );
             shabal512_4way( &ctx.shabal, vhash, size );
             shabal512_4way_close( &ctx.shabal, vhash );
-             mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+             mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case WHIRLPOOL:
             sph_whirlpool_init( &ctx.whirlpool );
@@ -277,11 +269,11 @@ void x16r_4way_hash( void* output, const void* input )
             sph_whirlpool_close( &ctx.whirlpool, hash3 );
         break;
         case SHA_512:
-             mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+             mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
             sha512_4way_init( &ctx.sha512 );
             sha512_4way( &ctx.sha512, vhash, size );
             sha512_4way_close( &ctx.sha512, vhash );
-             mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+             mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
      }
      size = 64;
@@ -304,15 +296,13 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
-   uint32_t *nonces = work->nonces;
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   int num_found = 0;
   uint32_t *noncep = vdata + 73;   // 9*8 + 1
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
-   for ( int k=0; k < 19; k++ )
+   casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
-      be32enc( &endiandata[k], pdata[k] );
+   casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
 //   if ( s_ntime != pdata[17] )
   if ( s_ntime != endiandata[17] )
   {
      uint32_t ntime = swab32(pdata[17]);
@@ -326,30 +316,27 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
      ptarget[7] = 0x0cff;
   uint64_t *edata = (uint64_t*)endiandata;
-   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+   mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
   do
   {
-      be32enc( noncep,   n   );
+      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-      be32enc( noncep+2, n+1 );
+               _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
      be32enc( noncep+4, n+2 );
      be32enc( noncep+6, n+3 );
      x16r_4way_hash( hash, vdata );
      pdata[19] = n;
-      for ( int i = 0; i < 4; i++ )
+      for ( int i = 0; i < 4; i++ )  if ( (hash+(i<<3))[7] <= Htarg )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
      {
         pdata[19] = n+i;
-         nonces[ num_found++ ] = n+i;
+         submit_solution( work, hash+(i<<3), mythr, i );
         work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;
-   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
+   } while ( (  n < max_nonce ) && !(*restart) );
   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }
 #endif
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -35,11 +35,9 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output )
 bool register_x16r_algo( algo_gate_t* gate )
 {
 #if defined (X16R_4WAY)
 //  init_x16r_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
  init_x16r_ctx();
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
@@ -52,11 +50,9 @@ bool register_x16r_algo( algo_gate_t* gate )
 bool register_x16s_algo( algo_gate_t* gate )
 {
 #if defined (X16R_4WAY)
 //  init_x16r_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
  init_x16r_ctx();
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -39,20 +39,14 @@ bool register_x16s_algo( algo_gate_t* gate );
 #if defined(X16R_4WAY)
 void x16r_4way_hash( void *state, const void *input );
 int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 void init_x16r_4way_ctx();
 #endif
 void x16r_hash( void *state, const void *input );
 int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );
 void init_x16r_ctx();
 #endif
--- a/algo/x16/x16r.c
+++ b/algo/x16/x16r.c
@@ -33,7 +33,8 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
-typedef struct {
+union _x16r_context_overlay
 {
 #if defined(__AES__)
        hashState_echo          echo;
        hashState_groestl       groestl;
@@ -55,19 +56,13 @@ typedef struct {
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
        SHA512_CTX              sha512;
 } x16r_ctx_holder;
 x16r_ctx_holder x16r_ctx __attribute__ ((aligned (64)));
 void init_x16r_ctx()
 {
   cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
 };
 typedef union _x16r_context_overlay x16r_context_overlay;
 void x16r_hash( void* output, const void* input )
 {
   uint32_t _ALIGN(128) hash[16];
-   x16r_ctx_holder ctx;
+   x16r_context_overlay ctx;
   void *in = (void*) input;
   int size = 80;
@@ -126,7 +121,7 @@ void x16r_hash( void* output, const void* input )
                                    (const BitSequence*)in, size );
         break;
         case CUBEHASH:
-            memcpy( &ctx.cube, &x16r_ctx.cube, sizeof(cubehashParam) );
+            cubehashInit( &ctx.cube, 512, 16, 32 );
            cubehashUpdateDigest( &ctx.cube, (byte*) hash,
                                  (const byte*)in, size );
         break;
@@ -196,13 +191,12 @@ int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t nonce = first_nonce;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
-   for ( int k=0; k < 19; k++ )
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-      be32enc( &endiandata[k], pdata[k] );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
 // This code is suspicious. s_ntime is saved after byteswapping pdata[17]
 // but is tested vs unswapped pdata[17]. This should result in calling
 // getAlgoString every pass, but that doesn't seem to be the case.
 // It appears to be working correctly as is.
   if ( s_ntime != pdata[17] )
   {
      uint32_t ntime = swab32(pdata[17]);
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -3,13 +3,9 @@
 bool register_x17_algo( algo_gate_t* gate )
 {
 #if defined (X17_4WAY)
 printf("register x17 4way\n");
 //  init_x17_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x17_4way;
  gate->hash      = (void*)&x17_4way_hash;
 #else
 printf("register x17 no 4way\n");
  init_x17_ctx();
  gate->scanhash  = (void*)&scanhash_x17;
  gate->hash      = (void*)&x17_hash;
 #endif
--- a/algo/x17/x17-gate.h
+++ b/algo/x17/x17-gate.h
@@ -13,20 +13,14 @@ bool register_x17_algo( algo_gate_t* gate );
 #if defined(X17_4WAY)
 void x17_4way_hash( void *state, const void *input );
 int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 //void init_x17_4way_ctx();
 #endif
 void x17_hash( void *state, const void *input );
 int scanhash_x17( int thr_id, struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
 void init_x17_ctx();
 #endif
--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -34,7 +34,8 @@
  #include "algo/echo/sph_echo.h"
 #endif
-typedef struct {
+union _x17_context_overlay
 {
 #if defined(__AES__)
        hashState_groestl       groestl;
        hashState_echo          echo;
@@ -43,7 +44,7 @@ typedef struct {
        sph_echo512_context     echo;
 #endif
        hashState_luffa         luffa;
-        cubehashParam           cubehash;
+        cubehashParam           cube;
        sph_shavite512_context  shavite;
        hashState_sd            simd;
        sph_hamsi512_context    hamsi;
@@ -52,38 +53,14 @@ typedef struct {
        sph_whirlpool_context   whirlpool;
        SHA512_CTX              sha512;
        sph_haval256_5_context  haval;
 } x17_ctx_holder;
 x17_ctx_holder x17_ctx __attribute__ ((aligned (64)));
 void init_x17_ctx()
 {
 #if defined(__AES__)
        init_groestl( &x17_ctx.groestl, 64 );
        init_echo( &x17_ctx.echo, 512 );
 #else
        sph_groestl512_init(&x17_ctx.groestl );
        sph_echo512_init(&x17_ctx.echo);
 #endif
        init_luffa( &x17_ctx.luffa, 512 );
        cubehashInit( &x17_ctx.cubehash, 512, 16, 32 );
        sph_shavite512_init( &x17_ctx.shavite );
        init_sd( &x17_ctx.simd, 512 );
        sph_hamsi512_init( &x17_ctx.hamsi );
        sph_fugue512_init( &x17_ctx.fugue );
        sph_shabal512_init( &x17_ctx.shabal );
        sph_whirlpool_init( &x17_ctx.whirlpool );
        SHA512_Init( &x17_ctx.sha512 );
        sph_haval256_5_init(&x17_ctx.haval);
 };
 typedef union _x17_context_overlay x17_context_overlay;
 void x17_hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (64)));
 	#define hashB hash+64
-
+   x17_context_overlay ctx;
        x17_ctx_holder ctx __attribute__ ((aligned (64)));
        memcpy( &ctx, &x17_ctx, sizeof(x17_ctx) );
        unsigned char hashbuf[128];
        size_t hashptr;
@@ -115,9 +92,11 @@ void x17_hash(void *output, const void *input)
        //---groestl----
 #if defined(__AES__)
        init_groestl( &ctx.groestl, 64 );
        update_and_final_groestl( &ctx.groestl, (char*)hash,
                                  (const char*)hash, 512 );
 #else
        sph_groestl512_init( &ctx.groestl );
        sph_groestl512( &ctx.groestl, hash, 64 );
        sph_groestl512_close( &ctx.groestl, hash );
 #endif
@@ -142,50 +121,62 @@ void x17_hash(void *output, const void *input)
        KEC_C;
        //--- luffa7
        init_luffa( &ctx.luffa, 512 );
        update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                (const BitSequence*)hash, 64 );
        // 8 Cube
-        cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
+        cubehashInit( &ctx.cube, 512, 16, 32 );
        cubehashUpdateDigest( &ctx.cube, (byte*) hash,
                              (const byte*)hash, 64 );
        // 9 Shavite
        sph_shavite512_init( &ctx.shavite );
        sph_shavite512( &ctx.shavite, hash, 64);
        sph_shavite512_close( &ctx.shavite, hash);
        // 10 Simd
        init_sd( &ctx.simd, 512 );
        update_final_sd( &ctx.simd, (BitSequence*)hash,
                         (const BitSequence*)hash, 512 );
        //11---echo---
 #if defined(__AES__)
        init_echo( &ctx.echo, 512 );
        update_final_echo ( &ctx.echo, (BitSequence*)hash,
                            (const BitSequence*)hash, 512 );
 #else
-	sph_echo512( &ctx.echo, hash, 64 );
+        sph_echo512_init( &ctx.echo );
        sph_echo512( &ctx.echo, hash, 64 );
        sph_echo512_close( &ctx.echo, hash );
 #endif
        // X13 algos
        // 12 Hamsi
        sph_hamsi512_init( &ctx.hamsi );
        sph_hamsi512( &ctx.hamsi, hash, 64 );
        sph_hamsi512_close( &ctx.hamsi, hash );
        // 13 Fugue
        sph_fugue512_init( &ctx.fugue );
        sph_fugue512(&ctx.fugue, hash, 64 );
        sph_fugue512_close(&ctx.fugue, hash );
        // X14 Shabal
        sph_shabal512_init( &ctx.shabal );
        sph_shabal512(&ctx.shabal, hash, 64);
        sph_shabal512_close( &ctx.shabal, hash );
        // X15 Whirlpool
-	sph_whirlpool( &ctx.whirlpool, hash, 64 );
+        sph_whirlpool_init( &ctx.whirlpool );
-	sph_whirlpool_close( &ctx.whirlpool, hash );
+        sph_whirlpool( &ctx.whirlpool, hash, 64 );
        sph_whirlpool_close( &ctx.whirlpool, hash );
-	SHA512_Update( &ctx.sha512, hash, 64 );
+        SHA512_Init( &ctx.sha512 );
        SHA512_Update( &ctx.sha512, hash, 64 );
        SHA512_Final( (unsigned char*)hash, &ctx.sha512 );
        sph_haval256_5_init(&ctx.haval);
        sph_haval256_5( &ctx.haval, (const void*)hash, 64 );
        sph_haval256_5_close( &ctx.haval, output );
 }
@@ -234,42 +225,42 @@ int scanhash_x17( int thr_id, struct work *work, uint32_t max_nonce,
 #endif
   for ( int m = 0; m < 6; m++ )
   {
-	if ( Htarg <= htmax[m] )
+      if ( Htarg <= htmax[m] )
 	{
 	   uint32_t mask = masks[m];
 	   do
 	   {
-		pdata[19] = ++n;
+	      uint32_t mask = masks[m];
-		be32enc( &endiandata[19], n );
+	      do
-		x17_hash( hash64, endiandata );
+	      {
 	         pdata[19] = ++n;
 		      be32enc( &endiandata[19], n );
 		      x17_hash( hash64, endiandata );
 #ifndef DEBUG_ALGO
-		if ( !( hash64[7] & mask ) )
+		      if ( !( hash64[7] & mask ) )
-                {
+            {
-                   if ( fulltest( hash64, ptarget ) )
+               if ( fulltest( hash64, ptarget ) )
-		   {
+		         {
-		      *hashes_done = n - first_nonce + 1;
+		            *hashes_done = n - first_nonce + 1;
-		      return true;
+		            return true;
-                   }
+               }
-//                 else
+//             else
-//                      applog(LOG_INFO, "Result does not validate on CPU!");
+//               applog(LOG_INFO, "Result does not validate on CPU!");
-                }
+            }
 #else
-		if ( !( n % 0x1000 ) && !thr_id ) printf(".");
+		      if ( !( n % 0x1000 ) && !thr_id ) printf(".");
-		if ( !( hash64[7] & mask ) )
+		      if ( !( hash64[7] & mask ) )
 	       	{
-		   printf("[%d]",thr_id);
+		         printf("[%d]",thr_id);
-		   if ( fulltest( hash64, ptarget ) )
+		         if ( fulltest( hash64, ptarget ) )
-		   {
+		         {
-                       work_set_target_ratio( work, hash64 );
+                 work_set_target_ratio( work, hash64 );
-		       *hashes_done = n - first_nonce + 1;
+		           *hashes_done = n - first_nonce + 1;
-		       return true;
+		           return true;
-		   }
+		         }
-		}
+		      }
 #endif
-	    } while (n < max_nonce && !work_restart[thr_id].restart);
+	      } while (n < max_nonce && !work_restart[thr_id].restart);
 			// see blake.c if else to understand the loop on htmax => mask
-	    break;
+	      break;
-	}
+	   }
   }
   *hashes_done = n - first_nonce + 1;
   pdata[19] = n;
--- a/algo/yespower/yespower-platform.c
+++ b/algo/yespower/yespower-platform.c
@@ -41,7 +41,7 @@ static void *alloc_region(yespower_region_t *region, size_t size)
 #endif
 	    MAP_ANON | MAP_PRIVATE;
 #if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE)
-	size_t new_size = size;
+   size_t new_size = size;
 	const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1;
 	if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) {
 		flags |= MAP_HUGETLB;
@@ -55,6 +55,7 @@ static void *alloc_region(yespower_region_t *region, size_t size)
 	base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0);
 	if (base != MAP_FAILED) {
 		base_size = new_size;
 	} else if (flags & MAP_HUGETLB) {
 		flags &= ~MAP_HUGETLB;
 		base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -7,16 +7,6 @@
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 CFLAGS="-O3 -march=core-avx2 -msha -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-avx2-sha.exe
 strip -s cpuminer
 mv cpuminer cpuminer-avx2-sha
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 CFLAGS="-O3 -march=skylake-avx512 -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
@@ -26,7 +16,6 @@ mv cpuminer cpuminer-avx512
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
@@ -36,7 +25,6 @@ mv cpuminer cpuminer-avx2
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 CFLAGS="-O3 -march=corei7-avx -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
@@ -46,7 +34,6 @@ mv cpuminer cpuminer-aes-avx
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
@@ -56,7 +43,6 @@ mv cpuminer cpuminer-aes-sse42
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 CFLAGS="-O3 -march=corei7 -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
@@ -66,7 +52,6 @@ mv cpuminer cpuminer-sse42
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 CFLAGS="-O3 -march=core2 -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
@@ -76,7 +61,6 @@ mv cpuminer cpuminer-ssse3
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 CFLAGS="-O3 -msse2 -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
@@ -86,8 +70,7 @@ mv cpuminer cpuminer-sse2
 make clean || echo done
 rm -f config.status
-./autogen.sh || echo done
+CFLAGS="-O3 -march=znver1 -Wall" ./configure --with-curl
 CFLAGS="-O3 -march=znver1 -DRYZEN_ -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-zen.exe
@@ -96,13 +79,8 @@ mv cpuminer cpuminer-zen
 make clean || echo done
 rm -f config.status
 ./autogen.sh || echo done
 CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
 #mv cpuminer.exe cpuminer-native.exe
 strip -s cpuminer
 #mv cpuminer cpuminer-native
 #make clean || echo done
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.3.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.4.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.9.3.1'
+PACKAGE_VERSION='3.9.4'
-PACKAGE_STRING='cpuminer-opt 3.9.3.1'
+PACKAGE_STRING='cpuminer-opt 3.9.4'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.9.3.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.9.4 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.9.3.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.9.4:";;
   esac
  cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.9.3.1
+cpuminer-opt configure 3.9.4
 generated by GNU Autoconf 2.69
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by cpuminer-opt $as_me 3.9.3.1, which was
+It was created by cpuminer-opt $as_me 3.9.4, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.9.3.1'
+ VERSION='3.9.4'
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.9.3.1, which was
+This file was extended by cpuminer-opt $as_me 3.9.4, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.9.3.1
+cpuminer-opt config.status 3.9.4
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.9.3.1])
+AC_INIT([cpuminer-opt], [3.9.4])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1851,14 +1851,14 @@ static void *miner_thread( void *userdata )
   {
 #if AFFINITY_USES_UINT128
       // Default affinity
-       if ( (opt_affinity == i128_neg1 ) && opt_n_threads > 1 )
+       if ( (opt_affinity == (uint128_t)(-1) ) && opt_n_threads > 1 )
       {  
         if ( opt_debug )
            applog( LOG_DEBUG, "Binding thread %d to cpu %d.",
                    thr_id, thr_id % num_cpus,
-	                 u128_hi64( (uint128_t)1ULL << (thr_id % num_cpus) ),
+	                 u128_hi64( (uint128_t)1 << (thr_id % num_cpus) ),
-		              u128_lo64( (uint128_t)1ULL << (thr_id % num_cpus) ) );
+		              u128_lo64( (uint128_t)1 << (thr_id % num_cpus) ) );
-         affine_to_cpu_mask( thr_id, (uint128_t)1ULL << (thr_id % num_cpus) );
+         affine_to_cpu_mask( thr_id, (uint128_t)1 << (thr_id % num_cpus) );
       }
 #else
       if ( (opt_affinity == -1LL) && opt_n_threads > 1 ) 
@@ -2322,7 +2322,7 @@ bool jr2_stratum_handle_response( json_t *val )
 static bool stratum_handle_response( char *buf )
 {
-	json_t *val, *id_val;
+	json_t *val, *id_val, *res_val;
 	json_error_t err;
 	bool ret = false;
@@ -2332,8 +2332,10 @@ static bool stratum_handle_response( char *buf )
           applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text);
 	   goto out;
 	}
-        json_object_get( val, "result" );
+   res_val = json_object_get( val, "result" );
-	id_val = json_object_get( val, "id" );
+   if ( !res_val ) { /* now what? */ }
   id_val = json_object_get( val, "id" );
 	if ( !id_val || json_is_null(id_val) )
 		goto out;
        if ( !algo_gate.stratum_handle_response( val ) )
--- a/simd-utils.h
+++ b/simd-utils.h
@@ -80,6 +80,7 @@
 //     the element size.
 //   - there is a subset of some functions for scalar data. They may have
 //     no prefix nor vec-size, just one size, the size of the data.
 //   - Some integer functions are also defined which use a similar notation.
 //   
 //    Function names follow this pattern:
 //
@@ -89,7 +90,7 @@
 //            are some examples:
 //
 //    u64:  unsigned 64 bit integer function
-//    i128: signed 128 bit integer function
+//    i128: signed 128 bit integer function (rarely used)
 //    m128: 128 bit vector identifier
 //    mm128: 128 bit vector function
 //
@@ -137,14 +138,32 @@
 // improve high level code readability without the penalty of function
 // overhead.
 //
 // A major restructuring is taking place shifting the focus from pointers
 // to registers. Previously pointer casting used memory to provide transparency
 // leaving it up to the compiler to manage everything and it does a very good
 // job. The focus has shifted to register arguments for more control
 // over the actual instructions assuming the data is in a register and the
 // the compiler just needs to manage the registers.
 //
 // Rather than use pointers to provide type transparency
 // specific instructions are used to access specific data as specific types.
 // Previously pointers were cast and the compiler was left to find a way
 // to get the data from wherever it happened to be to the correct registers.
 //
 // The utilities defined here make use features like register aliasing
 // to optimize operations. Many operations have specialized versions as
 // well as more generic versions. It is preferable to use a specialized
 // version whenever possible a sthey can take advantage of certain
 // optimizations not available to the generic version. Specically the generic
 // version usually has a second argument used is some extra calculations.
 // 
 ///////////////////////////////////////////////////////
 #include <inttypes.h>
 #include <x86intrin.h>
 #include <memory.h>
 #include <stdlib.h>
 #include <stdbool.h>
 // byteswap.h doesn't exist on Windows, find alternative
 //#include <byteswap.h>
 // Various types and overlays
 #include "simd-utils/simd-types.h"
@@ -157,6 +176,7 @@
 // 64 bit vectors
 #include "simd-utils/simd-mmx.h"
 #include "simd-utils/intrlv-mmx.h"
 #if defined(__SSE2__)
 // 128 bit vectors
--- a/simd-utils/intrlv-avx2.h
+++ b/simd-utils/intrlv-avx2.h
@@ -384,6 +384,7 @@ static inline void mm256_intrlv_8x32( void *d, const void *s0,
   // bit_len == 1024
 }
 // A couple of mining specifi functions.
 // Interleave 80 bytes of 32 bit data for 8 lanes.
 static inline void mm256_bswap_intrlv80_8x32( void *d, const void *s )
@@ -469,6 +470,20 @@ static inline void mm256_bswap_intrlv80_4x64( void *d, const void *s )
   mm256_bswap_intrlv_4x64_128( d+256, casti_m128i( s, 4 ) );
 }
 // Blend 32 byte lanes of hash from 2 sources according to control mask.
 // macro due to 256 bit value arg.
 #define mm256_blend_hash_4x64( dst, a, b, mask ) \
 do { \
    dst[0] = _mm256_blendv_epi8( a[0], b[0], mask ); \
    dst[1] = _mm256_blendv_epi8( a[1], b[1], mask ); \
    dst[2] = _mm256_blendv_epi8( a[2], b[2], mask ); \
    dst[3] = _mm256_blendv_epi8( a[3], b[3], mask ); \
    dst[4] = _mm256_blendv_epi8( a[4], b[4], mask ); \
    dst[5] = _mm256_blendv_epi8( a[5], b[5], mask ); \
    dst[6] = _mm256_blendv_epi8( a[6], b[6], mask ); \
    dst[7] = _mm256_blendv_epi8( a[7], b[7], mask ); \
 } while(0)
 // Deinterleave 4 buffers of 64 bit data from the source buffer.
 // bit_len must be 256, 512, 640 or 1024 bits.
 // Requires overrun padding for 640 bit len.
--- a/simd-utils/simd-avx2.h
+++ b/simd-utils/simd-avx2.h
@@ -103,6 +103,29 @@
 #define mm128_extr_lo128_256( a ) _mm256_castsi256_si128( a )
 #define mm128_extr_hi128_256( a ) _mm256_extracti128_si256( a, 1 )
 // Extract 4 u64 from 256 bit vector.
 #define mm256_extr_4x64( a0, a1, a2, a3, src ) \
 do { \
  __m128i hi = _mm256_extracti128_si256( src, 1 ); \
  a0 = _mm_extract_epi64( _mm256_castsi256_si128( src ), 0 ); \
  a1 = _mm_extract_epi64( _mm256_castsi256_si128( src ), 1 ); \
  a2 = _mm_extract_epi64( hi, 0 ); \
  a3 = _mm_extract_epi64( hi, 1 ); \
 } while(0)
 #define mm256_extr_8x32( a0, a1, a2, a3, a4, a5, a6, a7, src ) \
 do { \
  __m128i hi = _mm256_extracti128_si256( src, 1 ); \
  a0 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 0 ); \
  a1 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 1 ); \
  a2 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 2 ); \
  a3 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 3 ); \
  a4 = _mm_extract_epi32( hi, 0 ); \
  a5 = _mm_extract_epi32( hi, 1 ); \
  a6 = _mm_extract_epi32( hi, 2 ); \
  a7 = _mm_extract_epi32( hi, 3 ); \
 } while(0)
 // input __m128i, returns __m256i
 // To build a 256 bit vector from 2 128 bit vectors lo must be done first.
 // lo alone leaves hi undefined, hi alone leaves lo unchanged.
@@ -111,10 +134,24 @@
 #define mm256_ins_lo128_256( a, b )  _mm256_inserti128_si256( a, b, 0 )
 #define mm256_ins_hi128_256( a, b )  _mm256_inserti128_si256( a, b, 1 )
-// concatenate two 128 bit vectors into one 256 bit vector
+// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
 #define mm256_concat_128( hi, lo ) \
   mm256_ins_hi128_256( _mm256_castsi128_si256( lo ), hi )
 // Horizontal vector testing
 // Bit-wise test of entire vector, useful to test results of cmp.
 #define mm256_anybits0( a ) \
         ( (uint128_t)mm128_extr_hi128_256( a ) \
         | (uint128_t)mm128_extr_lo128_256( a ) )
 #define mm256_anybits1( a ) \
         ( ( (uint128_t)mm128_extr_hi128_256( a ) + 1 ) \
         | ( (uint128_t)mm128_extr_lo128_256( a ) + 1 ) )
 #define mm256_allbits0_256( a ) ( !mm256_anybits1(a) )
 #define mm256_allbits1_256( a ) ( !mm256_anybits0(a) )
 // Parallel AES, for when x is expected to be in a 256 bit register.
 #define mm256_aesenc_2x128( x ) \
     mm256_concat_128( \
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -3,9 +3,15 @@
 ///////////////////////////////////
 //
-//    Integers up to 64 bits.
+//    Integers up to 128 bits.
 //
-
+//   These utilities enhance support for integers up to 128 bits.
 //   All standard operations are supported on 128 bit integers except
 //   numeric constant representation and IO. 128 bit integers must be built
 //   and displayed as 2 64 bit halves, just like the old times.
 //
 //   Some utilities are also provided for smaller integers, most notably
 //   bit rotation.   
 // MMX has no extract instruction for 32 bit elements so this:
 // Lo is trivial, high is a simple shift. 
@@ -17,7 +23,6 @@
 #define u64_extr_16( a, n )  ( (uint16_t)( (a) >> ( ( 4-(n)) <<4 ) ) )
 #define u64_extr_8(  a, n )  ( (uint8_t) ( (a) >> ( ( 8-(n)) <<3 ) ) )
 // Rotate bits in various sized integers.
 #define u64_ror_64( x, c ) \
      (uint64_t)( ( (uint64_t)(x) >> (c) ) | ( (uint64_t)(x) << (64-(c)) ) )
@@ -36,6 +41,9 @@
 #define u8_rol_8( x, c ) \
      (uint8_t) ( ( (uint8_t) (x) << (c) ) | ( (uint8_t) (x) >> ( 8-(c)) ) )
 // Endian byte swap
 #define bswap_64( a ) __builtin_bswap64( a )
 #define bswap_32( a ) __builtin_bswap32( a )
 // 64 bit mem functions use integral sizes instead of bytes, data must
 // be aligned to 64 bits. Mostly for scaled indexing convenience.
@@ -56,21 +64,20 @@ static inline void memset_64( uint64_t *dst, const uint64_t a,  int n )
 //
 // No real need or use.
-#define i128_neg1        ((uint128_t)(-1LL))
+//#define u128_neg1        ((uint128_t)(-1))
-// Extract specified 64 bit half of 128 bit integer.
+// Extracting the low bits is a trivial cast.
-// typecast should work for lo: (uint64_t)(x), test it!
+// These specialized functions are optimized while providing a
 // consistent interface.
 #define u128_hi64( x )    ( (uint64_t)( (uint128_t)(x) >> 64 ) )
-#define u128_lo64( x )    ( (uint64_t)( (uint128_t)(x) << 64 >> 64 ) )
+#define u128_lo64( x )    ( (uint64_t)(x) )
 // #define i128_lo64( x )    ((uint64_t)(x))
-// Generic extract, 
+// Generic extract, don't use for extracting low bits, cast instead.
 #define u128_extr_64( a, n )  ( (uint64_t)( (a) >> ( ( 2-(n)) <<6 ) ) )
 #define u128_extr_32( a, n )  ( (uint32_t)( (a) >> ( ( 4-(n)) <<5 ) ) )
 #define u128_extr_16( a, n )  ( (uint16_t)( (a) >> ( ( 8-(n)) <<4 ) ) )
 #define u128_extr_8(  a, n )  ( (uint8_t) ( (a) >> ( (16-(n)) <<3 ) ) )
 // Not much need for this but it fills a gap.
 #define u128_ror_128( x, c ) \
       ( ( (uint128_t)(x) >> (c) ) | ( (uint128_t)(x) << (128-(c)) ) )
--- a/simd-utils/simd-mmx.h
+++ b/simd-utils/simd-mmx.h
@@ -111,8 +111,8 @@
 #if defined(__SSSE3__)
-// An SSE2 versin of this would be monstrous, shifting, masking and oring
+// An SSE2 or MMX version of this would be monstrous, shifting, masking and
-// each byte individually.
+// oring each byte individually.
 #define mm64_invert_8(  v ) \
    _mm_shuffle_pi8( (__m64)v, _mm_set_pi8( 0,1,2,3,4,5,6,7 ) );
--- a/simd-utils/simd-sse2.h
+++ b/simd-utils/simd-sse2.h
@@ -10,11 +10,21 @@
 // SSE2 is generally required for full 128 bit support. Some functions
 // are also optimized with SSSE3 or SSE4.1.
 //
 // Do not call _mm_extract directly, it isn't supported in SSE2.
 // Use mm128_extr instead, it will select the appropriate implementation.
 //
 // 128 bit operations are enhanced with uint128 which adds 128 bit integer
 // support for arithmetic and other operations. Casting to uint128_t is not
 // free, it requires a move from mmx to gpr but is often the only way or
 // the more efficient way for certain operations.
 // Compile time constant initializers are type agnostic and can have
 // a pointer handle of almost any type. All arguments must be scalar constants.
 // up to 64 bits. These iniitializers should only be used at compile time
 // to initialize vector arrays. All data reside in memory.
 //
 // These are of limited use, it is often simpler to use uint64_t arrays
 // and cast as required.
 #define mm128_const_64( x1, x0 ) {{ x1, x0 }}
 #define mm128_const1_64( x )     {{  x,  x }}
@@ -80,6 +90,28 @@
 #define mm128_negate_32( v )    _mm_sub_epi32( m128_zero, v )  
 #define mm128_negate_16( v )    _mm_sub_epi16( m128_zero, v )  
 // Use uint128_t for most arithmetic, bit shift, comparison operations
 // spanning all 128 bits. Some extractions are also more efficient 
 // casting __m128i as uint128_t and usingstandard operators.
 // This isn't cheap, not suitable for bulk usage.
 #define mm128_extr_4x32( a0, a1, a2, a3, src ) \
 do { \
  a0 = _mm_extract_epi32( src, 0 ); \
  a1 = _mm_extract_epi32( src, 1 ); \
  a1 = _mm_extract_epi32( src, 2 ); \
  a3 = _mm_extract_epi32( src, 3 ); \
 } while(0)
 // Horizontal vector testing
 // Bit-wise test of entire vector, useful to test results of cmp.
 #define mm128_anybits0( a ) (uint128_t)(a)
 #define mm128_anybits1( a ) (((uint128_t)(a))+1)
 #define mm128_allbits0( a ) ( !mm128_anybits1(a) )
 #define mm128_allbits1( a ) ( !mm128_anybits0(a) )
 //
 // Vector pointer cast
--- a/simd-utils/simd-types.h
+++ b/simd-utils/simd-types.h
@@ -93,6 +93,7 @@
 // my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 );
 // Compiler check for __int128 support
 // Configure also has a test for int128.
 #if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
  #define GCC_INT128 1
 #endif
@@ -386,13 +387,3 @@ typedef union _regarray_v256 regarray_v256;
 #define u8_1e u8_._1e
 #define u8_1f u8_._1f
 // This is in use by, coincidentally, simd hash.
 union _m256_v16 {
  uint16_t u16[16];
  __m256i v256;
 };
 typedef union _m256_v16 m256_v16;
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -34,7 +34,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=znver1 -DRYZEN_ -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS
 make -j 16
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-zen.exe