v3.9.4

2026-07-14 10:56:50 +00:00 · 2019-06-18 13:15:45 -04:00
parent 71d6b97ee8
commit d6e8d7a46e
75 changed files with 1790 additions and 781 deletions
--- a/algo/argon2/argon2d/argon2d/core.c
+++ b/algo/argon2/argon2d/argon2d/core.c
@@ -112,7 +112,7 @@ int allocate_memory(const argon2_context *context, uint8_t **memory,
 void free_memory(const argon2_context *context, uint8_t *memory,
                 size_t num, size_t size) {
    size_t memory_size = num*size;
-    clear_internal_memory(memory, memory_size);
+//    clear_internal_memory(memory, memory_size);
    if (context->free_cbk) {
        (context->free_cbk)(memory, memory_size);
    } else {
@@ -137,7 +137,7 @@ void NOT_OPTIMIZED secure_wipe_memory(void *v, size_t n) {
 int FLAG_clear_internal_memory = 0;
 void clear_internal_memory(void *v, size_t n) {
  if (FLAG_clear_internal_memory && v) {
-    secure_wipe_memory(v, n);
+//    secure_wipe_memory(v, n);
  }
 }

@@ -559,7 +559,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
                       context->pwdlen);

        if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) {
-            secure_wipe_memory(context->pwd, context->pwdlen);
+//            secure_wipe_memory(context->pwd, context->pwdlen);
            context->pwdlen = 0;
        }
    }
@@ -580,7 +580,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
                       context->secretlen);

        if (context->flags & ARGON2_FLAG_CLEAR_SECRET) {
-            secure_wipe_memory(context->secret, context->secretlen);
+//            secure_wipe_memory(context->secret, context->secretlen);
            context->secretlen = 0;
        }
    }
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -537,6 +537,8 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
      }
   }
   sc->ptr = ptr;
+
+
   if ( h1 != sc->H )
        memcpy_128( sc->H, h1, 16 );
 }
@@ -571,6 +573,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,

   for ( u = 0; u < 16; u ++ )
      buf[u] = h2[u];
+
   compress_small( buf, (__m128i*)final_s, h1 );

   for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
@@ -1041,22 +1044,22 @@ static const __m256i final_s8[16] =

 void bmw256_8way_init( bmw256_8way_context *ctx )
 {
-   ctx->H[ 0] = _mm256_set1_epi64x( IV256[ 0] );
-   ctx->H[ 1] = _mm256_set1_epi64x( IV256[ 1] );
-   ctx->H[ 2] = _mm256_set1_epi64x( IV256[ 2] );
-   ctx->H[ 3] = _mm256_set1_epi64x( IV256[ 3] );
-   ctx->H[ 4] = _mm256_set1_epi64x( IV256[ 4] );
-   ctx->H[ 5] = _mm256_set1_epi64x( IV256[ 5] );
-   ctx->H[ 6] = _mm256_set1_epi64x( IV256[ 6] );
-   ctx->H[ 7] = _mm256_set1_epi64x( IV256[ 7] );
-   ctx->H[ 8] = _mm256_set1_epi64x( IV256[ 8] );
-   ctx->H[ 9] = _mm256_set1_epi64x( IV256[ 9] );
-   ctx->H[10] = _mm256_set1_epi64x( IV256[10] );
-   ctx->H[11] = _mm256_set1_epi64x( IV256[11] );
-   ctx->H[12] = _mm256_set1_epi64x( IV256[12] );
-   ctx->H[13] = _mm256_set1_epi64x( IV256[13] );
-   ctx->H[14] = _mm256_set1_epi64x( IV256[14] );
-   ctx->H[15] = _mm256_set1_epi64x( IV256[15] );
+   ctx->H[ 0] = _mm256_set1_epi32( IV256[ 0] );
+   ctx->H[ 1] = _mm256_set1_epi32( IV256[ 1] );
+   ctx->H[ 2] = _mm256_set1_epi32( IV256[ 2] );
+   ctx->H[ 3] = _mm256_set1_epi32( IV256[ 3] );
+   ctx->H[ 4] = _mm256_set1_epi32( IV256[ 4] );
+   ctx->H[ 5] = _mm256_set1_epi32( IV256[ 5] );
+   ctx->H[ 6] = _mm256_set1_epi32( IV256[ 6] );
+   ctx->H[ 7] = _mm256_set1_epi32( IV256[ 7] );
+   ctx->H[ 8] = _mm256_set1_epi32( IV256[ 8] );
+   ctx->H[ 9] = _mm256_set1_epi32( IV256[ 9] );
+   ctx->H[10] = _mm256_set1_epi32( IV256[10] );
+   ctx->H[11] = _mm256_set1_epi32( IV256[11] );
+   ctx->H[12] = _mm256_set1_epi32( IV256[12] );
+   ctx->H[13] = _mm256_set1_epi32( IV256[13] );
+   ctx->H[14] = _mm256_set1_epi32( IV256[14] );
+   ctx->H[15] = _mm256_set1_epi32( IV256[15] );
   ctx->ptr       = 0;
   ctx->bit_count = 0;

@@ -1076,14 +1079,15 @@ void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
   ptr = ctx->ptr;
   h1 = ctx->H;
   h2 = htmp;
+
   while ( len > 0 )
   {
      size_t clen;
      clen = buf_size - ptr;
      if ( clen > len )
         clen = len;
-      memcpy_256( buf + (ptr>>3), vdata, clen >> 3 );
-      vdata = vdata + (clen>>3);
+      memcpy_256( buf + (ptr>>2), vdata, clen >> 2 );
+      vdata = vdata + (clen>>2);
      len -= clen;
      ptr += clen;
      if ( ptr == buf_size )
@@ -1097,6 +1101,7 @@ void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
      }
   }
   ctx->ptr = ptr;
+
   if ( h1 != ctx->H )
        memcpy_256( ctx->H, h1, 16 );
 }
@@ -1106,24 +1111,26 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
   __m256i *buf;
   __m256i h1[16], h2[16], *h;
   size_t ptr, u, v;
-//   unsigned z;
   const int buf_size = 64;  // bytes of one lane, compatible with len

   buf = ctx->buf;
   ptr = ctx->ptr;
-   buf[ ptr>>3 ] = _mm256_set1_epi32( 0x80 );
-   ptr += 8;
+   buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
+   ptr += 4;
   h = ctx->H;

-   if (  ptr > (buf_size - 8) )
+   if (  ptr > (buf_size - 4) )
   {
-      memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+      memset_zero_256( buf + (ptr>>2), (buf_size - ptr) >> 2 );
      compress_small_8way( buf, h, h1 );
      ptr = 0;
      h = h1;
   }
-   memset_zero_256( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
-   buf[ (buf_size - 8) >> 3 ] = _mm256_set1_epi64x( ctx->bit_count );
+   memset_zero_256( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
+   buf[ (buf_size - 8) >> 2 ] = _mm256_set1_epi32( ctx->bit_count );
+   buf[ (buf_size - 4) >> 2 ] = m256_zero;
+
+
   compress_small_8way( buf, h, h2 );

   for ( u = 0; u < 16; u ++ )
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -47,7 +47,9 @@ bool lyra2rev3_thread_init()

   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
   l2v3_wholeMatrix = _mm_malloc( size, 64 );
-#if defined (LYRA2REV3_4WAY)
+#if defined (LYRA2REV3_8WAY)
+   init_lyra2rev3_8way_ctx();;
+#elif defined (LYRA2REV3_4WAY)
   init_lyra2rev3_4way_ctx();;
 #else
   init_lyra2rev3_ctx();
@@ -57,7 +59,10 @@ bool lyra2rev3_thread_init()

 bool register_lyra2rev3_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV3_4WAY)
+#if defined (LYRA2REV3_8WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev3_8way;
+  gate->hash      = (void*)&lyra2rev3_8way_hash;
+#elif defined (LYRA2REV3_4WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev3_4way;
  gate->hash      = (void*)&lyra2rev3_4way_hash;
 #else
@@ -203,13 +208,18 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )

 bool register_phi2_algo( algo_gate_t* gate )
 {
-   init_phi2_ctx();
+//   init_phi2_ctx();
   gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
   gate->get_work_data_size = (void*)&phi2_get_work_data_size;
   gate->decode_extra_data  = (void*)&phi2_decode_extra_data;
   gate->build_extraheader  = (void*)&phi2_build_extraheader;
   gate->set_target         = (void*)&alt_set_target; 
   gate->get_max64          = (void*)&get_max64_0xffffLL;
+#if defined(PHI2_4WAY)
+   gate->scanhash           = (void*)&scanhash_phi2_4way;
+#else
+   init_phi2_ctx();
   gate->scanhash           = (void*)&scanhash_phi2;
+#endif
   return true;
 }
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -5,7 +5,9 @@
 #include <stdint.h>
 #include "lyra2.h"

-//#if defined(__AVX2__)
+#if defined(__AVX2__)
+  #define LYRA2REV3_8WAY
+#endif

 #if defined(__SSE2__)
  #define LYRA2REV3_4WAY
@@ -14,8 +16,14 @@
 extern __thread uint64_t* l2v3_wholeMatrix;

 bool register_lyra2rev3_algo( algo_gate_t* gate );
+#if defined(LYRA2REV3_8WAY)

-#if defined(LYRA2REV3_4WAY)
+void lyra2rev3_8way_hash( void *state, const void *input );
+int scanhash_lyra2rev3_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr );
+bool init_lyra2rev3_8way_ctx();
+
+#elif defined(LYRA2REV3_4WAY)

 void lyra2rev3_4way_hash( void *state, const void *input );
 int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -142,15 +150,29 @@ bool init_allium_ctx();

 /////////////////////////////////////////

+#if defined(__AVX2__) && defined(__AES__)
+//  #define PHI2_4WAY
+#endif
+
 bool phi2_has_roots;

 bool register_phi2_algo( algo_gate_t* gate );
+#if defined(PHI2_4WAY)
+
+void phi2_hash_4way( void *state, const void *input );
+int scanhash_phi2_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr );
+//void init_phi2_ctx();
+
+#else

 void phi2_hash( void *state, const void *input );
 int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr );
 void init_phi2_ctx();

+#endif
+
 #endif  // LYRA2_GATE_H__


--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -1,12 +1,138 @@
 #include "lyra2-gate.h"
 #include <memory.h>

-#if defined (LYRA2REV3_4WAY)	
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 

+
+#if defined (LYRA2REV3_8WAY)
+
+typedef struct {
+   blake256_8way_context     blake;
+   cubehashParam             cube;
+   bmw256_8way_context       bmw;
+} lyra2v3_8way_ctx_holder;
+
+static lyra2v3_8way_ctx_holder l2v3_8way_ctx;
+
+bool init_lyra2rev3_8way_ctx()
+{
+   blake256_8way_init( &l2v3_8way_ctx.blake );
+   cubehashInit( &l2v3_8way_ctx.cube, 256, 16, 32 );
+   bmw256_8way_init( &l2v3_8way_ctx.bmw );
+   return true;
+}
+
+void lyra2rev3_8way_hash( void *state, const void *input )
+{
+   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (32)));
+   uint32_t hash2[8] __attribute__ ((aligned (32)));
+   uint32_t hash3[8] __attribute__ ((aligned (32)));
+   uint32_t hash4[8] __attribute__ ((aligned (32)));
+   uint32_t hash5[8] __attribute__ ((aligned (32)));
+   uint32_t hash6[8] __attribute__ ((aligned (32)));
+   uint32_t hash7[8] __attribute__ ((aligned (32)));
+   lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );
+
+   blake256_8way( &ctx.blake, input, 80 );
+   blake256_8way_close( &ctx.blake, vhash );
+
+   mm256_dintrlv_8x32( hash0, hash1, hash2, hash3,
+                       hash4, hash5, hash6, hash7, vhash, 256 );
+
+   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash4, 32, hash4, 32, hash4, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash5, 32, hash5, 32, hash5, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
+
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash4, (const byte*) hash4, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash5, (const byte*) hash5, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash6, (const byte*) hash6, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash7, (const byte*) hash7, 32 );
+
+   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash4, 32, hash4, 32, hash4, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash5, 32, hash5, 32, hash5, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
+
+   mm256_intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
+                             hash4, hash5, hash6, hash7, 256 );
+
+   bmw256_8way( &ctx.bmw, vhash, 32 );
+   bmw256_8way_close( &ctx.bmw, state );
+
+   }
+
+int scanhash_lyra2rev3_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t lane_hash[8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t Htarg = ptarget[7];
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm256_bswap_intrlv80_8x32( vdata, pdata );
+   do
+   {
+      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                  n+3, n+2, n+1, n ) );
+
+      lyra2rev3_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
+      {
+         mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+              pdata[19] = n + lane;
+              submit_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#endif
+
+#if defined (LYRA2REV3_4WAY)  
+
+
 typedef struct {
   blake256_4way_context     blake;
   cubehashParam             cube;
--- a/algo/lyra2/phi2-4way.c
+++ b/algo/lyra2/phi2-4way.c
@@ -0,0 +1,233 @@
+/**
+ * Phi-2 algo Implementation
+ */
+
+#include "lyra2-gate.h"
+
+#if defined(PHI2_4WAY)
+
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/gost/sph_gost.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+     cubehashParam           cube;
+     jh512_4way_context      jh;
+     hashState_echo          echo;
+//     hashState_echo          echo2;
+     sph_gost512_context     gost;
+     skein512_4way_context   skein;
+} phi2_ctx_holder;
+/*
+phi2_ctx_holder phi2_ctx;
+
+void init_phi2_ctx()
+{
+   cubehashInit( &phi2_ctx.cube, 512, 16, 32 );
+   sph_jh512_init(&phi2_ctx.jh);
+   init_echo( &phi2_ctx.echo1, 512 );
+   init_echo( &phi2_ctx.echo2, 512 );
+   sph_gost512_init(&phi2_ctx.gost);
+   sph_skein512_init(&phi2_ctx.skein);
+};
+*/
+void phi2_hash_4way( void *state, const void *input )
+{
+   uint32_t hash[4][16] __attribute__ ((aligned (64)));
+   uint32_t hashA[4][16] __attribute__ ((aligned (64)));
+   uint32_t hashB[4][16] __attribute__ ((aligned (64)));
+   uint32_t vhash[4*16] __attribute__ ((aligned (64)));
+
+//   unsigned char _ALIGN(128) hash[64];
+//	unsigned char _ALIGN(128) hashA[64];
+//	unsigned char _ALIGN(128) hashB[64];
+
+   phi2_ctx_holder ctx __attribute__ ((aligned (64)));
+//  memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
+
+   cubehashInit( &ctx.cube, 512, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[0], (const byte*)input,
+                        phi2_has_roots ? 144 : 80 );
+   cubehashInit( &ctx.cube, 512, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[1], (const byte*)input+144,
+                        phi2_has_roots ? 144 : 80 );
+   cubehashInit( &ctx.cube, 512, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[2], (const byte*)input+288,
+                        phi2_has_roots ? 144 : 80 );
+   cubehashInit( &ctx.cube, 512, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[3], (const byte*)input+432,
+                        phi2_has_roots ? 144 : 80 );
+
+	LYRA2RE( &hashA[0][0], 32, &hashB[0][0], 32, &hashB[0][0], 32, 1, 8, 8 );
+	LYRA2RE( &hashA[0][8], 32, &hashB[0][8], 32, &hashB[0][8], 32, 1, 8, 8 );
+   LYRA2RE( &hashA[1][0], 32, &hashB[1][0], 32, &hashB[1][0], 32, 1, 8, 8 );
+   LYRA2RE( &hashA[1][8], 32, &hashB[1][8], 32, &hashB[1][8], 32, 1, 8, 8 );
+   LYRA2RE( &hashA[2][0], 32, &hashB[2][0], 32, &hashB[2][0], 32, 1, 8, 8 );
+   LYRA2RE( &hashA[2][8], 32, &hashB[2][8], 32, &hashB[2][8], 32, 1, 8, 8 );
+   LYRA2RE( &hashA[3][0], 32, &hashB[3][0], 32, &hashB[3][0], 32, 1, 8, 8 );
+   LYRA2RE( &hashA[3][8], 32, &hashB[3][8], 32, &hashB[3][8], 32, 1, 8, 8 );
+
+   mm256_intrlv_4x64( vhash, hashA[0], hashA[1], hashA[2], hashA[3], 512 );
+
+   jh512_4way_init( &ctx.jh );
+   jh512_4way( &ctx.jh, vhash, 64 );
+   jh512_4way_close( &ctx.jh, vhash );
+
+   mm256_dintrlv_4x64( hash[0], hash[1], hash[2], hash[3], vhash, 512 );
+
+   if ( hash[0][0] & 1 )
+  	{
+      sph_gost512_init( &ctx.gost );
+      sph_gost512( &ctx.gost, (const void*)hash[0], 64 );
+	   sph_gost512_close( &ctx.gost, (void*)hash[0] );
+	}
+  	else
+  	{
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[0],
+                          (const BitSequence *)hash[0], 512 );
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[0],
+                          (const BitSequence *)hash[0], 512 );
+	}
+
+   if ( hash[1][0] & 1 )
+   {
+      sph_gost512_init( &ctx.gost );
+      sph_gost512( &ctx.gost, (const void*)hash[1], 64 );
+      sph_gost512_close( &ctx.gost, (void*)hash[1] );
+   }
+   else
+   {
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[1],
+                          (const BitSequence *)hash[1], 512 );
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[1],
+                          (const BitSequence *)hash[1], 512 );
+   }
+
+   if ( hash[2][0] & 1 )
+   {
+      sph_gost512_init( &ctx.gost );
+      sph_gost512( &ctx.gost, (const void*)hash[2], 64 );
+      sph_gost512_close( &ctx.gost, (void*)hash[2] );
+   }
+   else
+   {
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[2],
+                          (const BitSequence *)hash[2], 512 );
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[2],
+                          (const BitSequence *)hash[2], 512 );
+   }
+
+   if ( hash[3][0] & 1 )
+   {
+      sph_gost512_init( &ctx.gost );
+      sph_gost512( &ctx.gost, (const void*)hash[3], 64 );
+      sph_gost512_close( &ctx.gost, (void*)hash[3] );
+   }
+   else
+   {
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[3],
+                          (const BitSequence *)hash[3], 512 );
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[3],
+                          (const BitSequence *)hash[3], 512 );
+   }
+
+   mm256_intrlv_4x64( vhash, hash[0], hash[1], hash[2], hash[3], 512 );
+   
+   skein512_4way_init( &ctx.skein );
+	skein512_4way( &ctx.skein, vhash, 64 );
+	skein512_4way_close( &ctx.skein, vhash );
+
+   for (int i=0; i<4; i++)
+   {
+      ( (uint64_t*)vhash    )[i] ^= ( (uint64_t*)vhash    )[i+4];
+      ( (uint64_t*)vhash+ 8 )[i] ^= ( (uint64_t*)vhash+ 8 )[i+4];
+      ( (uint64_t*)vhash+16 )[i] ^= ( (uint64_t*)vhash+16 )[i+4];
+      ( (uint64_t*)vhash+24 )[i] ^= ( (uint64_t*)vhash+24 )[i+4];
+   }
+//   for ( int i = 0; i < 4; i++ )
+//      casti_m256i( vhash, i ) = _mm256_xor_si256( casti_m256i( vhash, i   ),
+//                                                  casti_m256i( vhash, i+4 ) );
+
+	memcpy( state, vhash, 128 );
+}
+
+int scanhash_phi2_4way( int thr_id, struct work *work, uint32_t max_nonce,
+	                     uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(128) hash[8];
+   uint32_t _ALIGN(128) edata[36];
+   uint32_t vdata[4][36] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[25]);
+   uint32_t lane_hash[8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+
+   if(opt_benchmark){
+   	ptarget[7] = 0x00ff;
+   }
+
+// Data is not interleaved, but hash is.
+// any non-zero data at index 20 or above sets roots true.
+// Split up the operations, bswap first, then set roots.
+
+   phi2_has_roots = false;
+   for ( int i=0; i < 36; i++ )
+   {
+   be32enc(&edata[i], pdata[i]);
+   if (i >= 20 && pdata[i]) phi2_has_roots = true;
+   }
+/*
+   casti_m256i( vdata[0], 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );   
+   casti_m256i( vdata[0], 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+   casti_m256i( vdata[0], 2 ) = mm256_bswap_32( casti_m256i( pdata, 2 ) );
+   casti_m256i( vdata[0], 3 ) = mm256_bswap_32( casti_m256i( pdata, 3 ) );
+   casti_m128i( vdata[0], 8 ) = mm128_bswap_32( casti_m128i( pdata, 8 ) );
+   phi2_has_roots = mm128_anybits1( casti_m128i( vdata[0], 5 ) ) ||
+                    mm128_anybits1( casti_m128i( vdata[0], 6 ) ) ||
+                    mm128_anybits1( casti_m128i( vdata[0], 7 ) ) ||
+                    mm128_anybits1( casti_m128i( vdata[0], 8 ) );
+*/   
+
+   memcpy( vdata[0], edata, 144 );
+   memcpy( vdata[1], edata, 144 );
+   memcpy( vdata[2], edata, 144 );
+   memcpy( vdata[3], edata, 144 );
+
+   do {
+      be32enc( &vdata[0][19], n );
+      be32enc( &vdata[1][19], n+1 );
+      be32enc( &vdata[2][19], n+2 );
+      be32enc( &vdata[3][19], n+3 );
+
+      phi2_hash_4way( hash, vdata );
+
+      for ( int lane = 0; lane < 4; lane++ ) if (  hash7[ lane<<1 ] < Htarg )
+      {
+          mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          {
+              pdata[19] = n + lane;
+              submit_solution( work, lane_hash, mythr, lane );
+          }
+       }
+       n += 4;
+    } while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
+    *hashes_done = n - first_nonce + 1;
+    return 0;
+}
+   
+#endif  // PHI2_4WAY
--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -144,8 +144,8 @@ void init_m7m_ctx()
 #define NM7M 5
 #define SW_DIVS 5
 #define M7_MIDSTATE_LEN 76
-int scanhash_m7m_hash( int thr_id, struct work* work,
-                       uint64_t max_nonce, unsigned long *hashes_done )
+int scanhash_m7m_hash( int thr_id, struct work* work, uint64_t max_nonce,
+                       unsigned long *hashes_done, struct thr_info *mythr )
 {
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
@@ -154,6 +154,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
    uint32_t hash[8] __attribute__((aligned(64)));
    uint8_t bhash[7][64] __attribute__((aligned(64)));
    uint32_t n = pdata[19] - 1;
+    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
    uint32_t usw_, mpzscale;
    const uint32_t first_nonce = pdata[19];
    char data_str[161], hash_str[65], target_str[65];
--- a/algo/pluck.c
+++ b/algo/pluck.c
@@ -445,7 +445,7 @@ void pluck_hash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const i
 }

 int scanhash_pluck(int thr_id, struct work *work, uint32_t max_nonce,
-        uint64_t *hashes_done  )
+        uint64_t *hashes_done, struct thr_info *mythr  )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -454,6 +454,8 @@ int scanhash_pluck(int thr_id, struct work *work, uint32_t max_nonce,
 	const uint32_t first_nonce = pdata[19];
 	volatile uint8_t *restart = &(work_restart[thr_id].restart);
 	uint32_t n = first_nonce;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+

 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0ffff;
--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -48,8 +48,8 @@ void anime_4way_hash( void *state, const void *input )
    __m256i* vhA = (__m256i*)vhashA;
    __m256i* vhB = (__m256i*)vhashB;
    __m256i vh_mask;
+    const uint32_t mask = 8;
    const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
-    int i;
    anime_4way_ctx_holder ctx;
    memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );

@@ -62,27 +62,44 @@ void anime_4way_hash( void *state, const void *input )
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );

-    mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-    update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                            (char*)hash0, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                            (char*)hash1, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                            (char*)hash2, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                            (char*)hash3, 512 );
-    mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+    mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-    skein512_4way( &ctx.skein, vhash, 64 );
-    skein512_4way_close( &ctx.skein, vhashB );
+    if ( hash0[0] & mask )
+    {
+       update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                               (char*)hash0, 512 );
+    }
+    if ( hash1[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                               (char*)hash1, 512 );
+    }
+    if ( hash2[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                               (char*)hash2, 512 );
+    }
+    if ( hash3[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                               (char*)hash3, 512 );
+    }

-    for ( i = 0; i < 8; i++ )
-       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+    mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+
+    if ( mm256_anybits0( vh_mask ) )
+    {
+       skein512_4way( &ctx.skein, vhash, 64 );
+       skein512_4way_close( &ctx.skein, vhashB );
+    }
+
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
+
+    mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-    mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
    reinit_groestl( &ctx.groestl );
@@ -91,7 +108,8 @@ void anime_4way_hash( void *state, const void *input )
    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-    mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+    mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

    jh512_4way( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhash );
@@ -99,16 +117,20 @@ void anime_4way_hash( void *state, const void *input )
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );

+    if ( mm256_anybits1( vh_mask ) )
+    {
       blake512_4way_init( &ctx.blake );
       blake512_4way( &ctx.blake, vhash, 64 );
       blake512_4way_close( &ctx.blake, vhashA );
-
+    }
+    if ( mm256_anybits0( vh_mask ) )
+    {
       bmw512_4way_init( &ctx.bmw );
       bmw512_4way( &ctx.bmw, vhash, 64 );
       bmw512_4way_close( &ctx.bmw, vhashB );
+    }

-    for ( i = 0; i < 8; i++ )
-       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

    keccak512_4way( &ctx.keccak, vhash, 64 );
    keccak512_4way_close( &ctx.keccak, vhash );
@@ -120,33 +142,35 @@ void anime_4way_hash( void *state, const void *input )
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );

-    keccak512_4way_init( &ctx.keccak );
-    keccak512_4way( &ctx.keccak, vhash, 64 );
-    keccak512_4way_close( &ctx.keccak, vhashA );
+    if ( mm256_anybits1( vh_mask ) )
+    {
+       keccak512_4way_init( &ctx.keccak );
+       keccak512_4way( &ctx.keccak, vhash, 64 );
+       keccak512_4way_close( &ctx.keccak, vhashA );
+    }
+    if ( mm256_anybits0( vh_mask ) )
+    {
+       jh512_4way_init( &ctx.jh );
+       jh512_4way( &ctx.jh, vhash, 64 );
+       jh512_4way_close( &ctx.jh, vhashB );
+    }

-    jh512_4way_init( &ctx.jh );
-    jh512_4way( &ctx.jh, vhash, 64 );
-    jh512_4way_close( &ctx.jh, vhashB );
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

-    for ( i = 0; i < 8; i++ )
-       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
-
-    mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
+    mm256_dintrlv_4x64( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done)
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[4*8] __attribute__ ((aligned (64)));
    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-    uint32_t endiandata[20] __attribute__((aligned(64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
-    uint32_t *nonces = work->nonces;
-    int num_found = 0;
-    uint32_t *noncep = vdata + 73;   // 9*8 + 1
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
    const uint32_t Htarg = ptarget[7];
    uint64_t htmax[] = {
                0,
@@ -165,10 +189,7 @@ int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
                0
        };

-    swab32_array( endiandata, pdata, 20 );
-
-    uint64_t *edata = (uint64_t*)endiandata;
-    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+    mm256_bswap_intrlv80_4x64( vdata, pdata );

    for (int m=0; m < 6; m++)
       if (Htarg <= htmax[m])
@@ -177,30 +198,26 @@ int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,

          do
          {
-             be32enc( noncep,   n   );
-             be32enc( noncep+2, n+1 );
-             be32enc( noncep+4, n+2 );
-             be32enc( noncep+6, n+3 );
+             *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

             anime_4way_hash( hash, vdata );
             pdata[19] = n;

             for ( int i = 0; i < 4; i++ )
             if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
-                && fulltest( hash+(i<<3), ptarget ) )
+                && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
             {
                pdata[19] = n+i;
-                nonces[ num_found++ ] = n+i;
-                work_set_target_ratio( work, hash+(i<<3) );
+                submit_solution( work, hash+(i<<3), mythr, i );
             }
             n += 4;
-          } while ( ( num_found == 0 ) && ( n < max_nonce )
-              && !work_restart[thr_id].restart );
+          } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
          break;
       }

    *hashes_done = n - first_nonce + 1;
-    return num_found;
+    return 0;
 }

 #endif
--- a/algo/quark/anime-gate.h
+++ b/algo/quark/anime-gate.h
@@ -13,19 +13,15 @@ bool register_anime_algo( algo_gate_t* gate );
 #if defined(ANIME_4WAY)

 void anime_4way_hash( void *state, const void *input );
-
 int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
-
+                         uint64_t *hashes_done, struct thr_info *mythr );
 void init_anime_4way_ctx();

 #endif

 void anime_hash( void *state, const void *input );
-
 int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
-
+                    uint64_t *hashes_done, struct thr_info *mythr );
 void init_anime_ctx();

 #endif
--- a/algo/quark/anime.c
+++ b/algo/quark/anime.c
@@ -46,20 +46,6 @@ void init_anime_ctx()
 void anime_hash( void *state, const void *input )
 {
    unsigned char hash[128] __attribute__ ((aligned (32)));
-/*
-    uint64_t hash0[8] __attribute__ ((aligned (64)));
-    uint64_t hash1[8] __attribute__ ((aligned (64)));
-    uint64_t hash2[8] __attribute__ ((aligned (64)));
-    uint64_t hash3[8] __attribute__ ((aligned (64)));
-    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-    uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
-    uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
-    __m256i* vh  = (__m256i*)vhash;
-    __m256i* vhA = (__m256i*)vhashA;
-    __m256i* vhB = (__m256i*)vhashB;
-    __m256i vh_mask;
-    __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
-*/
    uint32_t mask = 8;
    anime_ctx_holder ctx;
    memcpy( &ctx, &anime_ctx, sizeof(anime_ctx) );
@@ -134,7 +120,7 @@ void anime_hash( void *state, const void *input )
 }

 int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done)
+                         uint64_t *hashes_done, struct thr_info *mythr)
 {
    uint32_t hash[8] __attribute__ ((aligned (64)));
    uint32_t endiandata[20] __attribute__((aligned(64)));
@@ -142,6 +128,7 @@ int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
    uint32_t *ptarget = work->target;
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
+    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
    const uint32_t Htarg = ptarget[7];
    uint64_t htmax[] = {
                0,
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -0,0 +1,618 @@
+#include "hmq1725-gate.h"
+
+#if defined(HMQ1725_4WAY)
+
+#include <string.h>
+#include <stdint.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/luffa_for_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#include "algo/simd/nist.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/simd-hash-2way.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/hamsi/hamsi-hash-4way.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/shabal-hash-4way.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include "algo/haval/haval-hash-4way.h"
+#include "algo/sha/sha2-hash-4way.h"
+
+union _hmq1725_4way_context_overlay
+{
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+    hamsi512_4way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_4way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_4way_context     sha512;
+    haval256_5_4way_context haval;
+};
+typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;
+
+extern void hmq1725_4way_hash(void *state, const void *input)
+{
+// why so big? only really need 8, haval thing uses 16.
+     uint32_t hash0 [32]    __attribute__ ((aligned (64)));
+     uint32_t hash1 [32]    __attribute__ ((aligned (64)));
+     uint32_t hash2 [32]    __attribute__ ((aligned (64)));
+     uint32_t hash3 [32]    __attribute__ ((aligned (64)));
+     uint32_t vhash [32<<2] __attribute__ ((aligned (64)));
+     uint32_t vhashA[32<<2] __attribute__ ((aligned (64)));
+     uint32_t vhashB[32<<2] __attribute__ ((aligned (64)));
+     hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
+     __m256i vh_mask;     
+     const __m256i vmask = _mm256_set1_epi64x( 24 );
+     const uint32_t mask = 24;
+     __m256i* vh  = (__m256i*)vhash;
+     __m256i* vhA = (__m256i*)vhashA;
+     __m256i* vhB = (__m256i*)vhashB;
+
+     bmw512_4way_init( &ctx.bmw );
+     bmw512_4way( &ctx.bmw, input, 80 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+// first fork, A is groestl serial, B is skein parallel.
+
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
+                                   m256_zero );
+
+// A
+
+//     if ( hash0[0] & mask )
+//     {
+       init_groestl( &ctx.groestl, 64 );
+       update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                               (char*)hash0, 512 );
+//     }
+//     if ( hash1[0] & mask )
+//     {
+       init_groestl( &ctx.groestl, 64 );
+       update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                               (char*)hash1, 512 );
+//     }
+//     if ( hash2[0] & mask )
+//     {
+       init_groestl( &ctx.groestl, 64 );
+       update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                               (char*)hash2, 512 );
+//     }
+//     if ( hash3[0] & mask )
+//     {
+       init_groestl( &ctx.groestl, 64 );
+       update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                               (char*)hash3, 512 );
+//     }
+
+     mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+
+// B
+
+//     if ( mm256_any_clr_256( vh_mask ) )
+//     {
+       skein512_4way_init( &ctx.skein );
+       skein512_4way( &ctx.skein, vhash, 64 );
+       skein512_4way_close( &ctx.skein, vhashB );
+//     }
+
+     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
+
+     jh512_4way_init( &ctx.jh );
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way_init( &ctx.keccak );
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+// second fork, A = blake parallel, B= bmw parallel.
+    
+     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
+                                   m256_zero );
+
+//     if ( mm256_any_set_256( vh_mask ) )
+//     {
+       blake512_4way_init( &ctx.blake );
+       blake512_4way( &ctx.blake, vhash, 64 );
+       blake512_4way_close( &ctx.blake, vhashA );
+//     }
+
+//     if ( mm256_any_clr_256( vh_mask ) )
+//     {
+       bmw512_4way_init( &ctx.bmw );
+       bmw512_4way( &ctx.bmw, vhash, 64 );
+       bmw512_4way_close( &ctx.bmw, vhashB );
+//     }
+
+     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
+    
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     init_luffa( &ctx.luffa, 512 );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                                   (const BitSequence*)hash0, 64 );
+     init_luffa( &ctx.luffa, 512 );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                                   (const BitSequence*)hash1, 64 );
+     init_luffa( &ctx.luffa, 512 );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                                   (const BitSequence*)hash2, 64 );
+     init_luffa( &ctx.luffa, 512 );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                                   (const BitSequence*)hash3, 64 );
+
+     cubehashInit( &ctx.cube, 512, 16, 32 );
+     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash0,
+                                (const BitSequence *)hash0, 64 );
+     cubehashInit( &ctx.cube, 512, 16, 32 );
+     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash1,
+                                (const BitSequence *)hash1, 64 );
+     cubehashInit( &ctx.cube, 512, 16, 32 );
+     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash2,
+                                (const BitSequence *)hash2, 64 );
+     cubehashInit( &ctx.cube, 512, 16, 32 );
+     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash3,
+                                (const BitSequence *)hash3, 64 );
+
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+// A= keccak parallel, B= jh parallel
+    
+     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
+                                  m256_zero );
+
+//     if ( mm256_any_set_256( vh_mask ) )
+//     {
+        keccak512_4way_init( &ctx.keccak );
+        keccak512_4way( &ctx.keccak, vhash, 64 );
+        keccak512_4way_close( &ctx.keccak, vhashA );
+//     }
+
+//     if ( mm256_any_clr_256( vh_mask ) )
+//     {
+        jh512_4way_init( &ctx.jh );
+        jh512_4way( &ctx.jh, vhash, 64 );
+        jh512_4way_close( &ctx.jh, vhashB );
+//     }
+
+     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
+
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512 ( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512 ( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512 ( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512 ( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     init_sd( &ctx.simd, 512 );
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                           (const BitSequence *)hash0, 512 );
+     init_sd( &ctx.simd, 512 );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                           (const BitSequence *)hash1, 512 );
+     init_sd( &ctx.simd, 512 );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                           (const BitSequence *)hash2, 512 );
+     init_sd( &ctx.simd, 512 );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                           (const BitSequence *)hash3, 512 );
+
+// A is whirlpool serial, B is haval parallel.
+    
+
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
+                                   m256_zero );
+     // A
+    
+//     if ( hash0[0] & mask )
+//     {
+        sph_whirlpool_init( &ctx.whirlpool );
+        sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+        sph_whirlpool_close( &ctx.whirlpool, hash0 );
+//     }
+//     if ( hash1[0] & mask )
+//     {
+        sph_whirlpool_init( &ctx.whirlpool );
+        sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+        sph_whirlpool_close( &ctx.whirlpool, hash1 );
+//     }
+//     if ( hash2[0] & mask )
+//     {
+        sph_whirlpool_init( &ctx.whirlpool );
+        sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+        sph_whirlpool_close( &ctx.whirlpool, hash2 );
+//     }
+//     if ( hash3[0] & mask )
+//     {
+        sph_whirlpool_init( &ctx.whirlpool );
+        sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+        sph_whirlpool_close( &ctx.whirlpool, hash3 );
+//     }
+
+     mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+
+// B
+
+//     if ( mm256_any_clr_256( vh_mask ) )
+//     {
+        haval256_5_4way_init( &ctx.haval );
+        haval256_5_4way( &ctx.haval, vhash, 64 );
+        haval256_5_4way_close( &ctx.haval, vhashB );
+        memset( &vhashB[8<<2], 0, 32<<2);
+//     }
+
+     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
+
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                             (const BitSequence *)hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                             (const BitSequence *)hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                             (const BitSequence *)hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                             (const BitSequence *)hash3, 512 );
+
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     
+     blake512_4way_init( &ctx.blake );
+     blake512_4way( &ctx.blake, vhash, 64 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+// shavite & luffa, both serial, select individually.
+
+   if ( hash0[0] & mask )
+   {
+      sph_shavite512_init( &ctx.shavite );
+      sph_shavite512( &ctx.shavite, hash0, 64 ); //
+      sph_shavite512_close( &ctx.shavite, hash0 ); //8
+   }
+   else
+   {
+      init_luffa( &ctx.luffa, 512 );
+      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash0,
+                                    (const BitSequence *)hash0, 64 );
+   }
+
+   if ( hash1[0] & mask )
+   {
+      sph_shavite512_init( &ctx.shavite );
+      sph_shavite512( &ctx.shavite, hash1, 64 ); //
+      sph_shavite512_close( &ctx.shavite, hash1 ); //8
+   }
+   else
+   {
+      init_luffa( &ctx.luffa, 512 );
+      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash1,
+                                    (const BitSequence *)hash1, 64 );
+   }
+
+   if ( hash2[0] & mask )
+   {
+      sph_shavite512_init( &ctx.shavite );
+      sph_shavite512( &ctx.shavite, hash2, 64 ); //
+      sph_shavite512_close( &ctx.shavite, hash2 ); //8
+   }
+   else
+   {
+      init_luffa( &ctx.luffa, 512 );
+      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash2,
+                                    (const BitSequence *)hash2, 64 );
+   }
+
+   if ( hash3[0] & mask )
+   {
+      sph_shavite512_init( &ctx.shavite );
+      sph_shavite512( &ctx.shavite, hash3, 64 ); //
+      sph_shavite512_close( &ctx.shavite, hash3 ); //8
+   }
+   else
+   {
+      init_luffa( &ctx.luffa, 512 );
+      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash3,
+                                    (const BitSequence *)hash3, 64 );
+   }
+
+   mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+   hamsi512_4way_init( &ctx.hamsi );
+   hamsi512_4way( &ctx.hamsi, vhash, 64 );
+   hamsi512_4way_close( &ctx.hamsi, vhash );
+
+   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash0, 64 );
+   sph_fugue512_close( &ctx.fugue, hash0 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash1, 64 );
+   sph_fugue512_close( &ctx.fugue, hash1 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash2, 64 );
+   sph_fugue512_close( &ctx.fugue, hash2 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash3, 64 );
+   sph_fugue512_close( &ctx.fugue, hash3 );
+
+
+//  A echo, B sd both serial
+   
+   if ( hash0[0] & mask ) //4
+   {
+       init_echo( &ctx.echo, 512 );
+       update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                               (const BitSequence *)hash0, 512 );
+   }
+   else
+   {
+       init_sd( &ctx.simd, 512 );
+       update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                             (const BitSequence *)hash0, 512 );
+   }
+
+   if ( hash1[0] & mask ) //4
+   {
+       init_echo( &ctx.echo, 512 );
+       update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                               (const BitSequence *)hash1, 512 );
+   }
+   else
+   {
+       init_sd( &ctx.simd, 512 );
+       update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                             (const BitSequence *)hash1, 512 );
+   }
+
+   if ( hash2[0] & mask ) //4
+   {
+       init_echo( &ctx.echo, 512 );
+       update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                               (const BitSequence *)hash2, 512 );
+   }
+   else
+   {
+       init_sd( &ctx.simd, 512 );
+       update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                             (const BitSequence *)hash2, 512 );
+   }
+
+   if ( hash3[0] & mask ) //4
+   {
+       init_echo( &ctx.echo, 512 );
+       update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                               (const BitSequence *)hash3, 512 );
+   }
+   else
+   {
+       init_sd( &ctx.simd, 512 );
+       update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                             (const BitSequence *)hash3, 512 );
+   }
+
+   mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+
+   shabal512_4way_init( &ctx.shabal );
+   shabal512_4way( &ctx.shabal, vhash, 64 );
+   shabal512_4way_close( &ctx.shabal, vhash );
+
+   mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash0 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash1 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash2 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+// A = fugue serial, B = sha512 prarallel
+   
+   mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+   vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
+                                 m256_zero );
+
+//   if ( hash0[0] & mask ) 
+//   {
+      sph_fugue512_init( &ctx.fugue );
+      sph_fugue512( &ctx.fugue, hash0, 64 );
+      sph_fugue512_close( &ctx.fugue, hash0 );
+//   }
+//   if ( hash1[0] & mask ) 
+//   {
+      sph_fugue512_init( &ctx.fugue );
+      sph_fugue512( &ctx.fugue, hash1, 64 );
+      sph_fugue512_close( &ctx.fugue, hash1 );
+//   }
+//   if ( hash2[0] & mask ) 
+//   {
+      sph_fugue512_init( &ctx.fugue );
+      sph_fugue512( &ctx.fugue, hash2, 64 );
+      sph_fugue512_close( &ctx.fugue, hash2 );
+//   }
+//   if ( hash3[0] & mask ) 
+//   {
+      sph_fugue512_init( &ctx.fugue );
+      sph_fugue512( &ctx.fugue, hash3, 64 );
+      sph_fugue512_close( &ctx.fugue, hash3 );
+//   }
+
+   mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+
+//   if ( mm256_any_clr_256( vh_mask ) )
+//   {
+      sha512_4way_init( &ctx.sha512 );
+      sha512_4way( &ctx.sha512, vhash, 64 );
+      sha512_4way_close( &ctx.sha512, vhashB );
+//   }
+
+   mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
+
+   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+   mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+   sha512_4way_init( &ctx.sha512 ); 
+   sha512_4way( &ctx.sha512, vhash, 64 );
+   sha512_4way_close( &ctx.sha512, vhash ); 
+
+// A = haval parallel, B = Whirlpool serial
+
+   vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
+                                 m256_zero );
+
+   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     
+//   if ( mm256_any_set_256( vh_mask ) ) //4
+//   {
+      haval256_5_4way_init( &ctx.haval );
+      haval256_5_4way( &ctx.haval, vhash, 64 );
+      haval256_5_4way_close( &ctx.haval, vhashA );
+      memset( &vhashA[8<<2], 0, 32<<2 );
+//   }
+
+//   if ( !( hash0[0] & mask ) )
+//   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash0 );
+//   }
+//   if ( !( hash2[0] & mask ) )
+//   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash1 );
+//   }
+//   if ( !( hash2[0] & mask ) )
+//   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash2 );
+//   }
+//   if ( !( hash3[0] & mask ) )
+//   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash3 );
+//   }
+
+   mm256_intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, 512 );
+
+   mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
+
+   bmw512_4way_init( &ctx.bmw );
+   bmw512_4way( &ctx.bmw, vhash, 64 );
+   bmw512_4way_close( &ctx.bmw, vhash );
+
+ 	memcpy(state, vhash, 32<<2 );
+}
+
+int scanhash_hmq1725_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+//   uint32_t *hash7 = &(hash[7<<2]);
+//   uint32_t lane_hash[8];
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19] - 1;
+   const uint32_t first_nonce = pdata[19];
+   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   const uint32_t Htarg = ptarget[7];
+   uint64_t htmax[] = {          0,        0xF,       0xFF,
+                             0xFFF,     0xFFFF, 0x10000000  };
+   uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                        0xFFFFF000, 0xFFFF0000,          0  };
+
+   mm256_bswap_intrlv80_4x64( vdata, pdata );
+   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+   {
+      uint32_t mask = masks[ m ];
+      do
+      {
+         *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+         hmq1725_4way_hash( hash, vdata );
+         for ( int i = 0; i < 4; i++ )
+         if ( ( (hash+(i<<3))[7] & mask ) == 0 )
+         {
+            if ( fulltest( (hash+(i<<3)), ptarget ) && !opt_benchmark )
+            {
+               pdata[19] = n + i;
+               submit_solution( work, (hash+(i<<3)), mythr, i );
+            }
+         }
+	      n += 4;
+      } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );	
+	   break;
+	}
+	*hashes_done = n - first_nonce + 1;
+	return 0;
+}
+
+#endif // HMQ1725_4WAY
--- a/algo/quark/hmq1725-gate.c
+++ b/algo/quark/hmq1725-gate.c
@@ -0,0 +1,17 @@
+#include "hmq1725-gate.h"
+
+bool register_hmq1725_algo( algo_gate_t* gate )
+{
+#if defined(HMQ1725_4WAY)
+  gate->scanhash  = (void*)&scanhash_hmq1725_4way;
+  gate->hash      = (void*)&hmq1725_4way_hash;
+#else
+  init_hmq1725_ctx();
+  gate->scanhash  = (void*)&scanhash_hmq1725;
+  gate->hash      = (void*)&hmq1725hash;
+#endif
+  gate->set_target       = (void*)&scrypt_set_target;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  return true;
+};
+
--- a/algo/quark/hmq1725-gate.h
+++ b/algo/quark/hmq1725-gate.h
@@ -0,0 +1,28 @@
+#ifndef HMQ1725_GATE_H__
+#define HMQ1725_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__) && defined(__AES__)
+//  #define HMQ1725_4WAY
+#endif
+
+bool register_hmq1725_algo( algo_gate_t* gate );
+
+#if defined(HMQ1725_4WAY)
+
+void hmq1725_4way_hash( void *state, const void *input );
+int scanhash_hmq1725_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+
+#else
+
+void hmq1725hash( void *state, const void *input );
+int scanhash_hmq1725( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );
+void init_hmq1725_ctx();
+
+#endif
+
+#endif  // HMQ1725_GATE_H__
--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "hmq1725-gate.h"
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/sph_blake.h"
@@ -298,10 +298,11 @@ extern void hmq1725hash(void *state, const void *input)
 	memcpy(state, hashA, 32);
 }

-int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
+int scanhash_hmq1725( int thr_id, struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t endiandata[32] __attribute__((aligned(64)));
+//        uint32_t endiandata[32] __attribute__((aligned(64)));
+        uint32_t endiandata[20] __attribute__((aligned(64)));
        uint32_t hash64[8] __attribute__((aligned(64)));
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -311,7 +312,8 @@ int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
 	//const uint32_t Htarg = ptarget[7];

 	//we need bigendian data...
-        for (int k = 0; k < 32; k++)
+//        for (int k = 0; k < 32; k++)
+        for (int k = 0; k < 20; k++)
                be32enc(&endiandata[k], pdata[k]);

        hmq_bmw512_midstate( endiandata );
@@ -407,14 +409,14 @@ int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
-
+/*
 bool register_hmq1725_algo( algo_gate_t* gate )
 {
  init_hmq1725_ctx();
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->set_target       = (void*)&scrypt_set_target;
  gate->scanhash         = (void*)&scanhash_hmq1725;
  gate->hash             = (void*)&hmq1725hash;
  return true;
 };
-
+*/
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -48,9 +48,10 @@ void quark_4way_hash( void *state, const void *input )
    __m256i* vhA = (__m256i*)vhashA;
    __m256i* vhB = (__m256i*)vhashB;
    __m256i vh_mask;
-    __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
-    int i;
    quark_4way_ctx_holder ctx;
+    const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
+    const uint32_t mask = 8;
+
    memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );

    blake512_4way( &ctx.blake, input, 80 );
@@ -62,27 +63,44 @@ void quark_4way_hash( void *state, const void *input )
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );

-       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+    if ( hash0[0] & mask )
+    {
       update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                               (char*)hash0, 512 );
+    }
+    if ( hash1[0] & mask )
+    {
       reinit_groestl( &ctx.groestl );
       update_and_final_groestl( &ctx.groestl, (char*)hash1,
                                               (char*)hash1, 512 );
+    }
+    if ( hash2[0] & mask )
+    {   
       reinit_groestl( &ctx.groestl );
       update_and_final_groestl( &ctx.groestl, (char*)hash2,
                                               (char*)hash2, 512 );
+    }
+    if ( hash3[0] & mask )
+    {   
       reinit_groestl( &ctx.groestl );
       update_and_final_groestl( &ctx.groestl, (char*)hash3,
                                               (char*)hash3, 512 );
-       mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+    }

+    mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+
+    if ( mm256_anybits0( vh_mask ) )   
+    {
       skein512_4way( &ctx.skein, vhash, 64 );
       skein512_4way_close( &ctx.skein, vhashB );
+    }

-    for ( i = 0; i < 8; i++ )
-       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
+
+    mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-    mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
    reinit_groestl( &ctx.groestl );
@@ -91,7 +109,8 @@ void quark_4way_hash( void *state, const void *input )
    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-    mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+    mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

    jh512_4way( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhash );
@@ -99,16 +118,21 @@ void quark_4way_hash( void *state, const void *input )
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );

+    if ( mm256_anybits1( vh_mask ) )
+    {
       blake512_4way_init( &ctx.blake );
       blake512_4way( &ctx.blake, vhash, 64 );
       blake512_4way_close( &ctx.blake, vhashA );
+    }

+    if ( mm256_anybits0( vh_mask ) )
+    {
       bmw512_4way_init( &ctx.bmw );
       bmw512_4way( &ctx.bmw, vhash, 64 );
       bmw512_4way_close( &ctx.bmw, vhashB );
+    }

-    for ( i = 0; i < 8; i++ )
-       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

    keccak512_4way( &ctx.keccak, vhash, 64 );
    keccak512_4way_close( &ctx.keccak, vhash );
@@ -120,63 +144,65 @@ void quark_4way_hash( void *state, const void *input )
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );

+    if ( mm256_anybits1( vh_mask ) )
+    {
       keccak512_4way_init( &ctx.keccak );
       keccak512_4way( &ctx.keccak, vhash, 64 );
       keccak512_4way_close( &ctx.keccak, vhashA );
+    }

+    if ( mm256_anybits0( vh_mask ) )
+    {
       jh512_4way_init( &ctx.jh );
       jh512_4way( &ctx.jh, vhash, 64 );
       jh512_4way_close( &ctx.jh, vhashB );
+    }

-    for ( i = 0; i < 8; i++ )
-       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
-
-    mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
+    // Final blend, directly to state, only need 32 bytes.
+    casti_m256i( state, 0 ) = _mm256_blendv_epi8( vhA[0], vhB[0], vh_mask );
+    casti_m256i( state, 1 ) = _mm256_blendv_epi8( vhA[1], vhB[1], vh_mask );
+    casti_m256i( state, 2 ) = _mm256_blendv_epi8( vhA[2], vhB[2], vh_mask );
+    casti_m256i( state, 3 ) = _mm256_blendv_epi8( vhA[3], vhB[3], vh_mask );
 }

 int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done)
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[4*8] __attribute__ ((aligned (64)));
    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-    uint32_t endiandata[20] __attribute__((aligned(64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+    uint32_t *hash7 = &(hash[25]);
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
-    uint32_t *nonces = work->nonces;
-    int num_found = 0;
-    uint32_t *noncep = vdata + 73;   // 9*8 + 1
-
-    swab32_array( endiandata, pdata, 20 );
-
-    uint64_t *edata = (uint64_t*)endiandata;
-    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

+    mm256_bswap_intrlv80_4x64( vdata, pdata );
    do
    {
-       be32enc( noncep,   n   );
-       be32enc( noncep+2, n+1 );
-       be32enc( noncep+4, n+2 );
-       be32enc( noncep+6, n+3 );
+       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

       quark_4way_hash( hash, vdata );
       pdata[19] = n;

       for ( int i = 0; i < 4; i++ )
-       if ( ( ( (hash+(i<<3))[7] & 0xFFFFFF00 ) == 0 )
-            && fulltest( hash+(i<<3), ptarget ) )
+       if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 )
       {
-          pdata[19] = n+i;
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          mm256_extract_lane_4x64( lane_hash, hash, i, 256 );
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark  )
+          {
+            pdata[19] = n+i;
+            submit_solution( work, lane_hash, mythr, i );
+          }
       }
       n += 4;
-    } while ( ( num_found == 0 ) && ( n < max_nonce )
-              && !work_restart[thr_id].restart );
+    } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );

    *hashes_done = n - first_nonce + 1;
-    return num_found;
+    return 0;
 }

 #endif
--- a/algo/quark/quark-gate.h
+++ b/algo/quark/quark-gate.h
@@ -13,19 +13,15 @@ bool register_quark_algo( algo_gate_t* gate );
 #if defined(QUARK_4WAY)

 void quark_4way_hash( void *state, const void *input );
-
 int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
-
+                         uint64_t *hashes_done, struct thr_info *mythr );
 void init_quark_4way_ctx();

 #endif

 void quark_hash( void *state, const void *input );
-
 int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
-
+                    uint64_t *hashes_done, struct thr_info *mythr );
 void init_quark_ctx();

 #endif
--- a/algo/quark/quark.c
+++ b/algo/quark/quark.c
@@ -173,16 +173,17 @@ void quark_hash(void *state, const void *input)
 }

 int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done)
+                    uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t endiandata[20] __attribute__((aligned(64)));
-        uint32_t hash64[8] __attribute__((aligned(32)));
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t hash64[8] __attribute__((aligned(32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

-        swab32_array( endiandata, pdata, 20 );
+   swab32_array( endiandata, pdata, 20 );

 	do {
 		pdata[19] = ++n;
--- a/algo/qubit/deep-2way.c
+++ b/algo/qubit/deep-2way.c
@@ -64,7 +64,7 @@ void deep_2way_hash( void *output, const void *input )
 }

 int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
-                         uint64_t *hashes_done )
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -73,17 +73,17 @@ int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
-     uint32_t *nonces = work->nonces;
-     int num_found = 0;
     uint32_t *noncep = vdata + 32+3;   // 4*8 + 3
+     /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };

-     // big endian encode 0..18 uint32_t, 64 bits at a time
-     swab32_array( endiandata, pdata, 20 );
+     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+     casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );

     uint64_t *edata = (uint64_t*)endiandata;
     mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
@@ -102,23 +102,24 @@ int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
            deep_2way_hash( hash, vdata );
            pdata[19] = n;

-            if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
+            if ( !( hash[7] & mask ) )
+            if ( fulltest( hash, ptarget) && !opt_benchmark )
            {
-               nonces[ num_found++ ] = n;
-               work_set_target_ratio( work, hash );
+                pdata[19] = n;
+                submit_solution( work, hash, mythr, 0 );
            }
-            if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
+            if ( !( (hash+8)[7] & mask ) )
+            if ( fulltest( hash+8, ptarget) && !opt_benchmark )
            {
-               nonces[ num_found++ ] = n+1;
-               work_set_target_ratio( work, hash+8 );
+               pdata[19] = n+1;
+               submit_solution( work, hash+8, mythr, 1 );
            }
            n += 2;
-         } while ( ( num_found == 0 ) && ( n < max_nonce )
-                   && !work_restart[thr_id].restart );
+         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
         break;
     }
     *hashes_done = n - first_nonce + 1;
-     return num_found;
+     return 0;
 }

 #endif
--- a/algo/qubit/deep-gate.h
+++ b/algo/qubit/deep-gate.h
@@ -13,19 +13,15 @@ bool register_deep_algo( algo_gate_t* gate );
 #if defined(DEEP_2WAY)

 void deep_2way_hash( void *state, const void *input );
-
 int scanhash_deep_2way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
-
+                         uint64_t *hashes_done, struct thr_info *mythr );
 void init_deep_2way_ctx();

 #endif

 void deep_hash( void *state, const void *input );
-
 int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
-
+                    uint64_t *hashes_done, struct thr_info *mythr );
 void init_deep_ctx();

 #endif
--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -72,14 +72,15 @@ void deep_hash(void *output, const void *input)
 }

 int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done)
+                   uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t endiandata[20] __attribute__((aligned(64)));
-        uint32_t hash64[8] __attribute__((aligned(32)));
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t hash64[8] __attribute__((aligned(32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	const uint32_t Htarg = ptarget[7];

 	uint64_t htmax[] = { 0, 0xF, 0xFF,  0xFFF, 0xFFFF, 0x10000000 };
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -70,7 +70,7 @@ void qubit_2way_hash( void *output, const void *input )
 }

 int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
-                         uint64_t *hashes_done )
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -79,17 +79,17 @@ int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
-     uint32_t *nonces = work->nonces;
-     int num_found = 0;
     uint32_t *noncep = vdata + 32+3;   // 4*8 + 3
+     /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };

-     // big endian encode 0..18 uint32_t, 64 bits at a time
-     swab32_array( endiandata, pdata, 20 );
+     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+     casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );

     uint64_t *edata = (uint64_t*)endiandata;
     mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
@@ -107,25 +107,24 @@ int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
            qubit_2way_hash( hash, vdata );
            pdata[19] = n;

-
-            if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
+            if ( !( hash[7] & mask ) )
+            if ( fulltest( hash, ptarget) && !opt_benchmark )
            {
-               nonces[ num_found++ ] = n;
-               work_set_target_ratio( work, hash );
+                pdata[19] = n;
+                submit_solution( work, hash, mythr, 0 );
            }
-            if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
+            if ( !( (hash+8)[7] & mask ) )
+            if ( fulltest( hash+8, ptarget) && !opt_benchmark )
            {
               pdata[19] = n+1;
-               nonces[ num_found++ ] = n+1;
-               work_set_target_ratio( work, hash+8 );
+               submit_solution( work, hash+8, mythr, 1 );
            }
            n += 2;
-         } while ( ( num_found == 0 ) && ( n < max_nonce )
-                   && !work_restart[thr_id].restart );
+         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
         break;
     }
     *hashes_done = n - first_nonce + 1;
-     return num_found;
+     return 0;
 }

 #endif
--- a/algo/qubit/qubit-gate.h
+++ b/algo/qubit/qubit-gate.h
@@ -13,19 +13,15 @@ bool register_qubit_algo( algo_gate_t* gate );
 #if defined(QUBIT_2WAY)

 void qubit_2way_hash( void *state, const void *input );
-
 int scanhash_qubit_2way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
-
+                         uint64_t *hashes_done, struct thr_info *mythr );
 void init_qubit_2way_ctx();

 #endif

 void qubit_hash( void *state, const void *input );
-
 int scanhash_qubit( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
-
+                    uint64_t *hashes_done, struct thr_info *mythr );
 void init_qubit_ctx();

 #endif
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -83,15 +83,16 @@ void qubit_hash(void *output, const void *input)
        memcpy(output, hash, 32);
 }

-int scanhash_qubit(int thr_id, struct work *work,
-		uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_qubit( int thr_id, struct work *work,	uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t endiandata[20] __attribute__((aligned(64)));
-        uint32_t hash64[8] __attribute__((aligned(64)));
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t hash64[8] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	const uint32_t Htarg = ptarget[7];

 	uint64_t htmax[] = { 0, 0xF, 0xFF,  0xFFF, 0xFFFF, 0x10000000 };
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -40,9 +40,9 @@ void lbry_8way_hash( void* output, const void* input )
   sha256_8way_close( &ctx_sha256, vhashA );

   // reinterleave to do sha512 4-way 64 bit twice.
-   mm256_deinterleave_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 );
-   mm256_interleave_4x64( vhashA, h0, h1, h2, h3, 256 );
-   mm256_interleave_4x64( vhashB, h4, h5, h6, h7, 256 );
+   mm256_dintrlv_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 );
+   mm256_intrlv_4x64( vhashA, h0, h1, h2, h3, 256 );
+   mm256_intrlv_4x64( vhashB, h4, h5, h6, h7, 256 );

   sha512_4way_init( &ctx_sha512 );
   sha512_4way( &ctx_sha512, vhashA, 32 );
@@ -53,9 +53,9 @@ void lbry_8way_hash( void* output, const void* input )
   sha512_4way_close( &ctx_sha512, vhashB );

   // back to 8-way 32 bit
-   mm256_deinterleave_4x64( h0, h1, h2, h3, vhashA, 512 );
-   mm256_deinterleave_4x64( h4, h5, h6, h7, vhashB, 512 );
-   mm256_interleave_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );
+   mm256_dintrlv_4x64( h0, h1, h2, h3, vhashA, 512 );
+   mm256_dintrlv_4x64( h4, h5, h6, h7, vhashB, 512 );
+   mm256_intrlv_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );

   ripemd160_8way_init( &ctx_ripemd );
   ripemd160_8way( &ctx_ripemd, vhashA, 32 );
@@ -72,27 +72,24 @@ void lbry_8way_hash( void* output, const void* input )

   sha256_8way_init( &ctx_sha256 );
   sha256_8way( &ctx_sha256, vhashA, 32 );
-   sha256_8way_close( &ctx_sha256, vhashA );
-
-   mm256_deinterleave_8x32( output,     output+ 32, output+ 64, output+ 96,
-                            output+128, output+160, output+192, output+224,
-                            vhashA, 256 );
+   sha256_8way_close( &ctx_sha256, output );
 }

 int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done)
+                        uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[32*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[7<<3]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[27];
   const uint32_t first_nonce = pdata[27];
   const uint32_t Htarg = ptarget[7];
   uint32_t edata[32] __attribute__ ((aligned (64)));
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 216; // 27*8
+   __m256i  *noncev = (__m256i*)vdata + 27;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

   uint64_t htmax[] = {          0,        0xF,       0xFF,
                             0xFFF,     0xFFFF, 0x10000000 };
@@ -100,9 +97,12 @@ int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
                        0xFFFFF000, 0xFFFF0000,          0 };

   // we need bigendian data...
-   swab32_array( edata, pdata, 32 );
-   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
-                                 edata, edata, edata, edata, 1024 );
+   casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+   casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+   casti_m256i( edata, 2 ) = mm256_bswap_32( casti_m256i( pdata, 2 ) );
+   casti_m256i( edata, 3 ) = mm256_bswap_32( casti_m256i( pdata, 3 ) );
+   mm256_intrlv_8x32( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 1024 );
   sha256_8way_init( &sha256_8w_mid );
   sha256_8way( &sha256_8w_mid, vdata, LBRY_MIDSTATE );

@@ -111,136 +111,26 @@ int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
      uint32_t mask = masks[m];
      do
      {
-         be32enc( noncep,   n   );
-         be32enc( noncep+1, n+1 );
-         be32enc( noncep+2, n+2 );
-         be32enc( noncep+3, n+3 );
-         be32enc( noncep+4, n+4 );
-         be32enc( noncep+5, n+5 );
-         be32enc( noncep+6, n+6 );
-         be32enc( noncep+7, n+7 );
-
+        *noncev = mm256_bswap_32( _mm256_set_epi32(
+                                          n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
         lbry_8way_hash( hash, vdata );

-         for ( int i = 0; i < 8; i++ )
-         if ( !( (hash+(i<<3))[7] & mask ) && fulltest( hash+(i<<3), ptarget ) )
+         for ( int i = 0; i < 8; i++ )  if ( !( hash7[ i ] & mask ) )
         {
-            pdata[27] = n+i;
-            nonces[ num_found++ ] = n+i;
-            work_set_target_ratio( work, hash+(i<<3) );
+            // deinterleave hash for lane
+            mm256_extract_lane_8x32( lane_hash, hash, i, 256 );
+            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+            {
+              pdata[27] = n + i;
+              submit_solution( work, lane_hash, mythr, i );
+            }
         }
-         n+=8;
-      } while ( ( num_found == 0 ) && ( n < max_nonce )
-                   && !work_restart[thr_id].restart );
+         n += 8;
+      } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
      break;
   }
-
-   *hashes_done = n - first_nonce;
-   return num_found;
-}
-
-#elif defined(LBRY_4WAY)
-
-static __thread sha256_4way_context sha256_mid;
-
-void lbry_4way_hash( void* output, const void* input )
-{
-   sha256_4way_context     ctx_sha256 __attribute__ ((aligned (64)));
-   sha512_4way_context     ctx_sha512;
-   ripemd160_4way_context  ctx_ripemd;
-   uint32_t _ALIGN(64) vhashA[16<<2];
-   uint32_t _ALIGN(64) vhashB[16<<2];
-   uint32_t _ALIGN(64) vhashC[16<<2];
-
-   memcpy( &ctx_sha256, &sha256_mid, sizeof(ctx_sha256) );
-   sha256_4way( &ctx_sha256, input + (LBRY_MIDSTATE<<2), LBRY_TAIL );
-   sha256_4way_close( &ctx_sha256, vhashA );
-
-   sha256_4way_init( &ctx_sha256 );
-   sha256_4way( &ctx_sha256, vhashA, 32 );
-   sha256_4way_close( &ctx_sha256, vhashA );
-
-   // sha512 64 bit data, 64 byte output
-   mm256_reinterleave_4x64( vhashB, vhashA, 256 );
-   sha512_4way_init( &ctx_sha512 );
-   sha512_4way( &ctx_sha512, vhashB, 32 );
-   sha512_4way_close( &ctx_sha512, vhashB );
-   mm256_reinterleave_4x32( vhashA, vhashB, 512 );
-
-   ripemd160_4way_init( &ctx_ripemd );
-   ripemd160_4way( &ctx_ripemd, vhashA, 32 );
-   ripemd160_4way_close( &ctx_ripemd, vhashB );
-
-   ripemd160_4way_init( &ctx_ripemd );
-   ripemd160_4way( &ctx_ripemd, vhashA+(8<<2), 32 );
-   ripemd160_4way_close( &ctx_ripemd, vhashC );
-
-   sha256_4way_init( &ctx_sha256 );
-   sha256_4way( &ctx_sha256, vhashB, 20 );
-   sha256_4way( &ctx_sha256, vhashC, 20 );
-   sha256_4way_close( &ctx_sha256, vhashA );
-
-   sha256_4way_init( &ctx_sha256 );
-   sha256_4way( &ctx_sha256, vhashA, 32 );
-   sha256_4way_close( &ctx_sha256, vhashA );
-
-   mm128_deinterleave_4x32( output, output+32, output+64, output+96,
-		            vhashA, 256 );
-}
-
-int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done)
-{
-   uint32_t hash[4*8] __attribute__ ((aligned (64)));
-   uint32_t vdata[32*4] __attribute__ ((aligned (64)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   uint32_t n = pdata[27];
-   const uint32_t first_nonce = pdata[27];
-   const uint32_t Htarg = ptarget[7];
-   uint32_t edata[32] __attribute__ ((aligned (64)));
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 108; // 27*4
-
-   uint64_t htmax[] = {          0,        0xF,       0xFF,
-                             0xFFF,     0xFFFF, 0x10000000 };
-   uint32_t masks[] = {	0xFFFFFFFF, 0xFFFFFFF0,	0xFFFFFF00,
-                        0xFFFFF000, 0xFFFF0000,          0 };
-
-   // we need bigendian data...
-   swab32_array( edata, pdata, 32 );
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 1024 );
-   sha256_4way_init( &sha256_mid );
-   sha256_4way( &sha256_mid, vdata, LBRY_MIDSTATE );
-
-   for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] )
-   {
-      uint32_t mask = masks[m];
-      do
-      {
-         be32enc( noncep,   n   );
-         be32enc( noncep+1, n+1 );
-         be32enc( noncep+2, n+2 );
-         be32enc( noncep+3, n+3 );
-
-         lbry_4way_hash( hash, vdata );
-
-         for ( int i = 0; i < 4; i++ )
-         if ( !( (hash+(i<<3))[7] & mask ) && fulltest( hash+(i<<3), ptarget ) )
-         {
-            pdata[27] = n+i;
-            nonces[ num_found++ ] = n+i;
-            work_set_target_ratio( work, hash+(i<<3) );
-         }
-         n+=4;
-      } while ( ( num_found == 0 ) && ( n < max_nonce )
-                   && !work_restart[thr_id].restart );
-      break;
-   }
-
-   *hashes_done = n - first_nonce;
-   return num_found;
+   *hashes_done = n - first_nonce + 1;
+   return 0;
 }

 #endif
--- a/algo/ripemd/lbry-gate.h
+++ b/algo/ripemd/lbry-gate.h
@@ -4,12 +4,10 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-// Overide multi way on ryzen, SHA is better.
-#if !defined(RYZEN_)
-// need sha512 2 way AVX x2 or 1 way scalar x4 to support 4way AVX.
-#if defined(__AVX2__)
+#if !defined(__SHA__)
+ #if defined(__AVX2__)
  #define LBRY_8WAY
-#endif
+ #endif
 #endif

 #define LBRY_NTIME_INDEX 25
@@ -24,17 +22,18 @@ bool register_lbry_algo( algo_gate_t* gate );

 void lbry_8way_hash( void *state, const void *input );
 int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
-
+                         uint64_t *hashes_done, struct thr_info *mythr );
+/*
 #elif defined(LBRY_4WAY)

 void lbry_4way_hash( void *state, const void *input );
 int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );
+*/
 #else

 void lbry_hash( void *state, const void *input );
 int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+                    uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 #endif
--- a/algo/ripemd/lbry.c
+++ b/algo/ripemd/lbry.c
@@ -48,13 +48,14 @@ void lbry_hash(void* output, const void* input)
 }

 int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done)
+                   uint64_t *hashes_done, struct thr_info *mythr)
 {
  uint32_t *pdata = work->data;
  uint32_t *ptarget = work->target;
 	uint32_t n = pdata[27] - 1;
 	const uint32_t first_nonce = pdata[27];
 	const uint32_t Htarg = ptarget[7];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

 	uint32_t hash64[8] __attribute__((aligned(64)));
 	uint32_t endiandata[32] __attribute__ ((aligned (64)));
--- a/algo/scrypt.c
+++ b/algo/scrypt.c
@@ -696,7 +696,7 @@ static void scrypt_1024_1_1_256_24way(const uint32_t *input,
 #endif /* HAVE_SCRYPT_6WAY */

 extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce,
-                            uint64_t *hashes_done )
+                            uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -704,6 +704,7 @@ extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce,
 	uint32_t midstate[8];
 	uint32_t n = pdata[19] - 1;
 	const uint32_t Htarg = ptarget[7];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	int throughput = scrypt_best_throughput();
 	int i;
 	
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -5,13 +5,12 @@
 #include "algo-gate-api.h"

 // Override multi way on ryzen, SHA is better.
-#if !defined(RYZEN_)
-#if defined(__SSE2__)
-  #define SHA256T_4WAY
-#endif
-#if defined(__AVX2__)
+#if !defined(__SHA__)
+ #if defined(__AVX2__)
  #define SHA256T_8WAY
-#endif
+#elif defined(__SSE2__)
+  #define SHA256T_4WAY
+ #endif
 #endif

 bool register_sha256t_algo( algo_gate_t* gate );
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -6,6 +6,12 @@

 #if defined (__AVX2__)

+union _m256_v16 {
+  uint16_t u16[16];
+  __m256i v256;
+};
+typedef union _m256_v16 m256_v16;
+
 // imported from simd_iv.h

 uint32_t SIMD_IV_512[] = { 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc,
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -2,7 +2,11 @@
 #include <string.h>
 #include <stdint.h>
 #include "skein-hash-4way.h"
-#include "algo/sha/sha2-hash-4way.h"
+#if defined(__SHA__)
+  #include <openssl/sha.h>
+#else
+  #include "algo/sha/sha2-hash-4way.h"
+#endif

 #if defined (SKEIN_4WAY)

@@ -11,53 +15,69 @@ void skeinhash_4way( void *state, const void *input )
     uint64_t vhash64[8*4] __attribute__ ((aligned (64)));
     uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
     skein512_4way_context ctx_skein;
+#if defined(__SHA__)
+     uint32_t hash0[16] __attribute__ ((aligned (64)));
+     uint32_t hash1[16] __attribute__ ((aligned (64)));
+     uint32_t hash2[16] __attribute__ ((aligned (64)));
+     uint32_t hash3[16] __attribute__ ((aligned (64)));
+     SHA256_CTX           ctx_sha256;
+#else
     sha256_4way_context ctx_sha256;
+#endif

     skein512_4way_init( &ctx_skein );
     skein512_4way( &ctx_skein, input, 80 );
     skein512_4way_close( &ctx_skein, vhash64 );

+#if defined(__SHA__)      
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
+     SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
+     SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
+     SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
+     SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
+
+     mm128_intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
+#else
     mm256_rintrlv_4x64_4x32( vhash32, vhash64, 512 );

     sha256_4way_init( &ctx_sha256 );
     sha256_4way( &ctx_sha256, vhash32, 64 );
     sha256_4way_close( &ctx_sha256, state );
-
-     mm128_dintrlv_4x32( state, state+32, state+64, state+96,
-		              vhash32, 256 );
+#endif
 }

 int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done )
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
    uint32_t hash[8*4] __attribute__ ((aligned (64)));
    uint32_t lane_hash[8];
    uint32_t *hash7 = &(hash[7<<2]);
-    uint32_t edata[20] __attribute__ ((aligned (64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
-    // hash is returned deinterleaved
-    uint32_t *nonces = work->nonces;
-    int num_found = 0;
-
-// data is 80 bytes, 20 u32 or 4 u64.
-	
-    swab32_array( edata, pdata, 20 );
- 
-    mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
-
-    uint32_t *noncep = vdata + 73;   // 9*8 + 1
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

+   mm256_bswap_intrlv80_4x64( vdata, pdata );
   do
   {
-       be32enc( noncep,   n   );
-       be32enc( noncep+2, n+1 );
-       be32enc( noncep+4, n+2 );
-       be32enc( noncep+6, n+3 );
+       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

       skeinhash_4way( hash, vdata );

@@ -68,16 +88,14 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
          if ( fulltest( lane_hash, ptarget ) )
          {
             pdata[19] = n + lane;
-             nonces[ num_found++ ] = n + lane;
-             work_set_target_ratio( work, lane_hash );
+             submit_solution( work, lane_hash, mythr, lane );
          }
       }
       n += 4;
-    } while ( (num_found == 0) && (n < max_nonce)
-               && !work_restart[thr_id].restart );
+    } while ( (n < max_nonce) && !work_restart[thr_id].restart );

    *hashes_done = n - first_nonce + 1;
-    return num_found;
+    return 0;
 }

 #endif
--- a/algo/skein/skein-gate.h
+++ b/algo/skein/skein-gate.h
@@ -3,24 +3,21 @@
 #include <stdint.h>
 #include "algo-gate-api.h"

-// Override multi way on ryzen, SHA is better.
-#if !defined(RYZEN_)
 #if defined(__AVX2__)
  #define SKEIN_4WAY
 #endif
-#endif

 #if defined(SKEIN_4WAY)

 void skeinhash_4way( void *output, const void *input );

 int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+                         uint64_t *hashes_done, struct thr_info *mythr );
 #endif

 void skeinhash( void *output, const void *input );

 int scanhash_skein( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+                    uint64_t *hashes_done, struct thr_info *mythr );

 #endif
--- a/algo/skein/skein.c
+++ b/algo/skein/skein.c
@@ -21,8 +21,8 @@ void skeinhash(void *state, const void *input)
     memcpy(state, hash, 32);
 }

-int scanhash_skein(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_skein( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -31,7 +31,8 @@ int scanhash_skein(int thr_id, struct work *work,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t n = first_nonce;
-	
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+
        swab32_array( endiandata, pdata, 20 );

 	do {
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -20,55 +20,43 @@ void skein2hash_4way( void *output, const void *input )
 }

 int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done )
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[8*4] __attribute__ ((aligned (64)));
    uint32_t *hash7 = &(hash[25]);
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-    uint32_t endiandata[20] __attribute__ ((aligned (64)));
-    uint64_t *edata = (uint64_t*)endiandata;
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
-    uint32_t *nonces = work->nonces;
-    int num_found = 0;
-
-    swab32_array( endiandata, pdata, 20 );
-
-    mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 );
-
-    uint32_t *noncep = vdata + 73;   // 9*8 + 1
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

+    mm256_bswap_intrlv80_4x64( vdata, pdata );
    do 
    {
-       be32enc( noncep,   n   );
-       be32enc( noncep+2, n+1 );
-       be32enc( noncep+4, n+2 );
-       be32enc( noncep+6, n+3 );
+       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

-       skein2hash( hash, vdata );
+       skein2hash_4way( hash, vdata );

       for ( int lane = 0; lane < 4; lane++ )
-       if ( hash7[ lane ] <= Htarg )
+       if ( hash7[ lane<<1 ] <= Htarg )
       {
-          // deinterleave hash for lane
          uint32_t lane_hash[8];
          mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) )
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
          {
             pdata[19] = n + lane;
-             nonces[ num_found++ ] = n + lane;
-             work_set_target_ratio( work, lane_hash );
+             submit_solution( work, lane_hash, mythr, lane );
          }
       }
       n += 4;
-    } while ( (num_found == 0) && (n < max_nonce)
-             &&  !work_restart[thr_id].restart );
+    } while ( (n < max_nonce) && !work_restart[thr_id].restart );

    *hashes_done = n - first_nonce + 1;
-    return num_found;
+    return 0;
 }

 #endif
--- a/algo/skein/skein2-gate.c
+++ b/algo/skein/skein2-gate.c
@@ -10,10 +10,9 @@ int64_t skein2_get_max64 ()
 bool register_skein2_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX2_OPT;
-#if defined (FOUR_WAY) && defined (__AVX2__)
+#if defined (SKEIN2_4WAY)
  gate->scanhash  = (void*)&scanhash_skein2_4way;
  gate->hash      = (void*)&skein2hash_4way;
-  four_way_not_tested();
 #else
  gate->scanhash  = (void*)&scanhash_skein2;
  gate->hash      = (void*)&skein2hash;
--- a/algo/skein/skein2-gate.h
+++ b/algo/skein/skein2-gate.h
@@ -10,11 +10,11 @@
 #if defined(SKEIN2_4WAY)
 void skein2hash_4way( void *output, const void *input );
 int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t* hashes_done );
+                          uint64_t* hashes_done, struct thr_info *mythr );
 #endif

 void skein2hash( void *output, const void *input );
 int scanhash_skein2( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
+                     uint64_t *hashes_done, struct thr_info *mythr );
 #endif

--- a/algo/skein/skein2.c
+++ b/algo/skein/skein2.c
@@ -34,8 +34,8 @@ void skein2hash(void *output, const void *input)

 }

-int scanhash_skein2(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_skein2( int thr_id, struct work *work,	uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -44,6 +44,7 @@ int scanhash_skein2(int thr_id, struct work *work,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t n = first_nonce;
+    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

        swab32_array( endiandata, pdata, 20 );

--- a/algo/x14/axiom.c
+++ b/algo/x14/axiom.c
@@ -44,12 +44,13 @@ void axiomhash(void *output, const void *input)
 }

 int scanhash_axiom(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
 	uint32_t _ALIGN(64) hash64[8];
 	uint32_t _ALIGN(64) endiandata[20];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
--- a/algo/x14/polytimos-4way.c
+++ b/algo/x14/polytimos-4way.c
@@ -13,26 +13,16 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/echo/aes_ni/hash_api.h"

-typedef struct {
+union _poly_4way_context_overlay
+{
   skein512_4way_context   skein;
   shabal512_4way_context  shabal;
   hashState_echo          echo;
   luffa_2way_context      luffa;
   sph_fugue512_context    fugue;
   sph_gost512_context     gost;
-} poly_4way_ctx_holder;
-
-poly_4way_ctx_holder poly_4way_ctx;
-
-void init_polytimos_4way_ctx()
-{
-    skein512_4way_init( &poly_4way_ctx.skein );
-    shabal512_4way_init( &poly_4way_ctx.shabal );
-    init_echo( &poly_4way_ctx.echo, 512  );
-    luffa_2way_init( &poly_4way_ctx.luffa, 512 );
-    sph_fugue512_init( &poly_4way_ctx.fugue );
-    sph_gost512_init( &poly_4way_ctx.gost );
-}
+};
+typedef union _poly_4way_context_overlay poly_4way_context_overlay;

 void polytimos_4way_hash( void *output, const void *input )
 {
@@ -41,51 +31,57 @@ void polytimos_4way_hash( void *output, const void *input )
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-     poly_4way_ctx_holder ctx __attribute__ ((aligned (64)));
-     memcpy( &ctx, &poly_4way_ctx, sizeof(poly_4way_ctx) );
+     poly_4way_context_overlay ctx;

+     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, input, 80 );
     skein512_4way_close( &ctx.skein, vhash );

     // Need to convert from 64 bit interleaved to 32 bit interleaved.
     uint32_t vhash32[16*4];
     mm256_rintrlv_4x64_4x32( vhash32, vhash, 512 );
+     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash32, 64 );
     shabal512_4way_close( &ctx.shabal, vhash32 );
     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );

+     init_echo( &ctx.echo, 512 );
     update_final_echo ( &ctx.echo, (BitSequence *)hash0,
                         (const BitSequence *)hash0, 512 );
-     memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, 512 );
-     memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, 512 );
-     memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

     mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
+     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
     mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
     mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
     mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );

+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
-     memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash1, 64 );
     sph_fugue512_close( &ctx.fugue, hash1 );
-     memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash2, 64 );
     sph_fugue512_close( &ctx.fugue, hash2 );
-     memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

+     sph_gost512_init( &ctx.gost );
     sph_gost512( &ctx.gost, hash0, 64 );
     sph_gost512_close( &ctx.gost, hash0 );
     sph_gost512_init( &ctx.gost );
@@ -104,51 +100,43 @@ void polytimos_4way_hash( void *output, const void *input )
     memcpy( output+96, hash3, 32 );
 }

-int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,                              uint64_t *hashes_done )
+int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-   uint32_t endiandata[20] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 73;   // 9*8 + 1
+     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   const uint32_t Htarg = ptarget[7];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
   volatile uint8_t *restart = &(work_restart[thr_id].restart);

   if ( opt_benchmark )
      ptarget[7] = 0x0cff;

-   for ( int i=0; i < 19; i++ )
-      be32enc( &endiandata[i], pdata[i] );
-
-   uint64_t *edata = (uint64_t*)endiandata;
-   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+   mm256_bswap_intrlv80_4x64( vdata, pdata );
   do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+2, n+1 );
-      be32enc( noncep+4, n+2 );
-      be32enc( noncep+6, n+3 );
+      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

      polytimos_4way_hash(hash, vdata);
      pdata[19] = n;

-      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg )
+      if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
      {
         pdata[19] = n+i;
-         nonces[ num_found++ ] = n+i;
-         work_set_target_ratio( work, hash+(i<<3) );
+         submit_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;

-   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart));
+   } while ( ( n < max_nonce-4 ) && !(*restart));

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/x14/polytimos-gate.c
+++ b/algo/x14/polytimos-gate.c
@@ -4,7 +4,6 @@ bool register_polytimos_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
 #ifdef POLYTIMOS_4WAY
-  init_polytimos_4way_ctx();
  gate->scanhash  = (void*)&scanhash_polytimos_4way;
  gate->hash      = (void*)&polytimos_4way_hash;
 #else
--- a/algo/x14/polytimos-gate.h
+++ b/algo/x14/polytimos-gate.h
@@ -13,19 +13,14 @@ bool register_polytimos_algo( algo_gate_t* gate );
 #if defined(POLYTIMOS_4WAY)

 void polytimos_4way_hash( void *state, const void *input );
-
 int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done );
-
-void init_polytimos_4way_ctx();
+                       uint64_t *hashes_done, struct thr_info *mythr );

 #endif

 void polytimos_hash( void *state, const void *input );
-
 int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done );
-
+                  uint64_t *hashes_done, struct thr_info *mythr );
 void init_polytimos_ctx();

 #endif
--- a/algo/x14/polytimos.c
+++ b/algo/x14/polytimos.c
@@ -76,13 +76,14 @@ void polytimos_hash(void *output, const void *input)
 	memcpy(output, hashA, 32);
 }

-int scanhash_polytimos(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(128) hash[8];
 	uint32_t _ALIGN(128) endiandata[20];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
-
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
--- a/algo/x14/veltor-4way.c
+++ b/algo/x14/veltor-4way.c
@@ -40,7 +40,7 @@ void veltor_4way_hash( void *output, const void *input )

     skein512_4way( &ctx.skein, input, 80 );
     skein512_4way_close( &ctx.skein, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
@@ -54,10 +54,10 @@ void veltor_4way_hash( void *output, const void *input )
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );

-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

     sph_gost512( &ctx.gost, hash0, 64 );
     sph_gost512_close( &ctx.gost, hash0 );
@@ -78,7 +78,7 @@ void veltor_4way_hash( void *output, const void *input )
 }

 int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done )
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -91,6 +91,7 @@ int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
     uint32_t *nonces = work->nonces;
     int num_found = 0;
     uint32_t *noncep = vdata + 73;   // 9*8 + 1
+     /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
     volatile uint8_t *restart = &(work_restart[thr_id].restart);

     if ( opt_benchmark )
@@ -101,7 +102,7 @@ int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
     }

     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
     do
     {
         be32enc( noncep,   n   );
--- a/algo/x14/veltor-gate.h
+++ b/algo/x14/veltor-gate.h
@@ -15,7 +15,7 @@ bool register_veltor_algo( algo_gate_t* gate );
 void veltor_4way_hash( void *state, const void *input );

 int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done );
+                       uint64_t *hashes_done, struct thr_info *mythr );

 void init_veltor_4way_ctx();

@@ -24,7 +24,7 @@ void init_veltor_4way_ctx();
 void veltor_hash( void *state, const void *input );

 int scanhash_veltor( int thr_id, struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done );
+                  uint64_t *hashes_done, struct thr_info *mythr );

 void init_veltor_ctx();

--- a/algo/x14/veltor.c
+++ b/algo/x14/veltor.c
@@ -61,12 +61,14 @@ void veltor_hash(void *output, const void *input)
 	memcpy(output, hashB, 32);
 }

-int scanhash_veltor(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_veltor( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(128) hash[8];
 	uint32_t _ALIGN(128) endiandata[20];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -78,7 +78,7 @@ void x14_4way_hash( void *state, const void *input )
     bmw512_4way_close( &ctx.bmw, vhash );

     // Serial
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     // 3 Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -90,7 +90,7 @@ void x14_4way_hash( void *state, const void *input )
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

     // Parallel 4way
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     // 4 Skein
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -105,16 +105,16 @@ void x14_4way_hash( void *state, const void *input )
     keccak512_4way_close( &ctx.keccak, vhash );

     // Serial
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     // 7 Luffa
-     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
+     mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );

     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -142,13 +142,13 @@ void x14_4way_hash( void *state, const void *input )
     sph_shavite512_close( &ctx.shavite, hash3 );

     // 10 Simd
-     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
+     mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );

     // 11 Echo
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -164,10 +164,10 @@ void x14_4way_hash( void *state, const void *input )
                       (const BitSequence *) hash3, 512 );

     // 12 Hamsi parallel 4way 32 bit
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     // 13 Fugue serial
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -183,14 +183,14 @@ void x14_4way_hash( void *state, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );

     // 14 Shabal, parallel 32 bit
-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, state );

 }

 int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done )
+                       uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*16] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -203,6 +203,7 @@ int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
     int num_found = 0;
     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     const uint32_t Htarg = ptarget[7];
+     /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
@@ -212,7 +213,7 @@ int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
     swab32_array( endiandata, pdata, 20 );

     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

     for ( int m=0; m < 6; m++ )
       if ( Htarg <= htmax[m] )
--- a/algo/x14/x14-gate.h
+++ b/algo/x14/x14-gate.h
@@ -13,19 +13,15 @@ bool register_x14_algo( algo_gate_t* gate );
 #if defined(X14_4WAY)

 void x14_4way_hash( void *state, const void *input );
-
 int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done );
-
+                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x14_4way_ctx();

 #endif

 void x14hash( void *state, const void *input );
-
 int scanhash_x14( int thr_id, struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done );
-
+                  uint64_t *hashes_done, struct thr_info *mythr );
 void init_x14_ctx();

 #endif
--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -180,8 +180,8 @@ void x14hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-int scanhash_x14(int thr_id, struct work *work,
-				uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_x14( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t endiandata[20] __attribute__((aligned(64)));
        uint32_t hash64[8] __attribute__((aligned(64)));
@@ -190,6 +190,7 @@ int scanhash_x14(int thr_id, struct work *work,
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
 	const uint32_t Htarg = ptarget[7];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

 	uint64_t htmax[] = {
 		0,
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -81,7 +81,7 @@ void x15_4way_hash( void *state, const void *input )
     bmw512_4way_close( &ctx.bmw, vhash );

     // Serial
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     // 3 Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -93,7 +93,7 @@ void x15_4way_hash( void *state, const void *input )
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

     // Parallel 4way
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     // 4 Skein
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -108,16 +108,16 @@ void x15_4way_hash( void *state, const void *input )
     keccak512_4way_close( &ctx.keccak, vhash );

     // Serial to the end
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     // 7 Luffa
-     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
+     mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );

     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -145,13 +145,13 @@ void x15_4way_hash( void *state, const void *input )
     sph_shavite512_close( &ctx.shavite, hash3 );

     // 10 Simd
-     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
+     mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );

     // 11 Echo
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -167,10 +167,10 @@ void x15_4way_hash( void *state, const void *input )
                       (const BitSequence *) hash3, 512 );

     // 12 Hamsi parallel 4way 32 bit
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     // 13 Fugue
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -186,10 +186,10 @@ void x15_4way_hash( void *state, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );

     // 14 Shabal, parallel 32 bit
-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
       
     // 15 Whirlpool
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -214,7 +214,7 @@ void x15_4way_hash( void *state, const void *input )
 }

 int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done )
+                       uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -227,6 +227,7 @@ int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
     int num_found = 0;
     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     const uint32_t Htarg = ptarget[7];
+     /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
@@ -236,7 +237,7 @@ int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
     swab32_array( endiandata, pdata, 20 );

     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

     for ( int m=0; m < 6; m++ )
       if ( Htarg <= htmax[m] )
--- a/algo/x15/x15-gate.h
+++ b/algo/x15/x15-gate.h
@@ -13,19 +13,15 @@ bool register_x15_algo( algo_gate_t* gate );
 #if defined(X15_4WAY)

 void x15_4way_hash( void *state, const void *input );
-
 int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done );
-
+                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x15_4way_ctx();

 #endif

 void x15hash( void *state, const void *input );
-
 int scanhash_x15( int thr_id, struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done );
-
+                  uint64_t *hashes_done, struct thr_info *mythr );
 void init_x15_ctx();

 #endif
--- a/algo/x15/x15.c
+++ b/algo/x15/x15.c
@@ -186,8 +186,8 @@ void x15hash(void *output, const void *input)
 	memcpy(output, hashB, 32);
 }

-int scanhash_x15(int thr_id, struct work *work,
-                    uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_x15( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t endiandata[20] __attribute__((aligned(64)));
        uint32_t hash64[8] __attribute__((aligned(64)));
@@ -196,6 +196,7 @@ int scanhash_x15(int thr_id, struct work *work,
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
 	const uint32_t Htarg = ptarget[7];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

 	uint64_t htmax[] = {
 		0,
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -32,8 +32,8 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };

-
-typedef struct {
+union _x16r_4way_context_overlay
+{
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
    hashState_echo          echo;
@@ -50,16 +50,8 @@ typedef struct {
    shabal512_4way_context  shabal;
    sph_whirlpool_context   whirlpool;
    sha512_4way_context     sha512;
-} x16r_4way_ctx_holder;
-
-x16r_4way_ctx_holder x16r_4way_ctx __attribute__ ((aligned (64)));
-
-// Cube needs one full init so fast reinits can be done in the hash loop.
-void init_x16r_4way_ctx()
-{
-   cubehashInit( &x16r_4way_ctx.cube, 512, 16, 32 );
 };
-
+typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;

 void x16r_4way_hash( void* output, const void* input )
 {
@@ -68,14 +60,14 @@ void x16r_4way_hash( void* output, const void* input )
   uint32_t hash2[24] __attribute__ ((aligned (64)));
   uint32_t hash3[24] __attribute__ ((aligned (64)));
   uint32_t vhash[24*4] __attribute__ ((aligned (64)));
-   x16r_4way_ctx_holder ctx;
+   x16r_4way_context_overlay ctx;
   void *in0 = (void*) hash0;
   void *in1 = (void*) hash1;
   void *in2 = (void*) hash2;
   void *in3 = (void*) hash3;
   int size = 80;

-   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, input, 640 );
+   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
 
   if ( s_ntime == UINT32_MAX )
   {
@@ -104,11 +96,11 @@ void x16r_4way_hash( void* output, const void* input )
               blake512_4way( &ctx.blake, input, size );
            else
            {
-               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               blake512_4way( &ctx.blake, vhash, size );
            }
            blake512_4way_close( &ctx.blake, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case BMW:
            bmw512_4way_init( &ctx.bmw );
@@ -116,11 +108,11 @@ void x16r_4way_hash( void* output, const void* input )
               bmw512_4way( &ctx.bmw, input, size );
            else
            {
-               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               bmw512_4way( &ctx.bmw, vhash, size );
            }
            bmw512_4way_close( &ctx.bmw, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case GROESTL:
               init_groestl( &ctx.groestl, 64 );
@@ -142,11 +134,11 @@ void x16r_4way_hash( void* output, const void* input )
               skein512_4way( &ctx.skein, input, size );
            else
            {
-               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               skein512_4way( &ctx.skein, vhash, size );
            }
            skein512_4way_close( &ctx.skein, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case JH:
            jh512_4way_init( &ctx.jh );
@@ -154,11 +146,11 @@ void x16r_4way_hash( void* output, const void* input )
               jh512_4way( &ctx.jh, input, size );
            else
            {
-               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               jh512_4way( &ctx.jh, vhash, size );
            }
            jh512_4way_close( &ctx.jh, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case KECCAK:
            keccak512_4way_init( &ctx.keccak );
@@ -166,21 +158,21 @@ void x16r_4way_hash( void* output, const void* input )
               keccak512_4way( &ctx.keccak, input, size );
            else
            {
-               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               keccak512_4way( &ctx.keccak, vhash, size );
            }
            keccak512_4way_close( &ctx.keccak, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case LUFFA:
-            mm256_interleave_2x128( vhash, in0, in1, size<<3 );
+            mm256_intrlv_2x128( vhash, in0, in1, size<<3 );
            luffa_2way_init( &ctx.luffa, 512 );
            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
-            mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-            mm256_interleave_2x128( vhash, in2, in3, size<<3 );
+            mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
+            mm256_intrlv_2x128( vhash, in2, in3, size<<3 );
            luffa_2way_init( &ctx.luffa, 512 );
            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
-            mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+            mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
         break;
         case CUBEHASH:
            cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -194,7 +186,7 @@ void x16r_4way_hash( void* output, const void* input )
                                  (const byte*)in2, size );
            cubehashInit( &ctx.cube, 512, 16, 32 );
            cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
-                                        (const byte*)in3, size );
+                                  (const byte*)in3, size );
         break;
         case SHAVITE:
            sph_shavite512_init( &ctx.shavite );
@@ -211,14 +203,14 @@ void x16r_4way_hash( void* output, const void* input )
            sph_shavite512_close( &ctx.shavite, hash3 );
         break;
         case SIMD:
-            mm256_interleave_2x128( vhash, in0, in1, size<<3 );
+            mm256_intrlv_2x128( vhash, in0, in1, size<<3 );
            simd_2way_init( &ctx.simd, 512 );
            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-            mm256_interleave_2x128( vhash, in2, in3, size<<3 );
+            mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
+            mm256_intrlv_2x128( vhash, in2, in3, size<<3 );
            simd_2way_init( &ctx.simd, 512 );
            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+            mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
         break;
         case ECHO:
             init_echo( &ctx.echo, 512 );
@@ -235,11 +227,11 @@ void x16r_4way_hash( void* output, const void* input )
                                (const BitSequence*)in3, size<<3 );
         break;
         case HAMSI:
-             mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+             mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
             hamsi512_4way_init( &ctx.hamsi );
             hamsi512_4way( &ctx.hamsi, vhash, size );
             hamsi512_4way_close( &ctx.hamsi, vhash );
-             mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+             mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case FUGUE:
             sph_fugue512_init( &ctx.fugue );
@@ -256,11 +248,11 @@ void x16r_4way_hash( void* output, const void* input )
             sph_fugue512_close( &ctx.fugue, hash3 );
         break;
         case SHABAL:
-             mm128_interleave_4x32( vhash, in0, in1, in2, in3, size<<3 );
+             mm128_intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
             shabal512_4way_init( &ctx.shabal );
             shabal512_4way( &ctx.shabal, vhash, size );
             shabal512_4way_close( &ctx.shabal, vhash );
-             mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+             mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case WHIRLPOOL:
             sph_whirlpool_init( &ctx.whirlpool );
@@ -277,11 +269,11 @@ void x16r_4way_hash( void* output, const void* input )
             sph_whirlpool_close( &ctx.whirlpool, hash3 );
         break;
         case SHA_512:
-             mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+             mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
             sha512_4way_init( &ctx.sha512 );
             sha512_4way( &ctx.sha512, vhash, size );
             sha512_4way_close( &ctx.sha512, vhash );
-             mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+             mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
      }
      size = 64;
@@ -304,15 +296,13 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 73;   // 9*8 + 1
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   volatile uint8_t *restart = &(work_restart[thr_id].restart);

-   for ( int k=0; k < 19; k++ )
-      be32enc( &endiandata[k], pdata[k] );
+   casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+   casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );

-//   if ( s_ntime != pdata[17] )
   if ( s_ntime != endiandata[17] )
   {
      uint32_t ntime = swab32(pdata[17]);
@@ -326,30 +316,27 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
      ptarget[7] = 0x0cff;

   uint64_t *edata = (uint64_t*)endiandata;
-   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+   mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

   do
   {
-      be32enc( noncep,   n   );
-      be32enc( noncep+2, n+1 );
-      be32enc( noncep+4, n+2 );
-      be32enc( noncep+6, n+3 );
+      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+               _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

      x16r_4way_hash( hash, vdata );
      pdata[19] = n;

-      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      for ( int i = 0; i < 4; i++ )  if ( (hash+(i<<3))[7] <= Htarg )
+      if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
      {
         pdata[19] = n+i;
-         nonces[ num_found++ ] = n+i;
-         work_set_target_ratio( work, hash+(i<<3) );
+         submit_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;
-   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
+   } while ( (  n < max_nonce ) && !(*restart) );

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -35,11 +35,9 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output )
 bool register_x16r_algo( algo_gate_t* gate )
 {
 #if defined (X16R_4WAY)
-//  init_x16r_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
-  init_x16r_ctx();
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
@@ -52,11 +50,9 @@ bool register_x16r_algo( algo_gate_t* gate )
 bool register_x16s_algo( algo_gate_t* gate )
 {
 #if defined (X16R_4WAY)
-//  init_x16r_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
-  init_x16r_ctx();
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -39,20 +39,14 @@ bool register_x16s_algo( algo_gate_t* gate );
 #if defined(X16R_4WAY)

 void x16r_4way_hash( void *state, const void *input );
-
 int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

-void init_x16r_4way_ctx();
-
 #endif

 void x16r_hash( void *state, const void *input );
-
 int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );

-void init_x16r_ctx();
-
 #endif

--- a/algo/x16/x16r.c
+++ b/algo/x16/x16r.c
@@ -33,7 +33,8 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };

-typedef struct {
+union _x16r_context_overlay
+{
 #if defined(__AES__)
        hashState_echo          echo;
        hashState_groestl       groestl;
@@ -55,19 +56,13 @@ typedef struct {
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
        SHA512_CTX              sha512;
-} x16r_ctx_holder;
-
-x16r_ctx_holder x16r_ctx __attribute__ ((aligned (64)));
-
-void init_x16r_ctx()
-{
-   cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
 };
+typedef union _x16r_context_overlay x16r_context_overlay;

 void x16r_hash( void* output, const void* input )
 {
   uint32_t _ALIGN(128) hash[16];
-   x16r_ctx_holder ctx;
+   x16r_context_overlay ctx;
   void *in = (void*) input;
   int size = 80;

@@ -126,7 +121,7 @@ void x16r_hash( void* output, const void* input )
                                    (const BitSequence*)in, size );
         break;
         case CUBEHASH:
-            memcpy( &ctx.cube, &x16r_ctx.cube, sizeof(cubehashParam) );
+            cubehashInit( &ctx.cube, 512, 16, 32 );
            cubehashUpdateDigest( &ctx.cube, (byte*) hash,
                                  (const byte*)in, size );
         break;
@@ -196,13 +191,12 @@ int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t nonce = first_nonce;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);

-   for ( int k=0; k < 19; k++ )
-      be32enc( &endiandata[k], pdata[k] );
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );

-// This code is suspicious. s_ntime is saved after byteswapping pdata[17]
-// but is tested vs unswapped pdata[17]. This should result in calling
-// getAlgoString every pass, but that doesn't seem to be the case.
-// It appears to be working correctly as is.
   if ( s_ntime != pdata[17] )
   {
      uint32_t ntime = swab32(pdata[17]);
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -3,13 +3,9 @@
 bool register_x17_algo( algo_gate_t* gate )
 {
 #if defined (X17_4WAY)
-printf("register x17 4way\n");
-//  init_x17_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x17_4way;
  gate->hash      = (void*)&x17_4way_hash;
 #else
-printf("register x17 no 4way\n");
-  init_x17_ctx();
  gate->scanhash  = (void*)&scanhash_x17;
  gate->hash      = (void*)&x17_hash;
 #endif
--- a/algo/x17/x17-gate.h
+++ b/algo/x17/x17-gate.h
@@ -13,20 +13,14 @@ bool register_x17_algo( algo_gate_t* gate );
 #if defined(X17_4WAY)

 void x17_4way_hash( void *state, const void *input );
-
 int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );

-//void init_x17_4way_ctx();
-
 #endif

 void x17_hash( void *state, const void *input );
-
 int scanhash_x17( int thr_id, struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );

-void init_x17_ctx();
-
 #endif

--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -34,7 +34,8 @@
  #include "algo/echo/sph_echo.h"
 #endif

-typedef struct {
+union _x17_context_overlay
+{
 #if defined(__AES__)
        hashState_groestl       groestl;
        hashState_echo          echo;
@@ -43,7 +44,7 @@ typedef struct {
        sph_echo512_context     echo;
 #endif
        hashState_luffa         luffa;
-        cubehashParam           cubehash;
+        cubehashParam           cube;
        sph_shavite512_context  shavite;
        hashState_sd            simd;
        sph_hamsi512_context    hamsi;
@@ -52,38 +53,14 @@ typedef struct {
        sph_whirlpool_context   whirlpool;
        SHA512_CTX              sha512;
        sph_haval256_5_context  haval;
-} x17_ctx_holder;
-
-x17_ctx_holder x17_ctx __attribute__ ((aligned (64)));
-
-void init_x17_ctx()
-{
-#if defined(__AES__)
-        init_groestl( &x17_ctx.groestl, 64 );
-        init_echo( &x17_ctx.echo, 512 );
-#else
-        sph_groestl512_init(&x17_ctx.groestl );
-        sph_echo512_init(&x17_ctx.echo);
-#endif
-        init_luffa( &x17_ctx.luffa, 512 );
-        cubehashInit( &x17_ctx.cubehash, 512, 16, 32 );
-        sph_shavite512_init( &x17_ctx.shavite );
-        init_sd( &x17_ctx.simd, 512 );
-        sph_hamsi512_init( &x17_ctx.hamsi );
-        sph_fugue512_init( &x17_ctx.fugue );
-        sph_shabal512_init( &x17_ctx.shabal );
-        sph_whirlpool_init( &x17_ctx.whirlpool );
-        SHA512_Init( &x17_ctx.sha512 );
-        sph_haval256_5_init(&x17_ctx.haval);
 };
+typedef union _x17_context_overlay x17_context_overlay;

 void x17_hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (64)));
 	#define hashB hash+64
-
-        x17_ctx_holder ctx __attribute__ ((aligned (64)));
-        memcpy( &ctx, &x17_ctx, sizeof(x17_ctx) );
+   x17_context_overlay ctx;

        unsigned char hashbuf[128];
        size_t hashptr;
@@ -115,9 +92,11 @@ void x17_hash(void *output, const void *input)
        //---groestl----

 #if defined(__AES__)
+        init_groestl( &ctx.groestl, 64 );
        update_and_final_groestl( &ctx.groestl, (char*)hash,
                                  (const char*)hash, 512 );
 #else
+        sph_groestl512_init( &ctx.groestl );
        sph_groestl512( &ctx.groestl, hash, 64 );
        sph_groestl512_close( &ctx.groestl, hash );
 #endif
@@ -142,50 +121,62 @@ void x17_hash(void *output, const void *input)
        KEC_C;

        //--- luffa7
+        init_luffa( &ctx.luffa, 512 );
        update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                (const BitSequence*)hash, 64 );

        // 8 Cube
-        cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
+        cubehashInit( &ctx.cube, 512, 16, 32 );
+        cubehashUpdateDigest( &ctx.cube, (byte*) hash,
                              (const byte*)hash, 64 );

        // 9 Shavite
+        sph_shavite512_init( &ctx.shavite );
        sph_shavite512( &ctx.shavite, hash, 64);
        sph_shavite512_close( &ctx.shavite, hash);

        // 10 Simd
+        init_sd( &ctx.simd, 512 );
        update_final_sd( &ctx.simd, (BitSequence*)hash,
                         (const BitSequence*)hash, 512 );

        //11---echo---
 #if defined(__AES__)
+        init_echo( &ctx.echo, 512 );
        update_final_echo ( &ctx.echo, (BitSequence*)hash,
                            (const BitSequence*)hash, 512 );
 #else
-	sph_echo512( &ctx.echo, hash, 64 );
+        sph_echo512_init( &ctx.echo );
+        sph_echo512( &ctx.echo, hash, 64 );
        sph_echo512_close( &ctx.echo, hash );
 #endif

        // X13 algos
        // 12 Hamsi
+        sph_hamsi512_init( &ctx.hamsi );
        sph_hamsi512( &ctx.hamsi, hash, 64 );
        sph_hamsi512_close( &ctx.hamsi, hash );

        // 13 Fugue
+        sph_fugue512_init( &ctx.fugue );
        sph_fugue512(&ctx.fugue, hash, 64 );
        sph_fugue512_close(&ctx.fugue, hash );

        // X14 Shabal
+        sph_shabal512_init( &ctx.shabal );
        sph_shabal512(&ctx.shabal, hash, 64);
        sph_shabal512_close( &ctx.shabal, hash );
       
        // X15 Whirlpool
-	sph_whirlpool( &ctx.whirlpool, hash, 64 );
-	sph_whirlpool_close( &ctx.whirlpool, hash );
+        sph_whirlpool_init( &ctx.whirlpool );
+        sph_whirlpool( &ctx.whirlpool, hash, 64 );
+        sph_whirlpool_close( &ctx.whirlpool, hash );

-	SHA512_Update( &ctx.sha512, hash, 64 );
+        SHA512_Init( &ctx.sha512 );
+        SHA512_Update( &ctx.sha512, hash, 64 );
        SHA512_Final( (unsigned char*)hash, &ctx.sha512 );

+        sph_haval256_5_init(&ctx.haval);
        sph_haval256_5( &ctx.haval, (const void*)hash, 64 );
        sph_haval256_5_close( &ctx.haval, output );
 }
@@ -234,42 +225,42 @@ int scanhash_x17( int thr_id, struct work *work, uint32_t max_nonce,
 #endif
   for ( int m = 0; m < 6; m++ )
   {
-	if ( Htarg <= htmax[m] )
-	{
-	   uint32_t mask = masks[m];
-	   do
+      if ( Htarg <= htmax[m] )
 	   {
-		pdata[19] = ++n;
-		be32enc( &endiandata[19], n );
-		x17_hash( hash64, endiandata );
+	      uint32_t mask = masks[m];
+	      do
+	      {
+	         pdata[19] = ++n;
+		      be32enc( &endiandata[19], n );
+		      x17_hash( hash64, endiandata );
 #ifndef DEBUG_ALGO
-		if ( !( hash64[7] & mask ) )
-                {
-                   if ( fulltest( hash64, ptarget ) )
-		   {
-		      *hashes_done = n - first_nonce + 1;
-		      return true;
-                   }
-//                 else
-//                      applog(LOG_INFO, "Result does not validate on CPU!");
-                }
+		      if ( !( hash64[7] & mask ) )
+            {
+               if ( fulltest( hash64, ptarget ) )
+		         {
+		            *hashes_done = n - first_nonce + 1;
+		            return true;
+               }
+//             else
+//               applog(LOG_INFO, "Result does not validate on CPU!");
+            }
 #else
-		if ( !( n % 0x1000 ) && !thr_id ) printf(".");
-		if ( !( hash64[7] & mask ) )
+		      if ( !( n % 0x1000 ) && !thr_id ) printf(".");
+		      if ( !( hash64[7] & mask ) )
 	       	{
-		   printf("[%d]",thr_id);
-		   if ( fulltest( hash64, ptarget ) )
-		   {
-                       work_set_target_ratio( work, hash64 );
-		       *hashes_done = n - first_nonce + 1;
-		       return true;
-		   }
-		}
+		         printf("[%d]",thr_id);
+		         if ( fulltest( hash64, ptarget ) )
+		         {
+                 work_set_target_ratio( work, hash64 );
+		           *hashes_done = n - first_nonce + 1;
+		           return true;
+		         }
+		      }
 #endif
-	    } while (n < max_nonce && !work_restart[thr_id].restart);
+	      } while (n < max_nonce && !work_restart[thr_id].restart);
 			// see blake.c if else to understand the loop on htmax => mask
-	    break;
-	}
+	      break;
+	   }
   }
   *hashes_done = n - first_nonce + 1;
   pdata[19] = n;
--- a/algo/yespower/yespower-platform.c
+++ b/algo/yespower/yespower-platform.c
@@ -41,7 +41,7 @@ static void *alloc_region(yespower_region_t *region, size_t size)
 #endif
 	    MAP_ANON | MAP_PRIVATE;
 #if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE)
-	size_t new_size = size;
+   size_t new_size = size;
 	const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1;
 	if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) {
 		flags |= MAP_HUGETLB;
@@ -55,6 +55,7 @@ static void *alloc_region(yespower_region_t *region, size_t size)
 	base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0);
 	if (base != MAP_FAILED) {
 		base_size = new_size;
+
 	} else if (flags & MAP_HUGETLB) {
 		flags &= ~MAP_HUGETLB;
 		base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);