v3.9.4

v3.9.3.1
v3.9.2.5
2025-09-17 23:44:27 +00:00 · 2019-06-18 13:15:45 -04:00 · 2019-06-13 21:15:58 -04:00 · 2019-06-13 11:20:27 -04:00
126 changed files with 6175 additions and 5098 deletions
--- a/3
+++ b/3
@@ -42,9 +42,6 @@ openssl 1.1.0e or higher. Add one of the following, depending on the
 compiler version, to CFLAGS:
 "-march=native" or "-march=znver1" or "-msha".
 Due to poor AVX2 performance on Ryzen users should add -DRYZEN_ to CFLAGS
 to override multiway AVX2 on algos with sha256, and use SHA instead.
 Additional instructions for static compilalation can be found here:
 https://lxadm.com/Static_compilation_of_cpuminer
 Static builds should only considered in a homogeneous HW and SW environment.
--- a/Makefile.am
+++ b/Makefile.am
@@ -131,6 +131,7 @@ cpuminer_SOURCES = \
  algo/lyra2/lyra2h-4way.c \
  algo/lyra2/allium-4way.c \
  algo/lyra2/allium.c \
  algo/lyra2/phi2-4way.c \
  algo/lyra2/phi2.c \
  algo/m7m.c \
  algo/neoscrypt/neoscrypt.c \
@@ -147,6 +148,9 @@ cpuminer_SOURCES = \
  algo/quark/anime-gate.c \
  algo/quark/anime.c \
  algo/quark/anime-4way.c \
  algo/quark/hmq1725-gate.c \
  algo/quark/hmq1725-4way.c \
  algo/quark/hmq1725.c \
  algo/qubit/qubit-gate.c \
  algo/qubit/qubit.c \
  algo/qubit/qubit-2way.c \
@@ -257,7 +261,6 @@ cpuminer_SOURCES = \
  algo/x17/xevan-gate.c \
  algo/x17/xevan.c \
  algo/x17/xevan-4way.c \
  algo/x17/hmq1725.c \
  algo/x17/sonoa-gate.c \
  algo/x17/sonoa-4way.c \
  algo/x17/sonoa.c \
--- a/README.txt
+++ b/README.txt
@@ -29,7 +29,7 @@ cpuminer-sse2.exe      "-msse2"                  Core2, Nehalem
 cpuminer-aes-sse42.exe "-march=westmere"         Westmere
 cpuminer-avx.exe       "-march=corei7-avx"       Sandy-Ivybridge
 cpuminer-avx2.exe      "-march=core-avx2"        Haswell, Sky-Kaby-Coffeelake
-cpuminer-zen           "-march=znver1 -DRYZEN_"  Ryzen
+cpuminer-zen           "-march=znver1"           AMD Ryzen, Threadripper
 If you like this software feel free to donate:
--- a/19
+++ b/19
@@ -38,6 +38,25 @@ supported.
 Change Log
 ----------
 v3.9.4
 Faster AVX2 for lyra2v3, quark, anime.
 Fixed skein AVX2 regression (invalid shares since v3.9.0) and faster.
 Faster skein2 with 4way AVX2 enabled.
 Automatic SHA override on Ryzen CPUs, no need for -DRYZEN compile flag.
 Ongoing restructuring.
 v3.9.3.1
 Skipped v3.9.3 due to misidentification of v3.9.2.5 as v3.9.3.
 Fixed x16r algo 25% invalid share reject rate. The bug may have also
 affected other algos.
 v3.9.2.5
 Fixed 2 regressions: hodl AES detection, x16r invalid shares with AVX2.
 More restructuring.
 v3.9.2.4
 Yet another affinity fix. Hopefully the last one.
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -2,8 +2,7 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include "miner.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 #include "interleave.h"
 /////////////////////////////
 ////
--- a/algo/argon2/argon2d/argon2d/core.c
+++ b/algo/argon2/argon2d/argon2d/core.c
@@ -112,7 +112,7 @@ int allocate_memory(const argon2_context *context, uint8_t **memory,
 void free_memory(const argon2_context *context, uint8_t *memory,
                 size_t num, size_t size) {
    size_t memory_size = num*size;
-    clear_internal_memory(memory, memory_size);
+//    clear_internal_memory(memory, memory_size);
    if (context->free_cbk) {
        (context->free_cbk)(memory, memory_size);
    } else {
@@ -137,7 +137,7 @@ void NOT_OPTIMIZED secure_wipe_memory(void *v, size_t n) {
 int FLAG_clear_internal_memory = 0;
 void clear_internal_memory(void *v, size_t n) {
  if (FLAG_clear_internal_memory && v) {
-    secure_wipe_memory(v, n);
+//    secure_wipe_memory(v, n);
  }
 }
@@ -559,7 +559,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
                       context->pwdlen);
        if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) {
-            secure_wipe_memory(context->pwd, context->pwdlen);
+//            secure_wipe_memory(context->pwd, context->pwdlen);
            context->pwdlen = 0;
        }
    }
@@ -580,7 +580,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
                       context->secretlen);
        if (context->flags & ARGON2_FLAG_CLEAR_SECRET) {
-            secure_wipe_memory(context->secret, context->secretlen);
+//            secure_wipe_memory(context->secret, context->secretlen);
            context->secretlen = 0;
        }
    }
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -45,7 +45,7 @@ extern "C"{
 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 #define SPH_SIZE_blake256   256
--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -16,7 +16,7 @@
 #if defined(__SSE4_2__)
-#include "avxdefs.h"
+#include "simd-utils.h"
 #include <stddef.h>
 #include <stdint.h>
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -43,7 +43,7 @@ extern "C"{
 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 #define SPH_SIZE_bmw256   256
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -537,6 +537,8 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
      }
   }
   sc->ptr = ptr;
   if ( h1 != sc->H )
        memcpy_128( sc->H, h1, 16 );
 }
@@ -571,6 +573,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
   for ( u = 0; u < 16; u ++ )
      buf[u] = h2[u];
   compress_small( buf, (__m128i*)final_s, h1 );
   for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
@@ -1041,22 +1044,22 @@ static const __m256i final_s8[16] =
 void bmw256_8way_init( bmw256_8way_context *ctx )
 {
-   ctx->H[ 0] = _mm256_set1_epi64x( IV256[ 0] );
+   ctx->H[ 0] = _mm256_set1_epi32( IV256[ 0] );
-   ctx->H[ 1] = _mm256_set1_epi64x( IV256[ 1] );
+   ctx->H[ 1] = _mm256_set1_epi32( IV256[ 1] );
-   ctx->H[ 2] = _mm256_set1_epi64x( IV256[ 2] );
+   ctx->H[ 2] = _mm256_set1_epi32( IV256[ 2] );
-   ctx->H[ 3] = _mm256_set1_epi64x( IV256[ 3] );
+   ctx->H[ 3] = _mm256_set1_epi32( IV256[ 3] );
-   ctx->H[ 4] = _mm256_set1_epi64x( IV256[ 4] );
+   ctx->H[ 4] = _mm256_set1_epi32( IV256[ 4] );
-   ctx->H[ 5] = _mm256_set1_epi64x( IV256[ 5] );
+   ctx->H[ 5] = _mm256_set1_epi32( IV256[ 5] );
-   ctx->H[ 6] = _mm256_set1_epi64x( IV256[ 6] );
+   ctx->H[ 6] = _mm256_set1_epi32( IV256[ 6] );
-   ctx->H[ 7] = _mm256_set1_epi64x( IV256[ 7] );
+   ctx->H[ 7] = _mm256_set1_epi32( IV256[ 7] );
-   ctx->H[ 8] = _mm256_set1_epi64x( IV256[ 8] );
+   ctx->H[ 8] = _mm256_set1_epi32( IV256[ 8] );
-   ctx->H[ 9] = _mm256_set1_epi64x( IV256[ 9] );
+   ctx->H[ 9] = _mm256_set1_epi32( IV256[ 9] );
-   ctx->H[10] = _mm256_set1_epi64x( IV256[10] );
+   ctx->H[10] = _mm256_set1_epi32( IV256[10] );
-   ctx->H[11] = _mm256_set1_epi64x( IV256[11] );
+   ctx->H[11] = _mm256_set1_epi32( IV256[11] );
-   ctx->H[12] = _mm256_set1_epi64x( IV256[12] );
+   ctx->H[12] = _mm256_set1_epi32( IV256[12] );
-   ctx->H[13] = _mm256_set1_epi64x( IV256[13] );
+   ctx->H[13] = _mm256_set1_epi32( IV256[13] );
-   ctx->H[14] = _mm256_set1_epi64x( IV256[14] );
+   ctx->H[14] = _mm256_set1_epi32( IV256[14] );
-   ctx->H[15] = _mm256_set1_epi64x( IV256[15] );
+   ctx->H[15] = _mm256_set1_epi32( IV256[15] );
   ctx->ptr       = 0;
   ctx->bit_count = 0;
@@ -1076,14 +1079,15 @@ void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
   ptr = ctx->ptr;
   h1 = ctx->H;
   h2 = htmp;
   while ( len > 0 )
   {
      size_t clen;
      clen = buf_size - ptr;
      if ( clen > len )
         clen = len;
-      memcpy_256( buf + (ptr>>3), vdata, clen >> 3 );
+      memcpy_256( buf + (ptr>>2), vdata, clen >> 2 );
-      vdata = vdata + (clen>>3);
+      vdata = vdata + (clen>>2);
      len -= clen;
      ptr += clen;
      if ( ptr == buf_size )
@@ -1097,6 +1101,7 @@ void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
      }
   }
   ctx->ptr = ptr;
   if ( h1 != ctx->H )
        memcpy_256( ctx->H, h1, 16 );
 }
@@ -1106,24 +1111,26 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
   __m256i *buf;
   __m256i h1[16], h2[16], *h;
   size_t ptr, u, v;
 //   unsigned z;
   const int buf_size = 64;  // bytes of one lane, compatible with len
   buf = ctx->buf;
   ptr = ctx->ptr;
-   buf[ ptr>>3 ] = _mm256_set1_epi32( 0x80 );
+   buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
-   ptr += 8;
+   ptr += 4;
   h = ctx->H;
-   if (  ptr > (buf_size - 8) )
+   if (  ptr > (buf_size - 4) )
   {
-      memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+      memset_zero_256( buf + (ptr>>2), (buf_size - ptr) >> 2 );
      compress_small_8way( buf, h, h1 );
      ptr = 0;
      h = h1;
   }
-   memset_zero_256( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
+   memset_zero_256( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
-   buf[ (buf_size - 8) >> 3 ] = _mm256_set1_epi64x( ctx->bit_count );
+   buf[ (buf_size - 8) >> 2 ] = _mm256_set1_epi32( ctx->bit_count );
   buf[ (buf_size - 4) >> 2 ] = m256_zero;
   compress_small_8way( buf, h, h2 );
   for ( u = 0; u < 16; u ++ )
--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -4,7 +4,7 @@
 #if defined(__AVX2__)
 #include <stdint.h>
-#include "avxdefs.h"
+#include "simd-utils.h"
 // 2x128, 2 way parallel SSE2
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -13,7 +13,7 @@
 #include <stdbool.h>
 #include <unistd.h>
 #include <memory.h>
-#include "avxdefs.h"
+#include "simd-utils.h"
 #include <stdio.h>
 // The result of hashing 10 rounds of initial data which is params and 
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -12,7 +12,7 @@
 #include <memory.h>
 #include "hash-groestl.h"
 #include "miner.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 #ifndef NO_AES_NI
--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -9,7 +9,7 @@
 #include <memory.h>
 #include "hash-groestl256.h"
 #include "miner.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 #ifndef NO_AES_NI
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -40,7 +40,7 @@
 #if defined (__AVX2__)
-#include "avxdefs.h"
+#include "simd-utils.h"
 #ifdef __cplusplus
 extern "C"{
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -69,7 +69,7 @@ extern "C"{
 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 #define SPH_SIZE_haval256_5   256
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -156,7 +156,7 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
 bool register_hodl_algo( algo_gate_t* gate )
 {
-#if defined(__AES__)
+#if !defined(__AES__)
  applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
  return false;
 #endif
--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -44,7 +44,7 @@ extern "C"{
 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 #define SPH_SIZE_jh256   256
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -44,7 +44,7 @@ extern "C"{
 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 #define SPH_SIZE_keccak256   256
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -24,7 +24,7 @@
 #if defined(__AVX2__)
-#include "avxdefs.h"
+#include "simd-utils.h"
 #define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \
                               0UL, 0UL, 0UL, 0xffffffffUL )
--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -24,7 +24,7 @@
 #include <immintrin.h>
 #include "algo/sha/sha3-defs.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 /* The length of digests*/
 #define DIGEST_BIT_LEN_224 224
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -20,7 +20,7 @@
 #include <string.h>
 #include <emmintrin.h>
-#include "avxdefs.h"
+#include "simd-utils.h"
 #include "luffa_for_sse2.h"
 #define MULT2(a0,a1) do \
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -44,10 +44,11 @@ void allium_4way_hash( void *state, const void *input )
   blake256_4way( &ctx.blake, input + (64<<2), 16 );
   blake256_4way_close( &ctx.blake, vhash32 );
-   mm256_reinterleave_4x64( vhash64, vhash32, 256 );
+   mm256_rintrlv_4x32_4x64( vhash64, vhash32, 256 );
   keccak256_4way( &ctx.keccak, vhash64, 32 );
   keccak256_4way_close( &ctx.keccak, vhash64 );
-   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
@@ -67,26 +68,23 @@ void allium_4way_hash( void *state, const void *input )
   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
-   mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+   mm256_intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
   skein256_4way( &ctx.skein, vhash64, 32 );
   skein256_4way_close( &ctx.skein, vhash64 );
   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
-   update_and_final_groestl256( &ctx.groestl, hash0, hash0, 256 );
+   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
           sizeof(hashState_groestl256) );
   update_and_final_groestl256( &ctx.groestl, hash1, hash1, 256 );
   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
           sizeof(hashState_groestl256) );
   update_and_final_groestl256( &ctx.groestl, hash2, hash2, 256 );
   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
           sizeof(hashState_groestl256) );
   update_and_final_groestl256( &ctx.groestl, hash3, hash3, 256 );
-   memcpy( state,    hash0, 32 );
+   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
-   memcpy( state+32, hash1, 32 );
+   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
-   memcpy( state+64, hash2, 32 );
+           sizeof(hashState_groestl256) );
-   memcpy( state+96, hash3, 32 );
+   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
           sizeof(hashState_groestl256) );
   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
           sizeof(hashState_groestl256) );
   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
 }
 int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -94,7 +92,6 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -106,13 +103,7 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;
-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   mm128_bswap_intrlv80_4x32( vdata, pdata );
   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   blake256_4way_init( &allium_4way_ctx.blake );
   blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
@@ -124,7 +115,7 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
     for ( int lane = 0; lane < 4; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
     {
-        if ( fulltest( hash+(lane<<3), ptarget ) )
+        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
        {
           pdata[19] = n + lane;
           submit_solution( work, hash+(lane<<3), mythr, lane );
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -47,7 +47,9 @@ bool lyra2rev3_thread_init()
   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
   l2v3_wholeMatrix = _mm_malloc( size, 64 );
-#if defined (LYRA2REV3_4WAY)
+#if defined (LYRA2REV3_8WAY)
   init_lyra2rev3_8way_ctx();;
 #elif defined (LYRA2REV3_4WAY)
   init_lyra2rev3_4way_ctx();;
 #else
   init_lyra2rev3_ctx();
@@ -57,7 +59,10 @@ bool lyra2rev3_thread_init()
 bool register_lyra2rev3_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV3_4WAY)
+#if defined (LYRA2REV3_8WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev3_8way;
  gate->hash      = (void*)&lyra2rev3_8way_hash;
 #elif defined (LYRA2REV3_4WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev3_4way;
  gate->hash      = (void*)&lyra2rev3_4way_hash;
 #else
@@ -203,13 +208,18 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 bool register_phi2_algo( algo_gate_t* gate )
 {
-   init_phi2_ctx();
+//   init_phi2_ctx();
   gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
   gate->get_work_data_size = (void*)&phi2_get_work_data_size;
   gate->decode_extra_data  = (void*)&phi2_decode_extra_data;
   gate->build_extraheader  = (void*)&phi2_build_extraheader;
   gate->set_target         = (void*)&alt_set_target; 
   gate->get_max64          = (void*)&get_max64_0xffffLL;
 #if defined(PHI2_4WAY)
   gate->scanhash           = (void*)&scanhash_phi2_4way;
 #else
   init_phi2_ctx();
   gate->scanhash           = (void*)&scanhash_phi2;
 #endif
   return true;
 }
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -6,14 +6,24 @@
 #include "lyra2.h"
 #if defined(__AVX2__)
  #define LYRA2REV3_8WAY
 #endif
 #if defined(__SSE2__)
  #define LYRA2REV3_4WAY
 #endif
 extern __thread uint64_t* l2v3_wholeMatrix;
 bool register_lyra2rev3_algo( algo_gate_t* gate );
 #if defined(LYRA2REV3_8WAY)
-#if defined(LYRA2REV3_4WAY)
+void lyra2rev3_8way_hash( void *state, const void *input );
 int scanhash_lyra2rev3_8way( int thr_id, struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev3_8way_ctx();
 #elif defined(LYRA2REV3_4WAY)
 void lyra2rev3_4way_hash( void *state, const void *input );
 int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -140,15 +150,29 @@ bool init_allium_ctx();
 /////////////////////////////////////////
 #if defined(__AVX2__) && defined(__AES__)
 //  #define PHI2_4WAY
 #endif
 bool phi2_has_roots;
 bool register_phi2_algo( algo_gate_t* gate );
 #if defined(PHI2_4WAY)
 void phi2_hash_4way( void *state, const void *input );
 int scanhash_phi2_4way( int thr_id, struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr );
 //void init_phi2_ctx();
 #else
 void phi2_hash( void *state, const void *input );
 int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr );
 void init_phi2_ctx();
 #endif
 #endif  // LYRA2_GATE_H__
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -566,7 +566,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
 #if defined(__AVX2__)
   memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
-#elif defined(__SSE4_2__)
+#elif defined(__SSE2__)
   memset_zero_128( (__m128i*)wholeMatrix, i>>4 );   
 #else
   memset( wholeMatrix, 0, i );
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -36,17 +36,16 @@ void lyra2h_4way_hash( void *state, const void *input )
     blake256_4way( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );
-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
-     LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, state, 32, hash0, 32, hash0, 32,
-     LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
+             16, 16, 16 );
-     LYRA2Z( lyra2h_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, state+32, 32, hash1, 32, hash1,
-     LYRA2Z( lyra2h_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 16, 16, 16 );
+             32, 16, 16, 16 );
-
+     LYRA2Z( lyra2h_4way_matrix, state+64, 32, hash2, 32, hash2,
-     memcpy( state,    hash0, 32 );
+             32, 16, 16, 16 );
-     memcpy( state+32, hash1, 32 );
+     LYRA2Z( lyra2h_4way_matrix, state+96, 32, hash3, 32, hash3,
-     memcpy( state+64, hash2, 32 );
+             32, 16, 16, 16 );
     memcpy( state+96, hash3, 32 );
 }
 int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -54,49 +53,36 @@ int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
   int num_found = 0;
   uint32_t *noncep= vdata + 76; // 19*4
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;
-   for ( int i=0; i < 20; i++ )
+   mm128_bswap_intrlv80_4x32( vdata, pdata );
      be32enc( &edata[i], pdata[i] );
   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   lyra2h_4way_midstate( vdata );
   do {
-      be32enc( noncep,   n   );
+     *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      be32enc( noncep+1, n+1 );
      be32enc( noncep+2, n+2 );
      be32enc( noncep+3, n+3 );
      be32enc( &edata[19], n );
      lyra2h_4way_hash( hash, vdata );
      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
           && !opt_benchmark )
      {
          pdata[19] = n+i;         
-          nonces[ num_found++ ] = n+i;
+          submit_solution( work, hash+(i<<3), mythr, i );
          work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;
-   } while ( (num_found == 0) && (n < max_nonce-4)
+   } while (  (n < max_nonce-4) && !work_restart[thr_id].restart);
                   && !work_restart[thr_id].restart);
   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }
 #endif
--- a/algo/lyra2/lyra2re.c
+++ b/algo/lyra2/lyra2re.c
@@ -6,7 +6,7 @@
 #include "algo/keccak/sph_keccak.h"
 #include "lyra2.h"
 #include "algo-gate-api.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 #if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl256.h"
 #endif
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -42,10 +42,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   blake256_4way( &ctx.blake, input + (64<<2), 16 );
   blake256_4way_close( &ctx.blake, vhash );
-   mm256_reinterleave_4x64( vhash64, vhash, 256 );
+   mm256_rintrlv_4x32_4x64( vhash64, vhash, 256 );
   keccak256_4way( &ctx.keccak, vhash64, 32 );
   keccak256_4way_close( &ctx.keccak, vhash64 );
-   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
@@ -60,10 +62,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
   LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
-   mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+   mm256_intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
   skein256_4way( &ctx.skein, vhash64, 32 );
   skein256_4way_close( &ctx.skein, vhash64 );
-   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
@@ -74,11 +78,10 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
-   mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
+   mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
   bmw256_4way( &ctx.bmw, vhash, 32 );
   bmw256_4way_close( &ctx.bmw, vhash );
-   mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+   bmw256_4way( &ctx.bmw, vhash, 32 );
   bmw256_4way_close( &ctx.bmw, state );
 }
 int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -86,49 +89,44 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(64) edata[20];
+   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t lane_hash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
-   uint32_t *nonces = work->nonces;
+   __m128i *noncev = (__m128i*)vdata + 19;   // aligned
   int num_found = 0;
   uint32_t *noncep = vdata + 76; // 19*4
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;
-   swab32_array( edata, pdata, 20 );
+   mm128_bswap_intrlv80_4x32( vdata, pdata );
   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   blake256_4way_init( &l2v2_4way_ctx.blake );
   blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
-   do {
+   do
-      be32enc( noncep,   n   );
+   {
-      be32enc( noncep+1, n+1 );
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      be32enc( noncep+2, n+2 );
      be32enc( noncep+3, n+3 );
      lyra2rev2_4way_hash( hash, vdata );
      pdata[19] = n;
-      for ( int i = 0; i < 4; i++ )
+      for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
      {
-          pdata[19] = n+i;         
+         mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
-          nonces[ num_found++ ] = n+i;
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
-          work_set_target_ratio( work, hash+(i<<3) );
+         {
            pdata[19] = n + lane;         
            submit_solution( work, lane_hash, mythr, lane );
         }
      }
      n += 4;
-   } while ( (num_found == 0) && (n < max_nonce-4)
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
                   && !work_restart[thr_id].restart);
   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }
 #endif
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -1,12 +1,138 @@
 #include "lyra2-gate.h"
 #include <memory.h>
 #if defined (LYRA2REV3_4WAY)	
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 
 #if defined (LYRA2REV3_8WAY)
 typedef struct {
   blake256_8way_context     blake;
   cubehashParam             cube;
   bmw256_8way_context       bmw;
 } lyra2v3_8way_ctx_holder;
 static lyra2v3_8way_ctx_holder l2v3_8way_ctx;
 bool init_lyra2rev3_8way_ctx()
 {
   blake256_8way_init( &l2v3_8way_ctx.blake );
   cubehashInit( &l2v3_8way_ctx.cube, 256, 16, 32 );
   bmw256_8way_init( &l2v3_8way_ctx.bmw );
   return true;
 }
 void lyra2rev3_8way_hash( void *state, const void *input )
 {
   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
   uint32_t hash0[8] __attribute__ ((aligned (64)));
   uint32_t hash1[8] __attribute__ ((aligned (32)));
   uint32_t hash2[8] __attribute__ ((aligned (32)));
   uint32_t hash3[8] __attribute__ ((aligned (32)));
   uint32_t hash4[8] __attribute__ ((aligned (32)));
   uint32_t hash5[8] __attribute__ ((aligned (32)));
   uint32_t hash6[8] __attribute__ ((aligned (32)));
   uint32_t hash7[8] __attribute__ ((aligned (32)));
   lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );
   blake256_8way( &ctx.blake, input, 80 );
   blake256_8way_close( &ctx.blake, vhash );
   mm256_dintrlv_8x32( hash0, hash1, hash2, hash3,
                       hash4, hash5, hash6, hash7, vhash, 256 );
   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash4, 32, hash4, 32, hash4, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash5, 32, hash5, 32, hash5, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash4, (const byte*) hash4, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash5, (const byte*) hash5, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash6, (const byte*) hash6, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash7, (const byte*) hash7, 32 );
   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash4, 32, hash4, 32, hash4, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash5, 32, hash5, 32, hash5, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
   mm256_intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
                             hash4, hash5, hash6, hash7, 256 );
   bmw256_8way( &ctx.bmw, vhash, 32 );
   bmw256_8way_close( &ctx.bmw, state );
   }
 int scanhash_lyra2rev3_8way( int thr_id, struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[7<<3]);
   uint32_t lane_hash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;
   mm256_bswap_intrlv80_8x32( vdata, pdata );
   do
   {
      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
                                                  n+3, n+2, n+1, n ) );
      lyra2rev3_8way_hash( hash, vdata );
      pdata[19] = n;
      for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
      {
         mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr, lane );
         }
      }
      n += 8;
   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
 #endif
 #if defined (LYRA2REV3_4WAY)  
 typedef struct {
   blake256_4way_context     blake;
   cubehashParam             cube;
@@ -35,7 +161,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   blake256_4way( &ctx.blake, input, 80 );
   blake256_4way_close( &ctx.blake, vhash );
-   mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+   mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
@@ -55,10 +181,9 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
-   mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
+   mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
   bmw256_4way( &ctx.bmw, vhash, 32 );
   bmw256_4way_close( &ctx.bmw, state );
 }
 int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -66,7 +191,6 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t edata[20] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t lane_hash[8];
   uint32_t *pdata = work->data;
@@ -80,15 +204,7 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;
-   // Need big endian data
+   mm128_bswap_intrlv80_4x32( vdata, pdata );
   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   do
   {
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
@@ -99,16 +215,14 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
      for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
      {
         mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
-
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         if ( fulltest( lane_hash, ptarget ) )
         {
              pdata[19] = n + lane;    
              submit_solution( work, lane_hash, mythr, lane );
-	 }
+	      }
      }
      n += 4;
   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -36,17 +36,12 @@ void lyra2z_4way_hash( void *state, const void *input )
     blake256_4way( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );
-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
-     LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, state   , 32, hash0, 32, hash0, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, state+32, 32, hash1, 32, hash1, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, state+64, 32, hash2, 32, hash2, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, state+96, 32, hash3, 32, hash3, 32, 8, 8, 8 );
     memcpy( state,    hash0, 32 );
     memcpy( state+32, hash1, 32 );
     memcpy( state+64, hash2, 32 );
     memcpy( state+96, hash3, 32 );
 }
 int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -54,7 +49,6 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
@@ -66,13 +60,7 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;
-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   mm128_bswap_intrlv80_4x32( vdata, pdata );
   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   lyra2z_4way_midstate( vdata );
   do {
@@ -82,16 +70,11 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
      pdata[19] = n;
      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
           && !opt_benchmark )
      {
          pdata[19] = n+i;         
-          work_set_target_ratio( work, hash+(i<<3) );
+          submit_solution( work, hash+(i<<3), mythr, i );
          if ( submit_work( mythr, work ) )
              applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
                             accepted_share_count + rejected_share_count + 1,
                             thr_id, i );
          else
              applog( LOG_WARNING, "Failed to submit share." );
      }
      n += 4;
   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
@@ -136,8 +119,8 @@ void lyra2z_8way_hash( void *state, const void *input )
     blake256_8way( &ctx_blake, input + (64*8), 16 );
     blake256_8way_close( &ctx_blake, vhash );
-     mm256_deinterleave_8x32( hash0, hash1, hash2, hash3,
+     mm256_dintrlv_8x32( hash0, hash1, hash2, hash3,
-                              hash4, hash5, hash6, hash7, vhash, 256 );
+                         hash4, hash5, hash6, hash7, vhash, 256 );
     LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
     LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
@@ -163,7 +146,6 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
@@ -175,13 +157,7 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;
-   casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+   mm256_bswap_intrlv80_8x32( vdata, pdata );
   casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
                                 edata, edata, edata, edata, 640 );
   lyra2z_8way_midstate( vdata );
   do {
@@ -191,7 +167,8 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
      pdata[19] = n;
      for ( int i = 0; i < 8; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
           && !opt_benchmark )
      {
          pdata[19] = n+i;         
          submit_solution( work, hash+(i<<3), mythr, i );
--- a/algo/lyra2/lyra2z.c
+++ b/algo/lyra2/lyra2z.c
@@ -3,7 +3,7 @@
 #include "lyra2-gate.h"
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 __thread uint64_t* lyra2z_matrix;
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -1,7 +1,7 @@
 #include <memory.h>
 #include "algo-gate-api.h"
 #include "lyra2.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 __thread uint64_t* lyra2z330_wholeMatrix;
@@ -30,14 +30,17 @@ int scanhash_lyra2z330( int thr_id, struct work *work, uint32_t max_nonce,
   if (opt_benchmark)
 	ptarget[7] = 0x0000ff;
-   for (int i=0; i < 19; i++)
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-      be32enc(&endiandata[i], pdata[i]);
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-        
+   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
   do
   {
      be32enc(&endiandata[19], nonce);
      lyra2z330_hash( hash, endiandata, work->height );
-      if ( hash[7] <= Htarg && fulltest(hash, ptarget) )
+      if ( hash[7] <= Htarg && fulltest(hash, ptarget) && !opt_benchmark )
      {
         work_set_target_ratio(work, hash);
         pdata[19] = nonce;
--- a/algo/lyra2/phi2-4way.c
+++ b/algo/lyra2/phi2-4way.c
@@ -0,0 +1,233 @@
 /**
 * Phi-2 algo Implementation
 */
 #include "lyra2-gate.h"
 #if defined(PHI2_4WAY)
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/echo/aes_ni/hash_api.h"
 typedef struct {
     cubehashParam           cube;
     jh512_4way_context      jh;
     hashState_echo          echo;
 //     hashState_echo          echo2;
     sph_gost512_context     gost;
     skein512_4way_context   skein;
 } phi2_ctx_holder;
 /*
 phi2_ctx_holder phi2_ctx;
 void init_phi2_ctx()
 {
   cubehashInit( &phi2_ctx.cube, 512, 16, 32 );
   sph_jh512_init(&phi2_ctx.jh);
   init_echo( &phi2_ctx.echo1, 512 );
   init_echo( &phi2_ctx.echo2, 512 );
   sph_gost512_init(&phi2_ctx.gost);
   sph_skein512_init(&phi2_ctx.skein);
 };
 */
 void phi2_hash_4way( void *state, const void *input )
 {
   uint32_t hash[4][16] __attribute__ ((aligned (64)));
   uint32_t hashA[4][16] __attribute__ ((aligned (64)));
   uint32_t hashB[4][16] __attribute__ ((aligned (64)));
   uint32_t vhash[4*16] __attribute__ ((aligned (64)));
 //   unsigned char _ALIGN(128) hash[64];
 //	unsigned char _ALIGN(128) hashA[64];
 //	unsigned char _ALIGN(128) hashB[64];
   phi2_ctx_holder ctx __attribute__ ((aligned (64)));
 //  memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
   cubehashInit( &ctx.cube, 512, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[0], (const byte*)input,
                        phi2_has_roots ? 144 : 80 );
   cubehashInit( &ctx.cube, 512, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[1], (const byte*)input+144,
                        phi2_has_roots ? 144 : 80 );
   cubehashInit( &ctx.cube, 512, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[2], (const byte*)input+288,
                        phi2_has_roots ? 144 : 80 );
   cubehashInit( &ctx.cube, 512, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[3], (const byte*)input+432,
                        phi2_has_roots ? 144 : 80 );
 	LYRA2RE( &hashA[0][0], 32, &hashB[0][0], 32, &hashB[0][0], 32, 1, 8, 8 );
 	LYRA2RE( &hashA[0][8], 32, &hashB[0][8], 32, &hashB[0][8], 32, 1, 8, 8 );
   LYRA2RE( &hashA[1][0], 32, &hashB[1][0], 32, &hashB[1][0], 32, 1, 8, 8 );
   LYRA2RE( &hashA[1][8], 32, &hashB[1][8], 32, &hashB[1][8], 32, 1, 8, 8 );
   LYRA2RE( &hashA[2][0], 32, &hashB[2][0], 32, &hashB[2][0], 32, 1, 8, 8 );
   LYRA2RE( &hashA[2][8], 32, &hashB[2][8], 32, &hashB[2][8], 32, 1, 8, 8 );
   LYRA2RE( &hashA[3][0], 32, &hashB[3][0], 32, &hashB[3][0], 32, 1, 8, 8 );
   LYRA2RE( &hashA[3][8], 32, &hashB[3][8], 32, &hashB[3][8], 32, 1, 8, 8 );
   mm256_intrlv_4x64( vhash, hashA[0], hashA[1], hashA[2], hashA[3], 512 );
   jh512_4way_init( &ctx.jh );
   jh512_4way( &ctx.jh, vhash, 64 );
   jh512_4way_close( &ctx.jh, vhash );
   mm256_dintrlv_4x64( hash[0], hash[1], hash[2], hash[3], vhash, 512 );
   if ( hash[0][0] & 1 )
  	{
      sph_gost512_init( &ctx.gost );
      sph_gost512( &ctx.gost, (const void*)hash[0], 64 );
 	   sph_gost512_close( &ctx.gost, (void*)hash[0] );
 	}
  	else
  	{
      init_echo( &ctx.echo, 512 );
      update_final_echo ( &ctx.echo, (BitSequence *)hash[0],
                          (const BitSequence *)hash[0], 512 );
      init_echo( &ctx.echo, 512 );
      update_final_echo ( &ctx.echo, (BitSequence *)hash[0],
                          (const BitSequence *)hash[0], 512 );
 	}
   if ( hash[1][0] & 1 )
   {
      sph_gost512_init( &ctx.gost );
      sph_gost512( &ctx.gost, (const void*)hash[1], 64 );
      sph_gost512_close( &ctx.gost, (void*)hash[1] );
   }
   else
   {
      init_echo( &ctx.echo, 512 );
      update_final_echo ( &ctx.echo, (BitSequence *)hash[1],
                          (const BitSequence *)hash[1], 512 );
      init_echo( &ctx.echo, 512 );
      update_final_echo ( &ctx.echo, (BitSequence *)hash[1],
                          (const BitSequence *)hash[1], 512 );
   }
   if ( hash[2][0] & 1 )
   {
      sph_gost512_init( &ctx.gost );
      sph_gost512( &ctx.gost, (const void*)hash[2], 64 );
      sph_gost512_close( &ctx.gost, (void*)hash[2] );
   }
   else
   {
      init_echo( &ctx.echo, 512 );
      update_final_echo ( &ctx.echo, (BitSequence *)hash[2],
                          (const BitSequence *)hash[2], 512 );
      init_echo( &ctx.echo, 512 );
      update_final_echo ( &ctx.echo, (BitSequence *)hash[2],
                          (const BitSequence *)hash[2], 512 );
   }
   if ( hash[3][0] & 1 )
   {
      sph_gost512_init( &ctx.gost );
      sph_gost512( &ctx.gost, (const void*)hash[3], 64 );
      sph_gost512_close( &ctx.gost, (void*)hash[3] );
   }
   else
   {
      init_echo( &ctx.echo, 512 );
      update_final_echo ( &ctx.echo, (BitSequence *)hash[3],
                          (const BitSequence *)hash[3], 512 );
      init_echo( &ctx.echo, 512 );
      update_final_echo ( &ctx.echo, (BitSequence *)hash[3],
                          (const BitSequence *)hash[3], 512 );
   }
   mm256_intrlv_4x64( vhash, hash[0], hash[1], hash[2], hash[3], 512 );
   skein512_4way_init( &ctx.skein );
 	skein512_4way( &ctx.skein, vhash, 64 );
 	skein512_4way_close( &ctx.skein, vhash );
   for (int i=0; i<4; i++)
   {
      ( (uint64_t*)vhash    )[i] ^= ( (uint64_t*)vhash    )[i+4];
      ( (uint64_t*)vhash+ 8 )[i] ^= ( (uint64_t*)vhash+ 8 )[i+4];
      ( (uint64_t*)vhash+16 )[i] ^= ( (uint64_t*)vhash+16 )[i+4];
      ( (uint64_t*)vhash+24 )[i] ^= ( (uint64_t*)vhash+24 )[i+4];
   }
 //   for ( int i = 0; i < 4; i++ )
 //      casti_m256i( vhash, i ) = _mm256_xor_si256( casti_m256i( vhash, i   ),
 //                                                  casti_m256i( vhash, i+4 ) );
 	memcpy( state, vhash, 128 );
 }
 int scanhash_phi2_4way( int thr_id, struct work *work, uint32_t max_nonce,
 	                     uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(128) hash[8];
   uint32_t _ALIGN(128) edata[36];
   uint32_t vdata[4][36] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[25]);
   uint32_t lane_hash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
   if(opt_benchmark){
   	ptarget[7] = 0x00ff;
   }
 // Data is not interleaved, but hash is.
 // any non-zero data at index 20 or above sets roots true.
 // Split up the operations, bswap first, then set roots.
   phi2_has_roots = false;
   for ( int i=0; i < 36; i++ )
   {
   be32enc(&edata[i], pdata[i]);
   if (i >= 20 && pdata[i]) phi2_has_roots = true;
   }
 /*
   casti_m256i( vdata[0], 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );   
   casti_m256i( vdata[0], 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
   casti_m256i( vdata[0], 2 ) = mm256_bswap_32( casti_m256i( pdata, 2 ) );
   casti_m256i( vdata[0], 3 ) = mm256_bswap_32( casti_m256i( pdata, 3 ) );
   casti_m128i( vdata[0], 8 ) = mm128_bswap_32( casti_m128i( pdata, 8 ) );
   phi2_has_roots = mm128_anybits1( casti_m128i( vdata[0], 5 ) ) ||
                    mm128_anybits1( casti_m128i( vdata[0], 6 ) ) ||
                    mm128_anybits1( casti_m128i( vdata[0], 7 ) ) ||
                    mm128_anybits1( casti_m128i( vdata[0], 8 ) );
 */   
   memcpy( vdata[0], edata, 144 );
   memcpy( vdata[1], edata, 144 );
   memcpy( vdata[2], edata, 144 );
   memcpy( vdata[3], edata, 144 );
   do {
      be32enc( &vdata[0][19], n );
      be32enc( &vdata[1][19], n+1 );
      be32enc( &vdata[2][19], n+2 );
      be32enc( &vdata[3][19], n+3 );
      phi2_hash_4way( hash, vdata );
      for ( int lane = 0; lane < 4; lane++ ) if (  hash7[ lane<<1 ] < Htarg )
      {
          mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
          {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr, lane );
          }
       }
       n += 4;
    } while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
    *hashes_done = n - first_nonce + 1;
    return 0;
 }
 #endif  // PHI2_4WAY
--- a/algo/lyra2/phi2.c
+++ b/algo/lyra2/phi2.c
@@ -50,11 +50,11 @@ void phi2_hash(void *state, const void *input)
 	unsigned char _ALIGN(128) hashA[64];
 	unsigned char _ALIGN(128) hashB[64];
-        phi2_ctx_holder ctx __attribute__ ((aligned (64)));
+  phi2_ctx_holder ctx __attribute__ ((aligned (64)));
-        memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
+  memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
-        cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)input,
+  cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)input,
-		              phi2_has_roots ? 144 : 80 );
+                        phi2_has_roots ? 144 : 80 );
 	LYRA2RE( &hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8 );
 	LYRA2RE( &hashA[32], 32, &hashB[32], 32, &hashB[32], 32, 1, 8, 8 );
@@ -63,17 +63,17 @@ void phi2_hash(void *state, const void *input)
 	sph_jh512_close( &ctx.jh, (void*)hash );
 	if ( hash[0] & 1 )
-       	{
+  	{
-           sph_gost512( &ctx.gost, (const void*)hash, 64 );
+      sph_gost512( &ctx.gost, (const void*)hash, 64 );
 	   sph_gost512_close( &ctx.gost, (void*)hash );
 	}
-       	else
+  	else
-       	{
+  	{
 #if defined(__AES__)
-           update_final_echo ( &ctx.echo1, (BitSequence *)hash,
+      update_final_echo ( &ctx.echo1, (BitSequence *)hash,
-                               (const BitSequence *)hash, 512 );
+                          (const BitSequence *)hash, 512 );
-           update_final_echo ( &ctx.echo2, (BitSequence *)hash,
+      update_final_echo ( &ctx.echo2, (BitSequence *)hash,
-                               (const BitSequence *)hash, 512 );
+                          (const BitSequence *)hash, 512 );
 #else
 	   sph_echo512( &ctx.echo1, (const void*)hash, 64 );
 	   sph_echo512_close( &ctx.echo1, (void*)hash );
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -51,7 +51,7 @@ inline void initState( uint64_t State[/*16*/] )
  state[3] = _mm256_set_epi64x( blake2b_IV[7], blake2b_IV[6],
                                blake2b_IV[5], blake2b_IV[4] );
-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)
  __m128i* state = (__m128i*)State;
@@ -137,7 +137,7 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
    //Squeezes remaining bytes
    memcpy_256( out, state, ( len_m256i % BLOCK_LEN_M256I ) );
-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)
    const int len_m128i = len / 16;
    const int fullBlocks = len_m128i / BLOCK_LEN_M128I;
@@ -205,7 +205,7 @@ inline void absorbBlock( uint64_t *State, const uint64_t *In )
    _mm256_store_si256( (__m256i*)State + 2, state2 );
    _mm256_store_si256( (__m256i*)State + 3, state3 );
-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)
    __m128i* state = (__m128i*)State;
    __m128i* in    = (__m128i*)In;
@@ -273,7 +273,7 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In )
    _mm256_store_si256( (__m256i*)State + 2, state2 );
    _mm256_store_si256( (__m256i*)State + 3, state3 );
-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)
    __m128i* state = (__m128i*)State;
    __m128i* in    = (__m128i*)In;
@@ -355,7 +355,7 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
    _mm256_store_si256( (__m256i*)State + 2, state2 );
    _mm256_store_si256( (__m256i*)State + 3, state3 );
-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)
    __m128i* state = (__m128i*)State;
    __m128i  state0 = _mm_load_si128(  state    );
@@ -494,7 +494,7 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
    _mm256_store_si256( (__m256i*)State + 2, state2 );
    _mm256_store_si256( (__m256i*)State + 3, state3 );
-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)
    __m128i* state = (__m128i*)State;
    __m128i  state0 = _mm_load_si128(  state    );
@@ -694,7 +694,7 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
    _mm256_store_si256( (__m256i*)State + 2, state2 );
    _mm256_store_si256( (__m256i*)State + 3, state3 );
-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)
    __m128i* in    = (__m128i*)rowIn;
    __m128i* inout = (__m128i*)rowInOut;
@@ -713,9 +713,9 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
    __m128i* state = (__m128i*)State;
    // For the last round in this function not optimized for AVX
-    uint64_t* ptrWordIn = rowIn;        //In Lyra2: pointer to prev
+//    uint64_t* ptrWordIn = rowIn;        //In Lyra2: pointer to prev
-    uint64_t* ptrWordInOut = rowInOut;  //In Lyra2: pointer to row*
+//    uint64_t* ptrWordInOut = rowInOut;  //In Lyra2: pointer to row*
-    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+//    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
    for ( i = 0; i < nCols; i++ )
    {
@@ -750,6 +750,28 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
        out[4] = _mm_xor_si128( state[4], in[4] );
        out[5] = _mm_xor_si128( state[5], in[5] );
       __m128i t0, t1;
       t0 = _mm_srli_si128( state[0], 8 );
       t1 = _mm_srli_si128( state[1], 8 );
       inout[0] = _mm_xor_si128( inout[0],
                              _mm_or_si128( _mm_slli_si128( state[0], 8 ),
                                            _mm_srli_si128( state[5], 8 ) ) );
       inout[1] = _mm_xor_si128( inout[1],
                        _mm_or_si128( _mm_slli_si128( state[1], 8 ), t0 ) );
       t0 = _mm_srli_si128( state[2], 8 );
       inout[2] = _mm_xor_si128( inout[2],
                        _mm_or_si128( _mm_slli_si128( state[2], 8 ), t1 ) );
       t1 = _mm_srli_si128( state[3], 8 );
       inout[3] = _mm_xor_si128( inout[3],
                        _mm_or_si128( _mm_slli_si128( state[3], 8 ), t0 ) );
       t0 = _mm_srli_si128( state[4], 8 );
       inout[4] = _mm_xor_si128( inout[4],
                        _mm_or_si128( _mm_slli_si128( state[4], 8 ), t1 ) );
       inout[5] = _mm_xor_si128( inout[5],
                        _mm_or_si128( _mm_slli_si128( state[5], 8 ), t0 ) );
 /*
        ptrWordInOut[0]  ^= State[11];
        ptrWordInOut[1]  ^= State[0];
        ptrWordInOut[2]  ^= State[1];
@@ -768,7 +790,7 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
        ptrWordIn += BLOCK_LEN_INT64;
        //Output: goes to previous column
        ptrWordOut -= BLOCK_LEN_INT64;
-
+*/
        inout += BLOCK_LEN_M128I;
        in    += BLOCK_LEN_M128I;
        out   -= BLOCK_LEN_M128I;
@@ -930,7 +952,7 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
   _mm256_store_si256( (__m256i*)State + 2, state2 );
   _mm256_store_si256( (__m256i*)State + 3, state3 );
-#elif defined(__SSE4_2__)
+#elif defined (__SSE2__)
    __m128i* state = (__m128i*)State;
    __m128i* in    = (__m128i*)rowIn;
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -23,7 +23,7 @@
 #define SPONGE_H_
 #include <stdint.h>
-#include "avxdefs.h"
+#include "simd-utils.h"
 #if defined(__GNUC__)
 #define ALIGN __attribute__ ((aligned(32)))
@@ -59,7 +59,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 // returns void, updates all args
 #define G_4X64(a,b,c,d) \
   a = _mm256_add_epi64( a, b ); \
-   d = mm256_ror_64( _mm256_xor_si256( d, a), 32 ); \
+   d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
   c = _mm256_add_epi64( c, d ); \
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
   a = _mm256_add_epi64( a, b ); \
--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -144,8 +144,8 @@ void init_m7m_ctx()
 #define NM7M 5
 #define SW_DIVS 5
 #define M7_MIDSTATE_LEN 76
-int scanhash_m7m_hash( int thr_id, struct work* work,
+int scanhash_m7m_hash( int thr_id, struct work* work, uint64_t max_nonce,
-                       uint64_t max_nonce, unsigned long *hashes_done )
+                       unsigned long *hashes_done, struct thr_info *mythr )
 {
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
@@ -154,6 +154,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
    uint32_t hash[8] __attribute__((aligned(64)));
    uint8_t bhash[7][64] __attribute__((aligned(64)));
    uint32_t n = pdata[19] - 1;
    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
    uint32_t usw_, mpzscale;
    const uint32_t first_nonce = pdata[19];
    char data_str[161], hash_str[65], target_str[65];
--- a/algo/pluck.c
+++ b/algo/pluck.c
@@ -445,7 +445,7 @@ void pluck_hash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const i
 }
 int scanhash_pluck(int thr_id, struct work *work, uint32_t max_nonce,
-        uint64_t *hashes_done  )
+        uint64_t *hashes_done, struct thr_info *mythr  )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -454,6 +454,8 @@ int scanhash_pluck(int thr_id, struct work *work, uint32_t max_nonce,
 	const uint32_t first_nonce = pdata[19];
 	volatile uint8_t *restart = &(work_restart[thr_id].restart);
 	uint32_t n = first_nonce;
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0ffff;
--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -48,8 +48,8 @@ void anime_4way_hash( void *state, const void *input )
    __m256i* vhA = (__m256i*)vhashA;
    __m256i* vhB = (__m256i*)vhashB;
    __m256i vh_mask;
    const uint32_t mask = 8;
    const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
    int i;
    anime_4way_ctx_holder ctx;
    memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );
@@ -62,27 +62,44 @@ void anime_4way_hash( void *state, const void *input )
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );
-    mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
    update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                            (char*)hash0, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash1,
                                            (char*)hash1, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash2,
                                            (char*)hash2, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash3,
                                            (char*)hash3, 512 );
    mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
-    skein512_4way( &ctx.skein, vhash, 64 );
+    if ( hash0[0] & mask )
-    skein512_4way_close( &ctx.skein, vhashB );
+    {
       update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                               (char*)hash0, 512 );
    }
    if ( hash1[0] & mask )
    {
       reinit_groestl( &ctx.groestl );
       update_and_final_groestl( &ctx.groestl, (char*)hash1,
                                               (char*)hash1, 512 );
    }
    if ( hash2[0] & mask )
    {
       reinit_groestl( &ctx.groestl );
       update_and_final_groestl( &ctx.groestl, (char*)hash2,
                                               (char*)hash2, 512 );
    }
    if ( hash3[0] & mask )
    {
       reinit_groestl( &ctx.groestl );
       update_and_final_groestl( &ctx.groestl, (char*)hash3,
                                               (char*)hash3, 512 );
    }
-    for ( i = 0; i < 8; i++ )
+    mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
-       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
    if ( mm256_anybits0( vh_mask ) )
    {
       skein512_4way( &ctx.skein, vhash, 64 );
       skein512_4way_close( &ctx.skein, vhashB );
    }
    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
    mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
    mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
    reinit_groestl( &ctx.groestl );
@@ -91,7 +108,8 @@ void anime_4way_hash( void *state, const void *input )
    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-    mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
    mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
    jh512_4way( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhash );
@@ -99,16 +117,20 @@ void anime_4way_hash( void *state, const void *input )
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );
    if ( mm256_anybits1( vh_mask ) )
    {
       blake512_4way_init( &ctx.blake );
       blake512_4way( &ctx.blake, vhash, 64 );
       blake512_4way_close( &ctx.blake, vhashA );
-
+    }
    if ( mm256_anybits0( vh_mask ) )
    {
       bmw512_4way_init( &ctx.bmw );
       bmw512_4way( &ctx.bmw, vhash, 64 );
       bmw512_4way_close( &ctx.bmw, vhashB );
    }
-    for ( i = 0; i < 8; i++ )
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
    keccak512_4way( &ctx.keccak, vhash, 64 );
    keccak512_4way_close( &ctx.keccak, vhash );
@@ -120,33 +142,35 @@ void anime_4way_hash( void *state, const void *input )
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );
-    keccak512_4way_init( &ctx.keccak );
+    if ( mm256_anybits1( vh_mask ) )
-    keccak512_4way( &ctx.keccak, vhash, 64 );
+    {
-    keccak512_4way_close( &ctx.keccak, vhashA );
+       keccak512_4way_init( &ctx.keccak );
       keccak512_4way( &ctx.keccak, vhash, 64 );
       keccak512_4way_close( &ctx.keccak, vhashA );
    }
    if ( mm256_anybits0( vh_mask ) )
    {
       jh512_4way_init( &ctx.jh );
       jh512_4way( &ctx.jh, vhash, 64 );
       jh512_4way_close( &ctx.jh, vhashB );
    }
-    jh512_4way_init( &ctx.jh );
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
    jh512_4way( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhashB );
-    for ( i = 0; i < 8; i++ )
+    mm256_dintrlv_4x64( state, state+32, state+64, state+96, vhash, 256 );
       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
    mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
 }
 int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done)
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[4*8] __attribute__ ((aligned (64)));
    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
    uint32_t endiandata[20] __attribute__((aligned(64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
-    uint32_t *nonces = work->nonces;
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-    int num_found = 0;
+    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
    uint32_t *noncep = vdata + 73;   // 9*8 + 1
    const uint32_t Htarg = ptarget[7];
    uint64_t htmax[] = {
                0,
@@ -165,10 +189,7 @@ int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
                0
        };
-    swab32_array( endiandata, pdata, 20 );
+    mm256_bswap_intrlv80_4x64( vdata, pdata );
    uint64_t *edata = (uint64_t*)endiandata;
    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
    for (int m=0; m < 6; m++)
       if (Htarg <= htmax[m])
@@ -177,30 +198,26 @@ int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
          do
          {
-             be32enc( noncep,   n   );
+             *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-             be32enc( noncep+2, n+1 );
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
             be32enc( noncep+4, n+2 );
             be32enc( noncep+6, n+3 );
             anime_4way_hash( hash, vdata );
             pdata[19] = n;
             for ( int i = 0; i < 4; i++ )
             if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
-                && fulltest( hash+(i<<3), ptarget ) )
+                && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
             {
                pdata[19] = n+i;
-                nonces[ num_found++ ] = n+i;
+                submit_solution( work, hash+(i<<3), mythr, i );
                work_set_target_ratio( work, hash+(i<<3) );
             }
             n += 4;
-          } while ( ( num_found == 0 ) && ( n < max_nonce )
+          } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
              && !work_restart[thr_id].restart );
          break;
       }
    *hashes_done = n - first_nonce + 1;
-    return num_found;
+    return 0;
 }
 #endif
--- a/algo/quark/anime-gate.h
+++ b/algo/quark/anime-gate.h
@@ -13,19 +13,15 @@ bool register_anime_algo( algo_gate_t* gate );
 #if defined(ANIME_4WAY)
 void anime_4way_hash( void *state, const void *input );
 int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+                         uint64_t *hashes_done, struct thr_info *mythr );
 void init_anime_4way_ctx();
 #endif
 void anime_hash( void *state, const void *input );
 int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+                    uint64_t *hashes_done, struct thr_info *mythr );
 void init_anime_ctx();
 #endif
--- a/algo/quark/anime.c
+++ b/algo/quark/anime.c
@@ -46,20 +46,6 @@ void init_anime_ctx()
 void anime_hash( void *state, const void *input )
 {
    unsigned char hash[128] __attribute__ ((aligned (32)));
 /*
    uint64_t hash0[8] __attribute__ ((aligned (64)));
    uint64_t hash1[8] __attribute__ ((aligned (64)));
    uint64_t hash2[8] __attribute__ ((aligned (64)));
    uint64_t hash3[8] __attribute__ ((aligned (64)));
    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
    uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
    uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
    __m256i* vh  = (__m256i*)vhash;
    __m256i* vhA = (__m256i*)vhashA;
    __m256i* vhB = (__m256i*)vhashB;
    __m256i vh_mask;
    __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
 */
    uint32_t mask = 8;
    anime_ctx_holder ctx;
    memcpy( &ctx, &anime_ctx, sizeof(anime_ctx) );
@@ -134,7 +120,7 @@ void anime_hash( void *state, const void *input )
 }
 int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done)
+                         uint64_t *hashes_done, struct thr_info *mythr)
 {
    uint32_t hash[8] __attribute__ ((aligned (64)));
    uint32_t endiandata[20] __attribute__((aligned(64)));
@@ -142,6 +128,7 @@ int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
    uint32_t *ptarget = work->target;
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
    const uint32_t Htarg = ptarget[7];
    uint64_t htmax[] = {
                0,
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -0,0 +1,618 @@
 #include "hmq1725-gate.h"
 #if defined(HMQ1725_4WAY)
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/sha/sha2-hash-4way.h"
 union _hmq1725_4way_context_overlay
 {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
    hashState_luffa         luffa;
    cubehashParam           cube;
    sph_shavite512_context  shavite;
    hashState_sd            simd;
    hashState_echo          echo;
    hamsi512_4way_context   hamsi;
    sph_fugue512_context    fugue;
    shabal512_4way_context  shabal;
    sph_whirlpool_context   whirlpool;
    sha512_4way_context     sha512;
    haval256_5_4way_context haval;
 };
 typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;
 extern void hmq1725_4way_hash(void *state, const void *input)
 {
 // why so big? only really need 8, haval thing uses 16.
     uint32_t hash0 [32]    __attribute__ ((aligned (64)));
     uint32_t hash1 [32]    __attribute__ ((aligned (64)));
     uint32_t hash2 [32]    __attribute__ ((aligned (64)));
     uint32_t hash3 [32]    __attribute__ ((aligned (64)));
     uint32_t vhash [32<<2] __attribute__ ((aligned (64)));
     uint32_t vhashA[32<<2] __attribute__ ((aligned (64)));
     uint32_t vhashB[32<<2] __attribute__ ((aligned (64)));
     hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
     __m256i vh_mask;     
     const __m256i vmask = _mm256_set1_epi64x( 24 );
     const uint32_t mask = 24;
     __m256i* vh  = (__m256i*)vhash;
     __m256i* vhA = (__m256i*)vhashA;
     __m256i* vhB = (__m256i*)vhashB;
     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, input, 80 );
     bmw512_4way_close( &ctx.bmw, vhash );
     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash0 );
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash1 );
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash2 );
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash3 );
 // first fork, A is groestl serial, B is skein parallel.
     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
                                   m256_zero );
 // A
 //     if ( hash0[0] & mask )
 //     {
       init_groestl( &ctx.groestl, 64 );
       update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                               (char*)hash0, 512 );
 //     }
 //     if ( hash1[0] & mask )
 //     {
       init_groestl( &ctx.groestl, 64 );
       update_and_final_groestl( &ctx.groestl, (char*)hash1,
                                               (char*)hash1, 512 );
 //     }
 //     if ( hash2[0] & mask )
 //     {
       init_groestl( &ctx.groestl, 64 );
       update_and_final_groestl( &ctx.groestl, (char*)hash2,
                                               (char*)hash2, 512 );
 //     }
 //     if ( hash3[0] & mask )
 //     {
       init_groestl( &ctx.groestl, 64 );
       update_and_final_groestl( &ctx.groestl, (char*)hash3,
                                               (char*)hash3, 512 );
 //     }
     mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
 // B
 //     if ( mm256_any_clr_256( vh_mask ) )
 //     {
       skein512_4way_init( &ctx.skein );
       skein512_4way( &ctx.skein, vhash, 64 );
       skein512_4way_close( &ctx.skein, vhashB );
 //     }
     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
     jh512_4way_init( &ctx.jh );
     jh512_4way( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );
     keccak512_4way_init( &ctx.keccak );
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
 // second fork, A = blake parallel, B= bmw parallel.
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
                                   m256_zero );
 //     if ( mm256_any_set_256( vh_mask ) )
 //     {
       blake512_4way_init( &ctx.blake );
       blake512_4way( &ctx.blake, vhash, 64 );
       blake512_4way_close( &ctx.blake, vhashA );
 //     }
 //     if ( mm256_any_clr_256( vh_mask ) )
 //     {
       bmw512_4way_init( &ctx.bmw );
       bmw512_4way( &ctx.bmw, vhash, 64 );
       bmw512_4way_close( &ctx.bmw, vhashB );
 //     }
     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     init_luffa( &ctx.luffa, 512 );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
                                   (const BitSequence*)hash0, 64 );
     init_luffa( &ctx.luffa, 512 );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
                                   (const BitSequence*)hash1, 64 );
     init_luffa( &ctx.luffa, 512 );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
                                   (const BitSequence*)hash2, 64 );
     init_luffa( &ctx.luffa, 512 );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                                   (const BitSequence*)hash3, 64 );
     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash0,
                                (const BitSequence *)hash0, 64 );
     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash1,
                                (const BitSequence *)hash1, 64 );
     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash2,
                                (const BitSequence *)hash2, 64 );
     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash3,
                                (const BitSequence *)hash3, 64 );
     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 // A= keccak parallel, B= jh parallel
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
                                  m256_zero );
 //     if ( mm256_any_set_256( vh_mask ) )
 //     {
        keccak512_4way_init( &ctx.keccak );
        keccak512_4way( &ctx.keccak, vhash, 64 );
        keccak512_4way_close( &ctx.keccak, vhashA );
 //     }
 //     if ( mm256_any_clr_256( vh_mask ) )
 //     {
        jh512_4way_init( &ctx.jh );
        jh512_4way( &ctx.jh, vhash, 64 );
        jh512_4way_close( &ctx.jh, vhashB );
 //     }
     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     sph_shavite512_init( &ctx.shavite );
     sph_shavite512 ( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     sph_shavite512_init( &ctx.shavite );
     sph_shavite512 ( &ctx.shavite, hash1, 64 );
     sph_shavite512_close( &ctx.shavite, hash1 );
     sph_shavite512_init( &ctx.shavite );
     sph_shavite512 ( &ctx.shavite, hash2, 64 );
     sph_shavite512_close( &ctx.shavite, hash2 );
     sph_shavite512_init( &ctx.shavite );
     sph_shavite512 ( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );
     init_sd( &ctx.simd, 512 );
     update_final_sd( &ctx.simd, (BitSequence *)hash0,
                           (const BitSequence *)hash0, 512 );
     init_sd( &ctx.simd, 512 );
     update_final_sd( &ctx.simd, (BitSequence *)hash1,
                           (const BitSequence *)hash1, 512 );
     init_sd( &ctx.simd, 512 );
     update_final_sd( &ctx.simd, (BitSequence *)hash2,
                           (const BitSequence *)hash2, 512 );
     init_sd( &ctx.simd, 512 );
     update_final_sd( &ctx.simd, (BitSequence *)hash3,
                           (const BitSequence *)hash3, 512 );
 // A is whirlpool serial, B is haval parallel.
     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
                                   m256_zero );
     // A
 //     if ( hash0[0] & mask )
 //     {
        sph_whirlpool_init( &ctx.whirlpool );
        sph_whirlpool( &ctx.whirlpool, hash0, 64 );
        sph_whirlpool_close( &ctx.whirlpool, hash0 );
 //     }
 //     if ( hash1[0] & mask )
 //     {
        sph_whirlpool_init( &ctx.whirlpool );
        sph_whirlpool( &ctx.whirlpool, hash1, 64 );
        sph_whirlpool_close( &ctx.whirlpool, hash1 );
 //     }
 //     if ( hash2[0] & mask )
 //     {
        sph_whirlpool_init( &ctx.whirlpool );
        sph_whirlpool( &ctx.whirlpool, hash2, 64 );
        sph_whirlpool_close( &ctx.whirlpool, hash2 );
 //     }
 //     if ( hash3[0] & mask )
 //     {
        sph_whirlpool_init( &ctx.whirlpool );
        sph_whirlpool( &ctx.whirlpool, hash3, 64 );
        sph_whirlpool_close( &ctx.whirlpool, hash3 );
 //     }
     mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
 // B
 //     if ( mm256_any_clr_256( vh_mask ) )
 //     {
        haval256_5_4way_init( &ctx.haval );
        haval256_5_4way( &ctx.haval, vhash, 64 );
        haval256_5_4way_close( &ctx.haval, vhashB );
        memset( &vhashB[8<<2], 0, 32<<2);
 //     }
     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                             (const BitSequence *)hash0, 512 );
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                             (const BitSequence *)hash1, 512 );
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                             (const BitSequence *)hash2, 512 );
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                             (const BitSequence *)hash3, 512 );
     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     blake512_4way_init( &ctx.blake );
     blake512_4way( &ctx.blake, vhash, 64 );
     blake512_4way_close( &ctx.blake, vhash );
     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 // shavite & luffa, both serial, select individually.
   if ( hash0[0] & mask )
   {
      sph_shavite512_init( &ctx.shavite );
      sph_shavite512( &ctx.shavite, hash0, 64 ); //
      sph_shavite512_close( &ctx.shavite, hash0 ); //8
   }
   else
   {
      init_luffa( &ctx.luffa, 512 );
      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash0,
                                    (const BitSequence *)hash0, 64 );
   }
   if ( hash1[0] & mask )
   {
      sph_shavite512_init( &ctx.shavite );
      sph_shavite512( &ctx.shavite, hash1, 64 ); //
      sph_shavite512_close( &ctx.shavite, hash1 ); //8
   }
   else
   {
      init_luffa( &ctx.luffa, 512 );
      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash1,
                                    (const BitSequence *)hash1, 64 );
   }
   if ( hash2[0] & mask )
   {
      sph_shavite512_init( &ctx.shavite );
      sph_shavite512( &ctx.shavite, hash2, 64 ); //
      sph_shavite512_close( &ctx.shavite, hash2 ); //8
   }
   else
   {
      init_luffa( &ctx.luffa, 512 );
      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash2,
                                    (const BitSequence *)hash2, 64 );
   }
   if ( hash3[0] & mask )
   {
      sph_shavite512_init( &ctx.shavite );
      sph_shavite512( &ctx.shavite, hash3, 64 ); //
      sph_shavite512_close( &ctx.shavite, hash3 ); //8
   }
   else
   {
      init_luffa( &ctx.luffa, 512 );
      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash3,
                                    (const BitSequence *)hash3, 64 );
   }
   mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
   hamsi512_4way_init( &ctx.hamsi );
   hamsi512_4way( &ctx.hamsi, vhash, 64 );
   hamsi512_4way_close( &ctx.hamsi, vhash );
   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
   sph_fugue512_init( &ctx.fugue );
   sph_fugue512( &ctx.fugue, hash0, 64 );
   sph_fugue512_close( &ctx.fugue, hash0 );
   sph_fugue512_init( &ctx.fugue );
   sph_fugue512( &ctx.fugue, hash1, 64 );
   sph_fugue512_close( &ctx.fugue, hash1 );
   sph_fugue512_init( &ctx.fugue );
   sph_fugue512( &ctx.fugue, hash2, 64 );
   sph_fugue512_close( &ctx.fugue, hash2 );
   sph_fugue512_init( &ctx.fugue );
   sph_fugue512( &ctx.fugue, hash3, 64 );
   sph_fugue512_close( &ctx.fugue, hash3 );
 //  A echo, B sd both serial
   if ( hash0[0] & mask ) //4
   {
       init_echo( &ctx.echo, 512 );
       update_final_echo( &ctx.echo, (BitSequence *)hash0,
                               (const BitSequence *)hash0, 512 );
   }
   else
   {
       init_sd( &ctx.simd, 512 );
       update_final_sd( &ctx.simd, (BitSequence *)hash0,
                             (const BitSequence *)hash0, 512 );
   }
   if ( hash1[0] & mask ) //4
   {
       init_echo( &ctx.echo, 512 );
       update_final_echo( &ctx.echo, (BitSequence *)hash1,
                               (const BitSequence *)hash1, 512 );
   }
   else
   {
       init_sd( &ctx.simd, 512 );
       update_final_sd( &ctx.simd, (BitSequence *)hash1,
                             (const BitSequence *)hash1, 512 );
   }
   if ( hash2[0] & mask ) //4
   {
       init_echo( &ctx.echo, 512 );
       update_final_echo( &ctx.echo, (BitSequence *)hash2,
                               (const BitSequence *)hash2, 512 );
   }
   else
   {
       init_sd( &ctx.simd, 512 );
       update_final_sd( &ctx.simd, (BitSequence *)hash2,
                             (const BitSequence *)hash2, 512 );
   }
   if ( hash3[0] & mask ) //4
   {
       init_echo( &ctx.echo, 512 );
       update_final_echo( &ctx.echo, (BitSequence *)hash3,
                               (const BitSequence *)hash3, 512 );
   }
   else
   {
       init_sd( &ctx.simd, 512 );
       update_final_sd( &ctx.simd, (BitSequence *)hash3,
                             (const BitSequence *)hash3, 512 );
   }
   mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
   shabal512_4way_init( &ctx.shabal );
   shabal512_4way( &ctx.shabal, vhash, 64 );
   shabal512_4way_close( &ctx.shabal, vhash );
   mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
   sph_whirlpool_init( &ctx.whirlpool );
   sph_whirlpool( &ctx.whirlpool, hash0, 64 );
   sph_whirlpool_close( &ctx.whirlpool, hash0 );
   sph_whirlpool_init( &ctx.whirlpool );
   sph_whirlpool( &ctx.whirlpool, hash1, 64 );
   sph_whirlpool_close( &ctx.whirlpool, hash1 );
   sph_whirlpool_init( &ctx.whirlpool );
   sph_whirlpool( &ctx.whirlpool, hash2, 64 );
   sph_whirlpool_close( &ctx.whirlpool, hash2 );
   sph_whirlpool_init( &ctx.whirlpool );
   sph_whirlpool( &ctx.whirlpool, hash3, 64 );
   sph_whirlpool_close( &ctx.whirlpool, hash3 );
 // A = fugue serial, B = sha512 prarallel
   mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
   vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
                                 m256_zero );
 //   if ( hash0[0] & mask ) 
 //   {
      sph_fugue512_init( &ctx.fugue );
      sph_fugue512( &ctx.fugue, hash0, 64 );
      sph_fugue512_close( &ctx.fugue, hash0 );
 //   }
 //   if ( hash1[0] & mask ) 
 //   {
      sph_fugue512_init( &ctx.fugue );
      sph_fugue512( &ctx.fugue, hash1, 64 );
      sph_fugue512_close( &ctx.fugue, hash1 );
 //   }
 //   if ( hash2[0] & mask ) 
 //   {
      sph_fugue512_init( &ctx.fugue );
      sph_fugue512( &ctx.fugue, hash2, 64 );
      sph_fugue512_close( &ctx.fugue, hash2 );
 //   }
 //   if ( hash3[0] & mask ) 
 //   {
      sph_fugue512_init( &ctx.fugue );
      sph_fugue512( &ctx.fugue, hash3, 64 );
      sph_fugue512_close( &ctx.fugue, hash3 );
 //   }
   mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
 //   if ( mm256_any_clr_256( vh_mask ) )
 //   {
      sha512_4way_init( &ctx.sha512 );
      sha512_4way( &ctx.sha512, vhash, 64 );
      sha512_4way_close( &ctx.sha512, vhashB );
 //   }
   mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
   init_groestl( &ctx.groestl, 64 );
   update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
   init_groestl( &ctx.groestl, 64 );
   update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
   init_groestl( &ctx.groestl, 64 );
   update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
   init_groestl( &ctx.groestl, 64 );
   update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
   mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
   sha512_4way_init( &ctx.sha512 ); 
   sha512_4way( &ctx.sha512, vhash, 64 );
   sha512_4way_close( &ctx.sha512, vhash ); 
 // A = haval parallel, B = Whirlpool serial
   vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
                                 m256_zero );
   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 //   if ( mm256_any_set_256( vh_mask ) ) //4
 //   {
      haval256_5_4way_init( &ctx.haval );
      haval256_5_4way( &ctx.haval, vhash, 64 );
      haval256_5_4way_close( &ctx.haval, vhashA );
      memset( &vhashA[8<<2], 0, 32<<2 );
 //   }
 //   if ( !( hash0[0] & mask ) )
 //   {
      sph_whirlpool_init( &ctx.whirlpool );
      sph_whirlpool( &ctx.whirlpool, hash0, 64 );
      sph_whirlpool_close( &ctx.whirlpool, hash0 );
 //   }
 //   if ( !( hash2[0] & mask ) )
 //   {
      sph_whirlpool_init( &ctx.whirlpool );
      sph_whirlpool( &ctx.whirlpool, hash1, 64 );
      sph_whirlpool_close( &ctx.whirlpool, hash1 );
 //   }
 //   if ( !( hash2[0] & mask ) )
 //   {
      sph_whirlpool_init( &ctx.whirlpool );
      sph_whirlpool( &ctx.whirlpool, hash2, 64 );
      sph_whirlpool_close( &ctx.whirlpool, hash2 );
 //   }
 //   if ( !( hash3[0] & mask ) )
 //   {
      sph_whirlpool_init( &ctx.whirlpool );
      sph_whirlpool( &ctx.whirlpool, hash3, 64 );
      sph_whirlpool_close( &ctx.whirlpool, hash3 );
 //   }
   mm256_intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, 512 );
   mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
   bmw512_4way_init( &ctx.bmw );
   bmw512_4way( &ctx.bmw, vhash, 64 );
   bmw512_4way_close( &ctx.bmw, vhash );
 	memcpy(state, vhash, 32<<2 );
 }
 int scanhash_hmq1725_4way( int thr_id, struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
 //   uint32_t *hash7 = &(hash[7<<2]);
 //   uint32_t lane_hash[8];
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19] - 1;
   const uint32_t first_nonce = pdata[19];
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
   const uint32_t Htarg = ptarget[7];
   uint64_t htmax[] = {          0,        0xF,       0xFF,
                             0xFFF,     0xFFFF, 0x10000000  };
   uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                        0xFFFFF000, 0xFFFF0000,          0  };
   mm256_bswap_intrlv80_4x64( vdata, pdata );
   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
   {
      uint32_t mask = masks[ m ];
      do
      {
         *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
         hmq1725_4way_hash( hash, vdata );
         for ( int i = 0; i < 4; i++ )
         if ( ( (hash+(i<<3))[7] & mask ) == 0 )
         {
            if ( fulltest( (hash+(i<<3)), ptarget ) && !opt_benchmark )
            {
               pdata[19] = n + i;
               submit_solution( work, (hash+(i<<3)), mythr, i );
            }
         }
 	      n += 4;
      } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );	
 	   break;
 	}
 	*hashes_done = n - first_nonce + 1;
 	return 0;
 }
 #endif // HMQ1725_4WAY
--- a/algo/quark/hmq1725-gate.c
+++ b/algo/quark/hmq1725-gate.c
@@ -0,0 +1,17 @@
 #include "hmq1725-gate.h"
 bool register_hmq1725_algo( algo_gate_t* gate )
 {
 #if defined(HMQ1725_4WAY)
  gate->scanhash  = (void*)&scanhash_hmq1725_4way;
  gate->hash      = (void*)&hmq1725_4way_hash;
 #else
  init_hmq1725_ctx();
  gate->scanhash  = (void*)&scanhash_hmq1725;
  gate->hash      = (void*)&hmq1725hash;
 #endif
  gate->set_target       = (void*)&scrypt_set_target;
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  return true;
 };
--- a/algo/quark/hmq1725-gate.h
+++ b/algo/quark/hmq1725-gate.h
@@ -0,0 +1,28 @@
 #ifndef HMQ1725_GATE_H__
 #define HMQ1725_GATE_H__ 1
 #include "algo-gate-api.h"
 #include <stdint.h>
 #if defined(__AVX2__) && defined(__AES__)
 //  #define HMQ1725_4WAY
 #endif
 bool register_hmq1725_algo( algo_gate_t* gate );
 #if defined(HMQ1725_4WAY)
 void hmq1725_4way_hash( void *state, const void *input );
 int scanhash_hmq1725_4way( int thr_id, struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 #else
 void hmq1725hash( void *state, const void *input );
 int scanhash_hmq1725( int thr_id, struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );
 void init_hmq1725_ctx();
 #endif
 #endif  // HMQ1725_GATE_H__
--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "hmq1725-gate.h"
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/sph_blake.h"
@@ -298,19 +298,22 @@ extern void hmq1725hash(void *state, const void *input)
 	memcpy(state, hashA, 32);
 }
-int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
+int scanhash_hmq1725( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done )
+                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t endiandata[32] __attribute__((aligned(64)));
+//        uint32_t endiandata[32] __attribute__((aligned(64)));
        uint32_t endiandata[20] __attribute__((aligned(64)));
        uint32_t hash64[8] __attribute__((aligned(64)));
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	//const uint32_t Htarg = ptarget[7];
 	//we need bigendian data...
-        for (int k = 0; k < 32; k++)
+//        for (int k = 0; k < 32; k++)
        for (int k = 0; k < 20; k++)
                be32enc(&endiandata[k], pdata[k]);
        hmq_bmw512_midstate( endiandata );
@@ -406,14 +409,14 @@ int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
-
+/*
 bool register_hmq1725_algo( algo_gate_t* gate )
 {
  init_hmq1725_ctx();
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->set_target       = (void*)&scrypt_set_target;
  gate->scanhash         = (void*)&scanhash_hmq1725;
  gate->hash             = (void*)&hmq1725hash;
  return true;
 };
-
+*/
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -48,9 +48,10 @@ void quark_4way_hash( void *state, const void *input )
    __m256i* vhA = (__m256i*)vhashA;
    __m256i* vhB = (__m256i*)vhashB;
    __m256i vh_mask;
    __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
    int i;
    quark_4way_ctx_holder ctx;
    const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
    const uint32_t mask = 8;
    memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
    blake512_4way( &ctx.blake, input, 80 );
@@ -62,27 +63,44 @@ void quark_4way_hash( void *state, const void *input )
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );
-       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
    if ( hash0[0] & mask )
    {
       update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                               (char*)hash0, 512 );
    }
    if ( hash1[0] & mask )
    {
       reinit_groestl( &ctx.groestl );
       update_and_final_groestl( &ctx.groestl, (char*)hash1,
                                               (char*)hash1, 512 );
    }
    if ( hash2[0] & mask )
    {   
       reinit_groestl( &ctx.groestl );
       update_and_final_groestl( &ctx.groestl, (char*)hash2,
                                               (char*)hash2, 512 );
    }
    if ( hash3[0] & mask )
    {   
       reinit_groestl( &ctx.groestl );
       update_and_final_groestl( &ctx.groestl, (char*)hash3,
                                               (char*)hash3, 512 );
-       mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+    }
    mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
    if ( mm256_anybits0( vh_mask ) )   
    {
       skein512_4way( &ctx.skein, vhash, 64 );
       skein512_4way_close( &ctx.skein, vhashB );
    }
-    for ( i = 0; i < 8; i++ )
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
-       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
    mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
    mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
    reinit_groestl( &ctx.groestl );
@@ -91,7 +109,8 @@ void quark_4way_hash( void *state, const void *input )
    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-    mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
    mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
    jh512_4way( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhash );
@@ -99,16 +118,21 @@ void quark_4way_hash( void *state, const void *input )
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );
    if ( mm256_anybits1( vh_mask ) )
    {
       blake512_4way_init( &ctx.blake );
       blake512_4way( &ctx.blake, vhash, 64 );
       blake512_4way_close( &ctx.blake, vhashA );
    }
    if ( mm256_anybits0( vh_mask ) )
    {
       bmw512_4way_init( &ctx.bmw );
       bmw512_4way( &ctx.bmw, vhash, 64 );
       bmw512_4way_close( &ctx.bmw, vhashB );
    }
-    for ( i = 0; i < 8; i++ )
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
    keccak512_4way( &ctx.keccak, vhash, 64 );
    keccak512_4way_close( &ctx.keccak, vhash );
@@ -120,63 +144,65 @@ void quark_4way_hash( void *state, const void *input )
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );
    if ( mm256_anybits1( vh_mask ) )
    {
       keccak512_4way_init( &ctx.keccak );
       keccak512_4way( &ctx.keccak, vhash, 64 );
       keccak512_4way_close( &ctx.keccak, vhashA );
    }
    if ( mm256_anybits0( vh_mask ) )
    {
       jh512_4way_init( &ctx.jh );
       jh512_4way( &ctx.jh, vhash, 64 );
       jh512_4way_close( &ctx.jh, vhashB );
    }
-    for ( i = 0; i < 8; i++ )
+    // Final blend, directly to state, only need 32 bytes.
-       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+    casti_m256i( state, 0 ) = _mm256_blendv_epi8( vhA[0], vhB[0], vh_mask );
-
+    casti_m256i( state, 1 ) = _mm256_blendv_epi8( vhA[1], vhB[1], vh_mask );
-    mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
+    casti_m256i( state, 2 ) = _mm256_blendv_epi8( vhA[2], vhB[2], vh_mask );
    casti_m256i( state, 3 ) = _mm256_blendv_epi8( vhA[3], vhB[3], vh_mask );
 }
 int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done)
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[4*8] __attribute__ ((aligned (64)));
    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-    uint32_t endiandata[20] __attribute__((aligned(64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
    uint32_t *hash7 = &(hash[25]);
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
-    uint32_t *nonces = work->nonces;
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-    int num_found = 0;
+    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
    uint32_t *noncep = vdata + 73;   // 9*8 + 1
    swab32_array( endiandata, pdata, 20 );
    uint64_t *edata = (uint64_t*)endiandata;
    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
    mm256_bswap_intrlv80_4x64( vdata, pdata );
    do
    {
-       be32enc( noncep,   n   );
+       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-       be32enc( noncep+2, n+1 );
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
       be32enc( noncep+4, n+2 );
       be32enc( noncep+6, n+3 );
       quark_4way_hash( hash, vdata );
       pdata[19] = n;
       for ( int i = 0; i < 4; i++ )
-       if ( ( ( (hash+(i<<3))[7] & 0xFFFFFF00 ) == 0 )
+       if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 )
            && fulltest( hash+(i<<3), ptarget ) )
       {
-          pdata[19] = n+i;
+          mm256_extract_lane_4x64( lane_hash, hash, i, 256 );
-          nonces[ num_found++ ] = n+i;
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark  )
-          work_set_target_ratio( work, hash+(i<<3) );
+          {
            pdata[19] = n+i;
            submit_solution( work, lane_hash, mythr, i );
          }
       }
       n += 4;
-    } while ( ( num_found == 0 ) && ( n < max_nonce )
+    } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
              && !work_restart[thr_id].restart );
    *hashes_done = n - first_nonce + 1;
-    return num_found;
+    return 0;
 }
 #endif
--- a/algo/quark/quark-gate.h
+++ b/algo/quark/quark-gate.h
@@ -13,19 +13,15 @@ bool register_quark_algo( algo_gate_t* gate );
 #if defined(QUARK_4WAY)
 void quark_4way_hash( void *state, const void *input );
 int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+                         uint64_t *hashes_done, struct thr_info *mythr );
 void init_quark_4way_ctx();
 #endif
 void quark_hash( void *state, const void *input );
 int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+                    uint64_t *hashes_done, struct thr_info *mythr );
 void init_quark_ctx();
 #endif
--- a/algo/quark/quark.c
+++ b/algo/quark/quark.c
@@ -173,16 +173,17 @@ void quark_hash(void *state, const void *input)
 }
 int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done)
+                    uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
-        uint32_t hash64[8] __attribute__((aligned(32)));
+   uint32_t hash64[8] __attribute__((aligned(32)));
-        uint32_t *pdata = work->data;
+   uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
-        swab32_array( endiandata, pdata, 20 );
+   swab32_array( endiandata, pdata, 20 );
 	do {
 		pdata[19] = ++n;
--- a/algo/qubit/deep-2way.c
+++ b/algo/qubit/deep-2way.c
@@ -64,7 +64,7 @@ void deep_2way_hash( void *output, const void *input )
 }
 int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
-                         uint64_t *hashes_done )
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -73,17 +73,17 @@ int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
     int num_found = 0;
     uint32_t *noncep = vdata + 32+3;   // 4*8 + 3
     /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };
-     // big endian encode 0..18 uint32_t, 64 bits at a time
+     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
-     swab32_array( endiandata, pdata, 20 );
+     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
     casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
     uint64_t *edata = (uint64_t*)endiandata;
     mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
@@ -102,23 +102,24 @@ int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
            deep_2way_hash( hash, vdata );
            pdata[19] = n;
-            if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
+            if ( !( hash[7] & mask ) )
            if ( fulltest( hash, ptarget) && !opt_benchmark )
            {
-               nonces[ num_found++ ] = n;
+                pdata[19] = n;
-               work_set_target_ratio( work, hash );
+                submit_solution( work, hash, mythr, 0 );
            }
-            if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
+            if ( !( (hash+8)[7] & mask ) )
            if ( fulltest( hash+8, ptarget) && !opt_benchmark )
            {
-               nonces[ num_found++ ] = n+1;
+               pdata[19] = n+1;
-               work_set_target_ratio( work, hash+8 );
+               submit_solution( work, hash+8, mythr, 1 );
            }
            n += 2;
-         } while ( ( num_found == 0 ) && ( n < max_nonce )
+         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
                   && !work_restart[thr_id].restart );
         break;
     }
     *hashes_done = n - first_nonce + 1;
-     return num_found;
+     return 0;
 }
 #endif
--- a/algo/qubit/deep-gate.h
+++ b/algo/qubit/deep-gate.h
@@ -13,19 +13,15 @@ bool register_deep_algo( algo_gate_t* gate );
 #if defined(DEEP_2WAY)
 void deep_2way_hash( void *state, const void *input );
 int scanhash_deep_2way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+                         uint64_t *hashes_done, struct thr_info *mythr );
 void init_deep_2way_ctx();
 #endif
 void deep_hash( void *state, const void *input );
 int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+                    uint64_t *hashes_done, struct thr_info *mythr );
 void init_deep_ctx();
 #endif
--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -72,14 +72,15 @@ void deep_hash(void *output, const void *input)
 }
 int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done)
+                   uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
-        uint32_t hash64[8] __attribute__((aligned(32)));
+   uint32_t hash64[8] __attribute__((aligned(32)));
-        uint32_t *pdata = work->data;
+   uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	const uint32_t Htarg = ptarget[7];
 	uint64_t htmax[] = { 0, 0xF, 0xFF,  0xFFF, 0xFFFF, 0x10000000 };
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -70,7 +70,7 @@ void qubit_2way_hash( void *output, const void *input )
 }
 int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
-                         uint64_t *hashes_done )
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -79,17 +79,17 @@ int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
     int num_found = 0;
     uint32_t *noncep = vdata + 32+3;   // 4*8 + 3
     /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };
-     // big endian encode 0..18 uint32_t, 64 bits at a time
+     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
-     swab32_array( endiandata, pdata, 20 );
+     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
     casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
     uint64_t *edata = (uint64_t*)endiandata;
     mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
@@ -107,25 +107,24 @@ int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
            qubit_2way_hash( hash, vdata );
            pdata[19] = n;
-
+            if ( !( hash[7] & mask ) )
-            if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
+            if ( fulltest( hash, ptarget) && !opt_benchmark )
            {
-               nonces[ num_found++ ] = n;
+                pdata[19] = n;
-               work_set_target_ratio( work, hash );
+                submit_solution( work, hash, mythr, 0 );
            }
-            if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
+            if ( !( (hash+8)[7] & mask ) )
            if ( fulltest( hash+8, ptarget) && !opt_benchmark )
            {
               pdata[19] = n+1;
-               nonces[ num_found++ ] = n+1;
+               submit_solution( work, hash+8, mythr, 1 );
               work_set_target_ratio( work, hash+8 );
            }
            n += 2;
-         } while ( ( num_found == 0 ) && ( n < max_nonce )
+         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
                   && !work_restart[thr_id].restart );
         break;
     }
     *hashes_done = n - first_nonce + 1;
-     return num_found;
+     return 0;
 }
 #endif
--- a/algo/qubit/qubit-gate.h
+++ b/algo/qubit/qubit-gate.h
@@ -13,19 +13,15 @@ bool register_qubit_algo( algo_gate_t* gate );
 #if defined(QUBIT_2WAY)
 void qubit_2way_hash( void *state, const void *input );
 int scanhash_qubit_2way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+                         uint64_t *hashes_done, struct thr_info *mythr );
 void init_qubit_2way_ctx();
 #endif
 void qubit_hash( void *state, const void *input );
 int scanhash_qubit( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+                    uint64_t *hashes_done, struct thr_info *mythr );
 void init_qubit_ctx();
 #endif
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -83,15 +83,16 @@ void qubit_hash(void *output, const void *input)
        memcpy(output, hash, 32);
 }
-int scanhash_qubit(int thr_id, struct work *work,
+int scanhash_qubit( int thr_id, struct work *work,	uint32_t max_nonce,
-		uint32_t max_nonce, uint64_t *hashes_done)
+                    uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
-        uint32_t hash64[8] __attribute__((aligned(64)));
+   uint32_t hash64[8] __attribute__((aligned(64)));
-        uint32_t *pdata = work->data;
+   uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	const uint32_t Htarg = ptarget[7];
 	uint64_t htmax[] = { 0, 0xF, 0xFF,  0xFFF, 0xFFFF, 0x10000000 };
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -40,9 +40,9 @@ void lbry_8way_hash( void* output, const void* input )
   sha256_8way_close( &ctx_sha256, vhashA );
   // reinterleave to do sha512 4-way 64 bit twice.
-   mm256_deinterleave_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 );
+   mm256_dintrlv_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 );
-   mm256_interleave_4x64( vhashA, h0, h1, h2, h3, 256 );
+   mm256_intrlv_4x64( vhashA, h0, h1, h2, h3, 256 );
-   mm256_interleave_4x64( vhashB, h4, h5, h6, h7, 256 );
+   mm256_intrlv_4x64( vhashB, h4, h5, h6, h7, 256 );
   sha512_4way_init( &ctx_sha512 );
   sha512_4way( &ctx_sha512, vhashA, 32 );
@@ -53,9 +53,9 @@ void lbry_8way_hash( void* output, const void* input )
   sha512_4way_close( &ctx_sha512, vhashB );
   // back to 8-way 32 bit
-   mm256_deinterleave_4x64( h0, h1, h2, h3, vhashA, 512 );
+   mm256_dintrlv_4x64( h0, h1, h2, h3, vhashA, 512 );
-   mm256_deinterleave_4x64( h4, h5, h6, h7, vhashB, 512 );
+   mm256_dintrlv_4x64( h4, h5, h6, h7, vhashB, 512 );
-   mm256_interleave_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );
+   mm256_intrlv_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );
   ripemd160_8way_init( &ctx_ripemd );
   ripemd160_8way( &ctx_ripemd, vhashA, 32 );
@@ -72,27 +72,24 @@ void lbry_8way_hash( void* output, const void* input )
   sha256_8way_init( &ctx_sha256 );
   sha256_8way( &ctx_sha256, vhashA, 32 );
-   sha256_8way_close( &ctx_sha256, vhashA );
+   sha256_8way_close( &ctx_sha256, output );
   mm256_deinterleave_8x32( output,     output+ 32, output+ 64, output+ 96,
                            output+128, output+160, output+192, output+224,
                            vhashA, 256 );
 }
 int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done)
+                        uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[32*8] __attribute__ ((aligned (64)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[7<<3]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[27];
   const uint32_t first_nonce = pdata[27];
   const uint32_t Htarg = ptarget[7];
   uint32_t edata[32] __attribute__ ((aligned (64)));
-   uint32_t *nonces = work->nonces;
+   __m256i  *noncev = (__m256i*)vdata + 27;   // aligned
-   int num_found = 0;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
   uint32_t *noncep = vdata + 216; // 27*8
   uint64_t htmax[] = {          0,        0xF,       0xFF,
                             0xFFF,     0xFFFF, 0x10000000 };
@@ -100,9 +97,12 @@ int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
                        0xFFFFF000, 0xFFFF0000,          0 };
   // we need bigendian data...
-   swab32_array( edata, pdata, 32 );
+   casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
-   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+   casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
-                                 edata, edata, edata, edata, 1024 );
+   casti_m256i( edata, 2 ) = mm256_bswap_32( casti_m256i( pdata, 2 ) );
   casti_m256i( edata, 3 ) = mm256_bswap_32( casti_m256i( pdata, 3 ) );
   mm256_intrlv_8x32( vdata, edata, edata, edata, edata,
                             edata, edata, edata, edata, 1024 );
   sha256_8way_init( &sha256_8w_mid );
   sha256_8way( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
@@ -111,136 +111,26 @@ int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
      uint32_t mask = masks[m];
      do
      {
-         be32enc( noncep,   n   );
+        *noncev = mm256_bswap_32( _mm256_set_epi32(
-         be32enc( noncep+1, n+1 );
+                                          n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
         be32enc( noncep+2, n+2 );
         be32enc( noncep+3, n+3 );
         be32enc( noncep+4, n+4 );
         be32enc( noncep+5, n+5 );
         be32enc( noncep+6, n+6 );
         be32enc( noncep+7, n+7 );
         lbry_8way_hash( hash, vdata );
-         for ( int i = 0; i < 8; i++ )
+         for ( int i = 0; i < 8; i++ )  if ( !( hash7[ i ] & mask ) )
         if ( !( (hash+(i<<3))[7] & mask ) && fulltest( hash+(i<<3), ptarget ) )
         {
-            pdata[27] = n+i;
+            // deinterleave hash for lane
-            nonces[ num_found++ ] = n+i;
+            mm256_extract_lane_8x32( lane_hash, hash, i, 256 );
-            work_set_target_ratio( work, hash+(i<<3) );
+            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
            {
              pdata[27] = n + i;
              submit_solution( work, lane_hash, mythr, i );
            }
         }
-         n+=8;
+         n += 8;
-      } while ( ( num_found == 0 ) && ( n < max_nonce )
+      } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
                   && !work_restart[thr_id].restart );
      break;
   }
-
+   *hashes_done = n - first_nonce + 1;
-   *hashes_done = n - first_nonce;
+   return 0;
   return num_found;
 }
 #elif defined(LBRY_4WAY)
 static __thread sha256_4way_context sha256_mid;
 void lbry_4way_hash( void* output, const void* input )
 {
   sha256_4way_context     ctx_sha256 __attribute__ ((aligned (64)));
   sha512_4way_context     ctx_sha512;
   ripemd160_4way_context  ctx_ripemd;
   uint32_t _ALIGN(64) vhashA[16<<2];
   uint32_t _ALIGN(64) vhashB[16<<2];
   uint32_t _ALIGN(64) vhashC[16<<2];
   memcpy( &ctx_sha256, &sha256_mid, sizeof(ctx_sha256) );
   sha256_4way( &ctx_sha256, input + (LBRY_MIDSTATE<<2), LBRY_TAIL );
   sha256_4way_close( &ctx_sha256, vhashA );
   sha256_4way_init( &ctx_sha256 );
   sha256_4way( &ctx_sha256, vhashA, 32 );
   sha256_4way_close( &ctx_sha256, vhashA );
   // sha512 64 bit data, 64 byte output
   mm256_reinterleave_4x64( vhashB, vhashA, 256 );
   sha512_4way_init( &ctx_sha512 );
   sha512_4way( &ctx_sha512, vhashB, 32 );
   sha512_4way_close( &ctx_sha512, vhashB );
   mm256_reinterleave_4x32( vhashA, vhashB, 512 );
   ripemd160_4way_init( &ctx_ripemd );
   ripemd160_4way( &ctx_ripemd, vhashA, 32 );
   ripemd160_4way_close( &ctx_ripemd, vhashB );
   ripemd160_4way_init( &ctx_ripemd );
   ripemd160_4way( &ctx_ripemd, vhashA+(8<<2), 32 );
   ripemd160_4way_close( &ctx_ripemd, vhashC );
   sha256_4way_init( &ctx_sha256 );
   sha256_4way( &ctx_sha256, vhashB, 20 );
   sha256_4way( &ctx_sha256, vhashC, 20 );
   sha256_4way_close( &ctx_sha256, vhashA );
   sha256_4way_init( &ctx_sha256 );
   sha256_4way( &ctx_sha256, vhashA, 32 );
   sha256_4way_close( &ctx_sha256, vhashA );
   mm128_deinterleave_4x32( output, output+32, output+64, output+96,
 		            vhashA, 256 );
 }
 int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done)
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t vdata[32*4] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[27];
   const uint32_t first_nonce = pdata[27];
   const uint32_t Htarg = ptarget[7];
   uint32_t edata[32] __attribute__ ((aligned (64)));
   uint32_t *nonces = work->nonces;
   int num_found = 0;
   uint32_t *noncep = vdata + 108; // 27*4
   uint64_t htmax[] = {          0,        0xF,       0xFF,
                             0xFFF,     0xFFFF, 0x10000000 };
   uint32_t masks[] = {	0xFFFFFFFF, 0xFFFFFFF0,	0xFFFFFF00,
                        0xFFFFF000, 0xFFFF0000,          0 };
   // we need bigendian data...
   swab32_array( edata, pdata, 32 );
   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 1024 );
   sha256_4way_init( &sha256_mid );
   sha256_4way( &sha256_mid, vdata, LBRY_MIDSTATE );
   for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] )
   {
      uint32_t mask = masks[m];
      do
      {
         be32enc( noncep,   n   );
         be32enc( noncep+1, n+1 );
         be32enc( noncep+2, n+2 );
         be32enc( noncep+3, n+3 );
         lbry_4way_hash( hash, vdata );
         for ( int i = 0; i < 4; i++ )
         if ( !( (hash+(i<<3))[7] & mask ) && fulltest( hash+(i<<3), ptarget ) )
         {
            pdata[27] = n+i;
            nonces[ num_found++ ] = n+i;
            work_set_target_ratio( work, hash+(i<<3) );
         }
         n+=4;
      } while ( ( num_found == 0 ) && ( n < max_nonce )
                   && !work_restart[thr_id].restart );
      break;
   }
   *hashes_done = n - first_nonce;
   return num_found;
 }
 #endif
--- a/algo/ripemd/lbry-gate.h
+++ b/algo/ripemd/lbry-gate.h
@@ -4,12 +4,10 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-// Overide multi way on ryzen, SHA is better.
+#if !defined(__SHA__)
-#if !defined(RYZEN_)
+ #if defined(__AVX2__)
 // need sha512 2 way AVX x2 or 1 way scalar x4 to support 4way AVX.
 #if defined(__AVX2__)
  #define LBRY_8WAY
-#endif
+ #endif
 #endif
 #define LBRY_NTIME_INDEX 25
@@ -24,17 +22,18 @@ bool register_lbry_algo( algo_gate_t* gate );
 void lbry_8way_hash( void *state, const void *input );
 int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+                         uint64_t *hashes_done, struct thr_info *mythr );
-
+/*
 #elif defined(LBRY_4WAY)
 void lbry_4way_hash( void *state, const void *input );
 int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );
 */
 #else
 void lbry_hash( void *state, const void *input );
 int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+                    uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 #endif
--- a/algo/ripemd/lbry.c
+++ b/algo/ripemd/lbry.c
@@ -48,13 +48,14 @@ void lbry_hash(void* output, const void* input)
 }
 int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done)
+                   uint64_t *hashes_done, struct thr_info *mythr)
 {
  uint32_t *pdata = work->data;
  uint32_t *ptarget = work->target;
 	uint32_t n = pdata[27] - 1;
 	const uint32_t first_nonce = pdata[27];
 	const uint32_t Htarg = ptarget[7];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	uint32_t hash64[8] __attribute__((aligned(64)));
 	uint32_t endiandata[32] __attribute__ ((aligned (64)));
--- a/algo/ripemd/ripemd-hash-4way.h
+++ b/algo/ripemd/ripemd-hash-4way.h
@@ -6,7 +6,7 @@
 #if defined(__SSE4_2__)
-#include "avxdefs.h"
+#include "simd-utils.h"
 typedef struct
 {
--- a/algo/scrypt.c
+++ b/algo/scrypt.c
@@ -696,7 +696,7 @@ static void scrypt_1024_1_1_256_24way(const uint32_t *input,
 #endif /* HAVE_SCRYPT_6WAY */
 extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce,
-                            uint64_t *hashes_done )
+                            uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -704,6 +704,7 @@ extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce,
 	uint32_t midstate[8];
 	uint32_t n = pdata[19] - 1;
 	const uint32_t Htarg = ptarget[7];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	int throughput = scrypt_best_throughput();
 	int i;
--- a/algo/sha/sha2-hash-4way.h
+++ b/algo/sha/sha2-hash-4way.h
@@ -42,7 +42,7 @@
 #include <stddef.h>
 #include "sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 #if defined(__SSE2__)
 //#if defined(__SSE4_2__)
--- a/algo/sha/sha256_hash_11way.c
+++ b/algo/sha/sha256_hash_11way.c
@@ -1,3 +1,4 @@
 #if 0
 #include <stddef.h>
 #include <string.h>
@@ -65,7 +66,7 @@ static const uint32_t K256[64] =
   _mm_xor_si64( _mm_xor_si64( \
       mm64_ror_32(x,2), mm64_ror_32(x,13) ), _mm_srli_pi32(x,22) )
-#define BSG2_0z(x)  ( ror_32(x,2) ^ ror_32(x,13)  ^ ((x)>>22) )
+#define BSG2_0z(x)  ( u32_ror_32(x,2) ^ u32_ror_32(x,13)  ^ ((x)>>22) )
 #define BSG2_1x(x) \
   _mm256_xor_si256( _mm256_xor_si256( \
@@ -75,7 +76,7 @@ static const uint32_t K256[64] =
   _mm_xor_si64( _mm_xor_si64( \
       mm64_ror_32(x,6), mm64_ror_32(x,11) ), _mm_srli_pi32(x,25) )
-#define BSG2_1z(x)   ( ror_32(x,6) ^ ror_32(x,11) ^ ((x)>>25) )
+#define BSG2_1z(x)   ( u32_ror_32(x,6) ^ u32_ror_32(x,11) ^ ((x)>>25) )
 #define SSG2_0x(x) \
   _mm256_xor_si256( _mm256_xor_si256( \
@@ -85,7 +86,7 @@ static const uint32_t K256[64] =
   _mm_xor_si64( _mm_xor_si64( \
       mm64_ror_32(x,7), mm64_ror_32(x,18) ), _mm_srli_pi32(x,3) )
-#define SSG2_0z(x)  (( ror_32(x,7) ^ ror_32(x,18) ) ^ ((x)>>3) )
+#define SSG2_0z(x)  (( u32_ror_32(x,7) ^ u32_ror_32(x,18) ) ^ ((x)>>3) )
 #define SSG2_1x(x) \
   _mm256_xor_si256( _mm256_xor_si256( \
@@ -95,7 +96,7 @@ static const uint32_t K256[64] =
   _mm_xor_si64( _mm_xor_si64( \
       mm64_ror_32(x,17), mm64_ror_32(x,19) ), _mm_srli_pi32(x,10) )
-#define SSG2_1z(x)   ( ror_32(x,17) ^ ror_32(x,19)  ^ ((x)>>10) )
+#define SSG2_1z(x)   ( u32_ror_32(x,17) ^ u32_ror_32(x,19)  ^ ((x)>>10) )
 #define SHA2x_MEXP( a, b, c, d ) \
     _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
@@ -449,7 +450,7 @@ void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
      if ( clen > len )
         clen = len;
      memcpy_256( ctx->bufx + (ptr>>2), vdatax + (ptr>>2), clen>>2 );
-      memcpy_64 ( ctx->bufy + (ptr>>2), vdatay + (ptr>>2), clen>>2 );
+      memcpy_m64( ctx->bufy + (ptr>>2), vdatay + (ptr>>2), clen>>2 );
      memcpy    ( ctx->bufz +  ptr,     idataz +  ptr,     clen    );
      ptr += clen;
      len -= clen;
@@ -486,19 +487,19 @@ void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dsty,
    if ( ptr > pad )
    {
         memset_zero_256( ctx->bufx + (ptr>>2), (buf_size - ptr) >> 2 );
-         memset_zero_64(  ctx->bufy + (ptr>>2), (buf_size - ptr) >> 2 );
+         memset_zero_m64( ctx->bufy + (ptr>>2), (buf_size - ptr) >> 2 );
         memset(      ctx->bufz + (ptr>>2), 0,  (buf_size - ptr) >> 2 );
         sha256_11way_round( ctx->bufx, ctx->valx,
 			     ctx->bufy, ctx->valy,
 			     ctx->bufz, ctx->valz );
         memset_zero_256( ctx->bufx, pad >> 2 );
-         memset_zero_64(  ctx->bufy, pad >> 2 );
+         memset_zero_m64(  ctx->bufy, pad >> 2 );
         memset(      ctx->bufz, 0,  pad >> 2 );
    }
    else
    {
        memset_zero_256( ctx->bufx + (ptr>>2),    (pad - ptr) >> 2 );
-        memset_zero_64(  ctx->bufy + (ptr>>2),    (pad - ptr) >> 2 );
+        memset_zero_m64(  ctx->bufy + (ptr>>2),    (pad - ptr) >> 2 );
        memset(          ctx->bufz + (ptr>>2), 0, (pad - ptr) >> 2 );
    }
@@ -534,3 +535,4 @@ void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dsty,
 }
 #endif
 #endif   // 0
--- a/algo/sha/sha256q-4way.c
+++ b/algo/sha/sha256q-4way.c
@@ -36,7 +36,6 @@ int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t hash[8*8] __attribute__ ((aligned (32)));
   uint32_t edata[20] __attribute__ ((aligned (32)));;
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
@@ -59,12 +58,7 @@ int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
                                        0 };
   // Need big endian data
-   casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+   mm256_bswap_intrlv80_8x32( vdata, pdata );
   casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
                                 edata, edata, edata, edata, 640 );
   sha256_8way_init( &sha256_ctx8 );
   sha256_8way( &sha256_ctx8, vdata, 64 );
@@ -73,11 +67,10 @@ int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
      uint32_t mask = masks[m];
      do
      {
-        *noncev = mm256_bswap_32(
+         *noncev = mm256_bswap_32(
-		 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
+		            _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
 	 pdata[19] = n;
 	      pdata[19] = n;
         sha256q_8way_hash( hash, vdata );
         uint32_t *hash7 = &(hash[7<<3]); 
@@ -86,27 +79,19 @@ int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
         if ( !( hash7[ lane ] & mask ) )
         { 
            // deinterleave hash for lane
-	    uint32_t lane_hash[8];
+	         uint32_t lane_hash[8];
-	    mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
+	         mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
-	    if ( fulltest( lane_hash, ptarget ) )
+	         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
            {
-	      pdata[19] = n + lane;
+	           pdata[19] = n + lane;
-              work_set_target_ratio( work, lane_hash );
+              submit_solution( work, lane_hash, mythr, lane );
-              if ( submit_work( mythr, work ) )
+            }
-                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
+	      }
                             accepted_share_count + rejected_share_count + 1,
                             thr_id, lane );
              else
                applog( LOG_WARNING, "Failed to submit share." );
 	    }
 	 }
         n += 8;
      } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
      break;
   }
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
@@ -146,7 +131,6 @@ int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t lane_hash[8];
   uint32_t edata[20] __attribute__ ((aligned (32)));;
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
@@ -168,13 +152,7 @@ int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
                               0xFFFF0000,
                                        0 };
-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   mm128_bswap_intrlv80_4x32( vdata, pdata );
   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   sha256_4way_init( &sha256_ctx4 );
   sha256_4way( &sha256_ctx4, vdata, 64 );
@@ -183,7 +161,7 @@ int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
      uint32_t mask = masks[m];
      do {
         *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
-	 pdata[19] = n;
+         pdata[19] = n;
         sha256q_4way_hash( hash, vdata );
@@ -192,25 +170,16 @@ int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
         {
            mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
-            if ( fulltest( lane_hash, ptarget ) )
+            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
            {
              pdata[19] = n + lane;
-              work_set_target_ratio( work, lane_hash );
+              submit_solution( work, lane_hash, mythr, lane );
              if ( submit_work( mythr, work ) )
                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
                             accepted_share_count + rejected_share_count + 1,
                             thr_id, lane );
              else
                applog( LOG_WARNING, "Failed to submit share." );
            }
         }
-
+         n += 4;
 	 n += 4;
      } while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
      break;
   }
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -72,7 +72,7 @@ int scanhash_sha256t_11way( int thr_id, struct work *work, uint32_t max_nonce,
   casti_m256i( dataz, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
   casti_m128i( dataz, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-   mm256_interleave_8x32( datax, dataz, dataz, dataz, dataz,
+   mm256_intrlv_8x32( datax, dataz, dataz, dataz, dataz,
                                 dataz, dataz, dataz, dataz, 640 );
   mm64_interleave_2x32( datay, dataz, dataz, 640 );
@@ -156,15 +156,15 @@ void sha256t_8way_hash( void* output, const void* input )
   sha256_8way_init( &ctx );
   sha256_8way( &ctx, vhash, 32 );
   sha256_8way_close( &ctx, output );
 }
 int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*8]  __attribute__ ((aligned (64)));
-   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t hash[8*8]    __attribute__ ((aligned (32)));
-   uint32_t edata[20] __attribute__ ((aligned (32)));;
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[7<<3]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
@@ -187,12 +187,7 @@ int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
                                        0 };
   // Need big endian data
-   casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+   mm256_bswap_intrlv80_8x32( vdata, pdata );
   casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
                                 edata, edata, edata, edata, 640 );
   sha256_8way_init( &sha256_ctx8 );
   sha256_8way( &sha256_ctx8, vdata, 64 );
@@ -201,29 +196,22 @@ int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
      uint32_t mask = masks[m];
      do
      {
-        *noncev = mm256_bswap_32(
+        *noncev = mm256_bswap_32( _mm256_set_epi32(
-                 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
+                                          n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
         pdata[19] = n;
         sha256t_8way_hash( hash, vdata );
         uint32_t *hash7 = &(hash[7<<3]);
         for ( int lane = 0; lane < 8; lane++ )
         if ( !( hash7[ lane ] & mask ) )
         {
            // deinterleave hash for lane
            uint32_t lane_hash[8] __attribute__ ((aligned (64)));
            mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
-
+            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
            if ( fulltest( lane_hash, ptarget ) )
            {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr, lane );
-	    }
+	         }
         }
         n += 8;
      } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
      break;
   }
@@ -253,7 +241,6 @@ void sha256t_4way_hash( void* output, const void* input )
   sha256_4way_init( &ctx );
   sha256_4way( &ctx, vhash, 32 );
   sha256_4way_close( &ctx, output );
 }
 int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -262,7 +249,6 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t edata[20] __attribute__ ((aligned (32)));;
   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -278,20 +264,14 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
                                   0xFFF,
                                  0xFFFF,
                              0x10000000 };
-   const uint32_t masks[] = {  0xFFFFFFFF,
+   const uint32_t masks[] = { 0xFFFFFFFF,
-                               0xFFFFFFF0,
+                              0xFFFFFFF0,
-                               0xFFFFFF00,
+                              0xFFFFFF00,
-                               0xFFFFF000,
+                              0xFFFFF000,
-                               0xFFFF0000,
+                              0xFFFF0000,
-                                        0 };
+                                       0 };
-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   mm128_bswap_intrlv80_4x32( vdata, pdata );
   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   sha256_4way_init( &sha256_ctx4 );
   sha256_4way( &sha256_ctx4, vdata, 64 );
@@ -300,7 +280,7 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
      uint32_t mask = masks[m];
      do {
         *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
-	 pdata[19] = n;
+         pdata[19] = n;
         sha256t_4way_hash( hash, vdata );
@@ -308,15 +288,13 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
         if ( !( hash7[ lane ] & mask ) )
         {
            mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
-
+            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
            if ( fulltest( lane_hash, ptarget ) )
            {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr, lane );
-	    }
+	         }
-         }
+         } 
-	 n += 4;
+         n += 4;
      } while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
      break;
   }
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -2,11 +2,7 @@
 bool register_sha256t_algo( algo_gate_t* gate )
 {
-#if defined(SHA256T_11WAY)
+#if defined(SHA256T_8WAY)
    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t_11way;
    gate->hash       = (void*)&sha256t_11way_hash;
 #elif defined(SHA256T_8WAY)
    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t_8way;
    gate->hash       = (void*)&sha256t_8way_hash;
@@ -25,11 +21,7 @@ gate->optimizations = SHA_OPT;
 bool register_sha256q_algo( algo_gate_t* gate )
 {
-#if defined(SHA256T_8WAY)
+#if defined(SHA256T_4WAY)
    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256q_8way;
    gate->hash       = (void*)&sha256q_8way_hash;
 #elif defined(SHA256T_4WAY)
    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256q_4way;
    gate->hash       = (void*)&sha256q_4way_hash;
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -5,30 +5,17 @@
 #include "algo-gate-api.h"
 // Override multi way on ryzen, SHA is better.
-#if !defined(RYZEN_)
+#if !defined(__SHA__)
-#if defined(__SSE2__)
+ #if defined(__AVX2__)
  #define SHA256T_4WAY
 #endif
 #if defined(__AVX2__)
  #define SHA256T_8WAY
-//  #define SHA256T_11WAY
+#elif defined(__SSE2__)
-#endif
+  #define SHA256T_4WAY
 #endif
 #endif
 bool register_sha256t_algo( algo_gate_t* gate );
 bool register_sha256q_algo( algo_gate_t* gate );
 #if defined(SHA256T_11WAY)
 void sha256t_11way_hash( void *outx, void *outy, void *outz, const void *inpx,
 	                 const void *inpy, const void *inpz );
 int scanhash_sha256t_11way( int thr_id, struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr );
 //void sha256q_8way_hash( void *output, const void *input );
 //int scanhash_sha256q_11way( int thr_id, struct work *work, uint32_t max_nonce,
 //                            uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 #if defined(SHA256T_8WAY)
 void sha256t_8way_hash( void *output, const void *input );
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -40,7 +40,7 @@
 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 #ifdef __cplusplus
 extern "C"{
--- a/algo/shavite/shavite-hash-2way.h
+++ b/algo/shavite/shavite-hash-2way.h
@@ -3,7 +3,7 @@
 #if defined(__AVX2__)
-#include "avxdefs.h"
+#include "simd-utils.h"
 typedef struct {
        unsigned char buf[128<<1];
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -36,7 +36,7 @@
 #ifdef __AES__
 #include "sph_shavite.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 #ifdef __cplusplus
 extern "C"{
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -6,6 +6,12 @@
 #if defined (__AVX2__)
 union _m256_v16 {
  uint16_t u16[16];
  __m256i v256;
 };
 typedef union _m256_v16 m256_v16;
 // imported from simd_iv.h
 uint32_t SIMD_IV_512[] = { 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc,
--- a/algo/simd/simd-hash-2way.h
+++ b/algo/simd/simd-hash-2way.h
@@ -5,7 +5,7 @@
 #if defined(__AVX2__)
-#include "avxdefs.h"
+#include "simd-utils.h"
 typedef struct {
  uint32_t A[ 32*2 ] __attribute__((aligned(64)));
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -2,7 +2,11 @@
 #include <string.h>
 #include <stdint.h>
 #include "skein-hash-4way.h"
-#include "algo/sha/sha2-hash-4way.h"
+#if defined(__SHA__)
  #include <openssl/sha.h>
 #else
  #include "algo/sha/sha2-hash-4way.h"
 #endif
 #if defined (SKEIN_4WAY)
@@ -11,53 +15,69 @@ void skeinhash_4way( void *state, const void *input )
     uint64_t vhash64[8*4] __attribute__ ((aligned (64)));
     uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
     skein512_4way_context ctx_skein;
 #if defined(__SHA__)
     uint32_t hash0[16] __attribute__ ((aligned (64)));
     uint32_t hash1[16] __attribute__ ((aligned (64)));
     uint32_t hash2[16] __attribute__ ((aligned (64)));
     uint32_t hash3[16] __attribute__ ((aligned (64)));
     SHA256_CTX           ctx_sha256;
 #else
     sha256_4way_context ctx_sha256;
 #endif
     skein512_4way_init( &ctx_skein );
     skein512_4way( &ctx_skein, input, 80 );
     skein512_4way_close( &ctx_skein, vhash64 );
-     mm256_reinterleave_4x32( vhash32, vhash64, 512 );
+#if defined(__SHA__)      
     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
     SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
     SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
     SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
     SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
     mm128_intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
 #else
     mm256_rintrlv_4x64_4x32( vhash32, vhash64, 512 );
     sha256_4way_init( &ctx_sha256 );
     sha256_4way( &ctx_sha256, vhash32, 64 );
     sha256_4way_close( &ctx_sha256, state );
-
+#endif
     mm128_deinterleave_4x32( state, state+32, state+64, state+96,
 		              vhash32, 256 );
 }
 int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done )
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
    uint32_t hash[8*4] __attribute__ ((aligned (64)));
    uint32_t lane_hash[8];
    uint32_t *hash7 = &(hash[7<<2]);
    uint32_t edata[20] __attribute__ ((aligned (64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
-    // hash is returned deinterleaved
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-    uint32_t *nonces = work->nonces;
+    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
    int num_found = 0;
 // data is 80 bytes, 20 u32 or 4 u64.
    swab32_array( edata, pdata, 20 );
    mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 );
    uint32_t *noncep = vdata + 73;   // 9*8 + 1
   mm256_bswap_intrlv80_4x64( vdata, pdata );
   do
   {
-       be32enc( noncep,   n   );
+       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-       be32enc( noncep+2, n+1 );
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
       be32enc( noncep+4, n+2 );
       be32enc( noncep+6, n+3 );
       skeinhash_4way( hash, vdata );
@@ -68,16 +88,14 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
          if ( fulltest( lane_hash, ptarget ) )
          {
             pdata[19] = n + lane;
-             nonces[ num_found++ ] = n + lane;
+             submit_solution( work, lane_hash, mythr, lane );
             work_set_target_ratio( work, lane_hash );
          }
       }
       n += 4;
-    } while ( (num_found == 0) && (n < max_nonce)
+    } while ( (n < max_nonce) && !work_restart[thr_id].restart );
               && !work_restart[thr_id].restart );
    *hashes_done = n - first_nonce + 1;
-    return num_found;
+    return 0;
 }
 #endif
--- a/algo/skein/skein-gate.h
+++ b/algo/skein/skein-gate.h
@@ -3,24 +3,21 @@
 #include <stdint.h>
 #include "algo-gate-api.h"
 // Override multi way on ryzen, SHA is better.
 #if !defined(RYZEN_)
 #if defined(__AVX2__)
  #define SKEIN_4WAY
 #endif
 #endif
 #if defined(SKEIN_4WAY)
 void skeinhash_4way( void *output, const void *input );
 int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+                         uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 void skeinhash( void *output, const void *input );
 int scanhash_skein( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+                    uint64_t *hashes_done, struct thr_info *mythr );
 #endif
--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -49,7 +49,7 @@ extern "C"{
 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 // Output size in bits
 #define SPH_SIZE_skein256   256
--- a/algo/skein/skein.c
+++ b/algo/skein/skein.c
@@ -21,8 +21,8 @@ void skeinhash(void *state, const void *input)
     memcpy(state, hash, 32);
 }
-int scanhash_skein(int thr_id, struct work *work,
+int scanhash_skein( int thr_id, struct work *work, uint32_t max_nonce,
-	uint32_t max_nonce, uint64_t *hashes_done)
+                    uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -31,7 +31,8 @@ int scanhash_skein(int thr_id, struct work *work,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t n = first_nonce;
-	
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
        swab32_array( endiandata, pdata, 20 );
 	do {
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -20,55 +20,43 @@ void skein2hash_4way( void *output, const void *input )
 }
 int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done )
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[8*4] __attribute__ ((aligned (64)));
    uint32_t *hash7 = &(hash[25]);
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
    uint32_t endiandata[20] __attribute__ ((aligned (64)));
    uint64_t *edata = (uint64_t*)endiandata;
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
-    uint32_t *nonces = work->nonces;
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-    int num_found = 0;
+    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
    swab32_array( endiandata, pdata, 20 );
    mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 );
    uint32_t *noncep = vdata + 73;   // 9*8 + 1
    mm256_bswap_intrlv80_4x64( vdata, pdata );
    do 
    {
-       be32enc( noncep,   n   );
+       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-       be32enc( noncep+2, n+1 );
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
       be32enc( noncep+4, n+2 );
       be32enc( noncep+6, n+3 );
-       skein2hash( hash, vdata );
+       skein2hash_4way( hash, vdata );
       for ( int lane = 0; lane < 4; lane++ )
-       if ( hash7[ lane ] <= Htarg )
+       if ( hash7[ lane<<1 ] <= Htarg )
       {
          // deinterleave hash for lane
          uint32_t lane_hash[8];
          mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) )
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
          {
             pdata[19] = n + lane;
-             nonces[ num_found++ ] = n + lane;
+             submit_solution( work, lane_hash, mythr, lane );
             work_set_target_ratio( work, lane_hash );
          }
       }
       n += 4;
-    } while ( (num_found == 0) && (n < max_nonce)
+    } while ( (n < max_nonce) && !work_restart[thr_id].restart );
             &&  !work_restart[thr_id].restart );
    *hashes_done = n - first_nonce + 1;
-    return num_found;
+    return 0;
 }
 #endif
--- a/algo/skein/skein2-gate.c
+++ b/algo/skein/skein2-gate.c
@@ -10,10 +10,9 @@ int64_t skein2_get_max64 ()
 bool register_skein2_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX2_OPT;
-#if defined (FOUR_WAY) && defined (__AVX2__)
+#if defined (SKEIN2_4WAY)
  gate->scanhash  = (void*)&scanhash_skein2_4way;
  gate->hash      = (void*)&skein2hash_4way;
  four_way_not_tested();
 #else
  gate->scanhash  = (void*)&scanhash_skein2;
  gate->hash      = (void*)&skein2hash;
--- a/algo/skein/skein2-gate.h
+++ b/algo/skein/skein2-gate.h
@@ -10,11 +10,11 @@
 #if defined(SKEIN2_4WAY)
 void skein2hash_4way( void *output, const void *input );
 int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t* hashes_done );
+                          uint64_t* hashes_done, struct thr_info *mythr );
 #endif
 void skein2hash( void *output, const void *input );
 int scanhash_skein2( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
+                     uint64_t *hashes_done, struct thr_info *mythr );
 #endif
--- a/algo/skein/skein2.c
+++ b/algo/skein/skein2.c
@@ -34,8 +34,8 @@ void skein2hash(void *output, const void *input)
 }
-int scanhash_skein2(int thr_id, struct work *work,
+int scanhash_skein2( int thr_id, struct work *work,	uint32_t max_nonce,
-	uint32_t max_nonce, uint64_t *hashes_done)
+                     uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -44,6 +44,7 @@ int scanhash_skein2(int thr_id, struct work *work,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t n = first_nonce;
    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
        swab32_array( endiandata, pdata, 20 );
--- a/algo/sm3/sm3-hash-4way.h
+++ b/algo/sm3/sm3-hash-4way.h
@@ -59,7 +59,7 @@
 #include <sys/types.h>
 #include <stdint.h>
 #include <string.h>
-#include "avxdefs.h"
+#include "simd-utils.h"
 #ifdef __cplusplus
 extern "C" {
--- a/algo/whirlpool/whirlpool-hash-4way.h
+++ b/algo/whirlpool/whirlpool-hash-4way.h
@@ -52,7 +52,7 @@
 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 /**
 * Output size (in bits) for WHIRLPOOL.
--- a/algo/x14/axiom.c
+++ b/algo/x14/axiom.c
@@ -44,12 +44,13 @@ void axiomhash(void *output, const void *input)
 }
 int scanhash_axiom(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
 	uint32_t _ALIGN(64) hash64[8];
 	uint32_t _ALIGN(64) endiandata[20];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
--- a/algo/x14/polytimos-4way.c
+++ b/algo/x14/polytimos-4way.c
@@ -13,26 +13,16 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/echo/aes_ni/hash_api.h"
-typedef struct {
+union _poly_4way_context_overlay
 {
   skein512_4way_context   skein;
   shabal512_4way_context  shabal;
   hashState_echo          echo;
   luffa_2way_context      luffa;
   sph_fugue512_context    fugue;
   sph_gost512_context     gost;
-} poly_4way_ctx_holder;
+};
-
+typedef union _poly_4way_context_overlay poly_4way_context_overlay;
 poly_4way_ctx_holder poly_4way_ctx;
 void init_polytimos_4way_ctx()
 {
    skein512_4way_init( &poly_4way_ctx.skein );
    shabal512_4way_init( &poly_4way_ctx.shabal );
    init_echo( &poly_4way_ctx.echo, 512  );
    luffa_2way_init( &poly_4way_ctx.luffa, 512 );
    sph_fugue512_init( &poly_4way_ctx.fugue );
    sph_gost512_init( &poly_4way_ctx.gost );
 }
 void polytimos_4way_hash( void *output, const void *input )
 {
@@ -41,51 +31,57 @@ void polytimos_4way_hash( void *output, const void *input )
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-     poly_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+     poly_4way_context_overlay ctx;
     memcpy( &ctx, &poly_4way_ctx, sizeof(poly_4way_ctx) );
     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, input, 80 );
     skein512_4way_close( &ctx.skein, vhash );
     // Need to convert from 64 bit interleaved to 32 bit interleaved.
     uint32_t vhash32[16*4];
-     mm256_reinterleave_4x32( vhash32, vhash, 512 );
+     mm256_rintrlv_4x64_4x32( vhash32, vhash, 512 );
     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash32, 64 );
     shabal512_4way_close( &ctx.shabal, vhash32 );
-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
     init_echo( &ctx.echo, 512 );
     update_final_echo ( &ctx.echo, (BitSequence *)hash0,
                         (const BitSequence *)hash0, 512 );
-     memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, 512 );
-     memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, 512 );
-     memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
-     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
     mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
     mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
-     memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash1, 64 );
     sph_fugue512_close( &ctx.fugue, hash1 );
-     memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash2, 64 );
     sph_fugue512_close( &ctx.fugue, hash2 );
-     memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );
     sph_gost512_init( &ctx.gost );
     sph_gost512( &ctx.gost, hash0, 64 );
     sph_gost512_close( &ctx.gost, hash0 );
     sph_gost512_init( &ctx.gost );
@@ -104,51 +100,43 @@ void polytimos_4way_hash( void *output, const void *input )
     memcpy( output+96, hash3, 32 );
 }
-int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,                              uint64_t *hashes_done )
+int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t endiandata[20] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
+     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   int num_found = 0;
   uint32_t *noncep = vdata + 73;   // 9*8 + 1
   const uint32_t Htarg = ptarget[7];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
   if ( opt_benchmark )
      ptarget[7] = 0x0cff;
-   for ( int i=0; i < 19; i++ )
+   mm256_bswap_intrlv80_4x64( vdata, pdata );
      be32enc( &endiandata[i], pdata[i] );
   uint64_t *edata = (uint64_t*)endiandata;
   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
   do {
-      be32enc( noncep,   n   );
+      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-      be32enc( noncep+2, n+1 );
+                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
      be32enc( noncep+4, n+2 );
      be32enc( noncep+6, n+3 );
      polytimos_4way_hash(hash, vdata);
      pdata[19] = n;
-      for ( int i = 0; i < 4; i++ )
+      for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
      {
         pdata[19] = n+i;
-         nonces[ num_found++ ] = n+i;
+         submit_solution( work, hash+(i<<3), mythr, i );
         work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;
-   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart));
+   } while ( ( n < max_nonce-4 ) && !(*restart));
   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }
 #endif
--- a/algo/x14/polytimos-gate.c
+++ b/algo/x14/polytimos-gate.c
@@ -4,7 +4,6 @@ bool register_polytimos_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
 #ifdef POLYTIMOS_4WAY
  init_polytimos_4way_ctx();
  gate->scanhash  = (void*)&scanhash_polytimos_4way;
  gate->hash      = (void*)&polytimos_4way_hash;
 #else
--- a/algo/x14/polytimos-gate.h
+++ b/algo/x14/polytimos-gate.h
@@ -13,19 +13,14 @@ bool register_polytimos_algo( algo_gate_t* gate );
 #if defined(POLYTIMOS_4WAY)
 void polytimos_4way_hash( void *state, const void *input );
 int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done );
+                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_polytimos_4way_ctx();
 #endif
 void polytimos_hash( void *state, const void *input );
 int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done );
+                  uint64_t *hashes_done, struct thr_info *mythr );
 void init_polytimos_ctx();
 #endif
--- a/algo/x14/polytimos.c
+++ b/algo/x14/polytimos.c
@@ -76,13 +76,14 @@ void polytimos_hash(void *output, const void *input)
 	memcpy(output, hashA, 32);
 }
-int scanhash_polytimos(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(128) hash[8];
 	uint32_t _ALIGN(128) endiandata[20];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
-
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
--- a/algo/x14/veltor-4way.c
+++ b/algo/x14/veltor-4way.c
@@ -40,7 +40,7 @@ void veltor_4way_hash( void *output, const void *input )
     skein512_4way( &ctx.skein, input, 80 );
     skein512_4way_close( &ctx.skein, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
@@ -54,10 +54,10 @@ void veltor_4way_hash( void *output, const void *input )
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );
-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
     sph_gost512( &ctx.gost, hash0, 64 );
     sph_gost512_close( &ctx.gost, hash0 );
@@ -78,7 +78,7 @@ void veltor_4way_hash( void *output, const void *input )
 }
 int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done )
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -91,6 +91,7 @@ int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
     uint32_t *nonces = work->nonces;
     int num_found = 0;
     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
     volatile uint8_t *restart = &(work_restart[thr_id].restart);
     if ( opt_benchmark )
@@ -101,7 +102,7 @@ int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
     }
     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
     do
     {
         be32enc( noncep,   n   );
--- a/algo/x14/veltor-gate.h
+++ b/algo/x14/veltor-gate.h
@@ -15,7 +15,7 @@ bool register_veltor_algo( algo_gate_t* gate );
 void veltor_4way_hash( void *state, const void *input );
 int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done );
+                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_veltor_4way_ctx();
@@ -24,7 +24,7 @@ void init_veltor_4way_ctx();
 void veltor_hash( void *state, const void *input );
 int scanhash_veltor( int thr_id, struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done );
+                  uint64_t *hashes_done, struct thr_info *mythr );
 void init_veltor_ctx();
--- a/algo/x14/veltor.c
+++ b/algo/x14/veltor.c
@@ -61,12 +61,14 @@ void veltor_hash(void *output, const void *input)
 	memcpy(output, hashB, 32);
 }
-int scanhash_veltor(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_veltor( int thr_id, struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(128) hash[8];
 	uint32_t _ALIGN(128) endiandata[20];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -78,7 +78,7 @@ void x14_4way_hash( void *state, const void *input )
     bmw512_4way_close( &ctx.bmw, vhash );
     // Serial
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 3 Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -90,7 +90,7 @@ void x14_4way_hash( void *state, const void *input )
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     // Parallel 4way
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     // 4 Skein
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -105,16 +105,16 @@ void x14_4way_hash( void *state, const void *input )
     keccak512_4way_close( &ctx.keccak, vhash );
     // Serial
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 7 Luffa
-     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
-     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -142,13 +142,13 @@ void x14_4way_hash( void *state, const void *input )
     sph_shavite512_close( &ctx.shavite, hash3 );
     // 10 Simd
-     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
-     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
     // 11 Echo
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -164,10 +164,10 @@ void x14_4way_hash( void *state, const void *input )
                       (const BitSequence *) hash3, 512 );
     // 12 Hamsi parallel 4way 32 bit
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 13 Fugue serial
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -183,14 +183,14 @@ void x14_4way_hash( void *state, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );
     // 14 Shabal, parallel 32 bit
-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, state );
 }
 int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done )
+                       uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*16] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -203,6 +203,7 @@ int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
     int num_found = 0;
     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     const uint32_t Htarg = ptarget[7];
     /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
@@ -212,7 +213,7 @@ int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
     swab32_array( endiandata, pdata, 20 );
     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
     for ( int m=0; m < 6; m++ )
       if ( Htarg <= htmax[m] )
--- a/algo/x14/x14-gate.h
+++ b/algo/x14/x14-gate.h
@@ -13,19 +13,15 @@ bool register_x14_algo( algo_gate_t* gate );
 #if defined(X14_4WAY)
 void x14_4way_hash( void *state, const void *input );
 int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done );
+                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x14_4way_ctx();
 #endif
 void x14hash( void *state, const void *input );
 int scanhash_x14( int thr_id, struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done );
+                  uint64_t *hashes_done, struct thr_info *mythr );
 void init_x14_ctx();
 #endif
--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -180,8 +180,8 @@ void x14hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }
-int scanhash_x14(int thr_id, struct work *work,
+int scanhash_x14( int thr_id, struct work *work, uint32_t max_nonce,
-				uint32_t max_nonce, uint64_t *hashes_done)
+                  uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t endiandata[20] __attribute__((aligned(64)));
        uint32_t hash64[8] __attribute__((aligned(64)));
@@ -190,6 +190,7 @@ int scanhash_x14(int thr_id, struct work *work,
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
 	const uint32_t Htarg = ptarget[7];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	uint64_t htmax[] = {
 		0,
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -81,7 +81,7 @@ void x15_4way_hash( void *state, const void *input )
     bmw512_4way_close( &ctx.bmw, vhash );
     // Serial
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 3 Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -93,7 +93,7 @@ void x15_4way_hash( void *state, const void *input )
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     // Parallel 4way
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     // 4 Skein
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -108,16 +108,16 @@ void x15_4way_hash( void *state, const void *input )
     keccak512_4way_close( &ctx.keccak, vhash );
     // Serial to the end
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 7 Luffa
-     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
-     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -145,13 +145,13 @@ void x15_4way_hash( void *state, const void *input )
     sph_shavite512_close( &ctx.shavite, hash3 );
     // 10 Simd
-     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
-     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
     // 11 Echo
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -167,10 +167,10 @@ void x15_4way_hash( void *state, const void *input )
                       (const BitSequence *) hash3, 512 );
     // 12 Hamsi parallel 4way 32 bit
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 13 Fugue
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -186,10 +186,10 @@ void x15_4way_hash( void *state, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );
     // 14 Shabal, parallel 32 bit
-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
     // 15 Whirlpool
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -214,7 +214,7 @@ void x15_4way_hash( void *state, const void *input )
 }
 int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done )
+                       uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -227,6 +227,7 @@ int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
     int num_found = 0;
     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     const uint32_t Htarg = ptarget[7];
     /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
@@ -236,7 +237,7 @@ int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
     swab32_array( endiandata, pdata, 20 );
     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
     for ( int m=0; m < 6; m++ )
       if ( Htarg <= htmax[m] )
--- a/algo/x15/x15-gate.h
+++ b/algo/x15/x15-gate.h
@@ -13,19 +13,15 @@ bool register_x15_algo( algo_gate_t* gate );
 #if defined(X15_4WAY)
 void x15_4way_hash( void *state, const void *input );
 int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done );
+                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x15_4way_ctx();
 #endif
 void x15hash( void *state, const void *input );
 int scanhash_x15( int thr_id, struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done );
+                  uint64_t *hashes_done, struct thr_info *mythr );
 void init_x15_ctx();
 #endif
--- a/algo/x15/x15.c
+++ b/algo/x15/x15.c
@@ -186,8 +186,8 @@ void x15hash(void *output, const void *input)
 	memcpy(output, hashB, 32);
 }
-int scanhash_x15(int thr_id, struct work *work,
+int scanhash_x15( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint32_t max_nonce, uint64_t *hashes_done)
+                  uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t endiandata[20] __attribute__((aligned(64)));
        uint32_t hash64[8] __attribute__((aligned(64)));
@@ -196,6 +196,7 @@ int scanhash_x15(int thr_id, struct work *work,
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
 	const uint32_t Htarg = ptarget[7];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	uint64_t htmax[] = {
 		0,
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -32,8 +32,8 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
-
+union _x16r_4way_context_overlay
-typedef struct {
+{
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
    hashState_echo          echo;
@@ -50,16 +50,8 @@ typedef struct {
    shabal512_4way_context  shabal;
    sph_whirlpool_context   whirlpool;
    sha512_4way_context     sha512;
 } x16r_4way_ctx_holder;
 x16r_4way_ctx_holder x16r_4way_ctx __attribute__ ((aligned (64)));
 // Cube needs one full init so fast reinits can be done in the hash loop.
 void init_x16r_4way_ctx()
 {
   cubehashInit( &x16r_4way_ctx.cube, 512, 16, 32 );
 };
-
+typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;
 void x16r_4way_hash( void* output, const void* input )
 {
@@ -68,14 +60,14 @@ void x16r_4way_hash( void* output, const void* input )
   uint32_t hash2[24] __attribute__ ((aligned (64)));
   uint32_t hash3[24] __attribute__ ((aligned (64)));
   uint32_t vhash[24*4] __attribute__ ((aligned (64)));
-   x16r_4way_ctx_holder ctx;
+   x16r_4way_context_overlay ctx;
   void *in0 = (void*) hash0;
   void *in1 = (void*) hash1;
   void *in2 = (void*) hash2;
   void *in3 = (void*) hash3;
   int size = 80;
-   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, input, 640 );
+   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
   if ( s_ntime == UINT32_MAX )
   {
@@ -104,11 +96,11 @@ void x16r_4way_hash( void* output, const void* input )
               blake512_4way( &ctx.blake, input, size );
            else
            {
-               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               blake512_4way( &ctx.blake, vhash, size );
            }
            blake512_4way_close( &ctx.blake, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case BMW:
            bmw512_4way_init( &ctx.bmw );
@@ -116,11 +108,11 @@ void x16r_4way_hash( void* output, const void* input )
               bmw512_4way( &ctx.bmw, input, size );
            else
            {
-               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               bmw512_4way( &ctx.bmw, vhash, size );
            }
            bmw512_4way_close( &ctx.bmw, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case GROESTL:
               init_groestl( &ctx.groestl, 64 );
@@ -142,11 +134,11 @@ void x16r_4way_hash( void* output, const void* input )
               skein512_4way( &ctx.skein, input, size );
            else
            {
-               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               skein512_4way( &ctx.skein, vhash, size );
            }
            skein512_4way_close( &ctx.skein, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case JH:
            jh512_4way_init( &ctx.jh );
@@ -154,11 +146,11 @@ void x16r_4way_hash( void* output, const void* input )
               jh512_4way( &ctx.jh, input, size );
            else
            {
-               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               jh512_4way( &ctx.jh, vhash, size );
            }
            jh512_4way_close( &ctx.jh, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case KECCAK:
            keccak512_4way_init( &ctx.keccak );
@@ -166,21 +158,21 @@ void x16r_4way_hash( void* output, const void* input )
               keccak512_4way( &ctx.keccak, input, size );
            else
            {
-               mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               keccak512_4way( &ctx.keccak, vhash, size );
            }
            keccak512_4way_close( &ctx.keccak, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case LUFFA:
-            mm256_interleave_2x128( vhash, in0, in1, size<<3 );
+            mm256_intrlv_2x128( vhash, in0, in1, size<<3 );
            luffa_2way_init( &ctx.luffa, 512 );
            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
-            mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+            mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
-            mm256_interleave_2x128( vhash, in2, in3, size<<3 );
+            mm256_intrlv_2x128( vhash, in2, in3, size<<3 );
            luffa_2way_init( &ctx.luffa, 512 );
            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
-            mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+            mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
         break;
         case CUBEHASH:
            cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -194,7 +186,7 @@ void x16r_4way_hash( void* output, const void* input )
                                  (const byte*)in2, size );
            cubehashInit( &ctx.cube, 512, 16, 32 );
            cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
-                                        (const byte*)in3, size );
+                                  (const byte*)in3, size );
         break;
         case SHAVITE:
            sph_shavite512_init( &ctx.shavite );
@@ -211,14 +203,14 @@ void x16r_4way_hash( void* output, const void* input )
            sph_shavite512_close( &ctx.shavite, hash3 );
         break;
         case SIMD:
-            mm256_interleave_2x128( vhash, in0, in1, size<<3 );
+            mm256_intrlv_2x128( vhash, in0, in1, size<<3 );
            simd_2way_init( &ctx.simd, 512 );
            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+            mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
-            mm256_interleave_2x128( vhash, in2, in3, size<<3 );
+            mm256_intrlv_2x128( vhash, in2, in3, size<<3 );
            simd_2way_init( &ctx.simd, 512 );
            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+            mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
         break;
         case ECHO:
             init_echo( &ctx.echo, 512 );
@@ -235,11 +227,11 @@ void x16r_4way_hash( void* output, const void* input )
                                (const BitSequence*)in3, size<<3 );
         break;
         case HAMSI:
-             mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+             mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
             hamsi512_4way_init( &ctx.hamsi );
             hamsi512_4way( &ctx.hamsi, vhash, size );
             hamsi512_4way_close( &ctx.hamsi, vhash );
-             mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+             mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case FUGUE:
             sph_fugue512_init( &ctx.fugue );
@@ -256,11 +248,11 @@ void x16r_4way_hash( void* output, const void* input )
             sph_fugue512_close( &ctx.fugue, hash3 );
         break;
         case SHABAL:
-             mm128_interleave_4x32( vhash, in0, in1, in2, in3, size<<3 );
+             mm128_intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
             shabal512_4way_init( &ctx.shabal );
             shabal512_4way( &ctx.shabal, vhash, size );
             shabal512_4way_close( &ctx.shabal, vhash );
-             mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+             mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case WHIRLPOOL:
             sph_whirlpool_init( &ctx.whirlpool );
@@ -277,11 +269,11 @@ void x16r_4way_hash( void* output, const void* input )
             sph_whirlpool_close( &ctx.whirlpool, hash3 );
         break;
         case SHA_512:
-             mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
+             mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
             sha512_4way_init( &ctx.sha512 );
             sha512_4way( &ctx.sha512, vhash, size );
             sha512_4way_close( &ctx.sha512, vhash );
-             mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+             mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
      }
      size = 64;
@@ -293,7 +285,7 @@ void x16r_4way_hash( void* output, const void* input )
 }
 int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done )
+                        uint64_t *hashes_done, struct thr_info *mythr)
 {
   uint32_t hash[4*16] __attribute__ ((aligned (64)));
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -303,15 +295,14 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
-   int num_found = 0;
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   uint32_t *noncep = vdata + 73;   // 9*8 + 1
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
-   for ( int k=0; k < 19; k++ )
+   casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
-      be32enc( &endiandata[k], pdata[k] );
+   casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
 //   if ( s_ntime != pdata[17] )
   if ( s_ntime != endiandata[17] )
   {
      uint32_t ntime = swab32(pdata[17]);
@@ -325,30 +316,27 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
      ptarget[7] = 0x0cff;
   uint64_t *edata = (uint64_t*)endiandata;
-   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+   mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
   do
   {
-      be32enc( noncep,   n   );
+      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-      be32enc( noncep+2, n+1 );
+               _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
      be32enc( noncep+4, n+2 );
      be32enc( noncep+6, n+3 );
      x16r_4way_hash( hash, vdata );
      pdata[19] = n;
-      for ( int i = 0; i < 4; i++ )
+      for ( int i = 0; i < 4; i++ )  if ( (hash+(i<<3))[7] <= Htarg )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
      {
         pdata[19] = n+i;
-         nonces[ num_found++ ] = n+i;
+         submit_solution( work, hash+(i<<3), mythr, i );
         work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;
-   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
+   } while ( (  n < max_nonce ) && !(*restart) );
   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }
 #endif
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -35,11 +35,9 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output )
 bool register_x16r_algo( algo_gate_t* gate )
 {
 #if defined (X16R_4WAY)
  init_x16r_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
  init_x16r_ctx();
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
@@ -52,11 +50,9 @@ bool register_x16r_algo( algo_gate_t* gate )
 bool register_x16s_algo( algo_gate_t* gate )
 {
 #if defined (X16R_4WAY)
  init_x16r_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
  init_x16r_ctx();
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -2,7 +2,7 @@
 #define X16R_GATE_H__ 1
 #include "algo-gate-api.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 #include <stdint.h>
 #if defined(__AVX2__) && defined(__AES__)
@@ -39,20 +39,14 @@ bool register_x16s_algo( algo_gate_t* gate );
 #if defined(X16R_4WAY)
 void x16r_4way_hash( void *state, const void *input );
 int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done );
+                        uint64_t *hashes_done, struct thr_info *mythr );
 void init_x16r_4way_ctx();
 #endif
 void x16r_hash( void *state, const void *input );
 int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done );
+                   uint64_t *hashes_done, struct thr_info *mythr );
 void init_x16r_ctx();
 #endif
--- a/algo/x16/x16r.c
+++ b/algo/x16/x16r.c
@@ -33,7 +33,8 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
-typedef struct {
+union _x16r_context_overlay
 {
 #if defined(__AES__)
        hashState_echo          echo;
        hashState_groestl       groestl;
@@ -55,19 +56,13 @@ typedef struct {
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
        SHA512_CTX              sha512;
 } x16r_ctx_holder;
 x16r_ctx_holder x16r_ctx __attribute__ ((aligned (64)));
 void init_x16r_ctx()
 {
   cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
 };
 typedef union _x16r_context_overlay x16r_context_overlay;
 void x16r_hash( void* output, const void* input )
 {
   uint32_t _ALIGN(128) hash[16];
-   x16r_ctx_holder ctx;
+   x16r_context_overlay ctx;
   void *in = (void*) input;
   int size = 80;
@@ -126,7 +121,7 @@ void x16r_hash( void* output, const void* input )
                                    (const BitSequence*)in, size );
         break;
         case CUBEHASH:
-            memcpy( &ctx.cube, &x16r_ctx.cube, sizeof(cubehashParam) );
+            cubehashInit( &ctx.cube, 512, 16, 32 );
            cubehashUpdateDigest( &ctx.cube, (byte*) hash,
                                  (const byte*)in, size );
         break;
@@ -184,7 +179,7 @@ void x16r_hash( void* output, const void* input )
 }
 int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done )
+                   uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(128) hash32[8];
   uint32_t _ALIGN(128) endiandata[20];
@@ -192,16 +187,16 @@ int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
   uint32_t nonce = first_nonce;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
-   for ( int k=0; k < 19; k++ )
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-      be32enc( &endiandata[k], pdata[k] );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
 // This code is suspicious. s_ntime is saved after byteswapping pdata[17]
 // but is tested vs unswapped pdata[17]. This should result in calling
 // getAlgoString every pass, but that doesn't seem to be the case.
 // It appears to be working correctly as is.
   if ( s_ntime != pdata[17] )
   {
      uint32_t ntime = swab32(pdata[17]);
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -47,30 +47,6 @@ union _sonoa_4way_context_overlay
 };
 typedef union _sonoa_4way_context_overlay sonoa_4way_context_overlay;
 /*
 sonoa_4way_ctx_holder sonoa_4way_ctx __attribute__ ((aligned (64)));
 void init_sonoa_4way_ctx()
 {
     blake512_4way_init( &sonoa_4way_ctx.blake );
     bmw512_4way_init( &sonoa_4way_ctx.bmw );
     init_groestl( &sonoa_4way_ctx.groestl, 64 );
     skein512_4way_init( &sonoa_4way_ctx.skein );
     jh512_4way_init( &sonoa_4way_ctx.jh );
     keccak512_4way_init( &sonoa_4way_ctx.keccak );
     luffa_2way_init( &sonoa_4way_ctx.luffa, 512 );
     cube_2way_init( &sonoa_4way_ctx.cube, 512, 16, 32 );
     shavite512_2way_init( &sonoa_4way_ctx.shavite );
     simd_2way_init( &sonoa_4way_ctx.simd, 512 );
     init_echo( &sonoa_4way_ctx.echo, 512 );
     hamsi512_4way_init( &sonoa_4way_ctx.hamsi );
     sph_fugue512_init( &sonoa_4way_ctx.fugue );
     shabal512_4way_init( &sonoa_4way_ctx.shabal );
     sph_whirlpool_init( &sonoa_4way_ctx.whirlpool );
     sha512_4way_init( &sonoa_4way_ctx.sha512 );
     haval256_5_4way_init( &sonoa_4way_ctx.haval );
 };
 */
 void sonoa_4way_hash( void *state, const void *input )
 {
@@ -82,8 +58,6 @@ void sonoa_4way_hash( void *state, const void *input )
     uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
     sonoa_4way_context_overlay ctx;
 //     sonoa_4way_ctx_holder ctx __attribute__ ((aligned (64)));
 //        memcpy( &ctx, &sonoa_4way_ctx, sizeof(sonoa_4way_ctx) );
 // 1
@@ -95,7 +69,7 @@ void sonoa_4way_hash( void *state, const void *input )
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -106,7 +80,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -120,7 +94,7 @@ void sonoa_4way_hash( void *state, const void *input )
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
-     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+     mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -142,8 +116,8 @@ void sonoa_4way_hash( void *state, const void *input )
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
-     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -160,13 +134,13 @@ void sonoa_4way_hash( void *state, const void *input )
 // 2
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -177,7 +151,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -191,7 +165,7 @@ void sonoa_4way_hash( void *state, const void *input )
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
-     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+     mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -213,8 +187,8 @@ void sonoa_4way_hash( void *state, const void *input )
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
-     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -229,7 +203,7 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
@@ -241,7 +215,7 @@ void sonoa_4way_hash( void *state, const void *input )
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -252,7 +226,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -266,7 +240,7 @@ void sonoa_4way_hash( void *state, const void *input )
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
-     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+     mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -288,8 +262,8 @@ void sonoa_4way_hash( void *state, const void *input )
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
-     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -304,13 +278,13 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -326,13 +300,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );
 // 4
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -343,7 +317,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -357,7 +331,7 @@ void sonoa_4way_hash( void *state, const void *input )
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
-     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+     mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -379,8 +353,8 @@ void sonoa_4way_hash( void *state, const void *input )
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
-     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -395,13 +369,13 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -416,19 +390,19 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );
-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm256_reinterleave_4x32_4x64( vhashB, vhash, 512 ); 
+     mm256_rintrlv_4x32_4x64( vhashB, vhash, 512 ); 
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhashB, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -443,8 +417,8 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
-     mm256_interleave_2x128( vhashA, hash0, hash1, 512 );
+     mm256_intrlv_2x128( vhashA, hash0, hash1, 512 );
-     mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
+     mm256_intrlv_2x128( vhashB, hash2, hash3, 512 );
     shavite512_2way_init( &ctx.shavite );
     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
@@ -452,19 +426,19 @@ void sonoa_4way_hash( void *state, const void *input )
     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
 // 5
-     mm256_reinterleave_2x128_4x64( vhash, vhashA, vhashB, 512 );
+     mm256_rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
-     mm256_reinterleave_4x64_4x32( vhashB, vhash,  512 );
+     mm256_rintrlv_4x64_4x32( vhashB, vhash,  512 );
     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhashB, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -475,7 +449,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -489,7 +463,7 @@ void sonoa_4way_hash( void *state, const void *input )
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
-     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+     mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -511,8 +485,8 @@ void sonoa_4way_hash( void *state, const void *input )
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
-     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -527,13 +501,13 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -548,13 +522,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );
-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -571,13 +545,13 @@ void sonoa_4way_hash( void *state, const void *input )
 // 6
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -588,7 +562,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -602,7 +576,7 @@ void sonoa_4way_hash( void *state, const void *input )
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
-     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+     mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -624,8 +598,8 @@ void sonoa_4way_hash( void *state, const void *input )
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
-     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -640,13 +614,13 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -661,13 +635,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );
-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -682,13 +656,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash3 );
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     sha512_4way_init( &ctx.sha512 );
     sha512_4way( &ctx.sha512, vhash, 64 );
     sha512_4way_close( &ctx.sha512, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -705,13 +679,13 @@ void sonoa_4way_hash( void *state, const void *input )
 // 7
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -722,7 +696,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -736,7 +710,7 @@ void sonoa_4way_hash( void *state, const void *input )
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
-     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+     mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -758,8 +732,8 @@ void sonoa_4way_hash( void *state, const void *input )
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
-     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -774,13 +748,13 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -795,13 +769,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );
-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -816,13 +790,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash3 );
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     sha512_4way_init( &ctx.sha512 );
     sha512_4way( &ctx.sha512, vhash, 64 );
     sha512_4way_close( &ctx.sha512, vhash );
-     mm256_reinterleave_4x64_4x32( vhashB, vhash,  512 );
+     mm256_rintrlv_4x64_4x32( vhashB, vhash,  512 );
     haval256_5_4way_init( &ctx.haval );
     haval256_5_4way( &ctx.haval, vhashB, 64 );
@@ -836,7 +810,6 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
     uint32_t *hash7 = &(hash[7<<2]);
     uint32_t lane_hash[8];
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
@@ -850,19 +823,13 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          0xFFFFF000, 0xFFFF0000,          0  };
     // Need big endian data
-     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+     mm256_bswap_intrlv80_4x64( vdata, pdata );
     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
     casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
     uint64_t *edata = (uint64_t*)endiandata;
     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
     for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
     {
        uint32_t mask = masks[m];
        do
        {
-           *noncev = mm256_interleave_blend_32( mm256_bswap_32(
+           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
                             _mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ),
                                                *noncev );
           sonoa_4way_hash( hash, vdata );
@@ -871,17 +838,10 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
           if ( ( ( hash7[ lane ] & mask ) == 0 ) )
           {
              mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
-              if ( fulltest( lane_hash, ptarget ) )
+              if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
              {
                 pdata[19] = n + lane;
-                 work_set_target_ratio( work, lane_hash );
+                 submit_solution( work, lane_hash, mythr, lane );
                 if ( submit_work( mythr, work ) )
                    applog( LOG_NOTICE,
                             "Share %d submitted by thread %d, lane %d.",
                             accepted_share_count + rejected_share_count + 1,
                             thr_id, lane );
                 else
                    applog( LOG_WARNING, "Failed to submit share." );
              }
           }
           n += 4;
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	d6e8d7a46e	v3.9.4	2019-06-18 13:15:45 -04:00
Jay D Dee	71d6b97ee8	v3.9.3.1	2019-06-13 21:15:58 -04:00
Jay D Dee	b2331375a3	v3.9.2.5	2019-06-13 11:20:27 -04:00