v3.9.6

2026-02-22 16:33:08 +00:00 · 2019-07-17 17:54:38 -04:00
parent e2d5762ef2
commit 6f49ba09b7
34 changed files with 1930 additions and 382 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -71,6 +71,9 @@ cpuminer_SOURCES = \
  algo/bmw/bmw256-hash-4way.c \
  algo/bmw/bmw512-hash-4way.c \
  algo/bmw/bmw256.c \
  algo/bmw/bmw512-gate.c \
  algo/bmw/bmw512.c \
  algo/bmw/bmw512-4way.c \
  algo/cryptonight/cryptolight.c \
  algo/cryptonight/cryptonight-common.c\
  algo/cryptonight/cryptonight-aesni.c\
@@ -238,6 +241,8 @@ cpuminer_SOURCES = \
  algo/x13/skunk-4way.c \
  algo/x13/skunk.c \
  algo/x13/drop.c \
  algo/x13/x13bcd-4way.c \
  algo/x13/x13bcd.c \
  algo/x14/x14-gate.c \
  algo/x14/x14.c \
  algo/x14/x14-4way.c \
@@ -254,6 +259,8 @@ cpuminer_SOURCES = \
  algo/x16/x16r-gate.c \
  algo/x16/x16r.c \
  algo/x16/x16r-4way.c \
  algo/x16/x16rt.c \
  algo/x16/x16rt-4way.c \
  algo/x17/x17-gate.c \
  algo/x17/x17.c \
  algo/x17/x17-4way.c \
--- a/README.md
+++ b/README.md
@@ -58,6 +58,7 @@ Supported Algorithms
                          blakecoin     blake256r8
                          blake2s       Blake-2 S
                          bmw           BMW 256
                          bmw512        BMW 512
                          c11           Chaincoin
                          decred
                          deep          Deepcoin (DCN)
@@ -113,11 +114,14 @@ Supported Algorithms
                          x11gost       sib (SibCoin)
                          x12           Galaxie Cash (GCH)
                          x13           X13
                          x13bcd        bcd
                          x13sm3        hsr (Hshare)
                          x14           X14
                          x15           X15
                          x16r          Ravencoin (RVN)
-                          x16s          pigeoncoin (PGN)
+                          x16rt         Gincoin (GIN)
                          x16rt_veil    Veil (VEIL)
                          x16s          Pigeoncoin (PGN)
                          x17
                          xevan         Bitsend (BSD)
                          yescrypt      Globalboost-Y (BSTY)
--- a/4
+++ b/4
@@ -38,6 +38,10 @@ supported.
 Change Log
 ----------
 v3.9.6
 New algos: bmw512, x16rt, x16rt-veil (alias veil), x13bcd (alias bcd).
 v3.9.5.4
 Fixed sha256q AVX2 poor performance.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -170,6 +170,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_BLAKECOIN:     register_blakecoin_algo     ( gate ); break;
 //    case ALGO_BLAKE2B:      register_blake2b_algo     ( gate ); break;
    case ALGO_BLAKE2S:       register_blake2s_algo       ( gate ); break;
    case ALGO_BMW512:        register_bmw512_algo        ( gate ); break;
    case ALGO_C11:           register_c11_algo           ( gate ); break;
    case ALGO_CRYPTOLIGHT:   register_cryptolight_algo   ( gate ); break;
    case ALGO_CRYPTONIGHT:   register_cryptonight_algo   ( gate ); break;
@@ -227,10 +228,13 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_X11GOST:       register_x11gost_algo       ( gate ); break;
    case ALGO_X12:           register_x12_algo           ( gate ); break;
    case ALGO_X13:           register_x13_algo           ( gate ); break;
    case ALGO_X13BCD:        register_x13bcd_algo        ( gate ); break;
    case ALGO_X13SM3:        register_x13sm3_algo        ( gate ); break;
    case ALGO_X14:           register_x14_algo           ( gate ); break;
    case ALGO_X15:           register_x15_algo           ( gate ); break;
    case ALGO_X16R:          register_x16r_algo          ( gate ); break;
    case ALGO_X16RT:         register_x16rt_algo         ( gate ); break;
    case ALGO_X16RT_VEIL:    register_x16rt_veil_algo    ( gate ); break;
    case ALGO_X16S:          register_x16s_algo          ( gate ); break;
    case ALGO_X17:           register_x17_algo           ( gate ); break;
    case ALGO_XEVAN:         register_xevan_algo         ( gate ); break;
@@ -327,7 +331,6 @@ const char* const algo_alias_map[][2] =
  { "lyra2",             "lyra2re"      },
  { "lyra2v2",           "lyra2rev2"    },
  { "lyra2v3",           "lyra2rev3"    },
  { "lyra2zoin",         "lyra2z330"    },
  { "myrgr",             "myr-gr"       },
  { "myriad",            "myr-gr"       },
  { "neo",               "neoscrypt"    },
@@ -335,11 +338,9 @@ const char* const algo_alias_map[][2] =
 //  { "sia",               "blake2b"      },
  { "sib",               "x11gost"      },
  { "timetravel8",       "timetravel"   },
-  { "ziftr",             "zr5"          },
+  { "veil",              "x16rt-veil"   },
  { "yenten",            "yescryptr16"  },
-  { "yescryptr8k",       "yescrypt"     },
+  { "ziftr",             "zr5"          },
  { "zcoin",             "lyra2z"       },
  { "zoin",              "lyra2z330"    },
  { NULL,                NULL           }   
 };
--- a/algo/argon2/argon2d/argon2d-gate.c
+++ b/algo/argon2/argon2d/argon2d-gate.c
@@ -36,35 +36,31 @@ void argon2d_crds_hash( void *output, const void *input )
 int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t _ALIGN(64) endiandata[20];
+   uint32_t _ALIGN(64) endiandata[20];
-        uint32_t _ALIGN(64) hash[8];
+   uint32_t _ALIGN(64) hash[8];
-        uint32_t *pdata = work->data;
+   uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *ptarget = work->target;
-        int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;  // thr_id arg is deprecated
   const uint32_t first_nonce = pdata[19];
   const uint32_t Htarg = ptarget[7];
   uint32_t nonce = first_nonce;
-        const uint32_t first_nonce = pdata[19];
+   swab32_array( endiandata, pdata, 20 );
        const uint32_t Htarg = ptarget[7];
-        uint32_t nonce = first_nonce;
+   do {
      be32enc(&endiandata[19], nonce);
      argon2d_crds_hash( hash, endiandata );
      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
      {
          pdata[19] = nonce;
          submit_solution( work, hash, mythr );
      }
      nonce++;
   } while (nonce < max_nonce && !work_restart[thr_id].restart);
-        swab32_array( endiandata, pdata, 20 );
+   pdata[19] = nonce;
-
+   *hashes_done = pdata[19] - first_nonce + 1;
-        do {
+   return 0;
                be32enc(&endiandata[19], nonce);
                argon2d_crds_hash( hash, endiandata );
                if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
                {
                        pdata[19] = nonce;
                        *hashes_done = pdata[19] - first_nonce;
                        work_set_target_ratio(work, hash);
                        return 1;
                }
                nonce++;
        } while (nonce < max_nonce && !work_restart[thr_id].restart);
        pdata[19] = nonce;
        *hashes_done = pdata[19] - first_nonce + 1;
        return 0;
 }
 bool register_argon2d_crds_algo( algo_gate_t* gate )
@@ -107,35 +103,32 @@ void argon2d_dyn_hash( void *output, const void *input )
 int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t _ALIGN(64) endiandata[20];
+   uint32_t _ALIGN(64) endiandata[20];
-        uint32_t _ALIGN(64) hash[8];
+   uint32_t _ALIGN(64) hash[8];
-        uint32_t *pdata = work->data;
+   uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *ptarget = work->target;
-        int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;  // thr_id arg is deprecated
   const uint32_t first_nonce = pdata[19];
   const uint32_t Htarg = ptarget[7];
   uint32_t nonce = first_nonce;
-        const uint32_t first_nonce = pdata[19];
+   swab32_array( endiandata, pdata, 20 );
        const uint32_t Htarg = ptarget[7];
-        uint32_t nonce = first_nonce;
+   do
   {
      be32enc(&endiandata[19], nonce);
      argon2d_dyn_hash( hash, endiandata );
      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
      {
          pdata[19] = nonce;
          submit_solution( work, hash, mythr );
      }
      nonce++;
  } while (nonce < max_nonce && !work_restart[thr_id].restart);
-        swab32_array( endiandata, pdata, 20 );
+   pdata[19] = nonce;
-
+   *hashes_done = pdata[19] - first_nonce + 1;
-        do {
+   return 0;
                be32enc(&endiandata[19], nonce);
                argon2d_dyn_hash( hash, endiandata );
                if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
                {
                        pdata[19] = nonce;
                        *hashes_done = pdata[19] - first_nonce;
                        work_set_target_ratio(work, hash);
                        return 1;
                }
                nonce++;
        } while (nonce < max_nonce && !work_restart[thr_id].restart);
        pdata[19] = nonce;
        *hashes_done = pdata[19] - first_nonce + 1;
        return 0;
 }
 bool register_argon2d_dyn_algo( algo_gate_t* gate )
@@ -171,11 +164,10 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
      be32enc( &endiandata[19], n );
      argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) endiandata, 80,
                 (char*) endiandata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
-      if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) )
+      if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) && !opt_benchmark )
      {
         *hashes_done = n - first_nonce + 1;
         pdata[19] = n;
-         return true;
+         submit_solution( work, vhash, mythr );
      }
      n++;
--- a/algo/bmw/bmw512-4way.c
+++ b/algo/bmw/bmw512-4way.c
@@ -0,0 +1,59 @@
 #include "bmw512-gate.h"
 #ifdef BMW512_4WAY
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
 //#include "sph_keccak.h"
 #include "bmw-hash-4way.h"
 void bmw512hash_4way(void *state, const void *input)
 {
    bmw512_4way_context ctx;
    bmw512_4way_init( &ctx );
    bmw512_4way( &ctx, input, 80 );
    bmw512_4way_close( &ctx, state );
 }
 int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t hash[16*4] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[25]);   // 3*8+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
 //   const uint32_t Htarg = ptarget[7];
    int thr_id = mythr->id;  // thr_id arg is deprecated
   mm256_bswap32_intrlv80_4x64( vdata, pdata );
   do {
       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
      bmw512hash_4way( hash, vdata );
      for ( int lane = 0; lane < 4; lane++ )
      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) )
          {
              pdata[19] = n + lane;
              submit_lane_solution( work, lane_hash, mythr, lane );
          }
      }
      n += 4;
   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
 #endif
--- a/algo/bmw/bmw512-gate.c
+++ b/algo/bmw/bmw512-gate.c
@@ -0,0 +1,20 @@
 #include "bmw512-gate.h"
 int64_t bmw512_get_max64() { return 0x7ffffLL; }
 bool register_bmw512_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX2_OPT;
  gate->set_target      = (void*)&alt_set_target;
  gate->get_max64       = (void*)&bmw512_get_max64;
 #if defined (BMW512_4WAY)
  gate->scanhash  = (void*)&scanhash_bmw512_4way;
  gate->hash      = (void*)&bmw512hash_4way;
 #else
  gate->scanhash        = (void*)&scanhash_bmw512;
  gate->hash            = (void*)&bmw512hash;
 #endif
  return true;
 };
--- a/algo/bmw/bmw512-gate.h
+++ b/algo/bmw/bmw512-gate.h
@@ -0,0 +1,23 @@
 #ifndef BMW512_GATE_H__
 #define BMW512_GATE_H__
 #include "algo-gate-api.h"
 #include <stdint.h>
 #if defined(__AVX2__)
  #define BMW512_4WAY 1
 #endif
 #if defined(BMW512_4WAY)
 void bmw512hash_4way( void *state, const void *input );
 int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 void bmw512hash( void *state, const void *input );
 int scanhash_bmw512( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
 #endif
--- a/algo/bmw/bmw512.c
+++ b/algo/bmw/bmw512.c
@@ -0,0 +1,53 @@
 #include "algo-gate-api.h"
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
 #include "sph_bmw.h"
 void bmw512hash(void *state, const void *input)
 {
    sph_bmw512_context ctx;
    uint32_t hash[32];	
    sph_bmw512_init( &ctx );
    sph_bmw512( &ctx,input, 80 );
    sph_bmw512_close( &ctx, hash );
    memcpy( state, hash, 32 );
 }
 int scanhash_bmw512( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
 	//const uint32_t Htarg = ptarget[7];
   int thr_id = mythr->id;  // thr_id arg is deprecated
 	uint32_t _ALIGN(32) hash64[8];
 	uint32_t endiandata[32];
   for (int i=0; i < 19; i++) 
           be32enc(&endiandata[i], pdata[i]);
 	do {
 		pdata[19] = ++n;
 		be32enc(&endiandata[19], n); 
 		bmw512hash(hash64, endiandata);
        if (((hash64[7]&0xFFFFFF00)==0) && 
 				fulltest(hash64, ptarget)) {
            *hashes_done = n - first_nonce + 1;
 			return true;
 		}
 	} while (n < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = n - first_nonce + 1;
 	pdata[19] = n;
 	return 0;
 }
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -7,6 +7,7 @@
 // 2x128
 /*
 // The result of hashing 10 rounds of initial data which consists of params
 // zero padded.
 static const uint64_t IV256[] =
@@ -24,13 +25,14 @@ static const uint64_t IV512[] =
 0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
 0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
 };
 */
 static void transform_2way( cube_2way_context *sp )
 {
    int r;
    const int rounds = sp->rounds;
-    __m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
+    __m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1;
    x0 = _mm256_load_si256( (__m256i*)sp->h     );
    x1 = _mm256_load_si256( (__m256i*)sp->h + 1 );
@@ -47,18 +49,12 @@ static void transform_2way( cube_2way_context *sp )
        x5 = _mm256_add_epi32( x1, x5 );
        x6 = _mm256_add_epi32( x2, x6 );
        x7 = _mm256_add_epi32( x3, x7 );
-        y0 = x2;
+        y0 = x0;
-        y1 = x3;
+        y1 = x1;
-        y2 = x0;
+        x0 = mm256_rol_32( x2, 7 );
-        y3 = x1;
+        x1 = mm256_rol_32( x3, 7 );
-        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0,  7 ),
+        x2 = mm256_rol_32( y0, 7 );
-                               _mm256_srli_epi32( y0, 25 ) );
+        x3 = mm256_rol_32( y1, 7 );
        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1,  7 ),
                               _mm256_srli_epi32( y1, 25 ) );
        x2 = _mm256_xor_si256( _mm256_slli_epi32( y2,  7 ),
                               _mm256_srli_epi32( y2, 25 ) );
        x3 = _mm256_xor_si256( _mm256_slli_epi32( y3,  7 ),
                               _mm256_srli_epi32( y3, 25 ) );
        x0 = _mm256_xor_si256( x0, x4 );
        x1 = _mm256_xor_si256( x1, x5 );
        x2 = _mm256_xor_si256( x2, x6 );
@@ -71,18 +67,12 @@ static void transform_2way( cube_2way_context *sp )
        x5 = _mm256_add_epi32( x1, x5 );
        x6 = _mm256_add_epi32( x2, x6 );
        x7 = _mm256_add_epi32( x3, x7 );
-        y0 = x1;
+        y0 = x0;
-        y1 = x0;
+        y1 = x2;
-        y2 = x3;
+        x0 = mm256_rol_32( x1, 11 );
-        y3 = x2;
+        x1 = mm256_rol_32( y0, 11 );
-        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
+        x2 = mm256_rol_32( x3, 11 );
-                               _mm256_srli_epi32( y0, 21 ) );
+        x3 = mm256_rol_32( y1, 11 );
        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ),
                               _mm256_srli_epi32( y1, 21 ) );
        x2 = _mm256_xor_si256( _mm256_slli_epi32( y2, 11 ),
                               _mm256_srli_epi32( y2, 21 ) );
        x3 = _mm256_xor_si256( _mm256_slli_epi32( y3, 11 ),
                               _mm256_srli_epi32( y3, 21 ) );
        x0 = _mm256_xor_si256( x0, x4 );
        x1 = _mm256_xor_si256( x1, x5 );
        x2 = _mm256_xor_si256( x2, x6 );
@@ -107,23 +97,40 @@ static void transform_2way( cube_2way_context *sp )
 int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
                    int blockbytes )
 {
-    const uint64_t* iv = hashbitlen == 512 ? IV512 : IV256;
+    __m128i* h = (__m128i*)sp->h;
    sp->hashlen   = hashbitlen/128;
    sp->blocksize = blockbytes/16;
    sp->rounds    = rounds;
    sp->pos       = 0;
-    __m256i* h = (__m256i*)sp->h;
+    if ( hashbitlen == 512 )
-
+    {
    h[0] = _mm256_set_epi64x( iv[ 1], iv[ 0], iv[ 1], iv[ 0] );
    h[1] = _mm256_set_epi64x( iv[ 3], iv[ 2], iv[ 3], iv[ 2] );
    h[2] = _mm256_set_epi64x( iv[ 5], iv[ 4], iv[ 5], iv[ 4] );
    h[3] = _mm256_set_epi64x( iv[ 7], iv[ 6], iv[ 7], iv[ 6] );
    h[4] = _mm256_set_epi64x( iv[ 9], iv[ 8], iv[ 9], iv[ 8] );
    h[5] = _mm256_set_epi64x( iv[11], iv[10], iv[11], iv[10] );
    h[6] = _mm256_set_epi64x( iv[13], iv[12], iv[13], iv[12] );
    h[7] = _mm256_set_epi64x( iv[15], iv[14], iv[15], iv[14] );
       h[ 0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
       h[ 2] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
       h[ 4] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
       h[ 6] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
       h[ 8] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
       h[10] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
       h[12] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
       h[14] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
       h[1] = h[ 0];  h[ 3] = h[ 2]; h[ 5] = h[ 4]; h[ 7] = h[ 6];
       h[9] = h[ 8];  h[11] = h[10]; h[13] = h[12]; h[15] = h[14];
    }
    else
    {
       h[ 0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
       h[ 2] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
       h[ 4] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
       h[ 6] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
       h[ 8] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
       h[10] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
       h[12] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
       h[14] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
       h[1] = h[ 0];  h[ 3] = h[ 2]; h[ 5] = h[ 4]; h[ 7] = h[ 6];
       h[9] = h[ 8];  h[11] = h[10]; h[13] = h[12]; h[15] = h[14];
    }
    return 0;
 }
@@ -165,7 +172,7 @@ int cube_2way_close( cube_2way_context *sp, void *output )
    for ( i = 0; i < 10; ++i )           transform_2way( sp );
-    for ( i = 0; i < sp->hashlen; i++ )  hash[i] = sp->h[i];
+    memcpy( hash, sp->h, sp->hashlen<<5 );
    return 0;
 }
@@ -198,7 +205,7 @@ int cube_2way_update_close( cube_2way_context *sp, void *output,
    for ( i = 0; i < 10; ++i )            transform_2way( sp );
-    for ( i = 0; i < sp->hashlen; i++ )   hash[i] = sp->h[i];
+    memcpy( hash, sp->h, sp->hashlen<<5 );
    return 0;
 }
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -16,24 +16,6 @@
 #include "simd-utils.h"
 #include <stdio.h>
 // The result of hashing 10 rounds of initial data which is params and 
 // mostly zeros.
 static const uint64_t IV256[] =
 {
 0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
 0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
 0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
 0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
 };
 static const uint64_t IV512[] =
 {
 0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
 0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
 0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
 0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
 };
 static void transform( cubehashParam *sp )
 {
    int r;
@@ -53,26 +35,22 @@ static void transform( cubehashParam *sp )
        x2 = _mm256_add_epi32( x0, x2 );
        x3 = _mm256_add_epi32( x1, x3 );
        y0 = x0;
-        x0 = _mm256_xor_si256( _mm256_slli_epi32( x1, 7 ),
+        x0 = mm256_rol_32( x1, 7 );
-                               _mm256_srli_epi32( x1, 25 ) );
+        x1 = mm256_rol_32( y0, 7 );
        x1 = _mm256_xor_si256( _mm256_slli_epi32( y0, 7 ),
                               _mm256_srli_epi32( y0, 25 ) );
        x0 = _mm256_xor_si256( x0, x2 );
        x1 = _mm256_xor_si256( x1, x3 );
-        x2 = _mm256_shuffle_epi32( x2, 0x4e );
+        x2 = mm256_swap64_128( x2 );
-        x3 = _mm256_shuffle_epi32( x3, 0x4e );
+        x3 = mm256_swap64_128( x3 );
        x2 = _mm256_add_epi32( x0, x2 );
        x3 = _mm256_add_epi32( x1, x3 );
-        y0 = _mm256_permute4x64_epi64( x0, 0x4e );
+        y0 = mm256_swap_128( x0 );
-        y1 = _mm256_permute4x64_epi64( x1, 0x4e );
+        y1 = mm256_swap_128( x1 );
-        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
+        x0 = mm256_rol_32( y0, 11 );
-                               _mm256_srli_epi32( y0, 21 ) );
+        x1 = mm256_rol_32( y1, 11 );
        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ), 
                               _mm256_srli_epi32( y1, 21 ) );
        x0 = _mm256_xor_si256( x0, x2 );
        x1 = _mm256_xor_si256( x1, x3 );
-        x2 = _mm256_shuffle_epi32( x2, 0xb1 );
+        x2 = mm256_swap32_64( x2 );
-        x3 = _mm256_shuffle_epi32( x3, 0xb1 );
+        x3 = mm256_swap32_64( x3 );
    }
    _mm256_store_si256( (__m256i*)sp->x,     x0 );
@@ -147,37 +125,58 @@ static void transform( cubehashParam *sp )
 #endif
 }  // transform
 /*
 // The result of hashing 10 rounds of initial data which is params and
 // mostly zeros.
 static const uint64_t IV256[] =
 {
 0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
 0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
 0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
 0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
 };
 static const uint64_t IV512[] =
 {
 0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
 0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
 0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
 0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
 };
 */
 int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
 {
-    const uint64_t* iv = hashbitlen == 512 ? IV512 : IV256;
+    __m128i *x = (__m128i*)sp->x;
    sp->hashlen   = hashbitlen/128;
    sp->blocksize = blockbytes/16;
    sp->rounds    = rounds;
    sp->pos       = 0;
 #if defined(__AVX2__)
-    __m256i* x = (__m256i*)sp->x;
+    if ( hashbitlen == 512 )
    {
-    x[0] = _mm256_set_epi64x( iv[ 3], iv[ 2], iv[ 1], iv[ 0] );
+       x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
-    x[1] = _mm256_set_epi64x( iv[ 7], iv[ 6], iv[ 5], iv[ 4] );
+       x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
-    x[2] = _mm256_set_epi64x( iv[11], iv[10], iv[ 9], iv[ 8] );
+       x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
-    x[3] = _mm256_set_epi64x( iv[15], iv[14], iv[13], iv[12] );
+       x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
       x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
       x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
       x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
       x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
    }
    else
    {
       x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
       x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
       x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
       x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
       x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
       x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
       x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
       x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
    }   
 #else
    __m128i* x = (__m128i*)sp->x;
     x[0] = _mm_set_epi64x( iv[ 1], iv[ 0] );
     x[1] = _mm_set_epi64x( iv[ 3], iv[ 2] );
     x[2] = _mm_set_epi64x( iv[ 5], iv[ 4] );
     x[3] = _mm_set_epi64x( iv[ 7], iv[ 6] );
     x[4] = _mm_set_epi64x( iv[ 9], iv[ 8] );
     x[5] = _mm_set_epi64x( iv[11], iv[10] );
     x[6] = _mm_set_epi64x( iv[13], iv[12] );
     x[7] = _mm_set_epi64x( iv[15], iv[14] );
 #endif
    return SUCCESS;
 }
--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -323,7 +323,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
     mpz_clears(magipi, magisw, product, bns0, bns1, NULL);
    *hashes_done = n - first_nonce + 1;
-    return rc;
+    return 0;
 }
 bool register_m7m_algo( algo_gate_t *gate )
--- a/algo/x13/x13bcd-4way.c
+++ b/algo/x13/x13bcd-4way.c
@@ -0,0 +1,283 @@
 #include "x13sm3-gate.h"
 #if defined(X13SM3_4WAY)
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 //#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/sm3/sm3-hash-4way.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
 //    luffa_2way_context      luffa;
    cubehashParam           cube;
    sph_shavite512_context  shavite;
    simd_2way_context       simd;
    hashState_echo          echo;
    sm3_4way_ctx_t          sm3;
    hamsi512_4way_context   hamsi;
    sph_fugue512_context    fugue;
 } x13bcd_4way_ctx_holder;
 x13bcd_4way_ctx_holder x13bcd_4way_ctx __attribute__ ((aligned (64)));
 static __thread blake512_4way_context x13bcd_ctx_mid;
 void init_x13bcd_4way_ctx()
 {
     blake512_4way_init( &x13bcd_4way_ctx.blake );
     bmw512_4way_init( &x13bcd_4way_ctx.bmw );
     init_groestl( &x13bcd_4way_ctx.groestl, 64 );
     skein512_4way_init( &x13bcd_4way_ctx.skein );
     jh512_4way_init( &x13bcd_4way_ctx.jh );
     keccak512_4way_init( &x13bcd_4way_ctx.keccak );
 //     luffa_2way_init( &x13bcd_4way_ctx.luffa, 512 );
     cubehashInit( &x13bcd_4way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &x13bcd_4way_ctx.shavite );
     simd_2way_init( &x13bcd_4way_ctx.simd, 512 );
     init_echo( &x13bcd_4way_ctx.echo, 512 );
     sm3_4way_init( &x13bcd_4way_ctx.sm3 );
     hamsi512_4way_init( &x13bcd_4way_ctx.hamsi );
     sph_fugue512_init( &x13bcd_4way_ctx.fugue );
 };
 void x13bcd_4way_hash( void *state, const void *input )
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     x13bcd_4way_ctx_holder ctx;
     memcpy( &ctx, &x13bcd_4way_ctx, sizeof(x13bcd_4way_ctx) );
     // Blake
     memcpy( &ctx.blake, &x13bcd_ctx_mid, sizeof(x13bcd_ctx_mid) );
     blake512_4way( &ctx.blake, input + (64<<2), 16 );
 //     blake512_4way( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );
     // Bmw
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
     // Serial
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     reinit_groestl( &ctx.groestl );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
     reinit_groestl( &ctx.groestl );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
     reinit_groestl( &ctx.groestl );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     // Parallel 4way
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     // Skein
     skein512_4way( &ctx.skein, vhash, 64 );
     skein512_4way_close( &ctx.skein, vhash );
     // JH
     jh512_4way( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );
     // Keccak
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     // SM3 parallel 32 bit
     uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64)));
     memset( sm3_vhash, 0, sizeof sm3_vhash );
     uint32_t sm3_hash0[32] __attribute__ ((aligned (32)));
     memset( sm3_hash0, 0, sizeof sm3_hash0 );
     uint32_t sm3_hash1[32] __attribute__ ((aligned (32)));
     memset( sm3_hash1, 0, sizeof sm3_hash1 );
     uint32_t sm3_hash2[32] __attribute__ ((aligned (32)));
     memset( sm3_hash2, 0, sizeof sm3_hash2 );
     uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
     memset( sm3_hash3, 0, sizeof sm3_hash3 );
     sm3_4way( &ctx.sm3, vhash, 64 );
     sm3_4way_close( &ctx.sm3, sm3_vhash );
     dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
 /*     
     // Luffa
     intrlv_2x128( vhash, hash0, hash1, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
     dintrlv_2x128( hash0, hash1, vhash, 512 );
     intrlv_2x128( vhash, hash2, hash3, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
     dintrlv_2x128( hash2, hash3, vhash, 512 );
 */
     // Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
     memcpy( &ctx.cube, &x13bcd_4way_ctx.cube, sizeof(cubehashParam) );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
     memcpy( &ctx.cube, &x13bcd_4way_ctx.cube, sizeof(cubehashParam) );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
     memcpy( &ctx.cube, &x13bcd_4way_ctx.cube, sizeof(cubehashParam) );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
     // Shavite
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     memcpy( &ctx.shavite, &x13bcd_4way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash1, 64 );
     sph_shavite512_close( &ctx.shavite, hash1 );
     memcpy( &ctx.shavite, &x13bcd_4way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash2, 64 );
     sph_shavite512_close( &ctx.shavite, hash2 );
     memcpy( &ctx.shavite, &x13bcd_4way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );
     // Simd
     intrlv_2x128( vhash, hash0, hash1, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_2x128( hash0, hash1, vhash, 512 );
     intrlv_2x128( vhash, hash2, hash3, 512 );
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_2x128( hash2, hash3, vhash, 512 );
     // Echo
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, 512 );
     memcpy( &ctx.echo, &x13bcd_4way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, 512 );
     memcpy( &ctx.echo, &x13bcd_4way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, 512 );
     memcpy( &ctx.echo, &x13bcd_4way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
 /*
     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     // SM3 parallel 32 bit
     uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64)));
     memset( sm3_vhash, 0, sizeof sm3_vhash );
     uint32_t sm3_hash0[32] __attribute__ ((aligned (32)));
     memset( sm3_hash0, 0, sizeof sm3_hash0 );
     uint32_t sm3_hash1[32] __attribute__ ((aligned (32)));
     memset( sm3_hash1, 0, sizeof sm3_hash1 );
     uint32_t sm3_hash2[32] __attribute__ ((aligned (32)));
     memset( sm3_hash2, 0, sizeof sm3_hash2 );
     uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
     memset( sm3_hash3, 0, sizeof sm3_hash3 );
     sm3_4way( &ctx.sm3, vhash, 64 );
     sm3_4way_close( &ctx.sm3, sm3_vhash );
     dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
 */
     // Hamsi parallel 4x32x2
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // Fugue serial
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
     memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash1, 64 );
     sph_fugue512_close( &ctx.fugue, hash1 );
     memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash2, 64 );
     sph_fugue512_close( &ctx.fugue, hash2 );
     memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );
     memcpy( state,    hash0, 32 );
     memcpy( state+32, hash1, 32 );
     memcpy( state+64, hash2, 32 );
     memcpy( state+96, hash3, 32 );
 }
 int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     int thr_id = mythr->id;  // thr_id arg is deprecated
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };
     mm256_bswap32_intrlv80_4x64( vdata, pdata );
     blake512_4way_init( &x13bcd_ctx_mid );
     blake512_4way( &x13bcd_ctx_mid, vdata, 64 );
     for ( int m=0; m < 6; m++ )
       if ( Htarg <= htmax[m] )
       {
         uint32_t mask = masks[m];
         do
         {
           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
            x13bcd_4way_hash( hash, vdata );
            pdata[19] = n;
            for ( int i = 0; i < 4; i++ )
            if ( ( ( (hash+(i<<3))[7] & mask ) == 0 ) )
            if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
            {
               pdata[19] = n+i;
              submit_lane_solution( work, hash+(i<<3), mythr, i );
            }
            n += 4;
         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
         break;
       }
     *hashes_done = n - first_nonce + 1;
     return 0;
 }
 #endif
--- a/algo/x13/x13bcd.c
+++ b/algo/x13/x13bcd.c
@@ -0,0 +1,258 @@
 #include "x13sm3-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
 #include "algo/groestl/sph_groestl.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/sph_luffa.h"
 #include "algo/cubehash/sph_cubehash.h"
 #include "algo/simd/sph_simd.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/sm3/sph_sm3.h"
 //#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/bmw/sse2/bmw.c"
 #include "algo/keccak/sse2/keccak.c"
 #include "algo/skein/sse2/skein.c"
 #include "algo/jh/sse2/jh_sse2_opt64.h"
 #ifndef NO_AES_NI
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
 typedef struct {
 #ifdef NO_AES_NI
        sph_groestl512_context  groestl;
        sph_echo512_context     echo;
 #else
        hashState_echo          echo;
        hashState_groestl       groestl;
 #endif
 //        hashState_luffa         luffa;
        cubehashParam           cube;
        sph_shavite512_context  shavite;
        hashState_sd            simd;
        sm3_ctx_t               sm3;
        sph_hamsi512_context    hamsi;
        sph_fugue512_context    fugue;
 } x13bcd_ctx_holder;
 x13bcd_ctx_holder x13bcd_ctx;
 void init_x13bcd_ctx()
 {
 #ifdef NO_AES_NI
        sph_groestl512_init(&x13bcd_ctx.groestl);
        sph_echo512_init(&x13bcd_ctx.echo);
 #else
        init_echo(&x13bcd_ctx.echo, 512);
        init_groestl(&x13bcd_ctx.groestl, 64 );
 #endif
 //        init_luffa(&x13bcd_ctx.luffa,512);
        cubehashInit(&x13bcd_ctx.cube,512,16,32);
        sph_shavite512_init(&x13bcd_ctx.shavite);
        init_sd(&x13bcd_ctx.simd,512);
        sm3_init( &x13bcd_ctx.sm3 );
        sph_hamsi512_init(&x13bcd_ctx.hamsi);
        sph_fugue512_init(&x13bcd_ctx.fugue);
 };
 void x13bcd_hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (32)));
        x13bcd_ctx_holder ctx;
        memcpy(&ctx, &x13bcd_ctx, sizeof(x13bcd_ctx));
        unsigned char hashbuf[128];
        size_t hashptr;
        sph_u64 hashctA;
        sph_u64 hashctB;
        //---blake1---
        DECL_BLK;
        BLK_I;
        BLK_W;
        BLK_C;
        //---bmw2---
        DECL_BMW;
        BMW_I;
        BMW_U;
        #define M(x)    sph_dec64le_aligned(data + 8 * (x))
        #define H(x)    (h[x])
        #define dH(x)   (dh[x])
        BMW_C;
        #undef M
        #undef H
        #undef dH
        //---groestl----
 #ifdef NO_AES_NI
        sph_groestl512 (&ctx.groestl, hash, 64);
        sph_groestl512_close(&ctx.groestl, hash);
 #else
        update_and_final_groestl( &ctx.groestl, (char*)hash,
                                  (const char*)hash, 512 );
 #endif
        //---skein4---
        DECL_SKN;
        SKN_I;
        SKN_U;
        SKN_C;
        //---jh5------
        DECL_JH;
        JH_H;
        //---keccak6---
        DECL_KEC;
        KEC_I;
        KEC_U;
        KEC_C;
        uint32_t sm3_hash[32] __attribute__ ((aligned (32)));
        memset(sm3_hash, 0, sizeof sm3_hash);
        sph_sm3(&ctx.sm3, hash, 64);
        sph_sm3_close(&ctx.sm3, sm3_hash);
        cubehashUpdateDigest( &ctx.cube, (byte*) hash,
                              (const byte*)sm3_hash, 64 );
 /*
        //--- luffa7
        update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                (const BitSequence*)hash, 64 );
        // 8 Cube
        cubehashUpdateDigest( &ctx.cube, (byte*) hash,
                              (const byte*)hash, 64 );
 */
        // 9 Shavite
        sph_shavite512( &ctx.shavite, hash, 64);
        sph_shavite512_close( &ctx.shavite, hash);
        // 10 Simd
        update_final_sd( &ctx.simd, (BitSequence *)hash,
                         (const BitSequence *)hash, 512 );
        //11---echo---
 #ifdef NO_AES_NI
        sph_echo512(&ctx.echo, hash, 64);
        sph_echo512_close(&ctx.echo, hash);
 #else
        update_final_echo ( &ctx.echo, (BitSequence *)hash,
                            (const BitSequence *)hash, 512 );
 #endif
        /*
        uint32_t sm3_hash[32] __attribute__ ((aligned (32)));
        memset(sm3_hash, 0, sizeof sm3_hash);
        sph_sm3(&ctx.sm3, hash, 64);
        sph_sm3_close(&ctx.sm3, sm3_hash);
        sph_hamsi512(&ctx.hamsi, sm3_hash, 64);
 */
        sph_hamsi512(&ctx.hamsi, hash, 64);
        sph_hamsi512_close(&ctx.hamsi, hash);
        sph_fugue512(&ctx.fugue, hash, 64);
        sph_fugue512_close(&ctx.fugue, hash);
        asm volatile ("emms");
 	memcpy(output, hash, 32);
 }
 int scanhash_x13bcd( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr)
 {
        uint32_t endiandata[20] __attribute__((aligned(64)));
        uint32_t hash64[8] __attribute__((aligned(64)));
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
   int thr_id = mythr->id;  // thr_id arg is deprecated
 	const uint32_t Htarg = ptarget[7];
 	uint64_t htmax[] = {
 		0,
 		0xF,
 		0xFF,
 		0xFFF,
 		0xFFFF,
 		0x10000000
 	};
 	uint32_t masks[] = {
 		0xFFFFFFFF,
 		0xFFFFFFF0,
 		0xFFFFFF00,
 		0xFFFFF000,
 		0xFFFF0000,
 		0
 	};
 	// we need bigendian data...
        swab32_array( endiandata, pdata, 20 );
 #ifdef DEBUG_ALGO
 	if (Htarg != 0)
 		printf("[%d] Htarg=%X\n", thr_id, Htarg);
 #endif
 	for (int m=0; m < 6; m++) {
 		if (Htarg <= htmax[m]) {
 			uint32_t mask = masks[m];
 			do {
 				pdata[19] = ++n;
 				be32enc(&endiandata[19], n);
 				x13bcd_hash(hash64, endiandata);
 #ifndef DEBUG_ALGO
 				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
 					*hashes_done = n - first_nonce + 1;
 					return true;
 				}
 #else
 				if (!(n % 0x1000) && !thr_id) printf(".");
 				if (!(hash64[7] & mask)) {
 					printf("[%d]",thr_id);
 					if (fulltest(hash64, ptarget)) {
                  work_set_target_ratio( work, hash64 );
 						*hashes_done = n - first_nonce + 1;
 						return true;
 					}
 				}
 #endif
 			} while (n < max_nonce && !work_restart[thr_id].restart);
 			// see blake.c if else to understand the loop on htmax => mask
 			break;
 		}
 	}
 	*hashes_done = n - first_nonce + 1;
 	pdata[19] = n;
 	return 0;
 }
--- a/algo/x13/x13sm3-gate.c
+++ b/algo/x13/x13sm3-gate.c
@@ -16,3 +16,19 @@ bool register_x13sm3_algo( algo_gate_t* gate )
  return true;
 };
 bool register_x13bcd_algo( algo_gate_t* gate )
 {
 #if defined (X13SM3_4WAY)
  init_x13bcd_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x13bcd_4way;
  gate->hash      = (void*)&x13bcd_4way_hash;
 #else
  init_x13bcd_ctx();
  gate->scanhash  = (void*)&scanhash_x13bcd;
  gate->hash      = (void*)&x13bcd_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x13/x13sm3-gate.h
+++ b/algo/x13/x13sm3-gate.h
@@ -10,23 +10,31 @@
 bool register_x13sm3_algo( algo_gate_t* gate );
 bool register_x13bcd_algo( algo_gate_t* gate );
 #if defined(X13SM3_4WAY)
 void x13sm3_4way_hash( void *state, const void *input );
 int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x13sm3_4way_ctx();
 void x13bcd_4way_hash( void *state, const void *input );
 int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x13bcd_4way_ctx();
 #endif
 void x13sm3_hash( void *state, const void *input );
 int scanhash_x13sm3( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
 void init_x13sm3_ctx();
 void x13bcd_hash( void *state, const void *input );
 int scanhash_x13bcd( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
 void init_x13bcd_ctx();
 #endif
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -62,3 +62,149 @@ bool register_x16s_algo( algo_gate_t* gate )
  return true;
 };
 ////////////////
 //
 //   X16RT
 void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash )
 {
    int32_t maskedTime = timeStamp & 0xffffff80;
    sha256d( (unsigned char*)timeHash, (const unsigned char*)( &maskedTime ),
             sizeof( maskedTime ) );
 }
 void x16rt_getAlgoString( const uint32_t *timeHash, char *output)
 {
   char *sptr = output;
   uint8_t* data = (uint8_t*)timeHash;
   for (uint8_t j = 0; j < X16R_HASH_FUNC_COUNT; j++) {
      uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed
      uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4;
      if (algoDigit >= 10)
         sprintf(sptr, "%c", 'A' + (algoDigit - 10));
      else
         sprintf(sptr, "%u", (uint32_t) algoDigit);
      sptr++;
   }
   *sptr = '\0';
 }
 void x16rt_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 {
   uchar merkle_tree[64] = { 0 };
   size_t t;
   algo_gate.gen_merkle_root( merkle_tree, sctx );
   // Increment extranonce2
   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
   // Assemble block header
 //   algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
 //          (uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree,
 //          le32dec( sctx->job.ntime ), le32dec(sctx->job.nbits) );
   int i;
   memset( g_work->data, 0, sizeof(g_work->data) );
   g_work->data[0] = le32dec( sctx->job.version );
   if ( have_stratum )
      for ( i = 0; i < 8; i++ )
         g_work->data[ 1+i ] = le32dec( (uint32_t*)sctx->job.prevhash + i );
   else
      for (i = 0; i < 8; i++)
         g_work->data[ 8-i ] = le32dec( (uint32_t*)sctx->job.prevhash + i );
   g_work->data[ algo_gate.ntime_index ] = le32dec( sctx->job.ntime );
   g_work->data[ algo_gate.nbits_index ] = le32dec( sctx->job.nbits );
   g_work->data[20] = 0x80000000;
   g_work->data[31] = 0x00000280;
   for ( i = 0; i < 8; i++ )
      g_work->merkleroothash[7 - i] = be32dec((uint32_t *)merkle_tree + i);
   for ( i = 0; i < 8; i++ )
      g_work->witmerkleroothash[7 - i] = be32dec((uint32_t *)merkle_tree + i);
   for ( i = 0; i < 8; i++ )
      g_work->denom10[i] =    le32dec((uint32_t *)sctx->job.denom10 + i);
   for ( i = 0; i < 8; i++ )
      g_work->denom100[i] =   le32dec((uint32_t *)sctx->job.denom100 + i);
   for ( i = 0; i < 8; i++ )
      g_work->denom1000[i] =  le32dec((uint32_t *)sctx->job.denom1000 + i);
   for ( i = 0; i < 8; i++ )
      g_work->denom10000[i] = le32dec((uint32_t *)sctx->job.denom10000 + i);
   uint32_t pofnhash[8];
   memset(pofnhash, 0x00, 32);
   char denom10_str      [ 2 * sizeof( g_work->denom10 )           + 1 ];
   char denom100_str     [ 2 * sizeof( g_work->denom100 )          + 1 ];
   char denom1000_str    [ 2 * sizeof( g_work->denom1000 )         + 1 ];
   char denom10000_str   [ 2 * sizeof( g_work->denom10000 )        + 1 ];
   char merkleroot_str   [ 2 * sizeof( g_work->merkleroothash )    + 1 ];
   char witmerkleroot_str[ 2 * sizeof( g_work->witmerkleroothash ) + 1 ];
   char pofn_str         [ 2 * sizeof( pofnhash )                  + 1 ];
   cbin2hex( denom10_str,       (char*) g_work->denom10,           32 );
   cbin2hex( denom100_str,      (char*) g_work->denom100,          32 );
   cbin2hex( denom1000_str,     (char*) g_work->denom1000,         32 );
   cbin2hex( denom10000_str,    (char*) g_work->denom10000,        32 );
   cbin2hex( merkleroot_str,    (char*) g_work->merkleroothash,    32 );
   cbin2hex( witmerkleroot_str, (char*) g_work->witmerkleroothash, 32 );
   cbin2hex( pofn_str,          (char*) pofnhash,                  32 );
   if ( true )
   {
       char* data;
       data = (char*)malloc( 2 + strlen( denom10_str ) * 4 + 16 * 4
                             + strlen( merkleroot_str ) * 3 );
       // Build the block header veildatahash in hex
       sprintf( data, "%s%s%s%s%s%s%s%s%s%s%s%s",
                       merkleroot_str, witmerkleroot_str, "04",
                       "0a00000000000000", denom10_str,
                       "6400000000000000", denom100_str,
                       "e803000000000000", denom1000_str,
                       "1027000000000000", denom10000_str, pofn_str );
       // Covert the hex to binary
       uint32_t test[100];
       hex2bin( (unsigned char*)(&test), data, 257);
       // Compute the sha256d of the binary
       uint32_t _ALIGN(64) hash[8];
       sha256d( (unsigned char*)hash, (unsigned char*)&(test), 257);
       // assign the veildatahash in the blockheader
       for ( i = 0; i < 8; i++ )
           g_work->data[16 - i] = le32dec(hash + i);
       free(data);
    }
 }
 bool register_x16rt_algo( algo_gate_t* gate )
 {
 #if defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16rt_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16rt_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->set_target = (void*)&alt_set_target;
  return true;
 };
 bool register_x16rt_veil_algo( algo_gate_t* gate )
 {
 #if defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16rt_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16rt_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->set_target = (void*)&alt_set_target;
  gate->build_extraheader = (void*)&x16rt_build_extraheader;
  return true;
 };
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -4,6 +4,7 @@
 #include "algo-gate-api.h"
 #include "simd-utils.h"
 #include <stdint.h>
 #include <unistd.h>
 #if defined(__AVX2__) && defined(__AES__)
  #define X16R_4WAY
@@ -30,11 +31,15 @@ enum x16r_Algo {
 };
 void (*x16_r_s_getAlgoString) ( const uint8_t*, char* );
-void x16r_getAlgoString( const uint8_t* prevblock, char *output );
+void x16r_getAlgoString( const uint8_t *prevblock, char *output );
-void x16s_getAlgoString( const uint8_t* prevblock, char *output );
+void x16s_getAlgoString( const uint8_t *prevblock, char *output );
 void x16rt_getAlgoString( const uint32_t *timeHash, char *output );
 void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash );
 bool register_x16r_algo( algo_gate_t* gate );
 bool register_x16s_algo( algo_gate_t* gate );
 bool register_x16rt_algo( algo_gate_t* gate );
 #if defined(X16R_4WAY)
@@ -42,11 +47,18 @@ void x16r_4way_hash( void *state, const void *input );
 int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 void x16rt_4way_hash( void *state, const void *input );
 int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 void x16r_hash( void *state, const void *input );
 int scanhash_x16r( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );
 void x16rt_hash( void *state, const void *input );
 int scanhash_x16rt( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );
 #endif
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -0,0 +1,353 @@
 #include "x16r-gate.h"
 #if defined (X16R_4WAY)
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sha2-hash-4way.h"
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread bool s_implemented = false;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
 union _x16rt_4way_context_overlay
 {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
    hashState_echo          echo;
    hashState_groestl       groestl;
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
    luffa_2way_context      luffa;
    cubehashParam           cube;
    sph_shavite512_context  shavite;
    simd_2way_context       simd;
    hamsi512_4way_context   hamsi;
    sph_fugue512_context    fugue;
    shabal512_4way_context  shabal;
    sph_whirlpool_context   whirlpool;
    sha512_4way_context     sha512;
 };
 typedef union _x16rt_4way_context_overlay x16rt_4way_context_overlay;
 void x16rt_4way_hash( void* output, const void* input )
 {
   uint32_t hash0[24] __attribute__ ((aligned (64)));
   uint32_t hash1[24] __attribute__ ((aligned (64)));
   uint32_t hash2[24] __attribute__ ((aligned (64)));
   uint32_t hash3[24] __attribute__ ((aligned (64)));
   uint32_t vhash[24*4] __attribute__ ((aligned (64)));
   x16rt_4way_context_overlay ctx;
   void *in0 = (void*) hash0;
   void *in1 = (void*) hash1;
   void *in2 = (void*) hash2;
   void *in3 = (void*) hash3;
   int size = 80;
   dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
 /*
   void *in = (void*) input;
   uint32_t *in32 = (uint32_t*) hash0;
   uint32_t ntime = in32[17];
   if ( s_ntime == UINT32_MAX )
   {
      uint32_t _ALIGN(64) timeHash[8];
      x16rt_getTimeHash(ntime, &timeHash);
      x16rt_getAlgoString(&timeHash[0], hashOrder);
   }
 */
   // Input data is both 64 bit interleaved (input)
   // and deinterleaved in inp0-3.
   // If First function uses 64 bit data it is not required to interleave inp
   // first. It may use the inerleaved data dmost convenient, ie 4way 64 bit.
   // All other functions assume data is deinterleaved in hash0-3
   // All functions must exit with data deinterleaved in hash0-3.
   // Alias in0-3 points to either inp0-3 or hash0-3 according to
   // its hashOrder position. Size is also set accordingly.
   for ( int i = 0; i < 16; i++ )
   {
      const char elem = hashOrder[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
      switch ( algo )
      {
         case BLAKE:
            blake512_4way_init( &ctx.blake );
            if ( i == 0 )
               blake512_4way( &ctx.blake, input, size );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               blake512_4way( &ctx.blake, vhash, size );
            }
            blake512_4way_close( &ctx.blake, vhash );
            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case BMW:
            bmw512_4way_init( &ctx.bmw );
            if ( i == 0 )
               bmw512_4way( &ctx.bmw, input, size );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               bmw512_4way( &ctx.bmw, vhash, size );
            }
            bmw512_4way_close( &ctx.bmw, vhash );
            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case GROESTL:
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                                 (const char*)in0, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash1,
                                                 (const char*)in1, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash2,
                                                 (const char*)in2, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash3,
                                                 (const char*)in3, size<<3 );
         break;
         case SKEIN:
            skein512_4way_init( &ctx.skein );
            if ( i == 0 )
               skein512_4way( &ctx.skein, input, size );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               skein512_4way( &ctx.skein, vhash, size );
            }
            skein512_4way_close( &ctx.skein, vhash );
            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case JH:
            jh512_4way_init( &ctx.jh );
            if ( i == 0 )
               jh512_4way( &ctx.jh, input, size );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               jh512_4way( &ctx.jh, vhash, size );
            }
            jh512_4way_close( &ctx.jh, vhash );
            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case KECCAK:
            keccak512_4way_init( &ctx.keccak );
            if ( i == 0 )
               keccak512_4way( &ctx.keccak, input, size );
            else
            {
               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
               keccak512_4way( &ctx.keccak, vhash, size );
            }
            keccak512_4way_close( &ctx.keccak, vhash );
            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case LUFFA:
            intrlv_2x128( vhash, in0, in1, size<<3 );
            luffa_2way_init( &ctx.luffa, 512 );
            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
            dintrlv_2x128( hash0, hash1, vhash, 512 );
            intrlv_2x128( vhash, in2, in3, size<<3 );
            luffa_2way_init( &ctx.luffa, 512 );
            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
            dintrlv_2x128( hash2, hash3, vhash, 512 );
         break;
         case CUBEHASH:
            cubehashInit( &ctx.cube, 512, 16, 32 );
            cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
                                  (const byte*)in0, size );
            cubehashInit( &ctx.cube, 512, 16, 32 );
            cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
                                  (const byte*)in1, size );
            cubehashInit( &ctx.cube, 512, 16, 32 );
            cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
                                  (const byte*)in2, size );
            cubehashInit( &ctx.cube, 512, 16, 32 );
            cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
                                  (const byte*)in3, size );
         break;
         case SHAVITE:
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in0, size );
            sph_shavite512_close( &ctx.shavite, hash0 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in1, size );
            sph_shavite512_close( &ctx.shavite, hash1 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in2, size );
            sph_shavite512_close( &ctx.shavite, hash2 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in3, size );
            sph_shavite512_close( &ctx.shavite, hash3 );
         break;
         case SIMD:
            intrlv_2x128( vhash, in0, in1, size<<3 );
            simd_2way_init( &ctx.simd, 512 );
            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
            dintrlv_2x128( hash0, hash1, vhash, 512 );
            intrlv_2x128( vhash, in2, in3, size<<3 );
            simd_2way_init( &ctx.simd, 512 );
            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
            dintrlv_2x128( hash2, hash3, vhash, 512 );
         break;
         case ECHO:
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
                                (const BitSequence*)in0, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
                                (const BitSequence*)in1, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
                                (const BitSequence*)in2, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
                                (const BitSequence*)in3, size<<3 );
         break;
         case HAMSI:
             intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
             hamsi512_4way_init( &ctx.hamsi );
             hamsi512_4way( &ctx.hamsi, vhash, size );
             hamsi512_4way_close( &ctx.hamsi, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case FUGUE:
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in0, size );
             sph_fugue512_close( &ctx.fugue, hash0 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in1, size );
             sph_fugue512_close( &ctx.fugue, hash1 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in2, size );
             sph_fugue512_close( &ctx.fugue, hash2 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in3, size );
             sph_fugue512_close( &ctx.fugue, hash3 );
         break;
         case SHABAL:
             intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
             shabal512_4way_init( &ctx.shabal );
             shabal512_4way( &ctx.shabal, vhash, size );
             shabal512_4way_close( &ctx.shabal, vhash );
             dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case WHIRLPOOL:
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in0, size );
             sph_whirlpool_close( &ctx.whirlpool, hash0 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in1, size );
             sph_whirlpool_close( &ctx.whirlpool, hash1 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in2, size );
             sph_whirlpool_close( &ctx.whirlpool, hash2 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in3, size );
             sph_whirlpool_close( &ctx.whirlpool, hash3 );
         break;
         case SHA_512:
             intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
             sha512_4way_init( &ctx.sha512 );
             sha512_4way( &ctx.sha512, vhash, size );
             sha512_4way_close( &ctx.sha512, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
      }
      size = 64;
   }
   memcpy( output,    hash0, 32 );
   memcpy( output+32, hash1, 32 );
   memcpy( output+64, hash2, 32 );
   memcpy( output+96, hash3, 32 );
 }
 int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
 {
   uint32_t hash[4*16] __attribute__ ((aligned (64)));
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t endiandata[20] __attribute__((aligned(64)));
   uint32_t _ALIGN(64) timeHash[4*8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   int thr_id = mythr->id;  // thr_id arg is deprecated
    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
   casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
   casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
   uint32_t ntime = swab32( pdata[17] );
   if ( s_ntime != ntime )
   {
      x16rt_getTimeHash( ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], hashOrder );
      s_ntime = ntime;
      s_implemented = true;
      if ( opt_debug && !thr_id )
          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
                               hashOrder, ntime, timeHash );
   }
   if ( !s_implemented )
   {
      applog( LOG_WARNING, "s not implemented");
      sleep(1);
      return 0;
   }
   if ( opt_benchmark )
      ptarget[7] = 0x0cff;
   uint64_t *edata = (uint64_t*)endiandata;
   intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
   do
   {
      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
               _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
      x16rt_4way_hash( hash, vdata );
      pdata[19] = n;
      for ( int i = 0; i < 4; i++ )  if ( (hash+(i<<3))[7] <= Htarg )
      if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
      {
         pdata[19] = n+i;
         submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;
   } while ( (  n < max_nonce ) && !(*restart) );
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
 #endif
--- a/algo/x16/x16rt.c
+++ b/algo/x16/x16rt.c
@@ -0,0 +1,239 @@
 #include "x16r-gate.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "algo/blake/sph_blake.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/groestl/sph_groestl.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include <openssl/sha.h>
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #endif
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread bool s_implemented = false;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
 union _x16rt_context_overlay
 {
 #if defined(__AES__)
        hashState_echo          echo;
        hashState_groestl       groestl;
 #else
        sph_groestl512_context   groestl;
        sph_echo512_context      echo;
 #endif
        sph_blake512_context    blake;
        sph_bmw512_context      bmw;
        sph_skein512_context    skein;
        sph_jh512_context       jh;
        sph_keccak512_context   keccak;
        hashState_luffa         luffa;
        cubehashParam           cube;
        sph_shavite512_context  shavite;
        hashState_sd            simd;
        sph_hamsi512_context    hamsi;
        sph_fugue512_context    fugue;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
        SHA512_CTX              sha512;
 };
 typedef union _x16rt_context_overlay x16rt_context_overlay;
 void x16rt_hash( void* output, const void* input )
 {
   uint32_t _ALIGN(128) hash[16];
   x16rt_context_overlay ctx;
   int size = 80;
   void *in = (void*) input;
 /*
   void *in = (void*) input;
   uint32_t *in32 = (uint32_t*) in;
   uint32_t ntime = in32[17];
   if ( s_ntime == UINT32_MAX )
   {
      uint32_t _ALIGN(64) timeHash[8];
      x16rt_getTimeHash(ntime, &timeHash);
      x16rt_getAlgoString(&timeHash[0], hashOrder);
   }
 */
   for ( int i = 0; i < 16; i++ )
   {
      const char elem = hashOrder[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
      switch ( algo )
      {
         case BLAKE:
            sph_blake512_init( &ctx.blake );
            sph_blake512( &ctx.blake, in, size );
            sph_blake512_close( &ctx.blake, hash );
         break;
         case BMW:
            sph_bmw512_init( &ctx.bmw );
            sph_bmw512(&ctx.bmw, in, size);
            sph_bmw512_close(&ctx.bmw, hash);
         break;
         case GROESTL:
 #if defined(__AES__)
            init_groestl( &ctx.groestl, 64 );
            update_and_final_groestl( &ctx.groestl, (char*)hash,
                                      (const char*)in, size<<3 );
 #else
            sph_groestl512_init( &ctx.groestl );
            sph_groestl512( &ctx.groestl, in, size );
            sph_groestl512_close(&ctx.groestl, hash);
 #endif
         break;
         case SKEIN:
            sph_skein512_init( &ctx.skein );
            sph_skein512( &ctx.skein, in, size );
            sph_skein512_close( &ctx.skein, hash );
         break;
         case JH:
            sph_jh512_init( &ctx.jh );
            sph_jh512(&ctx.jh, in, size );
            sph_jh512_close(&ctx.jh, hash );
         break;
         case KECCAK:
            sph_keccak512_init( &ctx.keccak );
            sph_keccak512( &ctx.keccak, in, size );
            sph_keccak512_close( &ctx.keccak, hash );
         break;
         case LUFFA:
            init_luffa( &ctx.luffa, 512 );
            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                    (const BitSequence*)in, size );
         break;
         case CUBEHASH:
            cubehashInit( &ctx.cube, 512, 16, 32 );
            cubehashUpdateDigest( &ctx.cube, (byte*) hash,
                                  (const byte*)in, size );
         break;
         case SHAVITE:
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in, size );
            sph_shavite512_close( &ctx.shavite, hash );
         break;
         case SIMD:
             init_sd( &ctx.simd, 512 );
             update_final_sd( &ctx.simd, (BitSequence *)hash,
                              (const BitSequence*)in, size<<3 );
         break;
         case ECHO:
 #if defined(__AES__)
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash,
                                (const BitSequence*)in, size<<3 );
 #else
             sph_echo512_init( &ctx.echo );
             sph_echo512( &ctx.echo, in, size );
             sph_echo512_close( &ctx.echo, hash );
 #endif
         break;
         case HAMSI:
             sph_hamsi512_init( &ctx.hamsi );
             sph_hamsi512( &ctx.hamsi, in, size );
             sph_hamsi512_close( &ctx.hamsi, hash );
         break;
         case FUGUE:
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in, size );
             sph_fugue512_close( &ctx.fugue, hash );
         break;
         case SHABAL:
             sph_shabal512_init( &ctx.shabal );
             sph_shabal512( &ctx.shabal, in, size );
             sph_shabal512_close( &ctx.shabal, hash );
         break;
         case WHIRLPOOL:
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in, size );
             sph_whirlpool_close( &ctx.whirlpool, hash );
         break;
         case SHA_512:
             SHA512_Init( &ctx.sha512 );
             SHA512_Update( &ctx.sha512, in, size );
             SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
         break;
      }
      in = (void*) hash;
      size = 64;
   }
   memcpy(output, hash, 32);
 }
 int scanhash_x16rt( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(128) hash32[8];
   uint32_t _ALIGN(128) endiandata[20];
   uint32_t _ALIGN(64) timeHash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   int thr_id = mythr->id;  // thr_id arg is deprecated
   uint32_t nonce = first_nonce;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
   uint32_t ntime = swab32( pdata[17] );
   if ( s_ntime != ntime )
   {
      x16rt_getTimeHash( ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], hashOrder );
      s_ntime = ntime;
      s_implemented = true;
      if ( opt_debug && !thr_id )
          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
                               hashOrder, ntime, timeHash );
   }
   if ( !s_implemented )
   {
      applog( LOG_WARNING, "s not implemented");
      sleep(1);
      return 0;
   }
   if ( opt_benchmark )
      ptarget[7] = 0x0cff;
   do
   {
      be32enc( &endiandata[19], nonce );
      x16rt_hash( hash32, endiandata );
      if ( hash32[7] <= Htarg )
      if (fulltest( hash32, ptarget ) && !opt_benchmark )
      {
         pdata[19] = nonce;
         submit_solution( work, hash32, mythr );
      }
      nonce++;
   } while ( nonce < max_nonce && !(*restart) );
   pdata[19] = nonce;
   *hashes_done = pdata[19] - first_nonce + 1;
   return 0;
 }
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -69,7 +69,7 @@ void sonoa_4way_hash( void *state, const void *input )
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -80,7 +80,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -134,13 +134,13 @@ void sonoa_4way_hash( void *state, const void *input )
 // 2
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -151,7 +151,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -203,7 +203,7 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
@@ -215,7 +215,7 @@ void sonoa_4way_hash( void *state, const void *input )
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -226,7 +226,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -278,13 +278,13 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -300,13 +300,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );
 // 4
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -317,7 +317,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -369,13 +369,13 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -390,7 +390,7 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );
-     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
@@ -402,7 +402,7 @@ void sonoa_4way_hash( void *state, const void *input )
     hamsi512_4way( &ctx.hamsi, vhashB, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -438,7 +438,7 @@ void sonoa_4way_hash( void *state, const void *input )
     shabal512_4way( &ctx.shabal, vhashB, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -449,7 +449,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -501,13 +501,13 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -522,13 +522,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );
-     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -545,13 +545,13 @@ void sonoa_4way_hash( void *state, const void *input )
 // 6
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -562,7 +562,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -614,13 +614,13 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -635,13 +635,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );
-     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -656,13 +656,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash3 );
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     sha512_4way_init( &ctx.sha512 );
     sha512_4way( &ctx.sha512, vhash, 64 );
     sha512_4way_close( &ctx.sha512, vhash );
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -679,13 +679,13 @@ void sonoa_4way_hash( void *state, const void *input )
 // 7
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -696,7 +696,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -748,13 +748,13 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -769,13 +769,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );
-     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -790,7 +790,7 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash3 );
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     sha512_4way_init( &ctx.sha512 );
     sha512_4way( &ctx.sha512, vhash, 64 );
@@ -806,7 +806,7 @@ void sonoa_4way_hash( void *state, const void *input )
 int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
 	            uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t hash[4*16] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t lane_hash[8] __attribute__ ((aligned (32)));
     uint32_t *hash7 = &(hash[7<<2]);
@@ -816,7 +816,7 @@ int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
     const uint32_t first_nonce = pdata[19];
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
-     int thr_id = mythr->id;  // thr_id arg is deprecated
+     int thr_id = mythr->id;
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -68,7 +68,7 @@ void x17_4way_hash( void *state, const void *input )
     bmw512_4way_close( &ctx.bmw, vhash );
     // Serialize
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     // 3 Groestl
     init_groestl( &ctx.groestl, 64 );
@@ -81,7 +81,7 @@ void x17_4way_hash( void *state, const void *input )
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     // Parallellize
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     // 4 Skein parallel 4 way 64 bit 
     skein512_4way_init( &ctx.skein );
@@ -142,13 +142,13 @@ void x17_4way_hash( void *state, const void *input )
                       (const BitSequence *) hash3, 512 );
     // 12 Hamsi parallel 4 way 64 bit
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     // 13 Fugue serial
     sph_fugue512_init( &ctx.fugue );
@@ -165,13 +165,13 @@ void x17_4way_hash( void *state, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );
     // 14 Shabal, parallel 4 way 32 bit
-     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
     // 15 Whirlpool serial
     sph_whirlpool_init( &ctx.whirlpool );
@@ -188,7 +188,7 @@ void x17_4way_hash( void *state, const void *input )
     sph_whirlpool_close( &ctx.whirlpool, hash3 );
     // 16 SHA512 parallel 64 bit 
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     sha512_4way_init( &ctx.sha512 );
     sha512_4way( &ctx.sha512, vhash, 64 );
@@ -205,7 +205,7 @@ void x17_4way_hash( void *state, const void *input )
 int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t hash[4*16] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t lane_hash[8] __attribute__ ((aligned (32)));
     uint32_t *hash7 = &(hash[7<<2]);
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -332,7 +332,7 @@ void xevan_4way_hash( void *output, const void *input )
 int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t hash[4*16] __attribute__ ((aligned (64)));
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[7<<2]);
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -399,15 +399,15 @@ int scanhash_yescrypt( struct work *work, uint32_t max_nonce,
                be32enc(&endiandata[k], pdata[k]);
        do {
-                be32enc(&endiandata[19], n);
+           be32enc(&endiandata[19], n);
-                yescrypt_hash((char*) endiandata, (char*) vhash, 80);
+           yescrypt_hash((char*) endiandata, (char*) vhash, 80);
-                if (vhash[7] < Htarg && fulltest(vhash, ptarget)) {
+           if (vhash[7] < Htarg && fulltest(vhash, ptarget ) 
-                        work_set_target_ratio( work, vhash );
+               && !opt_benchmark )
-                        *hashes_done = n - first_nonce + 1;
+           {
-                        pdata[19] = n;
+               pdata[19] = n;
-                        return true;
+               submit_solution( work, vhash, mythr );
-                }
+           }
-                n++;
+           n++;
        } while (n < max_nonce && !work_restart[thr_id].restart);
        *hashes_done = n - first_nonce + 1;
--- a/algo/yespower/yespower.c
+++ b/algo/yespower/yespower.c
@@ -53,15 +53,15 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce,
        for (int k = 0; k < 19; k++)
                be32enc(&endiandata[k], pdata[k]);
        do {
-                be32enc(&endiandata[19], n);
+           be32enc(&endiandata[19], n);
-                yespower_hash((char*) endiandata, (char*) vhash, 80);
+           yespower_hash((char*) endiandata, (char*) vhash, 80);
-                if (vhash[7] < Htarg && fulltest(vhash, ptarget)) {
+           if ( vhash[7] < Htarg && fulltest( vhash, ptarget )
-                        work_set_target_ratio( work, vhash );
+              && !opt_benchmark )
-                        *hashes_done = n - first_nonce + 1;
+           {
-                        pdata[19] = n;
+               pdata[19] = n;
-                        return true;
+               submit_solution( work, vhash, mythr );
-                }
+            }
-                n++;
+            n++;
        } while (n < max_nonce && !work_restart[thr_id].restart);
        *hashes_done = n - first_nonce + 1;
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.5.4.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.6.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.9.5.4'
+PACKAGE_VERSION='3.9.6'
-PACKAGE_STRING='cpuminer-opt 3.9.5.4'
+PACKAGE_STRING='cpuminer-opt 3.9.6'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.9.5.4 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.9.6 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.9.5.4:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.9.6:";;
   esac
  cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.9.5.4
+cpuminer-opt configure 3.9.6
 generated by GNU Autoconf 2.69
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by cpuminer-opt $as_me 3.9.5.4, which was
+It was created by cpuminer-opt $as_me 3.9.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.9.5.4'
+ VERSION='3.9.6'
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.9.5.4, which was
+This file was extended by cpuminer-opt $as_me 3.9.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.9.5.4
+cpuminer-opt config.status 3.9.6
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.9.5.4])
+AC_INIT([cpuminer-opt], [3.9.6])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1009,8 +1009,7 @@ static int share_result( int result, struct work *null_work,
                       sres, diffstr, share_time, accepted_share_count,
                       rejected_share_count, solved_block_count );
-   if ( have_stratum && result && my_stats.share_diff && my_stats.net_diff
+   if ( have_stratum && result && !opt_quiet )
        && !opt_quiet )
   {
      applog( LOG_NOTICE, "Miner %s %sH/s, Share %s, Latency %d ms.",
                          hr, hr_units, shr, latency );
--- a/miner.h
+++ b/miner.h
@@ -313,6 +313,7 @@ void   applog(int prio, const char *fmt, ...);
 void   restart_threads(void);
 extern json_t *json_rpc_call( CURL *curl, const char *url, const char *userpass,
                	const char *rpc_req, int *curl_err, int flags );
 extern void cbin2hex(char *out, const char *in, size_t len);
 void   bin2hex( char *s, const unsigned char *p, size_t len );
 char  *abin2hex( const unsigned char *p, size_t len );
 bool   hex2bin( unsigned char *p, const char *hexstr, size_t len );
@@ -330,6 +331,7 @@ extern void diff_to_target(uint32_t *target, double diff);
 double hash_target_ratio( uint32_t* hash, uint32_t* target );
 void   work_set_target_ratio( struct work* work, uint32_t* hash );
 void   get_currentalgo( char* buf, int sz );
 bool   has_sha();
 bool   has_aes_ni();
@@ -363,6 +365,14 @@ struct work {
 	char *job_id;
 	size_t xnonce2_len;
 	unsigned char *xnonce2;
   // x16rt
   uint32_t merkleroothash[8];
   uint32_t witmerkleroothash[8];
   uint32_t denom10[8];
   uint32_t denom100[8];
   uint32_t denom1000[8];
   uint32_t denom10000[8];
 } __attribute__ ((aligned (64)));
 struct stratum_job {
@@ -376,9 +386,15 @@ struct stratum_job {
 	unsigned char version[4];
 	unsigned char nbits[4];
 	unsigned char ntime[4];
 	bool clean;
 	double diff;
-        unsigned char extra[64];
+   bool clean;
   // for x16rt
   unsigned char extra[64];
   unsigned char denom10[32];
   unsigned char denom100[32];
   unsigned char denom1000[32];
   unsigned char denom10000[32];
   unsigned char proofoffullnode[32];
 } __attribute__ ((aligned (64)));
@@ -498,6 +514,7 @@ enum algos {
 //        ALGO_BLAKE2B,
        ALGO_BLAKE2S,     
        ALGO_BMW,        
        ALGO_BMW512,
        ALGO_C11,         
        ALGO_CRYPTOLIGHT, 
        ALGO_CRYPTONIGHT,
@@ -555,10 +572,13 @@ enum algos {
        ALGO_X11GOST,
        ALGO_X12,
        ALGO_X13,         
        ALGO_X13BCD,
        ALGO_X13SM3,
        ALGO_X14,        
        ALGO_X15,       
        ALGO_X16R,
        ALGO_X16RT,
        ALGO_X16RT_VEIL,
        ALGO_X16S,
        ALGO_X17,
        ALGO_XEVAN,
@@ -586,6 +606,7 @@ static const char* const algo_names[] = {
 //        "blake2b",
        "blake2s",
        "bmw",
        "bmw512",
        "c11",
        "cryptolight",
        "cryptonight",
@@ -643,10 +664,13 @@ static const char* const algo_names[] = {
        "x11gost",
        "x12",
        "x13",
        "x13bcd",
        "x13sm3",
        "x14",
        "x15",
        "x16r",
        "x16rt",
        "x16rt-veil",
        "x16s",
        "x17",
        "xevan",
@@ -736,6 +760,7 @@ Options:\n\
                          blakecoin     blake256r8\n\
                          blake2s       Blake-2 S\n\
                          bmw           BMW 256\n\
                          bmw512        BMW 512\n\
                          c11           Chaincoin\n\
                          cryptolight   Cryptonight-light\n\
                          cryptonight   Cryptonote legacy\n\
@@ -782,7 +807,7 @@ Options:\n\
                          skein2        Double Skein (Woodcoin)\n\
                          skunk         Signatum (SIGT)\n\
                          sonoa         Sono\n\
-			                 timetravel    timeravel8, Machinecoin (MAC)\n\
+                          timetravel    timeravel8, Machinecoin (MAC)\n\
                          timetravel10  Bitcore (BTX)\n\
                          tribus        Denarius (DNR)\n\
                          vanilla       blake256r8vnl (VCash)\n\
@@ -794,20 +819,23 @@ Options:\n\
                          x11gost       sib (SibCoin)\n\
                          x12           Galaxie Cash (GCH)\n\
                          x13           X13\n\
                          x13bcd        bcd \n\
                          x13sm3        hsr (Hshare)\n\
                          x14           X14\n\
                          x15           X15\n\
                          x16r          Ravencoin (RVN)\n\
                          x16rt         Gincoin (GIN)\n\
                          x16rt-veil    Veil (VEIL)\n\
                          x16s          Pigeoncoin (PGN)\n\
                          x17\n\
                          xevan         Bitsend (BSD)\n\
-                          yescrypt      Globlboost-Y (BSTY)\n\
+                          yescrypt      Globalboost-Y (BSTY)\n\
                          yescryptr8    BitZeny (ZNY)\n\
                          yescryptr16   Eli\n\
                          yescryptr32   WAVI\n\
                          yespower      Cryply\n\
                          yespowerr16   Yenten (YTN)\n\
-			  zr5           Ziftr\n\
+                          zr5           Ziftr\n\
  -o, --url=URL         URL of mining server\n\
  -O, --userpass=U:P    username:password pair for mining server\n\
  -u, --user=USERNAME   username for mining server\n\
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -477,42 +477,42 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, void *src )
   __m256i s0 = mm256_bswap_32( casti_m256i( src,0 ) );
   __m256i s1 = mm256_bswap_32( casti_m256i( src,1 ) );
   __m128i s2 = mm128_bswap_32( casti_m128i( src,4 ) );
-  const __m256i zero = m256_zero;
+  const __m256i zero  = m256_zero;
-  const __m256i one  = m256_one_32;
+  const __m256i one   = m256_one_32;
-  const __m256i two  = _mm256_add_epi32( one, one );
+  const __m256i two   = _mm256_add_epi32( one, one );
-  const __m256i tre  = _mm256_add_epi32( two, one );
+  const __m256i three = _mm256_add_epi32( two, one );
-  const __m256i four = _mm256_add_epi32( two, two );
+  const __m256i four  = _mm256_add_epi32( two, two );
-  casti_m256i( d, 0 ) = _mm256_permutevar8x32_epi32( s0, zero );
+  casti_m256i( d, 0 ) = _mm256_permutevar8x32_epi32( s0, zero  );
-  casti_m256i( d, 1 ) = _mm256_permutevar8x32_epi32( s0, one  );
+  casti_m256i( d, 1 ) = _mm256_permutevar8x32_epi32( s0, one   );
-  casti_m256i( d, 2 ) = _mm256_permutevar8x32_epi32( s0, two  );
+  casti_m256i( d, 2 ) = _mm256_permutevar8x32_epi32( s0, two   );
-  casti_m256i( d, 3 ) = _mm256_permutevar8x32_epi32( s0, tre  );
+  casti_m256i( d, 3 ) = _mm256_permutevar8x32_epi32( s0, three );
-  casti_m256i( d, 4 ) = _mm256_permutevar8x32_epi32( s0, four );
+  casti_m256i( d, 4 ) = _mm256_permutevar8x32_epi32( s0, four  );
  casti_m256i( d, 5 ) = _mm256_permutevar8x32_epi32( s0,
-                                       _mm256_add_epi32( four, one ) );
+                                       _mm256_add_epi32( four, one   ) );
  casti_m256i( d, 6 ) = _mm256_permutevar8x32_epi32( s0,
-                                       _mm256_add_epi32( four, two ) );
+                                       _mm256_add_epi32( four, two   ) );
  casti_m256i( d, 7 ) = _mm256_permutevar8x32_epi32( s0,
-                                       _mm256_add_epi32( four, tre ) );
+                                       _mm256_add_epi32( four, three ) );
-  casti_m256i( d, 8 ) = _mm256_permutevar8x32_epi32( s1, zero );
+  casti_m256i( d, 8 ) = _mm256_permutevar8x32_epi32( s1, zero  );
-  casti_m256i( d, 9 ) = _mm256_permutevar8x32_epi32( s1, one  );
+  casti_m256i( d, 9 ) = _mm256_permutevar8x32_epi32( s1, one   );
-  casti_m256i( d,10 ) = _mm256_permutevar8x32_epi32( s1, two  );
+  casti_m256i( d,10 ) = _mm256_permutevar8x32_epi32( s1, two   );
-  casti_m256i( d,11 ) = _mm256_permutevar8x32_epi32( s1, tre  );
+  casti_m256i( d,11 ) = _mm256_permutevar8x32_epi32( s1, three );
-  casti_m256i( d,12 ) = _mm256_permutevar8x32_epi32( s1, four );
+  casti_m256i( d,12 ) = _mm256_permutevar8x32_epi32( s1, four  );
  casti_m256i( d,13 ) = _mm256_permutevar8x32_epi32( s1,
-                                       _mm256_add_epi32( four, one ) );
+                                       _mm256_add_epi32( four, one   ) );
  casti_m256i( d,14 ) = _mm256_permutevar8x32_epi32( s1,
-                                       _mm256_add_epi32( four, two  ) );
+                                       _mm256_add_epi32( four, two   ) );
  casti_m256i( d,15 ) = _mm256_permutevar8x32_epi32( s1,
-                                       _mm256_add_epi32( four, tre ) );
+                                       _mm256_add_epi32( four, three ) );
  casti_m256i( d,16 ) = _mm256_permutevar8x32_epi32(
-                             _mm256_castsi128_si256( s2 ), zero );
+                             _mm256_castsi128_si256( s2 ), zero  );
  casti_m256i( d,17 ) = _mm256_permutevar8x32_epi32(
-                             _mm256_castsi128_si256( s2 ), one  );
+                             _mm256_castsi128_si256( s2 ), one   );
  casti_m256i( d,18 ) = _mm256_permutevar8x32_epi32(
-                             _mm256_castsi128_si256( s2 ), two  );
+                             _mm256_castsi128_si256( s2 ), two   );
  casti_m256i( d,19 ) = _mm256_permutevar8x32_epi32( 
-                             _mm256_castsi128_si256( s2 ), tre  );
+                             _mm256_castsi128_si256( s2 ), three );
 }
 #endif   // AVX2
@@ -677,39 +677,39 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, void *src )
 {
  __m512i s0 = mm512_bswap_32( casti_m512i( src, 0 ) );
  __m128i s1 = mm128_bswap_32( casti_m128i( src, 4 ) );
-  const __m512i zero   = m512_zero;
+  const __m512i zero     = m512_zero;
-  const __m512i one    = m512_one_32;
+  const __m512i one      = m512_one_32;
-  const __m512i two    = _mm512_add_epi32( one,   one  );
+  const __m512i two      = _mm512_add_epi32( one,   one   );
-  const __m512i tre    = _mm512_add_epi32( two,   one  );
+  const __m512i three    = _mm512_add_epi32( two,   one   );
-  const __m512i four   = _mm512_add_epi32( two,   two  );
+  const __m512i four     = _mm512_add_epi32( two,   two   );
-  const __m512i eight  = _mm512_add_epi32( four,  four );
+  const __m512i eight    = _mm512_add_epi32( four,  four  );
-  const __m512i eleven = _mm512_add_epi32( eight, tre  );
+  const __m512i eleven   = _mm512_add_epi32( eight, three );
-  casti_m512i( d, 0 ) = _mm512_permutexvar_epi32( s0, zero );
+  casti_m512i( d, 0 ) = _mm512_permutexvar_epi32( s0, zero   );
-  casti_m512i( d, 1 ) = _mm512_permutexvar_epi32( s0, one  );
+  casti_m512i( d, 1 ) = _mm512_permutexvar_epi32( s0, one    );
-  casti_m512i( d, 2 ) = _mm512_permutexvar_epi32( s0, two  );
+  casti_m512i( d, 2 ) = _mm512_permutexvar_epi32( s0, two    );
-  casti_m512i( d, 3 ) = _mm512_permutexvar_epi32( s0, tre  );
+  casti_m512i( d, 3 ) = _mm512_permutexvar_epi32( s0, three  );
-  casti_m512i( d, 4 ) = _mm512_permutexvar_epi32( s0, four );
+  casti_m512i( d, 4 ) = _mm512_permutexvar_epi32( s0, four   );
  casti_m512i( d, 5 ) = _mm512_permutexvar_epi32( s0,
-                                    _mm512_add_epi32( four, one ) );
+                                    _mm512_add_epi32( four,   one   ) );
  casti_m512i( d, 6 ) = _mm512_permutexvar_epi32( s0, 
-                                    _mm512_add_epi32( four, two ) );
+                                    _mm512_add_epi32( four,   two   ) );
  casti_m512i( d, 7 ) = _mm512_permutexvar_epi32( s0,
-                                    _mm512_add_epi32( four, tre ) );
+                                    _mm512_add_epi32( four,   three ) );
  casti_m512i( d, 8 ) = _mm512_permutexvar_epi32( s0, eight );
  casti_m512i( d, 9 ) = _mm512_permutexvar_epi32( s0, 
-                                    _mm512_add_epi32( eight, one ) );
+                                    _mm512_add_epi32( eight,  one   ) );
  casti_m512i( d,10 ) = _mm512_permutexvar_epi32( s0,
-                                    _mm512_add_epi32( eight, two ) );
+                                    _mm512_add_epi32( eight,  two   ) );
  casti_m512i( d,11 ) = _mm512_permutexvar_epi32( s0, eleven ); 
  casti_m512i( d,12 ) = _mm512_permutexvar_epi32( s0, 
-                                    _mm512_add_epi32( eleven, one ) );
+                                    _mm512_add_epi32( eleven, one   ) );
  casti_m512i( d,13 ) = _mm512_permutexvar_epi32( s0, 
-                                    _mm512_add_epi32( eleven, two ) );
+                                    _mm512_add_epi32( eleven, two   ) );
  casti_m512i( d,14 ) = _mm512_permutexvar_epi32( s0, 
-                                    _mm512_add_epi32( eleven, tre ) );
+                                    _mm512_add_epi32( eleven, three ) );
  casti_m512i( d,15 ) = _mm512_permutexvar_epi32( s0,
-                                    _mm512_add_epi32( eleven, four ) );
+                                    _mm512_add_epi32( eleven, four  ) );
  casti_m512i( d,16 ) = _mm512_permutexvar_epi32(
                          _mm512_castsi128_si512( s1 ), zero );
  casti_m512i( d,17 ) = _mm512_permutexvar_epi32(
@@ -717,7 +717,7 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, void *src )
  casti_m512i( d,18 ) = _mm512_permutexvar_epi32(
                          _mm512_castsi128_si512( s1 ), two  );
  casti_m512i( d,19 ) = _mm512_permutexvar_epi32(
-                          _mm512_castsi128_si512( s1 ), tre  );
+                          _mm512_castsi128_si512( s1 ), three  );
 }
 #endif    // AVX512
@@ -1006,20 +1006,20 @@ static inline void mm512_bswap32_intrlv80_8x64( void *dst, void *src )
   __m512i *d = (__m512i*)dst;
   __m512i s0 = mm512_bswap_32( casti_m512i(src, 0 ) );
   __m128i s1 = mm128_bswap_32( casti_m128i(src, 4 ) );
-  const __m512i zero = m512_zero;
+  const __m512i zero   = m512_zero;
-  const __m512i one  = m512_one_64;
+  const __m512i one    = m512_one_64;
-  const __m512i two  = _mm512_add_epi64( one, one );
+  const __m512i two    = _mm512_add_epi64( one, one );
-  const __m512i tre  = _mm512_add_epi64( two, one );
+  const __m512i three  = _mm512_add_epi64( two, one );
-  const __m512i four = _mm512_add_epi64( two, two );
+  const __m512i four   = _mm512_add_epi64( two, two );
  d[0] = _mm512_permutexvar_epi64( s0, zero );
  d[1] = _mm512_permutexvar_epi64( s0, one  );
  d[2] = _mm512_permutexvar_epi64( s0, two  );
-  d[3] = _mm512_permutexvar_epi64( s0, tre  );
+  d[3] = _mm512_permutexvar_epi64( s0, three  );
  d[4] = _mm512_permutexvar_epi64( s0, four );
-  d[5] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, one ) );
+  d[5] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, one   ) );
-  d[6] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, two ) );
+  d[6] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, two   ) );
-  d[7] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, tre ) );
+  d[7] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, three ) );
  d[8] = _mm512_permutexvar_epi64(
           _mm512_castsi128_si512( s1 ), zero );
  d[9] = _mm512_permutexvar_epi64(
@@ -1296,25 +1296,18 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
 #if defined(__SSE4_1__)
 // No SSE2 implementation.
-#define mm128_intrlv_blend_64( hi, lo ) \
+#define mm128_intrlv_blend_64( hi, lo )   _mm_blend_epi16( hi, lo, 0x0f )
-                _mm_blend_epi16( hi, lo, 0x0f )
+#define mm128_intrlv_blend_32( hi, lo )   _mm_blend_epi16( hi, lo, 0x33 )
 #define mm128_intrlv_blend_32( hi, lo ) \
                _mm_blend_epi16( hi, lo, 0x33 )
 #endif   // SSE4_1
 #if defined(__AVX2__)
-#define mm256_intrlv_blend_128( hi, lo ) \
+#define mm256_intrlv_blend_128( hi, lo )  _mm256_blend_epi32( hi, lo, 0x0f )
-                _mm256_blend_epi32( hi, lo, 0x0f )
+#define mm256_intrlv_blend_64( hi, lo )   _mm256_blend_epi32( hi, lo, 0x33 )
 #define mm256_intrlv_blend_32( hi, lo )   _mm256_blend_epi32( hi, lo, 0x55 )
-#define mm256_intrlv_blend_64( hi, lo ) \
+// Select lanes of 32 byte hash from 2 sources according to control mask.
                _mm256_blend_epi32( hi, lo, 0x33 )
 #define mm256_intrlv_blend_32( hi, lo ) \
           _mm256_blend_epi32( hi, lo, 0x55 )
 // Blend 32 byte lanes of hash from 2 sources according to control mask.
 // macro due to 256 bit value arg.
 #define mm256_blend_hash_4x64( dst, a, b, mask ) \
 do { \
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -358,17 +358,17 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
 // no SSE2 implementation, no current users
 #define mm128_ror_1x16( v ) \
-   _mm_shuffle_epi8( v, _mm_set_epi8(  1, 0,15,14,13,12,11,10 \
+   _mm_shuffle_epi8( v, m128_const_64( 0x01000f0e0d0c0b0a, \
-                                       9, 8, 7, 6, 5, 4, 3, 2 ) )
+                                       0x0908070605040302 ) )
 #define mm128_rol_1x16( v ) \
-   _mm_shuffle_epi8( v, _mm_set_epi8( 13,12,11,10, 9, 8, 7, 6, \
+   _mm_shuffle_epi8( v, m128_const_64( 0x0d0c0b0a09080706, \
-                                       5, 4, 3, 2, 1, 0,15,14 ) )
+                                       0x0504030201000f0e ) )
 #define mm128_ror_1x8( v ) \
-   _mm_shuffle_epi8( v, _mm_set_epi8(  0,15,14,13,12,11,10, 9, \
+   _mm_shuffle_epi8( v, m128_const_64( 0x000f0e0d0c0b0a09, \
-                                       8, 7, 6, 5, 4, 3, 2, 1 ) )
+                                       0x0807060504030201 ) )
 #define mm128_rol_1x8( v ) \
-   _mm_shuffle_epi8( v, _mm_set_epi8( 14,13,12,11,10, 9, 8, 7, \
+   _mm_shuffle_epi8( v, m128_const_64( 0x0e0d0c0b0a090807, \
-                                       6, 5, 4, 3, 2, 1, 0,15 ) )
+                                       0x060504030201000f ) )
 #endif  // SSE3
 // Rotate 16 byte (128 bit) vector by c bytes.
@@ -386,12 +386,12 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
 #define mm128_swap32_64( v )  _mm_shuffle_epi32( v, 0xb1 )
 #define mm128_ror16_64( v )   _mm_shuffle_epi8( v, \
-         _mm_set_epi8(  9, 8,15,14,13,12,11,10,  1, 0, 7, 6, 5, 4, 3, 2 )
+                   m128_const_64( 0x09080f0e0d0c0b0a, 0x0100070605040302 )
 #define mm128_rol16_64( v )   _mm_shuffle_epi8( v, \
-              _mm_set_epi8( 13,12,11,10, 9, 8,15,14,  5, 4, 3, 2, 1, 0, 7, 6 )
+                   m128_const_64( 0x0dc0b0a09080f0e, 0x0504030201000706 )
 #define mm128_swap16_32( v )  _mm_shuffle_epi8( v, \
-                      _mm_set_epi8( 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2 )
+                   m128_const_64( 0x0d0c0f0e09080b0a, 0x0504070601000302 )
 //
 // Endian byte swap.
@@ -399,16 +399,15 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
 #if defined(__SSSE3__)
 #define mm128_bswap_64( v ) \
-   _mm_shuffle_epi8( v, m128_const64(  0x08090a0b0c0d0e0f, \
+   _mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \
                                       0x0001020304050607 ) )
 #define mm128_bswap_32( v ) \
   _mm_shuffle_epi8( v, m128_const_64( 0x0c0d0e0f08090a0b, \
                                       0x0405060700010203 ) )
-#define mm128_bswap_16( v ) \
+#define mm128_bswap_16( v ) _mm_shuffle_epi8( \
-   _mm_shuffle_epi8( v, _mm_set_epi8( 14,15,  12,13,  10,11,   8, 9, \
+                   m128_const_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 )
                                       6, 7,   4, 5,   2, 3,   0, 1 ) )
 // 8 byte qword * 8 qwords * 2 lanes = 128 bytes
 #define mm128_block_bswap_64( d, s ) do \
@@ -462,14 +461,14 @@ static inline __m128i mm128_bswap_16( __m128i v )
 static inline void mm128_block_bswap_64( __m128i *d, __m128i *s )
 {
-   d[0] = mm128_bswap_32( s[0] );
+   d[0] = mm128_bswap_64( s[0] );
-   d[1] = mm128_bswap_32( s[1] );
+   d[1] = mm128_bswap_64( s[1] );
-   d[2] = mm128_bswap_32( s[2] );
+   d[2] = mm128_bswap_64( s[2] );
-   d[3] = mm128_bswap_32( s[3] );
+   d[3] = mm128_bswap_64( s[3] );
-   d[4] = mm128_bswap_32( s[4] );
+   d[4] = mm128_bswap_64( s[4] );
-   d[5] = mm128_bswap_32( s[5] );
+   d[5] = mm128_bswap_64( s[5] );
-   d[6] = mm128_bswap_32( s[6] );
+   d[6] = mm128_bswap_64( s[6] );
-   d[7] = mm128_bswap_32( s[7] );
+   d[7] = mm128_bswap_64( s[7] );
 }
 static inline void mm128_block_bswap_32( __m128i *d, __m128i *s )
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -32,6 +32,7 @@
 // set instructions load memory resident constants, this avoids mem.
 // cost 4 pinsert + 1 vinsert, estimate 7 clocks.
 // Avoid using, mm128_const_64 twice is still faster.
 #define m256_const_64( i3, i2, i1, i0 ) \
   _mm256_insertf128_si256( _mm256_castsi128_si256( m128_const_64( i1, i0 ) ), \
                            m128_const_64( i3, i2 ), 1 )
@@ -50,7 +51,7 @@ static inline __m256i m256_one_64_fn()
  asm( "vpxor %0, %0, %0\n\t"
       "vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t"
       "vpsubq %%ymm1, %0, %0\n\t"
-       :"=x"(a)
+       : "=x"(a)
       :
       : "ymm1" );
  return a;
@@ -63,7 +64,7 @@ static inline __m256i m256_one_32_fn()
  asm( "vpxor %0, %0, %0\n\t"
       "vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t"
       "vpsubd %%ymm1, %0, %0\n\t"
-       :"=x"(a)
+       : "=x"(a)
       :
       : "ymm1" );
  return a;
@@ -76,7 +77,7 @@ static inline __m256i m256_one_16_fn()
  asm( "vpxor %0, %0, %0\n\t"
       "vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t"
       "vpsubw %%ymm1, %0, %0\n\t"
-       :"=x"(a)
+       : "=x"(a)
       :
       : "ymm1" );
  return a;
@@ -89,7 +90,7 @@ static inline __m256i m256_one_8_fn()
  asm( "vpxor %0, %0, %0\n\t"
       "vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t"
       "vpsubb %%ymm1, %0, %0\n\t"
-       :"=x"(a)
+       : "=x"(a)
       :
       : "ymm1" );
  return a;
@@ -100,7 +101,7 @@ static inline __m256i m256_neg1_fn()
 {
   __m256i a;
   asm( "vpcmpeqq %0, %0, %0\n\t"
-        :"=x"(a) );
+        : "=x"(a) );
   return a;
 }
 #define m256_neg1    m256_neg1_fn()
@@ -423,23 +424,23 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
 // Rotate 256 bit vector by one 16 bit element.     
 #define mm256_ror_1x16( v ) \
-   _mm256_permutexvar_epi16( _mm256_set_epi16( \
+   _mm256_permutexvar_epi16( m256_const_64( \
-         0,15,14,13,12,11,10, 9,   8, 7, 6, 5, 4, 3, 2, 1 ), v )
+                                 0x0000000f000e000d, 0x000c000b000a0009, \
                                 0x0008000700060005, 0x0004000300020001 ), v )
 #define mm256_rol_1x16( v ) \
-   _mm256_permutexvar_epi16( _mm256_set_epi16( \
+   _mm256_permutexvar_epi16( m256_const_64( \
-        14,13,12,11,10, 9, 8, 7,   6, 5, 4, 3, 2, 1, 0,15 ), v )
+                                 0x000e000d000c000b, 0x000a000900080007, \
                                 0x0006000500040003, 0x000200010000000f ), v )
 // Rotate 256 bit vector by one byte.
-#define mm256_ror_1x8( v ) \
+#define mm256_ror_1x8( v ) m256_const_64( \
-   _mm256_permutexvar_epi8( _mm256_set_epi8( \
+                                 0x001f1e1d1c1b1a19, 0x1817161514131211, \
-         0,31,30,29,28,27,26,25,  24,23,22,21,20,19,18,17, \
+                                 0x100f0e0d0c0b0a09, 0x0807060504030201 )
        16,15,14,13,12,11,10, 9,   8, 7, 6, 5, 4, 3, 2, 1 ), v )
-#define mm256_rol_1x8( v ) \
+#define mm256_rol_1x8( v ) m256_const_64( \
-   _mm256_permutexvar_epi8( _mm256_set_epi8( \
+                                 0x1e1d1c1b1a191817, 0x161514131211100f, \
-        30,29,28,27,26,25,24,23,  22,21,20,19,18,17,16,15, \
+                                 0x0e0d0c0b0a090807, 0x060504030201001f )
        14,13,12,11,10, 9, 8, 7,   6, 5, 4, 3, 2, 1, 0,31 ), v )
 #endif  // AVX512
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -503,7 +503,7 @@ static inline __m512i m512_neg1_fn()
                       0x08090A0B, 0x0C0D0E0F,   0x00010203, 0x04050607 ) )
 #define mm512_bswap_32( v ) \
-   _mm512_permutexvar_epi8( v, _mm512_set_epi832( \
+   _mm512_permutexvar_epi8( v, _mm512_set_epi32( \
                       0x3C3D3E3F, 0x38393A3B, 0x34353637, 0x30313233, \
                       0x3C3D3E3F, 0x38393A3B, 0x34353637, 0x30313233, \
                       0x3C3D3E3F, 0x38393A3B, 0x34353637, 0x30313233, \
--- a/util.c
+++ b/util.c
@@ -668,6 +668,15 @@ err_out:
 	return cfg;
 }
 void cbin2hex(char *out, const char *in, size_t len)
 {
   if (out) {
      unsigned int i;
      for (i = 0; i < len; i++)
         sprintf(out + (i * 2), "%02x", (uint8_t)in[i]);
   }
 }
 void bin2hex(char *s, const unsigned char *p, size_t len)
 {
 	for (size_t i = 0; i < len; i++)
@@ -1693,35 +1702,47 @@ static uint32_t getblocheight(struct stratum_ctx *sctx)
 static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 {
 	const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime;
-        const char *extradata = NULL;
+   const char *denom10 = NULL, *denom100 = NULL, *denom1000 = NULL,
              *denom10000 = NULL, *prooffullnode = NULL;
   const char *extradata = NULL;
 	size_t coinb1_size, coinb2_size;
 	bool clean, ret = false;
 	int merkle_count, i, p = 0;
 	json_t *merkle_arr;
 	uchar **merkle = NULL;
 	int jsize = json_array_size(params);
-        bool has_claim = ( opt_algo == ALGO_LBRY ) && ( jsize == 10 );
+   bool has_claim = ( opt_algo == ALGO_LBRY ) && ( jsize == 10 );
-        bool has_roots = ( opt_algo == ALGO_PHI2 ) && ( jsize == 10 );
+   bool has_roots = ( opt_algo == ALGO_PHI2 ) && ( jsize == 10 );
-	job_id = json_string_value(json_array_get(params, p++));
+   bool is_veil  = ( opt_algo == ALGO_X16RT_VEIL );
   job_id = json_string_value(json_array_get(params, p++));
 	prevhash = json_string_value(json_array_get(params, p++));
-        if ( has_claim )
+   if ( has_claim )
-        {
+   {
-            extradata = json_string_value(json_array_get(params, p++));
+       extradata = json_string_value(json_array_get(params, p++));
-            if ( !extradata || strlen( extradata ) != 64 ) 
+       if ( !extradata || strlen( extradata ) != 64 ) 
-            {
+       {
-                applog(LOG_ERR, "Stratum notify: invalid claim parameter");
+           applog(LOG_ERR, "Stratum notify: invalid claim parameter");
-                goto out;
+           goto out;
-            }
+       }
-        }
+   }
-        else if ( has_roots )
+   else if ( has_roots )
-       	{
+   {
-            extradata = json_string_value(json_array_get(params, p++));
+       extradata = json_string_value(json_array_get(params, p++));
-            if ( !extradata || strlen( extradata ) != 128 )
+       if ( !extradata || strlen( extradata ) != 128 )
-	    {
+       {
-                applog(LOG_ERR, "Stratum notify: invalid UTXO root parameter");
+           applog(LOG_ERR, "Stratum notify: invalid UTXO root parameter");
-                goto out;
+           goto out;
-            }
+       }
-        }
+   }
   if ( is_veil )
   {
      denom10 = json_string_value(json_array_get(params, p++));
      denom100 = json_string_value(json_array_get(params, p++));
      denom1000 = json_string_value(json_array_get(params, p++));
      denom10000 = json_string_value(json_array_get(params, p++));
      prooffullnode = json_string_value(json_array_get(params, p++));
   }
 	coinb1 = json_string_value(json_array_get(params, p++));
 	coinb2 = json_string_value(json_array_get(params, p++));
@@ -1733,7 +1754,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	nbits = json_string_value(json_array_get(params, p++));
 	stime = json_string_value(json_array_get(params, p++));
 	clean = json_is_true(json_array_get(params, p)); p++;
-
+   
 	if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime ||
 	    strlen(prevhash) != 64 || strlen(version) != 8 ||
 	    strlen(nbits) != 8 || strlen(stime) != 8) {
@@ -1741,8 +1762,22 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 		goto out;
 	}
-        merkle = (uchar**) malloc(merkle_count * sizeof(char *));
+   if ( is_veil )
-	for (i = 0; i < merkle_count; i++) {
+   {
      if ( !denom10 || !denom100 || !denom1000 || !denom10000
        || !prooffullnode || strlen(denom10) != 64 || strlen(denom100) != 64
        || strlen(denom1000) != 64 || strlen(denom10000) != 64
        || strlen(prooffullnode) != 64 )
      {
         applog(LOG_ERR, "Stratum notify: invalid veil parameters");
         goto out;
      }
   }
   if ( merkle_count )
      merkle = (uchar**) malloc(merkle_count * sizeof(char *));
 	for ( i = 0; i < merkle_count; i++ )
   {
 		const char *s = json_string_value(json_array_get(merkle_arr, i));
 		if (!s || strlen(s) != 64) {
 			while (i--)
@@ -1774,6 +1809,15 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
        if (has_claim) hex2bin(sctx->job.extra, extradata, 32);
        if (has_roots) hex2bin(sctx->job.extra, extradata, 64);
   if ( is_veil )
   {
      hex2bin(sctx->job.denom10, denom10, 32);
      hex2bin(sctx->job.denom100, denom100, 32);
      hex2bin(sctx->job.denom1000, denom1000, 32);
      hex2bin(sctx->job.denom10000, denom10000, 32);
      hex2bin(sctx->job.proofoffullnode, prooffullnode, 32);
   }
 	sctx->bloc_height = getblocheight(sctx);
 	for (i = 0; i < sctx->job.merkle_count; i++)