v3.22.2

v3.22.1
v3.22.0
2025-09-17 23:44:27 +00:00 · 2023-04-06 13:38:37 -04:00 · 2023-03-24 18:29:42 -04:00 · 2023-03-21 17:12:51 -04:00 · 2023-03-15 12:27:04 -04:00 · 2023-03-13 14:54:38 -04:00
56 changed files with 2843 additions and 4423 deletions
--- a/2
+++ b/2
@@ -37,7 +37,7 @@ SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
 openssl 1.1.0e or higher.

 znver1 and znver2 should be recognized on most recent version of GCC and
-znver3 is expected with GCC 11. GCC 11 also includes rocketlake support.
+znver3 is available with GCC 11. GCC 11 also includes rocketlake support.
 In the meantime here are some suggestions to compile with new CPUs:

 "-march=native" is usually the best choice, used by build.sh.
--- a/Makefile.am
+++ b/Makefile.am
@@ -55,9 +55,6 @@ cpuminer_SOURCES = \
  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
  algo/blake/blakecoin-4way.c \
-  algo/blake/decred-gate.c \
-  algo/blake/decred.c \
-  algo/blake/decred-4way.c \
  algo/blake/pentablake-gate.c \
  algo/blake/pentablake-4way.c \
  algo/blake/pentablake.c \
@@ -178,6 +175,8 @@ cpuminer_SOURCES = \
  algo/sha/sha256t.c \
  algo/sha/sha256q-4way.c \
  algo/sha/sha256q.c \
+  algo/sha/sha512256d-4way.c \
+  algo/sha/sha256dt.c \
  algo/shabal/sph_shabal.c \
  algo/shabal/shabal-hash-4way.c \
  algo/shavite/sph_shavite.c \
--- a/75
+++ b/75
@@ -65,10 +65,83 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v3.22.2
+
+Added sha512256d & sha256dt algos.
+Fixed intermittant invalid shares lyra2v2 AVX512.
+Removed application limits on the number of CPUs and threads, HW and OS limits still apply.
+Added a log warning if more threads are defined than active CPUs in affinity mask.
+Improved merkle tree memory management for stratum.
+Added transaction count to New Work log.
+Other small improvements.
+
+v3.22.1
+
+#393 fixed segfault in GBT, regression from v3.22.0.
+More efficient 32 bit data interleaving.
+
+v3.22.0
+
+Stratum: faster netdiff calculation.
+Merged a few updates from Pooler/cpuminer:
+   Use CURLOPT_POSTFIELDS in json_rpc_call,
+   Use CURLINFO_ACTIVESOCKET when supported,
+   JSONRPC speedup,
+   Speed up hex2bin function.  
+Small log improvements, notably more frequent hash rate reports.
+Removed decred algo.
+
+v3.21.5
+
+All issues with v3.21.3 & v3.21.4 should be resolved.
+Changes since v3.21.2:
+#392 #379 #389 Fixed misaligned address segfault solo mining.
+#392 Fixed stats for myr-gr algo, and a few others, for CPUs without AVX2.
+#392 Fixed conditional mining.
+#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
+     Windows binaries no longer support CPU groups,
+     Windows binaries support CPUs with up to 64 threads.
+Small optimizations to serialized vectoring.
+
+v3.21.4 CANCELLED
+
+Reapply selected changes from v3.21.3.
+#392 #379 #389 Fixed misaligned address segfault solo mining.
+#392 Fixed conditional mining.
+#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
+     Windows binaries no longer support CPU groups,
+     Windows binaries support CPUs with up to 64 threads.
+
+v3.21.3.1 UNRELEASED
+
+Revert to 3.21.2
+
+v3.21.3 CANCELLED
+
+#392 #379 #389 Fixed misaligned address segfault solo mining.
+#392 Fixed stats for myr-gr algo, and a few others, for CPUs without AVX2.
+#392 Fixed conditional mining.
+#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
+     Windows binaries no longer support CPU groups,
+     Windows binaries support CPUs with up to 64 threads.
+Midstate prehash is now centralized, done only once instead of by every thread
+for selected algos.
+Small optimizations to serialized vectoring.
+
+v3.21.2 
+
+Faster SALSA SIMD shuffle for yespower, yescrypt & scryptn2.
+Fixed a couple of compiler warnings with gcc-12.
+
+v3.21.1
+
+Fixed a segfault in some obsolete algos.
+Small optimizations to Hamsi & Shabal AVX2 & AVX512.
+
 v3.21.0

 Added minotaurx algo for stratum only.
-Blake256 & sha256 prehash optimised to ignore zero-padded data for AVX2 & AVX512.
+Blake256 & sha256 prehash optimized to ignore zero-padded data for AVX2 & AVX512.
 Other small improvements.

 v3.20.3
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -263,8 +263,6 @@ void init_algo_gate( algo_gate_t* gate )
   gate->build_block_header      = (void*)&std_build_block_header;
   gate->build_extraheader       = (void*)&std_build_extraheader;
   gate->set_work_data_endian    = (void*)&do_nothing;
-   gate->calc_network_diff       = (void*)&std_calc_network_diff;
-   gate->ready_to_mine           = (void*)&std_ready_to_mine;
   gate->resync_threads          = (void*)&do_nothing;
   gate->do_this_thread          = (void*)&return_true;
   gate->longpoll_rpc_call       = (void*)&std_longpoll_rpc_call;
@@ -308,7 +306,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_BLAKECOIN:    rc = register_blakecoin_algo     ( gate ); break;
    case ALGO_BMW512:       rc = register_bmw512_algo        ( gate ); break;
    case ALGO_C11:          rc = register_c11_algo           ( gate ); break;
-    case ALGO_DECRED:       rc = register_decred_algo        ( gate ); break;
    case ALGO_DEEP:         rc = register_deep_algo          ( gate ); break;
    case ALGO_DMD_GR:       rc = register_dmd_gr_algo        ( gate ); break;
    case ALGO_GROESTL:      rc = register_groestl_algo       ( gate ); break;
@@ -340,9 +337,11 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_QUBIT:        rc = register_qubit_algo         ( gate ); break;
    case ALGO_SCRYPT:       rc = register_scrypt_algo        ( gate ); break;
    case ALGO_SHA256D:      rc = register_sha256d_algo       ( gate ); break;
+    case ALGO_SHA256DT:     rc = register_sha256dt_algo      ( gate ); break;
    case ALGO_SHA256Q:      rc = register_sha256q_algo       ( gate ); break;
    case ALGO_SHA256T:      rc = register_sha256t_algo       ( gate ); break;
    case ALGO_SHA3D:        rc = register_sha3d_algo         ( gate ); break;
+    case ALGO_SHA512256D:   rc = register_sha512256d_algo    ( gate ); break;
    case ALGO_SHAVITE3:     rc = register_shavite_algo       ( gate ); break;
    case ALGO_SKEIN:        rc = register_skein_algo         ( gate ); break;
    case ALGO_SKEIN2:       rc = register_skein2_algo        ( gate ); break;
@@ -427,7 +426,6 @@ const char* const algo_alias_map[][2] =
  { "blake256r8",        "blakecoin"      },
  { "blake256r8vnl",     "vanilla"        },
  { "blake256r14",       "blake"          },
-  { "blake256r14dcr",    "decred"         },
  { "diamond",           "dmd-gr"         },
  { "espers",            "hmq1725"        },
  { "flax",              "c11"            },
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -144,7 +144,7 @@ void ( *gen_merkle_root )       ( char*, struct stratum_ctx* );
 void ( *build_extraheader )     ( struct work*, struct stratum_ctx* );

 void ( *build_block_header )    ( struct work*, uint32_t, uint32_t*,
-	                                uint32_t*, uint32_t, uint32_t,
+	                                uint32_t*,   uint32_t, uint32_t,
                                   unsigned char* );

 // Build mining.submit message
@@ -155,19 +155,13 @@ char* ( *malloc_txs_request )   ( struct work* );
 // Big endian or little endian
 void ( *set_work_data_endian )  ( struct work* );

-double ( *calc_network_diff )   ( struct work* );
-
-// Wait for first work
-bool ( *ready_to_mine )         ( struct work*, struct stratum_ctx*, int );
-
 // Diverge mining threads
 bool ( *do_this_thread )        ( int );

 // After do_this_thread
 void ( *resync_threads )        ( int, struct work* );

-// No longer needed
-json_t* (*longpoll_rpc_call)      ( CURL*, int*, char* );
+json_t* ( *longpoll_rpc_call )  ( CURL*, int*, char* );

 set_t optimizations;
 int  ( *get_work_data_size )     ();
@@ -286,8 +280,6 @@ char* std_malloc_txs_request( struct work *work );
 // Default is do_nothing, little endian is assumed
 void set_work_data_big_endian( struct work *work );

-double std_calc_network_diff( struct work *work );
-
 void std_build_block_header( struct work* g_work, uint32_t version,
 	                          uint32_t *prevhash,  uint32_t *merkle_root,
   	                       uint32_t ntime,      uint32_t nbits,
@@ -297,9 +289,6 @@ void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );

 json_t* std_longpoll_rpc_call( CURL *curl, int *err, char *lp_url );

-bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                        int thr_id );
-
 int std_get_work_data_size();

 // Gate admin functions
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -1,74 +0,0 @@
-#include "decred-gate.h"
-#include "blake-hash-4way.h"
-#include <string.h>
-#include <stdint.h>
-#include <memory.h>
-#include <unistd.h>
-
-#if defined (DECRED_4WAY)
-
-static __thread blake256_4way_context blake_mid;
-
-void decred_hash_4way( void *state, const void *input )
-{
-     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-//     uint32_t hash0[8] __attribute__ ((aligned (32)));
-//     uint32_t hash1[8] __attribute__ ((aligned (32)));
-//     uint32_t hash2[8] __attribute__ ((aligned (32)));
-//     uint32_t hash3[8] __attribute__ ((aligned (32)));
-     const void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
-     int tail_len = 180 - DECRED_MIDSTATE_LEN; 
-     blake256_4way_context ctx __attribute__ ((aligned (64)));
-
-     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
-     blake256_4way_update( &ctx, tail, tail_len );
-     blake256_4way_close( &ctx, vhash );
-     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
-}
-
-int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t vdata[48*4] __attribute__ ((aligned (64)));
-   uint32_t hash[8*4] __attribute__ ((aligned (32)));
-   uint32_t _ALIGN(64) edata[48];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
-   uint32_t n = first_nonce;
-   const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-   // copy to buffer guaranteed to be aligned.
-   memcpy( edata, pdata, 180 );
-
-   // use the old way until  new way updated for size.
-   mm128_intrlv_4x32x( vdata, edata, edata, edata, edata, 180*8 );
-
-   blake256_4way_init( &blake_mid );
-   blake256_4way_update( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
-
-   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
-   do {
-      * noncep    = n;
-      *(noncep+1) = n+1;
-      *(noncep+2) = n+2;
-      *(noncep+3) = n+3;
-
-      decred_hash_4way( hash, vdata );
-
-      for ( int i = 0; i < 4; i++ )
-      if (  (hash+(i<<3))[7] <= HTarget )
-      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-      {
-          pdata[DECRED_NONCE_INDEX] = n+i;
-          submit_solution( work, hash+(i<<3), mythr );
-      }
-      n += 4;
-  } while ( (n < max_nonce) && !work_restart[thr_id].restart );
-
-  *hashes_done = n - first_nonce + 1;
-  return 0;
-}
-
-#endif
--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -1,171 +0,0 @@
-#include "decred-gate.h"
-#include <unistd.h>
-#include <memory.h>
-#include <string.h>
-
-uint32_t *decred_get_nonceptr( uint32_t *work_data )
-{
-   return &work_data[ DECRED_NONCE_INDEX ];
-}
-
-long double decred_calc_network_diff( struct work* work )
-{
-   // sample for diff 43.281 : 1c05ea29
-   // todo: endian reversed on longpoll could be zr5 specific...
-   uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
-   uint32_t bits = ( nbits & 0xffffff );
-   int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
-   int m;
-   long double d = (long double)0x0000ffff / (long double)bits;
-
-   for ( m = shift; m < 29; m++ )
-       d *= 256.0;
-   for ( m = 29; m < shift; m++ )
-       d /= 256.0;
-   if ( shift == 28 )
-       d *= 256.0; // testnet
-   if ( opt_debug_diff )
-       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", (double)d,
-                           shift, bits );
-   return net_diff;
-}
-
-void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
-{
-   // some random extradata to make the work unique
-   work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
-   work->height = work->data[32];
-   if (!have_longpoll && work->height > *net_blocks + 1)
-   {
-      char netinfo[64] = { 0 };
-      if ( net_diff > 0. )
-      {
-         if (net_diff != work->targetdiff)
-            sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
-                   work->targetdiff);
-         else
-             sprintf(netinfo, ", diff %.3f", net_diff);
-       }
-       applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
-                       netinfo);
-       *net_blocks = work->height - 1;
-   }
-}
-
-void decred_be_build_stratum_request( char *req, struct work *work,
-                                      struct stratum_ctx *sctx )
-{
-   unsigned char *xnonce2str;
-   uint32_t ntime, nonce;
-   char ntimestr[9], noncestr[9];
-
-   be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
-   be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
-   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
-   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
-   xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
-                                     sctx->xnonce1_size );
-   snprintf( req, JSON_BUF_LEN,
-        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
-   free(xnonce2str);
-}
-
-#if !defined(min)
-#define min(a,b) (a>b ? (b) :(a))
-#endif
-
-void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
-{
-   uchar merkle_root[64] = { 0 };
-   uint32_t extraheader[32] = { 0 };
-   int headersize = 0;
-   uint32_t* extradata = (uint32_t*) sctx->xnonce1;
-   int i;
-
-   // getwork over stratum, getwork merkle + header passed in coinb1
-   memcpy(merkle_root, sctx->job.coinbase, 32);
-   headersize = min((int)sctx->job.coinbase_size - 32,
-                  sizeof(extraheader) );
-   memcpy( extraheader, &sctx->job.coinbase[32], headersize );
-
-   // Assemble block header 
-   memset( g_work->data, 0, sizeof(g_work->data) );
-   g_work->data[0] = le32dec( sctx->job.version );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[1 + i] = swab32(
-                              le32dec( (uint32_t *) sctx->job.prevhash + i ) );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
-
-//   for ( i = 0; i < 8; i++ ) // prevhash
-//      g_work->data[1 + i] = swab32( g_work->data[1 + i] );
-//   for ( i = 0; i < 8; i++ ) // merkle
-//      g_work->data[9 + i] = swab32( g_work->data[9 + i] );
-
-   for ( i = 0; i < headersize/4; i++ ) // header
-      g_work->data[17 + i] = extraheader[i];
-   // extradata
-
-   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
-      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
-   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
-      g_work->data[i] = 0;
-   g_work->data[37] = (rand()*4) << 8;
-   // block header suffix from coinb2 (stake version)
-   memcpy( &g_work->data[44],
-           &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
-   sctx->block_height = g_work->data[32];
-   //applog_hex(work->data, 180);
-   //applog_hex(&work->data[36], 36);
-}
-
-#undef min
-
-bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                           int thr_id )
-{
-   if ( have_stratum && strcmp(stratum->job.job_id, work->job_id)  )
-      // need to regen g_work..
-      return false;
-   if ( have_stratum && !work->data[0] && !opt_benchmark )
-   {
-      sleep(1);
-      return false;
-   }
-   // extradata: prevent duplicates
-   work->data[ DECRED_XNONCE_INDEX     ] += 1;
-   work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
-   return true;
-}
-
-int decred_get_work_data_size() { return DECRED_DATA_SIZE; }
-
-bool register_decred_algo( algo_gate_t* gate )
-{
-#if defined(DECRED_4WAY)
-  four_way_not_tested();
-  gate->scanhash  = (void*)&scanhash_decred_4way;
-  gate->hash      = (void*)&decred_hash_4way;
-#else
-  gate->scanhash  = (void*)&scanhash_decred;
-  gate->hash      = (void*)&decred_hash;
-#endif
-  gate->optimizations = AVX2_OPT;
-//  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
-  gate->decode_extra_data     = (void*)&decred_decode_extradata;
-  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
-  gate->work_decode           = (void*)&std_be_work_decode;
-  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
-  gate->build_extraheader     = (void*)&decred_build_extraheader;
-  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
-  gate->nbits_index           = DECRED_NBITS_INDEX;
-  gate->ntime_index           = DECRED_NTIME_INDEX;
-  gate->nonce_index           = DECRED_NONCE_INDEX;
-  gate->get_work_data_size    = (void*)&decred_get_work_data_size;
-  gate->work_cmp_size         = DECRED_WORK_COMPARE_SIZE;
-  allow_mininginfo            = false;
-  have_gbt                    = false;
-  return true;
-}
-
--- a/algo/blake/decred-gate.h
+++ b/algo/blake/decred-gate.h
@@ -1,36 +0,0 @@
-#ifndef __DECRED_GATE_H__
-#define __DECRED_GATE_H__
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-#define DECRED_NBITS_INDEX 29
-#define DECRED_NTIME_INDEX 34
-#define DECRED_NONCE_INDEX 35
-#define DECRED_XNONCE_INDEX 36
-#define DECRED_DATA_SIZE 192
-#define DECRED_WORK_COMPARE_SIZE 140
-#define DECRED_MIDSTATE_LEN 128
-
-#if defined (__AVX2__) 
-//void blakehash_84way(void *state, const void *input);
-//int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
-//                         uint64_t *hashes_done );
-#endif
-
-#if defined(__SSE4_2__)
-  #define DECRED_4WAY
-#endif
-
-#if defined (DECRED_4WAY)
-void decred_hash_4way(void *state, const void *input);
-int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr );
-#endif
-
-void decred_hash( void *state, const void *input );
-int scanhash_decred( struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done, struct thr_info *mythr );
-
-#endif
-
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -1,282 +0,0 @@
-#include "decred-gate.h"
-
-#if !defined(DECRED_8WAY) && !defined(DECRED_4WAY)
-
-#include "sph_blake.h"
-
-#include <string.h>
-#include <stdint.h>
-#include <memory.h>
-#include <unistd.h>
-
-/*
-#ifndef min
-#define min(a,b) (a>b ? b : a)
-#endif
-#ifndef max 
-#define max(a,b) (a<b ? b : a)
-#endif
-*/
-/*
-#define DECRED_NBITS_INDEX 29
-#define DECRED_NTIME_INDEX 34
-#define DECRED_NONCE_INDEX 35
-#define DECRED_XNONCE_INDEX 36
-#define DECRED_DATA_SIZE 192
-#define DECRED_WORK_COMPARE_SIZE 140
-*/
-static __thread sph_blake256_context blake_mid;
-static __thread bool ctx_midstate_done = false;
-
-void decred_hash(void *state, const void *input)
-{
-//        #define MIDSTATE_LEN 128
-        sph_blake256_context ctx __attribute__ ((aligned (64)));
-
-        uint8_t *ending = (uint8_t*) input;
-        ending += DECRED_MIDSTATE_LEN;
-
-        if (!ctx_midstate_done) {
-                sph_blake256_init(&blake_mid);
-                sph_blake256(&blake_mid, input, DECRED_MIDSTATE_LEN);
-                ctx_midstate_done = true;
-        }
-        memcpy(&ctx, &blake_mid, sizeof(blake_mid));
-
-        sph_blake256(&ctx, ending, (180 - DECRED_MIDSTATE_LEN));
-        sph_blake256_close(&ctx, state);
-}
-
-void decred_hash_simple(void *state, const void *input)
-{
-        sph_blake256_context ctx;
-        sph_blake256_init(&ctx);
-        sph_blake256(&ctx, input, 180);
-        sph_blake256_close(&ctx, state);
-}
-
-int scanhash_decred( struct work *work, uint32_t max_nonce,
-               uint64_t *hashes_done, struct thr_info *mythr )
-{
-        uint32_t _ALIGN(64) endiandata[48];
-        uint32_t _ALIGN(64) hash32[8];
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-//        #define DCR_NONCE_OFT32 35
-
-        const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
-        const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
-
-        uint32_t n = first_nonce;
-
-        ctx_midstate_done = false;
-
-#if 1
-        memcpy(endiandata, pdata, 180);
-#else
-        for (int k=0; k < (180/4); k++)
-                be32enc(&endiandata[k], pdata[k]);
-#endif
-
-        do {
-                //be32enc(&endiandata[DCR_NONCE_OFT32], n);
-                endiandata[DECRED_NONCE_INDEX] = n;
-                decred_hash(hash32, endiandata);
-
-                if (hash32[7] <= HTarget && fulltest(hash32, ptarget))
-                {
-                   pdata[DECRED_NONCE_INDEX] = n;
-                   submit_solution( work, hash32, mythr );
-                }
-
-                n++;
-
-        } while (n < max_nonce && !work_restart[thr_id].restart);
-
-        *hashes_done = n - first_nonce + 1;
-        pdata[DECRED_NONCE_INDEX] = n;
-        return 0;
-}
-
-/*
-uint32_t *decred_get_nonceptr( uint32_t *work_data )
-{
-   return &work_data[ DECRED_NONCE_INDEX ];
-}
-
-double decred_calc_network_diff( struct work* work )
-{
-   // sample for diff 43.281 : 1c05ea29
-   // todo: endian reversed on longpoll could be zr5 specific...
-   uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
-   uint32_t bits = ( nbits & 0xffffff );
-   int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
-   int m;
-   double d = (double)0x0000ffff / (double)bits;
-
-   for ( m = shift; m < 29; m++ )
-       d *= 256.0;
-   for ( m = 29; m < shift; m++ )
-       d /= 256.0;
-   if ( shift == 28 )
-       d *= 256.0; // testnet
-   if ( opt_debug_diff )
-       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
-                           shift, bits );
-   return net_diff;
-}
-
-void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
-{
-   // some random extradata to make the work unique
-   work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
-   work->height = work->data[32];
-   if (!have_longpoll && work->height > *net_blocks + 1)
-   {
-      char netinfo[64] = { 0 };
-      if (net_diff > 0.)
-      {
-         if (net_diff != work->targetdiff)
-	    sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
-                   work->targetdiff);
-	 else
-	     sprintf(netinfo, ", diff %.3f", net_diff);
-       }
-       applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
-                       netinfo);
-       *net_blocks = work->height - 1;
-   }
-}
-
-void decred_be_build_stratum_request( char *req, struct work *work,
-                                      struct stratum_ctx *sctx )
-{
-   unsigned char *xnonce2str;
-   uint32_t ntime, nonce;
-   char ntimestr[9], noncestr[9];
-
-   be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
-   be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
-   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
-   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
-   xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
-                                     sctx->xnonce1_size );
-   snprintf( req, JSON_BUF_LEN,
-        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
-   free(xnonce2str);
-}
-*/
-/*
-// data shared between gen_merkle_root and build_extraheader.
-__thread uint32_t decred_extraheader[32] = { 0 };
-__thread int decred_headersize = 0;
-
-void decred_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
-{
-   // getwork over stratum, getwork merkle + header passed in coinb1
-   memcpy(merkle_root, sctx->job.coinbase, 32);
-   decred_headersize = min((int)sctx->job.coinbase_size - 32, 
-                  sizeof(decred_extraheader) );
-   memcpy( decred_extraheader, &sctx->job.coinbase[32], decred_headersize);
-}
-*/
-
-/*
-#define min(a,b) (a>b ? (b) :(a))
-
-void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
-{
-   uchar merkle_root[64] = { 0 };
-   uint32_t extraheader[32] = { 0 };
-   int headersize = 0;
-   uint32_t* extradata = (uint32_t*) sctx->xnonce1;
-   size_t t;
-   int i;
-
-   // getwork over stratum, getwork merkle + header passed in coinb1
-   memcpy(merkle_root, sctx->job.coinbase, 32);
-   headersize = min((int)sctx->job.coinbase_size - 32,
-                  sizeof(extraheader) );
-   memcpy( extraheader, &sctx->job.coinbase[32], headersize );
-
-   // Increment extranonce2 
-   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
-
-   // Assemble block header 
-   memset( g_work->data, 0, sizeof(g_work->data) );
-   g_work->data[0] = le32dec( sctx->job.version );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[1 + i] = swab32(
-                              le32dec( (uint32_t *) sctx->job.prevhash + i ) );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
-
-//   for ( i = 0; i < 8; i++ ) // prevhash
-//      g_work->data[1 + i] = swab32( g_work->data[1 + i] );
-//   for ( i = 0; i < 8; i++ ) // merkle
-//      g_work->data[9 + i] = swab32( g_work->data[9 + i] );
-
-   for ( i = 0; i < headersize/4; i++ ) // header
-      g_work->data[17 + i] = extraheader[i];
-   // extradata
-
-   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
-      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
-   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
-      g_work->data[i] = 0;
-   g_work->data[37] = (rand()*4) << 8;
-   // block header suffix from coinb2 (stake version)
-   memcpy( &g_work->data[44],
-           &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
-   sctx->bloc_height = g_work->data[32];
-   //applog_hex(work->data, 180);
-   //applog_hex(&work->data[36], 36);
-}
-
-#undef min
-
-bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                           int thr_id )
-{
-   if ( have_stratum && strcmp(stratum->job.job_id, work->job_id)  )
-      // need to regen g_work..
-      return false;
-   if ( have_stratum && !work->data[0] && !opt_benchmark )
-   {
-      sleep(1);
-      return false;
-   }      
-   // extradata: prevent duplicates
-   work->data[ DECRED_XNONCE_INDEX     ] += 1;
-   work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
-   return true;
-}
-
-
-bool register_decred_algo( algo_gate_t* gate )
-{
-  gate->optimizations         = SSE2_OPT;
-  gate->scanhash              = (void*)&scanhash_decred;
-  gate->hash                  = (void*)&decred_hash;
-  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
-  gate->decode_extra_data     = (void*)&decred_decode_extradata;
-  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
-  gate->work_decode           = (void*)&std_be_work_decode;
-  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
-  gate->build_extraheader     = (void*)&decred_build_extraheader;
-  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
-  gate->nbits_index           = DECRED_NBITS_INDEX;
-  gate->ntime_index           = DECRED_NTIME_INDEX;
-  gate->nonce_index           = DECRED_NONCE_INDEX;
-  gate->work_data_size        = DECRED_DATA_SIZE;
-  gate->work_cmp_size         = DECRED_WORK_COMPARE_SIZE; 
-  allow_mininginfo            = false;
-  have_gbt                    = false;
-  return true;
-}
-*/
-
-#endif
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -1,6 +1,6 @@
 #include "pentablake-gate.h"

-#if defined (__AVX2__)
+#if defined(PENTABLAKE_4WAY)

 #include <stdlib.h>
 #include <stdint.h>
--- a/algo/blake/pentablake-gate.h
+++ b/algo/blake/pentablake-gate.h
@@ -4,9 +4,10 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__)
-  #define PENTABLAKE_4WAY
-#endif
+// 4way is broken
+//#if defined(__AVX2__)
+//  #define PENTABLAKE_4WAY
+//#endif

 #if defined(PENTABLAKE_4WAY)
 void pentablakehash_4way( void *state, const void *input );
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -103,16 +103,16 @@
   const uint8_t *sigmaR = sigma[R]; \
   BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
   BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
-   V2 = mm128_alignr_64( V[3], V[2] ); \
-   V3 = mm128_alignr_64( V[2], V[3] ); \
-   V6 = mm128_alignr_64( V[6], V[7] ); \
-   V7 = mm128_alignr_64( V[7], V[6] ); \
+   V2 = mm128_alignr_64( V[3], V[2], 1 ); \
+   V3 = mm128_alignr_64( V[2], V[3], 1 ); \
+   V6 = mm128_alignr_64( V[6], V[7], 1 ); \
+   V7 = mm128_alignr_64( V[7], V[6], 1 ); \
   BLAKE2B_G( V[0], V2, V[5], V6,  8,  9, 10, 11 ); \
   BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
-   V[2] = mm128_alignr_64( V2, V3 ); \
-   V[3] = mm128_alignr_64( V3, V2 ); \
-   V[6] = mm128_alignr_64( V7, V6 ); \
-   V[7] = mm128_alignr_64( V6, V7 ); \
+   V[2] = mm128_alignr_64( V2, V3, 1 ); \
+   V[3] = mm128_alignr_64( V3, V2, 1 ); \
+   V[6] = mm128_alignr_64( V7, V6, 1 ); \
+   V[7] = mm128_alignr_64( V6, V7, 1 ); \
 }

 #else
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -24,9 +24,6 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )

  ctx->hashlen = hashlen;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
-
  for ( i = 0; i < SIZE512; i++ )
  {
     ctx->chaining[i] = _mm_setzero_si128();
@@ -46,9 +43,6 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
 {
  int i;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
-
  for ( i = 0; i < SIZE512; i++ )
  {
     ctx->chaining[i] = _mm_setzero_si128();
--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -22,9 +22,6 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )

  ctx->hashlen = hashlen;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
-
  for ( i = 0; i < SIZE256; i++ )
  {
     ctx->chaining[i] = _mm_setzero_si128();
@@ -43,9 +40,6 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
 {
  int i;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
-
  for ( i = 0; i < SIZE256; i++ )
  {
     ctx->chaining[i] = _mm_setzero_si128();
@@ -54,8 +48,6 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)

  ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 );

-//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-//  INIT256(ctx->chaining);
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -26,9 +26,6 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )

  ctx->hashlen = hashlen;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
-
  for ( i = 0; i < SIZE256; i++ )
  {
     ctx->chaining[i] = m512_zero;
@@ -54,8 +51,8 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
   __m512i* in = (__m512i*)input;
   int i;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
+//  if (ctx->chaining == NULL || ctx->buffer == NULL)
+//    return 1;

  for ( i = 0; i < SIZE256; i++ )
  {
@@ -179,8 +176,8 @@ int groestl256_2way_init( groestl256_2way_context* ctx, uint64_t hashlen )

  ctx->hashlen = hashlen;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
+//  if (ctx->chaining == NULL || ctx->buffer == NULL)
+//    return 1;

  for ( i = 0; i < SIZE256; i++ )
  {
@@ -207,9 +204,6 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
   __m256i* in = (__m256i*)input;
   int i;

-   if (ctx->chaining == NULL || ctx->buffer == NULL)
-     return 1;
-
   for ( i = 0; i < SIZE256; i++ )
   {
     ctx->chaining[i] = m256_zero;
--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -21,9 +21,6 @@

 int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
 {
-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
-
  memset_zero_512( ctx->chaining, SIZE512 );
  memset_zero_512( ctx->buffer, SIZE512 );

@@ -142,9 +139,6 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,

 int groestl512_2way_init( groestl512_2way_context* ctx, uint64_t hashlen )
 {
-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
-
  memset_zero_256( ctx->chaining, SIZE512 );
  memset_zero_256( ctx->buffer, SIZE512 );

--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -73,11 +73,11 @@ int scanhash_myriad( struct work *work, uint32_t max_nonce,
      be32enc(&endiandata[19], nonce);
      myriad_hash(hash, endiandata);

-      if (hash[7] <= Htarg && fulltest(hash, ptarget))
+      if (hash[7] <= Htarg )
+      if ( fulltest(hash, ptarget) && !opt_benchmark )
      {
         pdata[19] = nonce;
-         *hashes_done = pdata[19] - first_nonce;
-         return 1;
+         submit_solution( work, hash, mythr );
      }
      nonce++;

--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -585,9 +585,8 @@ do { \
  t = _mm512_xor_si512( t, c ); \
  d = mm512_xoror( a, b, t ); \
  t = mm512_xorand( t, a, b ); \
-  b = mm512_xor3( b, d, t ); \
  a = c; \
-  c = b; \
+  c = mm512_xor3( b, d, t ); \
  b = d; \
  d = mm512_not( t ); \
 } while (0)
@@ -635,7 +634,7 @@ do { \

 #define ROUND_BIG8( alpha ) \
 do { \
-   __m512i t0, t1, t2, t3; \
+   __m512i t0, t1, t2, t3, t4, t5; \
   s0 = _mm512_xor_si512( s0, alpha[ 0] ); /* m0 */ \
   s1 = _mm512_xor_si512( s1, alpha[ 1] ); /* c0 */ \
   s2 = _mm512_xor_si512( s2, alpha[ 2] ); /* m1 */ \
@@ -662,43 +661,35 @@ do { \
  s5 = mm512_swap64_32( s5 ); \
  sD = mm512_swap64_32( sD ); \
  sE = mm512_swap64_32( sE ); \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
-  L8( s0, t1, s9, t3 ); \
-  s4 = _mm512_mask_blend_epi32( 0x5555, s4, t1 ); \
-  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, t1 ); \
-  sD = _mm512_mask_blend_epi32( 0x5555, sD, t3 ); \
-  sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t3 ); \
+  t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
+  L8( s0, t0, s9, t1 ); \
 \
  s6 = mm512_swap64_32( s6 ); \
  sF = mm512_swap64_32( sF ); \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
-  L8( s1, t1, sA, t3 ); \
-  s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
-  s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, t1 ); \
-  sE = _mm512_mask_blend_epi32( 0x5555, sE, t3 ); \
-  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
+  L8( s1, t2, sA, t3 ); \
+  s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \
+  sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \
 \
  s7 = mm512_swap64_32( s7 ); \
  sC = mm512_swap64_32( sC ); \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
-  L8( s2, t1, sB, t3 ); \
-  s6 = _mm512_mask_blend_epi32( 0x5555, s6, t1 ); \
-  s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, t1 ); \
-  sF = _mm512_mask_blend_epi32( 0x5555, sF, t3 ); \
-  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t3 ); \
+  t4 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
+  t5 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
+  L8( s2, t4, sB, t5 ); \
+  s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \
+  sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \
  s6 = mm512_swap64_32( s6 ); \
  sF = mm512_swap64_32( sF ); \
 \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
-  L8( s3, t1, s8, t3 ); \
-  s7 = _mm512_mask_blend_epi32( 0x5555, s7, t1 ); \
-  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, t1 ); \
-  sC = _mm512_mask_blend_epi32( 0x5555, sC, t3 ); \
-  sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t3 ); \
+  L8( s3, t2, s8, t3 ); \
+  s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \
+  s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \
+  sC = _mm512_mask_blend_epi32( 0x5555, t5, t3 ); \
+  sD = _mm512_mask_blend_epi32( 0xaaaa, t1, t3 ); \
  s7 = mm512_swap64_32( s7 ); \
  sC = mm512_swap64_32( sC ); \
 \
@@ -924,10 +915,9 @@ do { \
  d = _mm256_xor_si256( d, a ); \
  a = _mm256_and_si256( a, b ); \
  t = _mm256_xor_si256( t, a ); \
-  b = _mm256_xor_si256( b, d ); \
-  b = _mm256_xor_si256( b, t ); \
  a = c; \
-  c = b; \
+  c = _mm256_xor_si256( b, d ); \
+  c = _mm256_xor_si256( c, t ); \
  b = d; \
  d = mm256_not( t ); \
 } while (0)
@@ -977,7 +967,7 @@ do { \

 #define ROUND_BIG( alpha ) \
 do { \
-   __m256i t0, t1, t2, t3; \
+   __m256i t0, t1, t2, t3, t4, t5; \
   s0 = _mm256_xor_si256( s0, alpha[ 0] ); \
   s1 = _mm256_xor_si256( s1, alpha[ 1] ); \
   s2 = _mm256_xor_si256( s2, alpha[ 2] ); \
@@ -1004,43 +994,35 @@ do { \
  s5 = mm256_swap64_32( s5 ); \
  sD = mm256_swap64_32( sD ); \
  sE = mm256_swap64_32( sE ); \
-  t1 = _mm256_blend_epi32( s4, s5, 0xaa ); \
-  t3 = _mm256_blend_epi32( sD, sE, 0xaa ); \
-  L( s0, t1, s9, t3 ); \
-  s4 = _mm256_blend_epi32( s4, t1, 0x55 ); \
-  s5 = _mm256_blend_epi32( s5, t1, 0xaa ); \
-  sD = _mm256_blend_epi32( sD, t3, 0x55 ); \
-  sE = _mm256_blend_epi32( sE, t3, 0xaa ); \
+  t0 = _mm256_blend_epi32( s4, s5, 0xaa ); \
+  t1 = _mm256_blend_epi32( sD, sE, 0xaa ); \
+  L( s0, t0, s9, t1 ); \
 \
  s6 = mm256_swap64_32( s6 ); \
  sF = mm256_swap64_32( sF ); \
-  t1 = _mm256_blend_epi32( s5, s6, 0xaa ); \
+  t2 = _mm256_blend_epi32( s5, s6, 0xaa ); \
  t3 = _mm256_blend_epi32( sE, sF, 0xaa ); \
-  L( s1, t1, sA, t3 ); \
-  s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
-  s6 = _mm256_blend_epi32( s6, t1, 0xaa ); \
-  sE = _mm256_blend_epi32( sE, t3, 0x55 ); \
-  sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
+  L( s1, t2, sA, t3 ); \
+  s5 = _mm256_blend_epi32( t0, t2, 0x55 ); \
+  sE = _mm256_blend_epi32( t1, t3, 0x55 ); \
 \
  s7 = mm256_swap64_32( s7 ); \
  sC = mm256_swap64_32( sC ); \
-  t1 = _mm256_blend_epi32( s6, s7, 0xaa ); \
-  t3 = _mm256_blend_epi32( sF, sC, 0xaa ); \
-  L( s2, t1, sB, t3 ); \
-  s6 = _mm256_blend_epi32( s6, t1, 0x55 ); \
-  s7 = _mm256_blend_epi32( s7, t1, 0xaa ); \
-  sF = _mm256_blend_epi32( sF, t3, 0x55 ); \
-  sC = _mm256_blend_epi32( sC, t3, 0xaa ); \
+  t4 = _mm256_blend_epi32( s6, s7, 0xaa ); \
+  t5 = _mm256_blend_epi32( sF, sC, 0xaa ); \
+  L( s2, t4, sB, t5 ); \
+  s6 = _mm256_blend_epi32( t2, t4, 0x55 ); \
+  sF = _mm256_blend_epi32( t3, t5, 0x55 ); \
  s6 = mm256_swap64_32( s6 ); \
  sF = mm256_swap64_32( sF ); \
 \
-  t1 = _mm256_blend_epi32( s7, s4, 0xaa ); \
+  t2 = _mm256_blend_epi32( s7, s4, 0xaa ); \
  t3 = _mm256_blend_epi32( sC, sD, 0xaa ); \
-  L( s3, t1, s8, t3 ); \
-  s7 = _mm256_blend_epi32( s7, t1, 0x55 ); \
-  s4 = _mm256_blend_epi32( s4, t1, 0xaa ); \
-  sC = _mm256_blend_epi32( sC, t3, 0x55 ); \
-  sD = _mm256_blend_epi32( sD, t3, 0xaa ); \
+  L( s3, t2, s8, t3 ); \
+  s7 = _mm256_blend_epi32( t4, t2, 0x55 ); \
+  s4 = _mm256_blend_epi32( t0, t2, 0xaa ); \
+  sC = _mm256_blend_epi32( t5, t3, 0x55 ); \
+  sD = _mm256_blend_epi32( t1, t3, 0xaa ); \
  s7 = mm256_swap64_32( s7 ); \
  sC = mm256_swap64_32( sC ); \
 \
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -141,6 +141,13 @@ do { \
                       _mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \
 } while (0)

+#define STEP1(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
+do { \
+   __m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
+                                      mm128_ror_32( x7, 11 ) ), w ); \
+} while (0)
+
 /*
 * PASSy(n, in) computes pass number "y", for a total of "n", using the
 * one-argument macro "in" to access input words. Current state is assumed
@@ -152,22 +159,22 @@ do { \
 #define PASS1(n, in)   do { \
 		unsigned pass_count; \
 		for (pass_count = 0; pass_count < 32; pass_count += 8) { \
-			STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
-				in(pass_count + 0), SPH_C32(0x00000000)); \
-			STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
-				in(pass_count + 1), SPH_C32(0x00000000)); \
-			STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
-				in(pass_count + 2), SPH_C32(0x00000000)); \
-			STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
-				in(pass_count + 3), SPH_C32(0x00000000)); \
-			STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
-				in(pass_count + 4), SPH_C32(0x00000000)); \
-			STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
-				in(pass_count + 5), SPH_C32(0x00000000)); \
-			STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
-				in(pass_count + 6), SPH_C32(0x00000000)); \
-			STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
-				in(pass_count + 7), SPH_C32(0x00000000)); \
+			STEP1(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
+				in(pass_count + 0) ); \
+			STEP1(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
+				in(pass_count + 1) ); \
+			STEP1(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
+				in(pass_count + 2) ); \
+			STEP1(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
+				in(pass_count + 3) ); \
+			STEP1(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
+				in(pass_count + 4) ); \
+			STEP1(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
+				in(pass_count + 5) ); \
+			STEP1(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
+				in(pass_count + 6) ); \
+			STEP1(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
+				in(pass_count + 7) ); \
   		} \
 	} while (0)

@@ -605,25 +612,32 @@ do { \
                       _mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
 } while (0)

+#define STEP1_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
+do { \
+   __m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
+                                      mm256_ror_32( x7, 11 ) ), w ); \
+} while (0)
+   
 #define PASS1_8W(n, in)   do { \
      unsigned pass_count; \
      for (pass_count = 0; pass_count < 32; pass_count += 8) { \
-         STEP_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
-            in(pass_count + 0), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
-            in(pass_count + 1), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
-            in(pass_count + 2), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
-            in(pass_count + 3), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
-            in(pass_count + 4), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
-            in(pass_count + 5), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
-            in(pass_count + 6), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
-            in(pass_count + 7), SPH_C32(0x00000000)); \
+         STEP1_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
+            in(pass_count + 0) ); \
+         STEP1_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
+            in(pass_count + 1) ); \
+         STEP1_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
+            in(pass_count + 2) ); \
+         STEP1_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
+            in(pass_count + 3) ); \
+         STEP1_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
+            in(pass_count + 4) ); \
+         STEP1_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
+            in(pass_count + 5) ); \
+         STEP1_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
+            in(pass_count + 6) ); \
+         STEP1_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
+            in(pass_count + 7) ); \
         } \
   } while (0)

--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -72,11 +72,11 @@ static const uint64_t RC[] = {
 // Targetted macros, keccak-macros.h is included for each target.

 #define DECL64(x)          __m512i x
-#define XOR(d, a, b)     (d = _mm512_xor_si512(a,b))
-#define XOR64 XOR
+#define XOR(d, a, b)       (d = _mm512_xor_si512(a,b))
+#define XOR64              XOR
 #define AND64(d, a, b)     (d = _mm512_and_si512(a,b))
 #define OR64(d, a, b)      (d = _mm512_or_si512(a,b))
-#define NOT64(d, s)        (d = _mm512_xor_si512(s,m512_neg1))
+#define NOT64(d, s)        (d = mm512_not( s ) )
 #define ROL64(d, v, n)     (d = mm512_rol_64(v, n))
 #define XOROR(d, a, b, c)  (d = mm512_xoror(a, b, c))
 #define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
@@ -257,14 +257,14 @@ keccak512_8way_close(void *cc, void *dst)
        kc->w[j ] = _mm256_xor_si256( kc->w[j], buf[j] ); \
 } while (0)

-#define DECL64(x)        __m256i x
-#define XOR(d, a, b)    (d = _mm256_xor_si256(a,b))
-#define XOR64 XOR
-#define AND64(d, a, b)   (d = _mm256_and_si256(a,b))
-#define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
-#define NOT64(d, s)      (d = _mm256_xor_si256(s,m256_neg1))
-#define ROL64(d, v, n)   (d = mm256_rol_64(v, n))
-#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
+#define DECL64(x)          __m256i x
+#define XOR(d, a, b)       (d = _mm256_xor_si256(a,b))
+#define XOR64              XOR
+#define AND64(d, a, b)     (d = _mm256_and_si256(a,b))
+#define OR64(d, a, b)      (d = _mm256_or_si256(a,b))
+#define NOT64(d, s)        (d = mm256_not( s ) )
+#define ROL64(d, v, n)     (d = mm256_rol_64(v, n))
+#define XOROR(d, a, b, c)  (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
 #define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
 #define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c ))

--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -554,20 +554,10 @@ int luffa_4way_update_close( luffa_4way_context *state,
    a = _mm256_xor_si256( a, c0 ); \
    b = _mm256_xor_si256( b, c1 );

-/*
-#define MULT2( a0, a1, mask ) \
-do { \
-  __m256i b = _mm256_xor_si256( a0, \
-                   _mm256_shuffle_epi32( _mm256_and_si256(a1,mask), 16 ) ); \
-  a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
-  a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) );  \
-} while(0)
-*/
-
-#define MULT2( a0, a1, mask ) \
+#define MULT2( a0, a1 ) \
 { \
-  __m256i b = _mm256_xor_si256( a0, \
-                 _mm256_shuffle_epi32( _mm256_and_si256( a1, mask ), 16 ) ); \
+  __m256i b = _mm256_xor_si256( a0, _mm256_shuffle_epi32( \
+                         _mm256_blend_epi32( a1, m256_zero, 0xee ), 16 ) ); \
  a0 = _mm256_alignr_epi8( a1,  b, 4 ); \
  a1 = _mm256_alignr_epi8(  b, a1, 4 ); \
 }
@@ -682,7 +672,6 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
    __m256i *chainv = state->chainv;
    __m256i msg0, msg1;
    __m256i x0, x1, x2, x3, x4, x5, x6, x7;
-    const __m256i MASK = m256_const1_i128( 0xffffffff );

    t0 = chainv[0];
    t1 = chainv[1];
@@ -696,7 +685,7 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
    t0 = _mm256_xor_si256( t0, chainv[8] );
    t1 = _mm256_xor_si256( t1, chainv[9] );

-    MULT2( t0, t1, MASK );
+    MULT2( t0, t1 );

    msg0 = _mm256_shuffle_epi32( msg[0], 27 );
    msg1 = _mm256_shuffle_epi32( msg[1], 27 );
@@ -715,66 +704,66 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
    t0 = chainv[0];
    t1 = chainv[1];

-    MULT2( chainv[0], chainv[1], MASK );
+    MULT2( chainv[0], chainv[1] );
    chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
    chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );

-    MULT2( chainv[2], chainv[3], MASK );
+    MULT2( chainv[2], chainv[3] );
    chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
    chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);

-    MULT2( chainv[4], chainv[5], MASK );
+    MULT2( chainv[4], chainv[5] );
    chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
    chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);

-    MULT2( chainv[6], chainv[7], MASK );
+    MULT2( chainv[6], chainv[7] );
    chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
    chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);

-    MULT2( chainv[8], chainv[9], MASK );
+    MULT2( chainv[8], chainv[9] );
    chainv[8] = _mm256_xor_si256( chainv[8], t0 );
    chainv[9] = _mm256_xor_si256( chainv[9], t1 );

    t0 = chainv[8];
    t1 = chainv[9];

-    MULT2( chainv[8], chainv[9], MASK );
+    MULT2( chainv[8], chainv[9] );
    chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
    chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );

-    MULT2( chainv[6], chainv[7], MASK );
+    MULT2( chainv[6], chainv[7] );
    chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
    chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );

-    MULT2( chainv[4], chainv[5], MASK );
+    MULT2( chainv[4], chainv[5] );
    chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
    chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );

-    MULT2( chainv[2], chainv[3], MASK );
+    MULT2( chainv[2], chainv[3] );
    chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
    chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );

-    MULT2( chainv[0], chainv[1], MASK );
+    MULT2( chainv[0], chainv[1] );
    chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
    chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );
    chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
    chainv[3] = _mm256_xor_si256( chainv[3], msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );
    chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
    chainv[5] = _mm256_xor_si256( chainv[5], msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );
    chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
    chainv[7] = _mm256_xor_si256( chainv[7], msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );
    chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
    chainv[9] = _mm256_xor_si256( chainv[9], msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );

    chainv[3] = mm256_rol_32( chainv[3], 1 );
    chainv[5] = mm256_rol_32( chainv[5], 2 );
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -19,26 +19,34 @@
 */

 #include <string.h>
-#include <emmintrin.h>
 #include "simd-utils.h"
 #include "luffa_for_sse2.h"

-#if defined(__SSE4_1__)
+#if defined(__AVX512VL__)
+
+#define MULT2( a0, a1 ) \
+{ \
+  __m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
+  a0 = _mm_alignr_epi32( a1, b, 1 ); \
+  a1 = _mm_alignr_epi32( b, a1, 1 ); \
+}
+
+#elif defined(__SSE4_1__)

 #define MULT2( a0, a1 ) do \
 { \
-  __m128i b =  _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
-  a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
-  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) );  \
+  __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
+  a0 = _mm_alignr_epi8( a1, b, 4 ); \
+  a1 = _mm_alignr_epi8( b, a1, 4 ); \
 } while(0)

 #else

 #define MULT2( a0, a1 ) do \
 { \
-  __m128i b =  _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 16 ) ); \
+  __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
  a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
-  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) );  \
+  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
 } while(0)

 #endif
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -75,7 +75,7 @@ void lyra2rev2_16way_hash( void *state, const void *input )
   keccak256_8way_close( &ctx.keccak, vhash );

   dintrlv_8x64( hash8,  hash9,  hash10,  hash11,
-                 hash12, hash13, hash14, hash5, vhash, 256 );
+                 hash12, hash13, hash14, hash15, vhash, 256 );

   cubehash_full( &ctx.cube, (byte*) hash0,  256, (const byte*) hash0,  32 );
   cubehash_full( &ctx.cube, (byte*) hash1,  256, (const byte*) hash1,  32 );
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -3,7 +3,7 @@
 #include "lyra2.h"
 #include "simd-utils.h"

-__thread uint64_t* lyra2z330_wholeMatrix;
+static __thread uint64_t* lyra2z330_wholeMatrix;

 void lyra2z330_hash(void *state, const void *input, uint32_t height)
 {
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -146,14 +146,25 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );

 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+{ \
+   __m128i t; \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_vrol256_64( s6, s7 ); \
-   mm128_vror256_64( s2, s3 ); \
+   t = mm128_alignr_64( s7, s6, 1 ); \
+   s6 = mm128_alignr_64( s6, s7, 1 ); \
+   s7 = t; \
+   t = mm128_alignr_64( s2, s3, 1 ); \
+   s2 =  mm128_alignr_64( s3, s2, 1 ); \
+   s3 = t; \
   G_2X64( s0, s2, s5, s6 ); \
   G_2X64( s1, s3, s4, s7 ); \
-   mm128_vror256_64( s6, s7 ); \
-   mm128_vrol256_64( s2, s3 );
+   t = mm128_alignr_64( s6, s7, 1 ); \
+   s6 = mm128_alignr_64( s7, s6, 1 ); \
+   s7 = t; \
+   t = mm128_alignr_64( s3, s2, 1 ); \
+   s2 =  mm128_alignr_64( s2, s3, 1 ); \
+   s3 = t; \
+} 

 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -4,24 +4,6 @@
 #include <string.h>
 #include <stdio.h>

-long double lbry_calc_network_diff( struct work *work )
-{
-        // sample for diff 43.281 : 1c05ea29
-        // todo: endian reversed on longpoll could be zr5 specific...
-
-   uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
-   uint32_t bits = (nbits & 0xffffff);
-   int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
-   long double d = (long double)0x0000ffff / (long double)bits;
-
-   for (int m=shift; m < 29; m++) d *= 256.0;
-   for (int m=29; m < shift; m++) d /= 256.0;
-   if (opt_debug_diff)
-      applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
-
-   return d;
-}
-
 // std_le should work but it doesn't
 void lbry_le_build_stratum_request( char *req, struct work *work,
                                      struct stratum_ctx *sctx )
@@ -41,31 +23,6 @@ void lbry_le_build_stratum_request( char *req, struct work *work,
   free(xnonce2str);
 }

-/*
-void lbry_build_block_header( struct work* g_work, uint32_t version,
-                             uint32_t *prevhash, uint32_t *merkle_root,
-                             uint32_t ntime, uint32_t nbits )
-{
-   int i;
-   memset( g_work->data, 0, sizeof(g_work->data) );
-   g_work->data[0] =  version;
-
-   if ( have_stratum )
-      for ( i = 0; i < 8; i++ )
-         g_work->data[1 + i] = le32dec( prevhash + i );
-   else
-      for (i = 0; i < 8; i++)
-         g_work->data[ 8-i ] = le32dec( prevhash + i );
-
-   for ( i = 0; i < 8; i++ )
-      g_work->data[9 + i] = be32dec( merkle_root + i );
-
-   g_work->data[ LBRY_NTIME_INDEX ] = ntime;
-   g_work->data[ LBRY_NBITS_INDEX ] = nbits;
-   g_work->data[28] = 0x80000000;
-}
-*/
-
 void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 {
   unsigned char merkle_root[64] = { 0 };
@@ -112,9 +69,7 @@ bool register_lbry_algo( algo_gate_t* gate )
  gate->hash                  = (void*)&lbry_hash;
  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
 #endif
-  gate->calc_network_diff     = (void*)&lbry_calc_network_diff;
  gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
-//  gate->build_block_header    = (void*)&build_block_header;
  gate->build_extraheader     = (void*)&lbry_build_extraheader;
  gate->ntime_index           = LBRY_NTIME_INDEX;
  gate->nbits_index           = LBRY_NBITS_INDEX;
--- a/algo/scrypt/scrypt-core-4way.c
+++ b/algo/scrypt/scrypt-core-4way.c
@@ -830,7 +830,7 @@ void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N )
   }
 }

-// Working, not up to date, needs stream optimization.
+// Working, not up to date, needs stream, shuffle optimizations.
 // 4x32 interleaving
 static void salsa8_simd128_4way( __m128i *b, const __m128i *c )
 {
@@ -937,46 +937,28 @@ void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
 // 4x memory usage
 // Working
 // 4x128 interleaving
-static void salsa_shuffle_4way_simd128( __m512i *X )
+static inline void salsa_shuffle_4way_simd128( __m512i *X )
 {
-   __m512i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3;
-
-   Y0 = _mm512_mask_blend_epi32( 0x1111, X[1], X[0] );
-   Z0 = _mm512_mask_blend_epi32( 0x4444, X[3], X[2] );
-
-   Y1 = _mm512_mask_blend_epi32( 0x1111, X[2], X[1] );
-   Z1 = _mm512_mask_blend_epi32( 0x4444, X[0], X[3] );
-
-   Y2 = _mm512_mask_blend_epi32( 0x1111, X[3], X[2] );
-   Z2 = _mm512_mask_blend_epi32( 0x4444, X[1], X[0] );
-
-   Y3 = _mm512_mask_blend_epi32( 0x1111, X[0], X[3] );
-   Z3 = _mm512_mask_blend_epi32( 0x4444, X[2], X[1] );
-
-   X[0] = _mm512_mask_blend_epi32( 0x3333, Z0, Y0 );
-   X[1] = _mm512_mask_blend_epi32( 0x3333, Z1, Y1 );
-   X[2] = _mm512_mask_blend_epi32( 0x3333, Z2, Y2 );
-   X[3] = _mm512_mask_blend_epi32( 0x3333, Z3, Y3 );
+  __m512i t0 = _mm512_mask_blend_epi32( 0xaaaa, X[0], X[1] );
+  __m512i t1 = _mm512_mask_blend_epi32( 0x5555, X[0], X[1] );
+  __m512i t2 = _mm512_mask_blend_epi32( 0xaaaa, X[2], X[3] );
+  __m512i t3 = _mm512_mask_blend_epi32( 0x5555, X[2], X[3] );
+  X[0] = _mm512_mask_blend_epi32( 0xcccc, t0, t2 );
+  X[1] = _mm512_mask_blend_epi32( 0x6666, t1, t3 );
+  X[2] = _mm512_mask_blend_epi32( 0x3333, t0, t2 );
+  X[3] = _mm512_mask_blend_epi32( 0x9999, t1, t3 );
 }

-static void salsa_unshuffle_4way_simd128( __m512i *X )
+static inline void salsa_unshuffle_4way_simd128( __m512i *X )
 {
-   __m512i Y0, Y1, Y2, Y3;
-
-   Y0 = _mm512_mask_blend_epi32( 0x8888, X[0], X[1] );
-   Y1 = _mm512_mask_blend_epi32( 0x1111, X[0], X[1] );
-   Y2 = _mm512_mask_blend_epi32( 0x2222, X[0], X[1] );
-   Y3 = _mm512_mask_blend_epi32( 0x4444, X[0], X[1] );
-
-   Y0 = _mm512_mask_blend_epi32( 0x4444, Y0, X[2] );
-   Y1 = _mm512_mask_blend_epi32( 0x8888, Y1, X[2] );
-   Y2 = _mm512_mask_blend_epi32( 0x1111, Y2, X[2] );
-   Y3 = _mm512_mask_blend_epi32( 0x2222, Y3, X[2] );
-
-   X[0] = _mm512_mask_blend_epi32( 0x2222, Y0, X[3] );
-   X[1] = _mm512_mask_blend_epi32( 0x4444, Y1, X[3] );
-   X[2] = _mm512_mask_blend_epi32( 0x8888, Y2, X[3] );
-   X[3] = _mm512_mask_blend_epi32( 0x1111, Y3, X[3] );
+  __m512i t0 = _mm512_mask_blend_epi32( 0xcccc, X[0], X[2] );
+  __m512i t1 = _mm512_mask_blend_epi32( 0x3333, X[0], X[2] );
+  __m512i t2 = _mm512_mask_blend_epi32( 0x6666, X[1], X[3] );
+  __m512i t3 = _mm512_mask_blend_epi32( 0x9999, X[1], X[3] );
+  X[0] = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 );
+  X[1] = _mm512_mask_blend_epi32( 0x5555, t0, t2 );
+  X[2] = _mm512_mask_blend_epi32( 0xaaaa, t1, t3 );
+  X[3] = _mm512_mask_blend_epi32( 0x5555, t1, t3 );
 }

 static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C)
@@ -1147,46 +1129,28 @@ void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N )
 // { l1xb, l1xa, l1c9, l1x8,   l0xb, l0xa, l0x9, l0x8 }   b[1]  B[23:16]
 // { l1xf, l1xe, l1xd, l1xc,   l0xf, l0xe, l0xd, l0xc }   b[0]  B[31:24]

-static void salsa_shuffle_2way_simd128( __m256i *X )
+static inline void salsa_shuffle_2way_simd128( __m256i *X )
 {
-   __m256i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3;
-
-   Y0 = _mm256_blend_epi32( X[1], X[0], 0x11 );
-   Z0 = _mm256_blend_epi32( X[3], X[2], 0x44 );
-
-   Y1 = _mm256_blend_epi32( X[2], X[1], 0x11 );
-   Z1 = _mm256_blend_epi32( X[0], X[3], 0x44 );
-
-   Y2 = _mm256_blend_epi32( X[3], X[2], 0x11 );
-   Z2 = _mm256_blend_epi32( X[1], X[0], 0x44 );
-
-   Y3 = _mm256_blend_epi32( X[0], X[3], 0x11 );
-   Z3 = _mm256_blend_epi32( X[2], X[1], 0x44 );
-
-   X[0] = _mm256_blend_epi32( Z0, Y0, 0x33 );
-   X[1] = _mm256_blend_epi32( Z1, Y1, 0x33 );
-   X[2] = _mm256_blend_epi32( Z2, Y2, 0x33 );
-   X[3] = _mm256_blend_epi32( Z3, Y3, 0x33 );
+  __m256i t0 = _mm256_blend_epi32( X[0], X[1], 0xaa );
+  __m256i t1 = _mm256_blend_epi32( X[0], X[1], 0x55 );
+  __m256i t2 = _mm256_blend_epi32( X[2], X[3], 0xaa );
+  __m256i t3 = _mm256_blend_epi32( X[2], X[3], 0x55 );
+  X[0] = _mm256_blend_epi32( t0, t2, 0xcc );
+  X[1] = _mm256_blend_epi32( t1, t3, 0x66 );
+  X[2] = _mm256_blend_epi32( t0, t2, 0x33 );
+  X[3] = _mm256_blend_epi32( t1, t3, 0x99 );
 }

-static void salsa_unshuffle_2way_simd128( __m256i *X )
+static inline void salsa_unshuffle_2way_simd128( __m256i *X )
 {
-   __m256i Y0, Y1, Y2, Y3;
-
-   Y0 = _mm256_blend_epi32( X[0], X[1], 0x88 );
-   Y1 = _mm256_blend_epi32( X[0], X[1], 0x11 );
-   Y2 = _mm256_blend_epi32( X[0], X[1], 0x22 );
-   Y3 = _mm256_blend_epi32( X[0], X[1], 0x44 );
-
-   Y0 = _mm256_blend_epi32( Y0, X[2], 0x44 );
-   Y1 = _mm256_blend_epi32( Y1, X[2], 0x88 );
-   Y2 = _mm256_blend_epi32( Y2, X[2], 0x11 );
-   Y3 = _mm256_blend_epi32( Y3, X[2], 0x22 );
-
-   X[0] = _mm256_blend_epi32( Y0, X[3], 0x22 );
-   X[1] = _mm256_blend_epi32( Y1, X[3], 0x44 );
-   X[2] = _mm256_blend_epi32( Y2, X[3], 0x88 );
-   X[3] = _mm256_blend_epi32( Y3, X[3], 0x11 );
+  __m256i t0 = _mm256_blend_epi32( X[0], X[2], 0xcc );
+  __m256i t1 = _mm256_blend_epi32( X[0], X[2], 0x33 );
+  __m256i t2 = _mm256_blend_epi32( X[1], X[3], 0x66 );
+  __m256i t3 = _mm256_blend_epi32( X[1], X[3], 0x99 );
+  X[0] = _mm256_blend_epi32( t0, t2, 0xaa );
+  X[1] = _mm256_blend_epi32( t0, t2, 0x55 );
+  X[2] = _mm256_blend_epi32( t1, t3, 0xaa );
+  X[3] = _mm256_blend_epi32( t1, t3, 0x55 );
 }

 static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C)
@@ -2163,7 +2127,7 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
   X2 = _mm_blend_epi32( B[1], B[0], 0x4 );
   Y3 = _mm_blend_epi32( B[0], B[3], 0x1 );
   X3 = _mm_blend_epi32( B[2], B[1], 0x4 );
-   X0 = _mm_blend_epi32( X0, Y0, 0x3);
+   X0 = _mm_blend_epi32( X0, Y0, 0x3 );
   X1 = _mm_blend_epi32( X1, Y1, 0x3 );
   X2 = _mm_blend_epi32( X2, Y2, 0x3 );
   X3 = _mm_blend_epi32( X3, Y3, 0x3 );
@@ -2311,91 +2275,34 @@ void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )
 // Double buffered, 2x memory usage
 // No interleaving

-static void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
+static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
 {
   __m128i *XA = (__m128i*)xa;
   __m128i *XB = (__m128i*)xb;
-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;

 #if defined(__SSE4_1__)

-//   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
-   __m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 );
-   YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 );
-   ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 );
-   ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 );
-
-   YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 );
-   YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 );
-   ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 );
-   ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 );
-
-   YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 );
-   YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 );
-   ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 );
-   ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 );
-
-   YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 );
-   YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 );
-   ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 );
-   ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 );
-
-   XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 );
-   XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 );
-
-   XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 );
-   XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 );
-
-   XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 );
-   XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 );
-
-   XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 );
-   XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 );
-
-#else
-
-//  SSE4.1
-
-   YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 );
-   YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 );
-   ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 );
-   ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 );
-
-   YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 );
-   YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 );
-   ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 );
-   ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 );
-
-   YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 );
-   YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 );
-   ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 );
-   ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 );
-
-   YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 );
-   YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 );
-   ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 );
-   ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 );
-
-   XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f );
-   XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f );
-
-   XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f );
-   XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f );
-
-   XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f );
-   XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f );
-
-   XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f );
-   XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f );
-
-#endif  // AVX2 else SSE4_1
+  __m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
+  __m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
+  __m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
+  __m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
+  XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XA[3] = _mm_blend_epi16( t1, t3, 0xc3 );
+  t0 = _mm_blend_epi16( XB[0], XB[1], 0xcc );
+  t1 = _mm_blend_epi16( XB[0], XB[1], 0x33 );
+  t2 = _mm_blend_epi16( XB[2], XB[3], 0xcc );
+  t3 = _mm_blend_epi16( XB[2], XB[3], 0x33 );
+  XB[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XB[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );

 #else   // SSE2
-  
+
+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
+   
   YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
   YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
   YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
@@ -2417,7 +2324,7 @@ static void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
 #endif
 }

-static void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
+static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
 {

   __m128i *XA = (__m128i*)xa;
@@ -2425,67 +2332,22 @@ static void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )

 #if defined(__SSE4_1__)

-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 );
-   YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 );
-   YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 );
-   YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 );
-   YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 );
-   YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 );
-   YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 );
-   YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 );
-
-   YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 );
-   YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 );
-   YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 );
-   YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 );
-   YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 );
-   YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 );
-   YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 );
-   YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 );
-
-   XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 );
-   XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 );
-   XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 );
-   XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 );
-   XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 );
-   XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 );
-   XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 );
-   XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 );
-
-#else   // SSE4_1
-
-   YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 );
-   YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 );
-   YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 );
-   YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 );
-   YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c );
-   YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c );
-   YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 );
-   YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 );
-
-   YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 );
-   YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 );
-   YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 );
-   YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 );
-   YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 );
-   YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 );
-   YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c );
-   YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c );
-
-   XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c );
-   XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c );
-   XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 );
-   XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 );
-   XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 );
-   XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 );
-   XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 );
-   XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 );
-
-#endif  // AVX2 else SSE4_1
+  __m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
+  __m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
+  __m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
+  __m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
+  XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XA[3] = _mm_blend_epi16( t1, t3, 0x33 );
+  t0 = _mm_blend_epi16( XB[0], XB[2], 0xf0 );
+  t1 = _mm_blend_epi16( XB[0], XB[2], 0x0f );
+  t2 = _mm_blend_epi16( XB[1], XB[3], 0x3c );
+  t3 = _mm_blend_epi16( XB[1], XB[3], 0xc3 );
+  XB[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XB[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XB[3] = _mm_blend_epi16( t1, t3, 0x33 );

 #else  // SSE2

@@ -2690,116 +2552,44 @@ void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
 }


-static void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
+static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
                                        uint32_t *xc )
 {
   __m128i *XA = (__m128i*)xa;
   __m128i *XB = (__m128i*)xb;
   __m128i *XC = (__m128i*)xc;
-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;

 #if defined(__SSE4_1__)

-   __m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3, ZC0, ZC1, ZC2, ZC3;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 );
-   YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 );
-   YC0 = _mm_blend_epi32( XC[1], XC[0], 0x1 );
-   ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 );
-   ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 );
-   ZC0 = _mm_blend_epi32( XC[3], XC[2], 0x4 );
-
-   YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 );
-   YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 );
-   YC1 = _mm_blend_epi32( XC[2], XC[1], 0x1 );
-   ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 );
-   ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 );
-   ZC1 = _mm_blend_epi32( XC[0], XC[3], 0x4 );
-
-   YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 );
-   YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 );
-   YC2 = _mm_blend_epi32( XC[3], XC[2], 0x1 );
-   ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 );
-   ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 );
-   ZC2 = _mm_blend_epi32( XC[1], XC[0], 0x4 );
-
-   YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 );
-   YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 );
-   YC3 = _mm_blend_epi32( XC[0], XC[3], 0x1 );
-   ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 );
-   ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 );
-   ZC3 = _mm_blend_epi32( XC[2], XC[1], 0x4 );
-
-   XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 );
-   XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 );
-   XC[0] = _mm_blend_epi32( ZC0, YC0, 0x3 );
-
-   XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 );
-   XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 );
-   XC[1] = _mm_blend_epi32( ZC1, YC1, 0x3 );
-
-   XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 );
-   XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 );
-   XC[2] = _mm_blend_epi32( ZC2, YC2, 0x3 );
-
-   XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 );
-   XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 );
-   XC[3] = _mm_blend_epi32( ZC3, YC3, 0x3 );
-
-#else   
-
-//  SSE4.1
-
-   YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 );
-   YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 );
-   YC0 = _mm_blend_epi16( XC[1], XC[0], 0x03 );
-   ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 );
-   ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 );
-   ZC0 = _mm_blend_epi16( XC[3], XC[2], 0x30 );
-
-   YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 );
-   YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 );
-   YC1 = _mm_blend_epi16( XC[2], XC[1], 0x03 );
-   ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 );
-   ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 );
-   ZC1 = _mm_blend_epi16( XC[0], XC[3], 0x30 );
-
-   YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 );
-   YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 );
-   YC2 = _mm_blend_epi16( XC[3], XC[2], 0x03 );
-   ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 );
-   ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 );
-   ZC2 = _mm_blend_epi16( XC[1], XC[0], 0x30 );
-
-   YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 );
-   YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 );
-   YC3 = _mm_blend_epi16( XC[0], XC[3], 0x03 );
-   ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 );
-   ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 );
-   ZC3 = _mm_blend_epi16( XC[2], XC[1], 0x30 );
-
-   XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f );
-   XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f );
-   XC[0] = _mm_blend_epi16( ZC0, YC0, 0x0f );
-
-   XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f );
-   XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f );
-   XC[1] = _mm_blend_epi16( ZC1, YC1, 0x0f );
-
-   XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f );
-   XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f );
-   XC[2] = _mm_blend_epi16( ZC2, YC2, 0x0f );
-
-   XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f );
-   XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f );
-   XC[3] = _mm_blend_epi16( ZC3, YC3, 0x0f );
-
-#endif  // AVX2 else SSE4_1
+  __m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
+  __m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
+  __m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
+  __m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
+  XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XA[3] = _mm_blend_epi16( t1, t3, 0xc3 );
+  t0 = _mm_blend_epi16( XB[0], XB[1], 0xcc );
+  t1 = _mm_blend_epi16( XB[0], XB[1], 0x33 );
+  t2 = _mm_blend_epi16( XB[2], XB[3], 0xcc );
+  t3 = _mm_blend_epi16( XB[2], XB[3], 0x33 );
+  XB[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XB[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );
+  t0 = _mm_blend_epi16( XC[0], XC[1], 0xcc );
+  t1 = _mm_blend_epi16( XC[0], XC[1], 0x33 );
+  t2 = _mm_blend_epi16( XC[2], XC[3], 0xcc );
+  t3 = _mm_blend_epi16( XC[2], XC[3], 0x33 );
+  XC[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XC[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XC[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XC[3] = _mm_blend_epi16( t1, t3, 0xc3 );

 #else   // SSE2

+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
+
   YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
   YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
   YC0 = _mm_set_epi32( xc[15], xc[10], xc[ 5], xc[ 0] );
@@ -2829,7 +2619,7 @@ static void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
 #endif
 }

-static void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
+static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
                                          uint32_t* xc )
 {
   __m128i *XA = (__m128i*)xa;
@@ -2838,91 +2628,30 @@ static void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,

 #if defined(__SSE4_1__)

-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 );
-   YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 );
-   YC0 = _mm_blend_epi32( XC[0], XC[1], 0x8 );
-   YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 );
-   YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 );
-   YC1 = _mm_blend_epi32( XC[0], XC[1], 0x1 );
-   YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 );
-   YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 );
-   YC2 = _mm_blend_epi32( XC[0], XC[1], 0x2 );
-   YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 );
-   YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 );
-   YC3 = _mm_blend_epi32( XC[0], XC[1], 0x4 );
-
-   YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 );
-   YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 );
-   YC0 = _mm_blend_epi32( YC0, XC[2], 0x4 );
-   YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 );
-   YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 );
-   YC1 = _mm_blend_epi32( YC1, XC[2], 0x8 );
-   YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 );
-   YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 );
-   YC2 = _mm_blend_epi32( YC2, XC[2], 0x1 );
-   YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 );
-   YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 );
-   YC3 = _mm_blend_epi32( YC3, XC[2], 0x2 );
-
-   XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 );
-   XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 );
-   XC[0] = _mm_blend_epi32( YC0, XC[3], 0x2 );
-   XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 );
-   XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 );
-   XC[1] = _mm_blend_epi32( YC1, XC[3], 0x4 );
-   XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 );
-   XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 );
-   XC[2] = _mm_blend_epi32( YC2, XC[3], 0x8 );
-   XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 );
-   XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 );
-   XC[3] = _mm_blend_epi32( YC3, XC[3], 0x1 );
-
-#else   // SSE4_1
-
-   YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 );
-   YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 );
-   YC0 = _mm_blend_epi16( XC[0], XC[1], 0xc0 );
-   YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 );
-   YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 );
-   YC1 = _mm_blend_epi16( XC[0], XC[1], 0x03 );
-   YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c );
-   YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c );
-   YC2 = _mm_blend_epi16( XC[0], XC[1], 0x0c );
-   YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 );
-   YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 );
-   YC3 = _mm_blend_epi16( XC[0], XC[1], 0x30 );
-
-   YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 );
-   YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 );
-   YC0 = _mm_blend_epi16( YC0, XC[2], 0x30 );
-   YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 );
-   YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 );
-   YC1 = _mm_blend_epi16( YC1, XC[2], 0xc0 );
-   YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 );
-   YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 );
-   YC2 = _mm_blend_epi16( YC2, XC[2], 0x03 );
-   YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c );
-   YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c );
-   YC3 = _mm_blend_epi16( YC3, XC[2], 0x0c );
-
-   XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c );
-   XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c );
-   XC[0] = _mm_blend_epi16( YC0, XC[3], 0x0c );
-   XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 );
-   XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 );
-   XC[1] = _mm_blend_epi16( YC1, XC[3], 0x30 );
-   XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 );
-   XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 );
-   XC[2] = _mm_blend_epi16( YC2, XC[3], 0xc0 );
-   XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 );
-   XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 );
-   XC[3] = _mm_blend_epi16( YC3, XC[3], 0x03 );
-
-#endif  // AVX2 else SSE4_1
+  __m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
+  __m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
+  __m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
+  __m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
+  XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XA[3] = _mm_blend_epi16( t1, t3, 0x33 );
+  t0 = _mm_blend_epi16( XB[0], XB[2], 0xf0 );
+  t1 = _mm_blend_epi16( XB[0], XB[2], 0x0f );
+  t2 = _mm_blend_epi16( XB[1], XB[3], 0x3c );
+  t3 = _mm_blend_epi16( XB[1], XB[3], 0xc3 );
+  XB[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XB[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XB[3] = _mm_blend_epi16( t1, t3, 0x33 );
+  t0 = _mm_blend_epi16( XC[0], XC[2], 0xf0 );
+  t1 = _mm_blend_epi16( XC[0], XC[2], 0x0f );
+  t2 = _mm_blend_epi16( XC[1], XC[3], 0x3c );
+  t3 = _mm_blend_epi16( XC[1], XC[3], 0xc3 );
+  XC[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XC[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XC[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XC[3] = _mm_blend_epi16( t1, t3, 0x33 );

 #else  // SSE2

--- a/algo/sha/md-helper-4way.c
+++ b/algo/sha/md-helper-4way.c
@@ -1,270 +0,0 @@
-/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */
-/*
- * This file contains some functions which implement the external data
- * handling and padding for Merkle-Damgard hash functions which follow
- * the conventions set out by MD4 (little-endian) or SHA-1 (big-endian).
- *
- * API: this file is meant to be included, not compiled as a stand-alone
- * file. Some macros must be defined:
- *   RFUN   name for the round function
- *   HASH   "short name" for the hash function
- *   BE32   defined for big-endian, 32-bit based (e.g. SHA-1)
- *   LE32   defined for little-endian, 32-bit based (e.g. MD5)
- *   BE64   defined for big-endian, 64-bit based (e.g. SHA-512)
- *   LE64   defined for little-endian, 64-bit based (no example yet)
- *   PW01   if defined, append 0x01 instead of 0x80 (for Tiger)
- *   BLEN   if defined, length of a message block (in bytes)
- *   PLW1   if defined, length is defined on one 64-bit word only (for Tiger)
- *   PLW4   if defined, length is defined on four 64-bit words (for WHIRLPOOL)
- *   SVAL   if defined, reference to the context state information
- *
- * BLEN is used when a message block is not 16 (32-bit or 64-bit) words:
- * this is used for instance for Tiger, which works on 64-bit words but
- * uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are
- * ignored if 32-bit words are used; if 64-bit words are used and PLW1 is
- * set, then only one word (64 bits) will be used to encode the input
- * message length (in bits), otherwise two words will be used (as in
- * SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but
- * not PLW1), four 64-bit words will be used to encode the message length
- * (in bits). Note that regardless of those settings, only 64-bit message
- * lengths are supported (in bits): messages longer than 2 Exabytes will be
- * improperly hashed (this is unlikely to happen soon: 2 Exabytes is about
- * 2 millions Terabytes, which is huge).
- *
- * If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close()
- * function. This is used for Tiger2, which is identical to Tiger except
- * when it comes to the padding (Tiger2 uses the standard 0x80 byte instead
- * of the 0x01 from original Tiger).
- *
- * The RFUN function is invoked with two arguments, the first pointing to
- * aligned data (as a "const void *"), the second being state information
- * from the context structure. By default, this state information is the
- * "val" field from the context, and this field is assumed to be an array
- * of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64).
- * from the context structure. The "val" field can have any type, except
- * for the output encoding which assumes that it is an array of "sph_u32"
- * values. By defining NO_OUTPUT, this last step is deactivated; the
- * includer code is then responsible for writing out the hash result. When
- * NO_OUTPUT is defined, the third parameter to the "close()" function is
- * ignored.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifdef _MSC_VER
-#pragma warning (disable: 4146)
-#endif
-
-#undef SPH_XCAT
-#define SPH_XCAT(a, b)     SPH_XCAT_(a, b)
-#undef SPH_XCAT_
-#define SPH_XCAT_(a, b)    a ## b
-
-#undef SPH_BLEN
-#undef SPH_WLEN
-#if defined BE64 || defined LE64
-#define SPH_BLEN    128U
-#define SPH_WLEN      8U
-#else
-#define SPH_BLEN     64U
-#define SPH_WLEN      4U
-#endif
-
-#ifdef BLEN
-#undef SPH_BLEN
-#define SPH_BLEN    BLEN
-#endif
-
-#undef SPH_MAXPAD
-#if defined PLW1
-#define SPH_MAXPAD   (SPH_BLEN - SPH_WLEN)
-#elif defined PLW4
-#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 2))
-#else
-#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 1))
-#endif
-
-#undef SPH_VAL
-#undef SPH_NO_OUTPUT
-#ifdef SVAL
-#define SPH_VAL         SVAL
-#define SPH_NO_OUTPUT   1
-#else
-#define SPH_VAL   sc->val
-#endif
-
-#ifndef CLOSE_ONLY
-
-#ifdef SPH_UPTR
-static void
-SPH_XCAT(HASH, _short)( void *cc, const void *data, size_t len )
-#else
-void
-HASH ( void *cc, const void *data, size_t len )
-#endif
-{
-   SPH_XCAT( HASH, _context ) *sc;
-   __m256i *vdata = (__m256i*)data;
-   size_t ptr;
-
-   sc = cc;
-   ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
-   while ( len > 0 )
-   {
-      size_t clen;
-      clen = SPH_BLEN - ptr;
-      if ( clen > len )
-         clen = len;
-      memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
-      vdata = vdata + (clen>>3);
-      ptr += clen;
-      len -= clen;
-      if ( ptr == SPH_BLEN )
-      {
-         RFUN( sc->buf, SPH_VAL );
-         ptr = 0;
-      }
-         sc->count += clen;
-   }
-}
-
-#ifdef SPH_UPTR
-void
-HASH (void *cc, const void *data, size_t len)
-{
-   SPH_XCAT(HASH, _context) *sc;
-   __m256i *vdata = (__m256i*)data;
-   unsigned ptr;
-
-   if ( len < (2 * SPH_BLEN) )
-   {
-      SPH_XCAT(HASH, _short)(cc, data, len);
-      return;
-   }
-   sc = cc;
-   ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
-   if ( ptr > 0 )
-   {
-      unsigned t;
-      t = SPH_BLEN - ptr;
-      SPH_XCAT( HASH, _short )( cc, data, t );
-      vdata = vdata + (t>>3);
-      len -= t;
-   }
-   SPH_XCAT( HASH, _short )( cc, data, len );
-}
-#endif
-
-#endif
-
-/*
- * Perform padding and produce result. The context is NOT reinitialized
- * by this function.
- */
-static void
-SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
-          void *dst, unsigned rnum )
-{
-    SPH_XCAT(HASH, _context) *sc;
-    unsigned ptr, u;
-    sc = cc;
-    ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
-
-#ifdef PW01
-    sc->buf[ptr>>3] = m256_const1_64( 0x100 >> 8 );
-#else
-    sc->buf[ptr>>3] = m256_const1_64( 0x80 );
-#endif
-    ptr += 8;
-
-    if ( ptr > SPH_MAXPAD )
-    {
-         memset_zero_256( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
-         RFUN( sc->buf, SPH_VAL );
-         memset_zero_256( sc->buf, SPH_MAXPAD >> 3 );
-    }
-    else
-    {
-         memset_zero_256( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
-    }
-#if defined BE64
-#if defined PLW1
-    sc->buf[ SPH_MAXPAD>>3 ] =
-                 mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
-#elif defined PLW4
-    memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
-    sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-                mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
-    sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-                mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
-#else
-    sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-               mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
-    sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-               mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
-#endif  // PLW
-#else  // LE64
-#if defined PLW1
-    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
-#elif defined PLW4
-    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
-    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
-                       _mm256_set1_epi64x( c->count >> 61 );
-    memset_zero_256( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
-                       2 * SPH_WLEN );
-#else
-    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
-    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
-                          _mm256_set1_epi64x( sc->count >> 61 );
-#endif // PLW
-
-#endif // LE64
-
-    RFUN( sc->buf, SPH_VAL );
-
-#ifdef SPH_NO_OUTPUT
-    (void)dst;
-    (void)rnum;
-    (void)u;
-#else
-    for ( u = 0; u < rnum; u ++ )
-    {
-#if defined BE64
-       ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
-#else  // LE64
-       ((__m256i*)dst)[u] = sc->val[u];
-#endif
-    }
-#endif
-}
-
-static void
-SPH_XCAT( HASH, _mdclose )( void *cc, void *dst, unsigned rnum )
-{
-   SPH_XCAT( HASH, _addbits_and_close )( cc, 0, 0, dst, rnum );
-}
--- a/algo/sha/sha256dt.c
+++ b/algo/sha/sha256dt.c
@@ -0,0 +1,268 @@
+#include "algo-gate-api.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "sha-hash-4way.h"
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define SHA256DT_16WAY 1
+#elif defined(__AVX2__)
+  #define SHA256DT_8WAY 1
+#else
+  #define SHA256DT_4WAY 1
+#endif
+
+#if defined(SHA256DT_16WAY)
+
+int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m512i  vdata[32]    __attribute__ ((aligned (128)));
+   __m512i  block[16]    __attribute__ ((aligned (64)));
+   __m512i  hash32[8]    __attribute__ ((aligned (64)));
+   __m512i  initstate[8] __attribute__ ((aligned (64)));
+   __m512i  midstate1[8] __attribute__ ((aligned (64)));
+   __m512i  midstate2[8] __attribute__ ((aligned (64)));
+   __m512i  mexp_pre[16] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 16;
+   uint32_t n = first_nonce;
+   __m512i *noncev = vdata + 19; 
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m512i last_byte = m512_const1_32( 0x80000000 );
+   const __m512i sixteen = m512_const1_32( 16 );
+
+   for ( int i = 0; i < 19; i++ )
+      vdata[i] = mm512_bcast_i32( pdata[i] );
+
+   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
+                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
+
+   vdata[16+4] = last_byte;
+   memset_zero_512( vdata+16 + 5, 10 );
+   vdata[16+15] = mm512_bcast_i32( 0x480 ); 
+   
+   block[ 8] = last_byte;
+   memset_zero_512( block + 9, 6 );
+   block[15] = mm512_bcast_i32( 0x300 ); 
+   
+   initstate[0] = mm512_bcast_i64( 0xdfa9bf2cdfa9bf2c );
+   initstate[1] = mm512_bcast_i64( 0xb72074d4b72074d4 );
+   initstate[2] = mm512_bcast_i64( 0x6bb011226bb01122 );
+   initstate[3] = mm512_bcast_i64( 0xd338e869d338e869 );
+   initstate[4] = mm512_bcast_i64( 0xaa3ff126aa3ff126 );
+   initstate[5] = mm512_bcast_i64( 0x475bbf30475bbf30 );
+   initstate[6] = mm512_bcast_i64( 0x8fd52e5b8fd52e5b );
+   initstate[7] = mm512_bcast_i64( 0x9f75c9ad9f75c9ad );
+
+   sha256_16way_transform_le( midstate1, vdata, initstate );
+   
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
+
+   do
+   {
+      sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                 mexp_pre );
+      sha256_16way_transform_le( hash32, block, initstate );
+      mm512_block_bswap_32( hash32, hash32 );    
+
+      for ( int lane = 0; lane < 16; lane++ )
+      if ( hash32_d7[ lane ] <= targ32_d7 )
+      {
+         extr_lane_16x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+      }
+      *noncev = _mm512_add_epi32( *noncev, sixteen );
+      n += 16;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+#endif
+
+#if defined(SHA256DT_8WAY)
+
+int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m256i  vdata[32]    __attribute__ ((aligned (64)));
+   __m256i  block[16]    __attribute__ ((aligned (32)));
+   __m256i  hash32[8]    __attribute__ ((aligned (32)));
+   __m256i  initstate[8] __attribute__ ((aligned (32)));
+   __m256i  midstate1[8] __attribute__ ((aligned (32)));
+   __m256i  midstate2[8] __attribute__ ((aligned (32)));
+   __m256i  mexp_pre[16] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+   __m256i *noncev = vdata + 19;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m256i last_byte = m256_const1_32( 0x80000000 );
+   const __m256i eight = m256_const1_32( 8 );
+
+   for ( int i = 0; i < 19; i++ )
+      vdata[i] = mm256_bcast_i32( pdata[i] );
+
+   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
+
+   vdata[16+4] = last_byte;
+   memset_zero_256( vdata+16 + 5, 10 );
+   vdata[16+15] = mm256_bcast_i32( 0x480 );
+
+   block[ 8] = last_byte;
+   memset_zero_256( block + 9, 6 );
+   block[15] = mm256_bcast_i32( 0x300 ); 
+   
+   // initialize state
+   initstate[0] = mm256_bcast_i64( 0xdfa9bf2cdfa9bf2c );
+   initstate[1] = mm256_bcast_i64( 0xb72074d4b72074d4 );
+   initstate[2] = mm256_bcast_i64( 0x6bb011226bb01122 );
+   initstate[3] = mm256_bcast_i64( 0xd338e869d338e869 );
+   initstate[4] = mm256_bcast_i64( 0xaa3ff126aa3ff126 );
+   initstate[5] = mm256_bcast_i64( 0x475bbf30475bbf30 );
+   initstate[6] = mm256_bcast_i64( 0x8fd52e5b8fd52e5b );
+   initstate[7] = mm256_bcast_i64( 0x9f75c9ad9f75c9ad );
+
+   sha256_8way_transform_le( midstate1, vdata, initstate );
+
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
+   
+   do
+   {
+      sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                mexp_pre );
+      sha256_8way_transform_le( hash32, block, initstate );
+      mm256_block_bswap_32( hash32, hash32 );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( hash32_d7[ lane ] <= targ32_d7 )
+      {
+         extr_lane_8x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+      }
+      *noncev = _mm256_add_epi32( *noncev, eight );
+      n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
+
+#if defined(SHA256DT_4WAY)
+
+int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m128i  vdata[32]    __attribute__ ((aligned (64)));
+   __m128i  block[16]    __attribute__ ((aligned (32)));
+   __m128i  hash32[8]    __attribute__ ((aligned (32)));
+   __m128i  initstate[8] __attribute__ ((aligned (32)));
+   __m128i  midstate[8]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   uint32_t n = first_nonce;
+   __m128i *noncev = vdata + 19;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m128i last_byte = m128_const1_32( 0x80000000 );
+   const __m128i four = m128_const1_32( 4 );
+
+   for ( int i = 0; i < 19; i++ )
+       vdata[i] = mm128_bcast_i32( pdata[i] );
+
+   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
+
+   vdata[16+4] = last_byte;
+   memset_zero_128( vdata+16 + 5, 10 );
+   vdata[16+15] = mm128_bcast_i32( 0x480 );
+
+   block[ 8] = last_byte;
+   memset_zero_128( block + 9, 6 );
+   block[15] = mm128_bcast_i32( 0x300 );
+   
+   // initialize state
+   initstate[0] = mm128_bcast_i64( 0xdfa9bf2cdfa9bf2c );
+   initstate[1] = mm128_bcast_i64( 0xb72074d4b72074d4 );
+   initstate[2] = mm128_bcast_i64( 0x6bb011226bb01122 );
+   initstate[3] = mm128_bcast_i64( 0xd338e869d338e869 );
+   initstate[4] = mm128_bcast_i64( 0xaa3ff126aa3ff126 );
+   initstate[5] = mm128_bcast_i64( 0x475bbf30475bbf30 );
+   initstate[6] = mm128_bcast_i64( 0x8fd52e5b8fd52e5b );
+   initstate[7] = mm128_bcast_i64( 0x9f75c9ad9f75c9ad );
+
+   // hash first 64 bytes of data
+   sha256_4way_transform_le( midstate, vdata, initstate );
+
+   do
+   {
+      sha256_4way_transform_le( block,  vdata+16, midstate  );
+      sha256_4way_transform_le( hash32, block,    initstate );
+      mm128_block_bswap_32( hash32, hash32 );
+
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_4x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+       }
+       *noncev = _mm_add_epi32( *noncev, four );
+       n += 4;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
+bool register_sha256dt_algo( algo_gate_t* gate )
+{
+    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+#if defined(SHA256DT_16WAY)
+    gate->scanhash   = (void*)&scanhash_sha256dt_16way;
+#elif defined(SHA256DT_8WAY)
+    gate->scanhash   = (void*)&scanhash_sha256dt_8way;
+#else
+    gate->scanhash   = (void*)&scanhash_sha256dt_4way;
+#endif
+    return true;
+}
+
--- a/algo/sha/sha512256d-4way.c
+++ b/algo/sha/sha512256d-4way.c
@@ -0,0 +1,221 @@
+#include "algo-gate-api.h"
+#include "sha-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#define SHA512256D_8WAY 1
+#elif defined(__AVX2__)
+#define SHA512256D_4WAY 1
+#endif
+
+#if defined(SHA512256D_8WAY)
+
+static void sha512256d_8way_init( sha512_8way_context *ctx )
+{
+  ctx->count = 0;
+  ctx->initialized = true;
+  ctx->val[0] = mm512_bcast_i64( 0x22312194FC2BF72C );
+  ctx->val[1] = mm512_bcast_i64( 0x9F555FA3C84C64C2 );
+  ctx->val[2] = mm512_bcast_i64( 0x2393B86B6F53B151 );
+  ctx->val[3] = mm512_bcast_i64( 0x963877195940EABD );
+  ctx->val[4] = mm512_bcast_i64( 0x96283EE2A88EFFE3 );
+  ctx->val[5] = mm512_bcast_i64( 0xBE5E1E2553863992 );
+  ctx->val[6] = mm512_bcast_i64( 0x2B0199FC2C85B8AA );
+  ctx->val[7] = mm512_bcast_i64( 0x0EB72DDC81C52CA2 );
+}
+
+int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint64_t hash[8*8] __attribute__ ((aligned (128)));
+    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+    sha512_8way_context ctx; 
+    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+    uint64_t *hash_q3 = &(hash[3*8]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 8;
+    uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;
+    const int thr_id = mythr->id;
+    const bool bench = opt_benchmark;
+    const __m512i eight = mm512_bcast_i64( 0x0000000800000000 );
+
+    mm512_bswap32_intrlv80_8x64( vdata, pdata );
+    *noncev = mm512_intrlv_blend_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
+    do
+    {
+       sha512256d_8way_init( &ctx );
+       sha512_8way_update( &ctx, vdata, 80 );
+       sha512_8way_close( &ctx, hash );        
+
+       sha512256d_8way_init( &ctx );
+       sha512_8way_update( &ctx, hash, 32 );
+       sha512_8way_close( &ctx, hash );
+
+       for ( int lane = 0; lane < 8; lane++ )
+       if ( unlikely( hash_q3[ lane ] <= targ_q3 && !bench ) )
+       {
+          extr_lane_8x64( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) && !bench )
+          {
+             pdata[19] = bswap_32( n + lane );
+             submit_solution( work, lane_hash, mythr );
+          }
+       }
+       *noncev = _mm512_add_epi32( *noncev, eight );
+       n += 8;
+    } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+
+    pdata[19] = n;
+    *hashes_done = n - first_nonce;
+    return 0;
+}
+
+#elif defined(SHA512256D_4WAY)
+
+static void sha512256d_4way_init( sha512_4way_context *ctx )
+{
+  ctx->count = 0;
+  ctx->initialized = true;
+  ctx->val[0] = mm256_bcast_i64( 0x22312194FC2BF72C );
+  ctx->val[1] = mm256_bcast_i64( 0x9F555FA3C84C64C2 );
+  ctx->val[2] = mm256_bcast_i64( 0x2393B86B6F53B151 );
+  ctx->val[3] = mm256_bcast_i64( 0x963877195940EABD );
+  ctx->val[4] = mm256_bcast_i64( 0x96283EE2A88EFFE3 );
+  ctx->val[5] = mm256_bcast_i64( 0xBE5E1E2553863992 );
+  ctx->val[6] = mm256_bcast_i64( 0x2B0199FC2C85B8AA );
+  ctx->val[7] = mm256_bcast_i64( 0x0EB72DDC81C52CA2 );
+}
+
+int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint64_t hash[8*4] __attribute__ ((aligned (64)));
+    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+    sha512_4way_context ctx;
+    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+    uint64_t *hash_q3 = &(hash[3*4]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 4;
+    uint32_t n = first_nonce;
+    __m256i  *noncev = (__m256i*)vdata + 9;
+    const int thr_id = mythr->id;
+    const bool bench = opt_benchmark;
+    const __m256i four = mm256_bcast_i64( 0x0000000400000000 );
+
+    mm256_bswap32_intrlv80_4x64( vdata, pdata );
+    *noncev = mm256_intrlv_blend_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+    do
+    {
+       sha512256d_4way_init( &ctx );
+       sha512_4way_update( &ctx, vdata, 80 );
+       sha512_4way_close( &ctx, hash );
+
+       sha512256d_4way_init( &ctx );
+       sha512_4way_update( &ctx, hash, 32 );
+       sha512_4way_close( &ctx, hash );
+
+       for ( int lane = 0; lane < 4; lane++ )
+       if ( hash_q3[ lane ] <= targ_q3 )
+       {
+          extr_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) && !bench )
+          {
+             pdata[19] = bswap_32( n + lane );
+             submit_solution( work, lane_hash, mythr );
+          }
+       }
+       *noncev = _mm256_add_epi32( *noncev, four );
+       n += 4;
+    } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
+    pdata[19] = n;
+    *hashes_done = n - first_nonce;
+    return 0;
+}
+
+#else
+
+#include "sph_sha2.h"
+
+static const uint64_t H512_256[8] =
+{
+   0x22312194FC2BF72C, 0x9F555FA3C84C64C2,
+   0x2393B86B6F53B151, 0x963877195940EABD,
+   0x96283EE2A88EFFE3, 0xBE5E1E2553863992,
+   0x2B0199FC2C85B8AA, 0x0EB72DDC81C52CA2,
+};
+
+static void sha512256d_init( sph_sha512_context *ctx )
+{
+   memcpy( ctx->val, H512_256, sizeof H512_256 );
+   ctx->count = 0;
+}
+
+int scanhash_sha512256d( struct work *work,   uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t hash64[8] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__ ((aligned (64)));
+   sph_sha512_context ctx;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   int thr_id = mythr->id;
+
+   swab32_array( endiandata, pdata, 20 );
+
+   do {
+      be32enc( &endiandata[19], n );
+
+      sha512256d_init( &ctx );
+      sph_sha512( &ctx, endiandata, 80 );
+      sph_sha512_close( &ctx, hash64 );
+
+      sha512256d_init( &ctx );
+      sph_sha512( &ctx, hash64, 32 );
+      sph_sha512_close( &ctx, hash64 );
+      
+      if ( hash64[7] <= Htarg )
+      if ( fulltest( hash64, ptarget ) && !opt_benchmark )
+      {
+         pdata[19] = n;
+         submit_solution( work, hash64, mythr );
+      }
+      n++;
+
+   } while (n < max_nonce && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   pdata[19] = n;
+
+   return 0;
+}
+
+#endif
+
+bool register_sha512256d_algo( algo_gate_t* gate )
+{
+   gate->optimizations = AVX2_OPT | AVX512_OPT;
+#if defined(SHA512256D_8WAY)
+   gate->scanhash = (void*)&scanhash_sha512256d_8way;
+#elif defined(SHA512256D_4WAY)
+   gate->scanhash = (void*)&scanhash_sha512256d_4way;
+#else
+   gate->scanhash = (void*)&scanhash_sha512256d;
+#endif
+   return true;
+};
+
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -33,6 +33,7 @@
 #include <stddef.h>
 #include <string.h>

+// 4way is only used with AVX2, 8way only with AVX512, 16way is not needed.
 #ifdef __SSE4_1__

 #include "shabal-hash-4way.h"
@@ -44,21 +45,6 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

-/*
- * Part of this code was automatically generated (the part between
- * the "BEGIN" and "END" markers).
- */
-
-#define sM    16
-
-#define C32   SPH_C32
-#define T32   SPH_T32
-
-#define O1   13
-#define O2    9
-#define O3    6
-
-
 #if defined(__AVX2__)

 #define DECL_STATE8   \
@@ -310,72 +296,71 @@ do { \
    mm256_swap512_256( BF, CF ); \
 } while (0)

-#define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
+#define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
-   xa0 = mm256_xor3( xm, xb1, _mm256_xor_si256(  \
-            _mm256_andnot_si256( xb3, xb2 ), \
-            _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
-               _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \
-                                   FIVE ) ), THREE ) ) ); \
+   xa0 = mm256_xor3( xm, xb1, mm256_xorandnot(  \
+           _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
+              _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
+           xb3, xb2 ) ); \
   xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
 } while (0)

 #define PERM_STEP_0_8   do { \
-      PERM_ELT8(A0, AB, B0, BD, B9, B6, C8, M0); \
-      PERM_ELT8(A1, A0, B1, BE, BA, B7, C7, M1); \
-      PERM_ELT8(A2, A1, B2, BF, BB, B8, C6, M2); \
-      PERM_ELT8(A3, A2, B3, B0, BC, B9, C5, M3); \
-      PERM_ELT8(A4, A3, B4, B1, BD, BA, C4, M4); \
-      PERM_ELT8(A5, A4, B5, B2, BE, BB, C3, M5); \
-      PERM_ELT8(A6, A5, B6, B3, BF, BC, C2, M6); \
-      PERM_ELT8(A7, A6, B7, B4, B0, BD, C1, M7); \
-      PERM_ELT8(A8, A7, B8, B5, B1, BE, C0, M8); \
-      PERM_ELT8(A9, A8, B9, B6, B2, BF, CF, M9); \
-      PERM_ELT8(AA, A9, BA, B7, B3, B0, CE, MA); \
-      PERM_ELT8(AB, AA, BB, B8, B4, B1, CD, MB); \
-      PERM_ELT8(A0, AB, BC, B9, B5, B2, CC, MC); \
-      PERM_ELT8(A1, A0, BD, BA, B6, B3, CB, MD); \
-      PERM_ELT8(A2, A1, BE, BB, B7, B4, CA, ME); \
-      PERM_ELT8(A3, A2, BF, BC, B8, B5, C9, MF); \
-   } while (0)
+      PERM_ELT8( A0, AB, B0, BD, B9, B6, C8, M0 ); \
+      PERM_ELT8( A1, A0, B1, BE, BA, B7, C7, M1 ); \
+      PERM_ELT8( A2, A1, B2, BF, BB, B8, C6, M2 ); \
+      PERM_ELT8( A3, A2, B3, B0, BC, B9, C5, M3 ); \
+      PERM_ELT8( A4, A3, B4, B1, BD, BA, C4, M4 ); \
+      PERM_ELT8( A5, A4, B5, B2, BE, BB, C3, M5 ); \
+      PERM_ELT8( A6, A5, B6, B3, BF, BC, C2, M6 ); \
+      PERM_ELT8( A7, A6, B7, B4, B0, BD, C1, M7 ); \
+      PERM_ELT8( A8, A7, B8, B5, B1, BE, C0, M8 ); \
+      PERM_ELT8( A9, A8, B9, B6, B2, BF, CF, M9 ); \
+      PERM_ELT8( AA, A9, BA, B7, B3, B0, CE, MA ); \
+      PERM_ELT8( AB, AA, BB, B8, B4, B1, CD, MB ); \
+      PERM_ELT8( A0, AB, BC, B9, B5, B2, CC, MC ); \
+      PERM_ELT8( A1, A0, BD, BA, B6, B3, CB, MD ); \
+      PERM_ELT8( A2, A1, BE, BB, B7, B4, CA, ME ); \
+      PERM_ELT8( A3, A2, BF, BC, B8, B5, C9, MF ); \
+} while (0)

 #define PERM_STEP_1_8   do { \
-      PERM_ELT8(A4, A3, B0, BD, B9, B6, C8, M0); \
-      PERM_ELT8(A5, A4, B1, BE, BA, B7, C7, M1); \
-      PERM_ELT8(A6, A5, B2, BF, BB, B8, C6, M2); \
-      PERM_ELT8(A7, A6, B3, B0, BC, B9, C5, M3); \
-      PERM_ELT8(A8, A7, B4, B1, BD, BA, C4, M4); \
-      PERM_ELT8(A9, A8, B5, B2, BE, BB, C3, M5); \
-      PERM_ELT8(AA, A9, B6, B3, BF, BC, C2, M6); \
-      PERM_ELT8(AB, AA, B7, B4, B0, BD, C1, M7); \
-      PERM_ELT8(A0, AB, B8, B5, B1, BE, C0, M8); \
-      PERM_ELT8(A1, A0, B9, B6, B2, BF, CF, M9); \
-      PERM_ELT8(A2, A1, BA, B7, B3, B0, CE, MA); \
-      PERM_ELT8(A3, A2, BB, B8, B4, B1, CD, MB); \
-      PERM_ELT8(A4, A3, BC, B9, B5, B2, CC, MC); \
-      PERM_ELT8(A5, A4, BD, BA, B6, B3, CB, MD); \
-      PERM_ELT8(A6, A5, BE, BB, B7, B4, CA, ME); \
-      PERM_ELT8(A7, A6, BF, BC, B8, B5, C9, MF); \
-   } while (0)
+      PERM_ELT8( A4, A3, B0, BD, B9, B6, C8, M0 ); \
+      PERM_ELT8( A5, A4, B1, BE, BA, B7, C7, M1 ); \
+      PERM_ELT8( A6, A5, B2, BF, BB, B8, C6, M2 ); \
+      PERM_ELT8( A7, A6, B3, B0, BC, B9, C5, M3 ); \
+      PERM_ELT8( A8, A7, B4, B1, BD, BA, C4, M4 ); \
+      PERM_ELT8( A9, A8, B5, B2, BE, BB, C3, M5 ); \
+      PERM_ELT8( AA, A9, B6, B3, BF, BC, C2, M6 ); \
+      PERM_ELT8( AB, AA, B7, B4, B0, BD, C1, M7 ); \
+      PERM_ELT8( A0, AB, B8, B5, B1, BE, C0, M8 ); \
+      PERM_ELT8( A1, A0, B9, B6, B2, BF, CF, M9 ); \
+      PERM_ELT8( A2, A1, BA, B7, B3, B0, CE, MA ); \
+      PERM_ELT8( A3, A2, BB, B8, B4, B1, CD, MB ); \
+      PERM_ELT8( A4, A3, BC, B9, B5, B2, CC, MC ); \
+      PERM_ELT8( A5, A4, BD, BA, B6, B3, CB, MD ); \
+      PERM_ELT8( A6, A5, BE, BB, B7, B4, CA, ME ); \
+      PERM_ELT8( A7, A6, BF, BC, B8, B5, C9, MF ); \
+} while (0)

 #define PERM_STEP_2_8   do { \
-      PERM_ELT8(A8, A7, B0, BD, B9, B6, C8, M0); \
-      PERM_ELT8(A9, A8, B1, BE, BA, B7, C7, M1); \
-      PERM_ELT8(AA, A9, B2, BF, BB, B8, C6, M2); \
-      PERM_ELT8(AB, AA, B3, B0, BC, B9, C5, M3); \
-      PERM_ELT8(A0, AB, B4, B1, BD, BA, C4, M4); \
-      PERM_ELT8(A1, A0, B5, B2, BE, BB, C3, M5); \
-      PERM_ELT8(A2, A1, B6, B3, BF, BC, C2, M6); \
-      PERM_ELT8(A3, A2, B7, B4, B0, BD, C1, M7); \
-      PERM_ELT8(A4, A3, B8, B5, B1, BE, C0, M8); \
-      PERM_ELT8(A5, A4, B9, B6, B2, BF, CF, M9); \
-      PERM_ELT8(A6, A5, BA, B7, B3, B0, CE, MA); \
-      PERM_ELT8(A7, A6, BB, B8, B4, B1, CD, MB); \
-      PERM_ELT8(A8, A7, BC, B9, B5, B2, CC, MC); \
-      PERM_ELT8(A9, A8, BD, BA, B6, B3, CB, MD); \
-      PERM_ELT8(AA, A9, BE, BB, B7, B4, CA, ME); \
-      PERM_ELT8(AB, AA, BF, BC, B8, B5, C9, MF); \
-   } while (0)
+      PERM_ELT8( A8, A7, B0, BD, B9, B6, C8, M0 ); \
+      PERM_ELT8( A9, A8, B1, BE, BA, B7, C7, M1 ); \
+      PERM_ELT8( AA, A9, B2, BF, BB, B8, C6, M2 ); \
+      PERM_ELT8( AB, AA, B3, B0, BC, B9, C5, M3 ); \
+      PERM_ELT8( A0, AB, B4, B1, BD, BA, C4, M4 ); \
+      PERM_ELT8( A1, A0, B5, B2, BE, BB, C3, M5 ); \
+      PERM_ELT8( A2, A1, B6, B3, BF, BC, C2, M6 ); \
+      PERM_ELT8( A3, A2, B7, B4, B0, BD, C1, M7 ); \
+      PERM_ELT8( A4, A3, B8, B5, B1, BE, C0, M8 ); \
+      PERM_ELT8( A5, A4, B9, B6, B2, BF, CF, M9 ); \
+      PERM_ELT8( A6, A5, BA, B7, B3, B0, CE, MA ); \
+      PERM_ELT8( A7, A6, BB, B8, B4, B1, CD, MB ); \
+      PERM_ELT8( A8, A7, BC, B9, B5, B2, CC, MC ); \
+      PERM_ELT8( A9, A8, BD, BA, B6, B3, CB, MD ); \
+      PERM_ELT8( AA, A9, BE, BB, B7, B4, CA, ME ); \
+      PERM_ELT8( AB, AA, BF, BC, B8, B5, C9, MF ); \
+} while (0)

 #define APPLY_P8 \
 do { \
@@ -437,8 +422,8 @@ do { \
 } while (0)

 #define INCR_W8   do { \
-      if ((Wlow = T32(Wlow + 1)) == 0) \
-         Whigh = T32(Whigh + 1); \
+      if ( ( Wlow = Wlow + 1 ) == 0 ) \
+         Whigh = Whigh + 1; \
   } while (0)

 static void
@@ -650,15 +635,8 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
   shabal_8way_close(cc, ub, n, dst, 16);
 }

-
 #endif  // AVX2

-/*
- * We copy the state into local variables, so that the compiler knows
- * that it can optimize them at will.
- */
-
-
 #define DECL_STATE   \
 	__m128i A0, A1, A2, A3, A4, A5, A6, A7, \
 	        A8, A9, AA, AB; \
@@ -888,15 +866,6 @@ do { \
   A1 = _mm_xor_si128( A1, _mm_set1_epi32( Whigh ) ); \
 } while (0)

-
-/*
-#define SWAP(v1, v2)   do { \
-		sph_u32 tmp = (v1); \
-		(v1) = (v2); \
-		(v2) = tmp; \
-	} while (0)
-*/
-
 #define SWAP_BC \
 do { \
    mm128_swap256_128( B0, C0 ); \
@@ -917,18 +886,6 @@ do { \
    mm128_swap256_128( BF, CF ); \
 } while (0)

-/*
-#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
-do { \
-  __m128i t1 = _mm_mullo_epi32(  mm_rol_32( xa1, 15 ),\
-                                   _mm_set1_epi32(5UL) ) \
-  __m128i t2 = _mm_xor_si128( xa0, xc ); \
-  xb0 = mm_not( _mm_xor_si256( xa0, mm_rol_32( xb0, 1 ) ) ); \
-  xa0 = mm_xor4( xm, xb1, _mm_andnot_si128( xb3, xb2 ), \
-              _mm_xor_si128( t2, \
-                      _mm_mullo_epi32( t1, _mm_set1_epi32(5UL) ) ) ) \
-*/
-
 #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
 do { \
   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
@@ -1056,8 +1013,8 @@ do { \
 } while (0)

 #define INCR_W   do { \
-		if ((Wlow = T32(Wlow + 1)) == 0) \
-			Whigh = T32(Whigh + 1); \
+		if ( ( Wlow = Wlow + 1 ) == 0 ) \
+			Whigh = Whigh + 1; \
 	} while (0)

 /*
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -75,7 +75,6 @@ void shabal512_8way_close( void *cc, void *dst );
 void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                       void *dst );

-
 #endif

 typedef struct {
@@ -97,7 +96,6 @@ void shabal256_4way_addbits_and_close(	void *cc, unsigned ub, unsigned n,

 void shabal512_4way_init( void *cc );
 void shabal512_4way_update( void *cc, const void *data, size_t len );
-//#define shabal512_4way shabal512_4way_update
 void shabal512_4way_close( void *cc, void *dst );
 void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                       void *dst );
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -1106,8 +1106,7 @@ skein256_4way_close(void *cc, void *dst)
 }


-
-// Do not use with 128 bit data
+// Broken for 80 & 128 bytes, use prehash or full
 void
 skein512_4way_update(void *cc, const void *data, size_t len)
 {
--- a/algo/skein/skein.c
+++ b/algo/skein/skein.c
@@ -31,18 +31,19 @@ int scanhash_skein( struct work *work, uint32_t max_nonce,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t n = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;

   swab32_array( endiandata, pdata, 20 );

 	do {
 		be32enc(&endiandata[19], n); 
 		skeinhash(hash64, endiandata);
-		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
-			*hashes_done = n - first_nonce + 1;
-			pdata[19] = n;
-			return true;
-		}
+      if (hash64[7] <= Htarg )
+      if ( fulltest(hash64, ptarget) && !opt_benchmark )
+      {
+         pdata[19] = n;
+         submit_solution( work, hash64, mythr );
+      }
 		n++;

 	} while (n < max_nonce && !work_restart[thr_id].restart);
--- a/algo/skein/skein2.c
+++ b/algo/skein/skein2.c
@@ -34,31 +34,31 @@ void skein2hash(void *output, const void *input)
 	sph_skein512_close(&ctx_skein, hash);

 	memcpy(output, hash, 32);
-
 }

 int scanhash_skein2( struct work *work,	uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	uint32_t hash64[8] __attribute__ ((aligned (64)));
 	uint32_t endiandata[20] __attribute__ ((aligned (64)));
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t n = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id; 

-        swab32_array( endiandata, pdata, 20 );
+   swab32_array( endiandata, pdata, 20 );

 	do {
 		be32enc(&endiandata[19], n);
 		skein2hash(hash64, endiandata);
-		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
-			*hashes_done = n - first_nonce + 1;
-			pdata[19] = n;
-			return true;
-		}
+      if (hash64[7] <= Htarg )
+      if ( fulltest(hash64, ptarget) && !opt_benchmark )
+      {
+         pdata[19] = n;
+         submit_solution( work, hash64, mythr );
+      }
 		n++;

 	} while (n < max_nonce && !work_restart[thr_id].restart);
--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -112,8 +112,9 @@ void timetravel_4way_hash(void *output, const void *input)
              intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
        case 3:
-           skein512_4way_update( &ctx.skein, vhashA, dataLen );
-           skein512_4way_close( &ctx.skein, vhashB );
+           skein512_4way_full( &ctx.skein, vhashB, vhashA, dataLen );
+//           skein512_4way_update( &ctx.skein, vhashA, dataLen );
+//           skein512_4way_close( &ctx.skein, vhashB );
           if ( i == 7 )
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
        break;
--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -118,8 +118,9 @@ void timetravel10_4way_hash(void *output, const void *input)
              intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
        case 3:
-           skein512_4way_update( &ctx.skein, vhashA, dataLen );
-           skein512_4way_close( &ctx.skein, vhashB );
+           skein512_4way_full( &ctx.skein, vhashB, vhashA, dataLen );
+//           skein512_4way_update( &ctx.skein, vhashA, dataLen );
+//           skein512_4way_close( &ctx.skein, vhashB );
           if ( i == 9 )
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
        break;
--- a/algo/x14/polytimos-4way.c
+++ b/algo/x14/polytimos-4way.c
@@ -33,9 +33,10 @@ void polytimos_4way_hash( void *output, const void *input )
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     poly_4way_context_overlay ctx;

-     skein512_4way_init( &ctx.skein );
-     skein512_4way_update( &ctx.skein, input, 80 );
-     skein512_4way_close( &ctx.skein, vhash );
+     skein512_4way_full( &ctx.skein, vhash, input, 80 );
+//     skein512_4way_init( &ctx.skein );
+//     skein512_4way_update( &ctx.skein, input, 80 );
+//     skein512_4way_close( &ctx.skein, vhash );

     // Need to convert from 64 bit interleaved to 32 bit interleaved.
     uint32_t vhash32[16*4];
--- a/algo/x14/veltor-4way.c
+++ b/algo/x14/veltor-4way.c
@@ -38,8 +38,10 @@ void veltor_4way_hash( void *output, const void *input )
     veltor_4way_ctx_holder ctx __attribute__ ((aligned (64)));
     memcpy( &ctx, &veltor_4way_ctx, sizeof(veltor_4way_ctx) );

-     skein512_4way_update( &ctx.skein, input, 80 );
-     skein512_4way_close( &ctx.skein, vhash );
+//     skein512_4way_update( &ctx.skein, input, 80 );
+//     skein512_4way_close( &ctx.skein, vhash );
+
+     skein512_4way_full( &ctx.skein, vhash, input, 80 );
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     sph_shavite512( &ctx.shavite, hash0, 64 );
@@ -105,7 +107,7 @@ int scanhash_veltor_4way( struct work *work, uint32_t max_nonce,
         pdata[19] = n;

         for ( int i = 0; i < 4; i++ )
-         if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+         if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) && ! opt_benchmark )
         {
            pdata[19] = n+i;
            submit_solution( work, hash+(i<<3), mythr );
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -198,7 +198,7 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
   {
       char* data;
       data = (char*)malloc( 2 + strlen( denom10_str ) * 4 + 16 * 4
-                             + strlen( merkleroot_str ) * 3 );
+                             + strlen( merkleroot_str ) * 3 + 1 );
       // Build the block header veildatahash in hex
       sprintf( data, "%s%s%s%s%s%s%s%s%s%s%s%s",
                       merkleroot_str, witmerkleroot_str, "04",
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -257,6 +257,7 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
   const __m512i eight = m512_const1_64( 8 );
   const bool bench = opt_benchmark;

+   // convert LE32 to LE64
   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
@@ -470,6 +471,7 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
   const __m256i four = m256_const1_64( 4 );
   const bool bench = opt_benchmark;

+   // convert LE32 to LE64
   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
--- a/algo/yespower/yespower-opt.c
+++ b/algo/yespower/yespower-opt.c
@@ -71,6 +71,11 @@
 */
 #undef USE_SSE4_FOR_32BIT

+// AVX512 is slow. There isn't enough AVX512 code to make up
+// for the reduced clock. AVX512VL, used for rotate & ternary logic on smaller
+// vectors, is exempt.
+//#define YESPOWER_USE_AVX512 1
+
 #ifdef __SSE2__
 /*
 * GCC before 4.9 would by default unnecessarily use store/load (without
@@ -124,18 +129,96 @@
 #endif

 typedef union {
-	uint32_t w[16];
-	uint64_t d[8];
+	uint32_t d[16];
+	uint64_t q[8];
 #ifdef __SSE2__
-	__m128i q[4];
+	__m128i m128[4];
+#endif
+#if defined(__AVX2__)
+   __m256i m256[2];
+#endif
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+   __m512i m512;
 #endif
 } salsa20_blk_t;

+#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+// Slow
+
+static const __m512i simd_shuffle_index = 
+   { 0x0000000500000000, 0x0000000f0000000a,
+     0x0000000900000004, 0x000000030000000e,
+     0x0000000d00000008, 0x0000000700000002,
+     0x000000010000000c, 0x0000000b00000006 };
+static const __m512i simd_unshuffle_index =
+   { 0x0000000d00000000, 0x000000070000000a,
+     0x0000000100000004, 0x0000000b0000000e,
+     0x0000000500000008, 0x0000000f00000002,
+     0x000000090000000c, 0x0000000300000006 };
+
+#elif defined(__AVX2__)
+
+#if defined(__AVX512VL__)
+// alternative when not using 512 bit vectors
+
+static const __m256i simd_shuffle_index =
+   { 0x0000000500000000, 0x0000000f0000000a,
+     0x0000000900000004, 0x000000030000000e };
+static const __m256i simd_unshuffle_index =
+   { 0x0000000d00000000, 0x000000070000000a,
+     0x0000000100000004, 0x0000000b0000000e };
+
+#else
+
+static const __m256i simd_shuffle_index =
+   { 0x0000000500000000, 0x0000000700000002,
+     0x0000000100000004, 0x0000000300000006 };
+// same index for unshuffle
+
+#endif
+
+#endif
+
 static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin,
    salsa20_blk_t *Bout)
 {
+#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  
+  Bout->m512 = _mm512_permutexvar_epi32( simd_shuffle_index, Bin->m512 );
+
+#elif defined(__AVX2__)
+
+#if defined(__AVX512VL__)
+
+  Bout->m256[0] = _mm256_permutex2var_epi32( Bin->m256[0], simd_shuffle_index,
+                                             Bin->m256[1] );
+  Bout->m256[1] = _mm256_permutex2var_epi32( Bin->m256[1], simd_shuffle_index,
+                                             Bin->m256[0] );
+  
+#else
+
+  __m256i t0 = _mm256_permutevar8x32_epi32( Bin->m256[0], simd_shuffle_index );
+  __m256i t1 = _mm256_permutevar8x32_epi32( Bin->m256[1], simd_shuffle_index );
+  Bout->m256[0] = _mm256_blend_epi32( t1, t0, 0x93 );
+  Bout->m256[1] = _mm256_blend_epi32( t1, t0, 0x6c );
+  
+#endif
+  
+#elif defined(__SSE4_1__)
+
+  __m128i t0 = _mm_blend_epi16( Bin->m128[0], Bin->m128[1], 0xcc );
+  __m128i t1 = _mm_blend_epi16( Bin->m128[0], Bin->m128[1], 0x33 );
+  __m128i t2 = _mm_blend_epi16( Bin->m128[2], Bin->m128[3], 0xcc );
+  __m128i t3 = _mm_blend_epi16( Bin->m128[2], Bin->m128[3], 0x33 );
+  Bout->m128[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  Bout->m128[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  Bout->m128[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  Bout->m128[3] = _mm_blend_epi16( t1, t3, 0xc3 );
+
+#else
+
 #define COMBINE(out, in1, in2) \
-	Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32);
+	Bout->q[out] = Bin->d[in1 * 2] | ((uint64_t)Bin->d[in2 * 2 + 1] << 32);
 	COMBINE(0, 0, 2)
 	COMBINE(1, 5, 7)
 	COMBINE(2, 2, 4)
@@ -145,14 +228,51 @@ static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin,
 	COMBINE(6, 6, 0)
 	COMBINE(7, 3, 5)
 #undef COMBINE
+
+#endif   
 }

 static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
    salsa20_blk_t *Bout)
 {
+#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+  Bout->m512 = _mm512_permutexvar_epi32( simd_unshuffle_index, Bin->m512 );    
+
+#elif defined(__AVX2__)
+  
+#if defined(__AVX512VL__)
+  
+  Bout->m256[0] = _mm256_permutex2var_epi32( Bin->m256[0], simd_unshuffle_index,
+                                             Bin->m256[1] );
+  Bout->m256[1] = _mm256_permutex2var_epi32( Bin->m256[1], simd_unshuffle_index,
+                                             Bin->m256[0] );
+
+#else  
+
+  __m256i t0 = _mm256_permutevar8x32_epi32( Bin->m256[0], simd_shuffle_index );
+  __m256i t1 = _mm256_permutevar8x32_epi32( Bin->m256[1], simd_shuffle_index );
+  Bout->m256[0] = _mm256_blend_epi32( t1, t0, 0x39 );
+  Bout->m256[1] = _mm256_blend_epi32( t1, t0, 0xc6 );
+
+#endif
+
+#elif defined(__SSE4_1__)
+
+  __m128i t0 = _mm_blend_epi16( Bin->m128[0], Bin->m128[2], 0xf0 );
+  __m128i t1 = _mm_blend_epi16( Bin->m128[0], Bin->m128[2], 0x0f );
+  __m128i t2 = _mm_blend_epi16( Bin->m128[1], Bin->m128[3], 0x3c );
+  __m128i t3 = _mm_blend_epi16( Bin->m128[1], Bin->m128[3], 0xc3 );
+  Bout->m128[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  Bout->m128[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  Bout->m128[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  Bout->m128[3] = _mm_blend_epi16( t1, t3, 0x33 );
+
+#else
+
 #define UNCOMBINE(out, in1, in2) \
-	Bout->w[out * 2] = Bin->d[in1]; \
-	Bout->w[out * 2 + 1] = Bin->d[in2] >> 32;
+	Bout->d[out * 2] = Bin->q[in1]; \
+	Bout->d[out * 2 + 1] = Bin->q[in2] >> 32;
 	UNCOMBINE(0, 0, 6)
 	UNCOMBINE(1, 5, 3)
 	UNCOMBINE(2, 2, 0)
@@ -162,19 +282,14 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
 	UNCOMBINE(6, 6, 4)
 	UNCOMBINE(7, 3, 1)
 #undef UNCOMBINE
+
+#endif
 }

-#ifdef __SSE2__
-
-#define DECL_X \
-	__m128i X0, X1, X2, X3;
-#define DECL_Y \
-	__m128i Y0, Y1, Y2, Y3;
-#define READ_X(in) \
-	X0 = (in).q[0]; X1 = (in).q[1]; X2 = (in).q[2]; X3 = (in).q[3];
 #define WRITE_X(out) \
-	(out).q[0] = X0; (out).q[1] = X1; (out).q[2] = X2; (out).q[3] = X3;
+ (out).m128[0] = X0; (out).m128[1] = X1; (out).m128[2] = X2; (out).m128[3] = X3;

+// Bit rotation optimization
 #if defined(__AVX512VL__)

 #define ARX(out, in1, in2, s) \
@@ -221,203 +336,229 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
 #define SALSA20_wrapper(out, rounds) { \
 	__m128i Z0 = X0, Z1 = X1, Z2 = X2, Z3 = X3; \
 	rounds \
-	(out).q[0] = X0 = _mm_add_epi32(X0, Z0); \
-	(out).q[1] = X1 = _mm_add_epi32(X1, Z1); \
-	(out).q[2] = X2 = _mm_add_epi32(X2, Z2); \
-	(out).q[3] = X3 = _mm_add_epi32(X3, Z3); \
+	(out).m128[0] = X0 = _mm_add_epi32( X0, Z0 ); \
+	(out).m128[1] = X1 = _mm_add_epi32( X1, Z1 ); \
+	(out).m128[2] = X2 = _mm_add_epi32( X2, Z2 ); \
+	(out).m128[3] = X3 = _mm_add_epi32( X3, Z3 ); \
 }

 /**
 * Apply the Salsa20/2 core to the block provided in X.
 */
+// Not called explicitly, aliased to SALSA20
 #define SALSA20_2(out) \
 	SALSA20_wrapper(out, SALSA20_2ROUNDS)

-#define SALSA20_8ROUNDS \
-	SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS
-
 /**
 * Apply the Salsa20/8 core to the block provided in X.
 */
+#define SALSA20_8ROUNDS \
+   SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS
+
 #define SALSA20_8(out) \
 	SALSA20_wrapper(out, SALSA20_8ROUNDS)

 #define XOR_X(in) \
-	X0 = _mm_xor_si128(X0, (in).q[0]); \
-	X1 = _mm_xor_si128(X1, (in).q[1]); \
-	X2 = _mm_xor_si128(X2, (in).q[2]); \
-	X3 = _mm_xor_si128(X3, (in).q[3]);
-
-#define XOR_X_2(in1, in2) \
-	X0 = _mm_xor_si128((in1).q[0], (in2).q[0]); \
-	X1 = _mm_xor_si128((in1).q[1], (in2).q[1]); \
-	X2 = _mm_xor_si128((in1).q[2], (in2).q[2]); \
-	X3 = _mm_xor_si128((in1).q[3], (in2).q[3]);
+	X0 = _mm_xor_si128( X0, (in).m128[0] ); \
+	X1 = _mm_xor_si128( X1, (in).m128[1] ); \
+	X2 = _mm_xor_si128( X2, (in).m128[2] ); \
+	X3 = _mm_xor_si128( X3, (in).m128[3] );

 #define XOR_X_WRITE_XOR_Y_2(out, in) \
-	(out).q[0] = Y0 = _mm_xor_si128((out).q[0], (in).q[0]); \
-	(out).q[1] = Y1 = _mm_xor_si128((out).q[1], (in).q[1]); \
-	(out).q[2] = Y2 = _mm_xor_si128((out).q[2], (in).q[2]); \
-	(out).q[3] = Y3 = _mm_xor_si128((out).q[3], (in).q[3]); \
-	X0 = _mm_xor_si128(X0, Y0); \
-	X1 = _mm_xor_si128(X1, Y1); \
-	X2 = _mm_xor_si128(X2, Y2); \
-	X3 = _mm_xor_si128(X3, Y3);
+	(out).m128[0] = Y0 = _mm_xor_si128( (out).m128[0], (in).m128[0] ); \
+	(out).m128[1] = Y1 = _mm_xor_si128( (out).m128[1], (in).m128[1] ); \
+	(out).m128[2] = Y2 = _mm_xor_si128( (out).m128[2], (in).m128[2] ); \
+	(out).m128[3] = Y3 = _mm_xor_si128( (out).m128[3], (in).m128[3] ); \
+	X0 = _mm_xor_si128( X0, Y0 ); \
+	X1 = _mm_xor_si128( X1, Y1 ); \
+	X2 = _mm_xor_si128( X2, Y2 ); \
+	X3 = _mm_xor_si128( X3, Y3 );

-#define INTEGERIFY _mm_cvtsi128_si32(X0)
-
-#else /* !defined(__SSE2__) */
-
-#define DECL_X \
-	salsa20_blk_t X;
-#define DECL_Y \
-	salsa20_blk_t Y;
-
-#define COPY(out, in) \
-	(out).d[0] = (in).d[0]; \
-	(out).d[1] = (in).d[1]; \
-	(out).d[2] = (in).d[2]; \
-	(out).d[3] = (in).d[3]; \
-	(out).d[4] = (in).d[4]; \
-	(out).d[5] = (in).d[5]; \
-	(out).d[6] = (in).d[6]; \
-	(out).d[7] = (in).d[7];
-
-#define READ_X(in) COPY(X, in)
-#define WRITE_X(out) COPY(out, X)
-
-/**
- * salsa20(B):
- * Apply the Salsa20 core to the provided block.
- */
-static inline void salsa20(salsa20_blk_t *restrict B,
-    salsa20_blk_t *restrict Bout, uint32_t doublerounds)
-{
-	salsa20_blk_t X;
-#define x X.w
-
-	salsa20_simd_unshuffle(B, &X);
-
-	do {
-#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
-		/* Operate on columns */
-		x[ 4] ^= R(x[ 0]+x[12], 7);  x[ 8] ^= R(x[ 4]+x[ 0], 9);
-		x[12] ^= R(x[ 8]+x[ 4],13);  x[ 0] ^= R(x[12]+x[ 8],18);
-
-		x[ 9] ^= R(x[ 5]+x[ 1], 7);  x[13] ^= R(x[ 9]+x[ 5], 9);
-		x[ 1] ^= R(x[13]+x[ 9],13);  x[ 5] ^= R(x[ 1]+x[13],18);
-
-		x[14] ^= R(x[10]+x[ 6], 7);  x[ 2] ^= R(x[14]+x[10], 9);
-		x[ 6] ^= R(x[ 2]+x[14],13);  x[10] ^= R(x[ 6]+x[ 2],18);
-
-		x[ 3] ^= R(x[15]+x[11], 7);  x[ 7] ^= R(x[ 3]+x[15], 9);
-		x[11] ^= R(x[ 7]+x[ 3],13);  x[15] ^= R(x[11]+x[ 7],18);
-
-		/* Operate on rows */
-		x[ 1] ^= R(x[ 0]+x[ 3], 7);  x[ 2] ^= R(x[ 1]+x[ 0], 9);
-		x[ 3] ^= R(x[ 2]+x[ 1],13);  x[ 0] ^= R(x[ 3]+x[ 2],18);
-
-		x[ 6] ^= R(x[ 5]+x[ 4], 7);  x[ 7] ^= R(x[ 6]+x[ 5], 9);
-		x[ 4] ^= R(x[ 7]+x[ 6],13);  x[ 5] ^= R(x[ 4]+x[ 7],18);
-
-		x[11] ^= R(x[10]+x[ 9], 7);  x[ 8] ^= R(x[11]+x[10], 9);
-		x[ 9] ^= R(x[ 8]+x[11],13);  x[10] ^= R(x[ 9]+x[ 8],18);
-
-		x[12] ^= R(x[15]+x[14], 7);  x[13] ^= R(x[12]+x[15], 9);
-		x[14] ^= R(x[13]+x[12],13);  x[15] ^= R(x[14]+x[13],18);
-#undef R
-	} while (--doublerounds);
-#undef x
-
-	{
-		uint32_t i;
-		salsa20_simd_shuffle(&X, Bout);
-		for (i = 0; i < 16; i += 4) {
-			B->w[i] = Bout->w[i] += B->w[i];
-			B->w[i + 1] = Bout->w[i + 1] += B->w[i + 1];
-			B->w[i + 2] = Bout->w[i + 2] += B->w[i + 2];
-			B->w[i + 3] = Bout->w[i + 3] += B->w[i + 3];
-		}
-	}
-}
-
-/**
- * Apply the Salsa20/2 core to the block provided in X.
- */
-#define SALSA20_2(out) \
-	salsa20(&X, &out, 1);
-
-/**
- * Apply the Salsa20/8 core to the block provided in X.
- */
-#define SALSA20_8(out) \
-	salsa20(&X, &out, 4);
-
-#define XOR(out, in1, in2) \
-	(out).d[0] = (in1).d[0] ^ (in2).d[0]; \
-	(out).d[1] = (in1).d[1] ^ (in2).d[1]; \
-	(out).d[2] = (in1).d[2] ^ (in2).d[2]; \
-	(out).d[3] = (in1).d[3] ^ (in2).d[3]; \
-	(out).d[4] = (in1).d[4] ^ (in2).d[4]; \
-	(out).d[5] = (in1).d[5] ^ (in2).d[5]; \
-	(out).d[6] = (in1).d[6] ^ (in2).d[6]; \
-	(out).d[7] = (in1).d[7] ^ (in2).d[7];
-
-#define XOR_X(in) XOR(X, X, in)
-#define XOR_X_2(in1, in2) XOR(X, in1, in2)
-#define XOR_X_WRITE_XOR_Y_2(out, in) \
-	XOR(Y, out, in) \
-	COPY(out, Y) \
-	XOR(X, X, Y)
-
-#define INTEGERIFY (uint32_t)X.d[0]
-#endif
+#define INTEGERIFY( X ) _mm_cvtsi128_si32( X )

 // AVX512 ternary logic optimization
 #if defined(__AVX512VL__)

 #define XOR_X_XOR_X( in1, in2 ) \
- X0 =  _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
- X1 =  _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
- X2 =  _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
- X3 =  _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); 
-
-#define XOR_X_2_XOR_X( in1, in2, in3 ) \
- X0 =  _mm_ternarylogic_epi32( (in1).q[0], (in2).q[0], (in3).q[0], 0x96 ); \
- X1 =  _mm_ternarylogic_epi32( (in1).q[1], (in2).q[1], (in3).q[1], 0x96 ); \
- X2 =  _mm_ternarylogic_epi32( (in1).q[2], (in2).q[2], (in3).q[2], 0x96 ); \
- X3 =  _mm_ternarylogic_epi32( (in1).q[3], (in2).q[3], (in3).q[3], 0x96 );
-
-#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
- X0 =  _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
- X1 =  _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
- X2 =  _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
- X3 =  _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); \
- SALSA20(out)
+ X0 =  _mm_ternarylogic_epi32( X0, (in1).m128[0], (in2).m128[0], 0x96 ); \
+ X1 =  _mm_ternarylogic_epi32( X1, (in1).m128[1], (in2).m128[1], 0x96 ); \
+ X2 =  _mm_ternarylogic_epi32( X2, (in1).m128[2], (in2).m128[2], 0x96 ); \
+ X3 =  _mm_ternarylogic_epi32( X3, (in1).m128[3], (in2).m128[3], 0x96 ); 

 #else

 #define XOR_X_XOR_X( in1, in2 ) \
  XOR_X( in1 ) \
-  XOR_X( in2 ) 
-
-#define XOR_X_2_XOR_X( in1, in2, in3 ) \
-   XOR_X_2( in1, in2 ) \
-   XOR_X( in3 )
-
-#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
-   XOR_X(in1) \
-   XOR_X(in2) \
-   SALSA20( out )
+  XOR_X( in2 )

 #endif

-/**
- * Apply the Salsa20 core to the block provided in X ^ in.
- */
+// General vectored optimizations
+#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define READ_X( in ) \
+  X.m512 = (in).m512;
+
+#define XOR_X_2_XOR_X( in1, in2, in3 ) \
+ X.m512 = _mm512_ternarylogic_epi32( (in1).m512, (in2).m512, (in3).m512, 0x96 );
+
+#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
+{ \
+ __m128i X0, X1, X2, X3; \
+ X.m512 = _mm512_ternarylogic_epi32( X.m512, (in1).m512, (in2).m512, 0x96 ); \
+ X0 = X.m128[0]; \
+ X1 = X.m128[1]; \
+ X2 = X.m128[2]; \
+ X3 = X.m128[3]; \
+ SALSA20( out ); \
+ X.m128[0] = X0; \
+ X.m128[1] = X1; \
+ X.m128[2] = X2; \
+ X.m128[3] = X3; \
+}
+
 #define SALSA20_XOR_MEM(in, out) \
-	XOR_X(in) \
-	SALSA20(out)
+{ \
+ __m128i X0, X1, X2, X3; \
+ X.m512 = _mm512_xor_si512( X.m512, (in).m512 ); \
+ X0 = X.m128[0]; \
+ X1 = X.m128[1]; \
+ X2 = X.m128[2]; \
+ X3 = X.m128[3]; \
+ SALSA20( out ); \
+ X.m128[0] = X0; \
+ X.m128[1] = X1; \
+ X.m128[2] = X2; \
+ X.m128[3] = X3; \
+}
+
+#elif defined(__AVX2__)
+
+#define READ_X( in ) \
+  X.m256[0] = (in).m256[0]; \
+  X.m256[1] = (in).m256[1];
+
+#if defined(__AVX512VL__)
+
+#define XOR_X_2_XOR_X( in1, in2, in3 ) \
+   X.m256[0] = _mm256_ternarylogic_epi32( (in1).m256[0], (in2).m256[0], \
+                                          (in3).m256[0], 0x96 ); \
+   X.m256[1] = _mm256_ternarylogic_epi32( (in1).m256[1], (in2).m256[1], \
+                                          (in3).m256[1], 0x96 );
+
+#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
+{ \
+   __m128i X0, X1, X2, X3; \
+   X.m256[0] = _mm256_ternarylogic_epi32( X.m256[0], (in1).m256[0], \
+                                      (in2).m256[0], 0x96 ); \
+   X.m256[1] = _mm256_ternarylogic_epi32( X.m256[1], (in1).m256[1], \
+                                      (in2).m256[1], 0x96 ); \
+   X0 = X.m128[0]; \
+   X1 = X.m128[1]; \
+   X2 = X.m128[2]; \
+   X3 = X.m128[3]; \
+   SALSA20( out ); \
+   X.m128[0] = X0; \
+   X.m128[1] = X1; \
+   X.m128[2] = X2; \
+   X.m128[3] = X3; \
+}
+
+#else  // AVX2
+
+#define XOR_X_2_XOR_X( in1, in2, in3 ) \
+   X.m256[0] = _mm256_xor_si256( (in1).m256[0], \
+                       _mm256_xor_si256( (in2).m256[0], (in3).m256[0] ) ); \
+   X.m256[1] = _mm256_xor_si256( (in1).m256[1], \
+                       _mm256_xor_si256( (in2).m256[1], (in3).m256[1] ) );
+
+#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
+{ \
+   __m128i X0, X1, X2, X3; \
+   X.m256[0] = _mm256_xor_si256( X.m256[0], \
+                       _mm256_xor_si256( (in1).m256[0], (in2).m256[0] ) ); \
+   X.m256[1] = _mm256_xor_si256( X.m256[1], \
+                       _mm256_xor_si256( (in1).m256[1], (in2).m256[1] ) ); \
+   X0 = X.m128[0]; \
+   X1 = X.m128[1]; \
+   X2 = X.m128[2]; \
+   X3 = X.m128[3]; \
+   SALSA20( out ); \
+   X.m128[0] = X0; \
+   X.m128[1] = X1; \
+   X.m128[2] = X2; \
+   X.m128[3] = X3; \
+}  
+
+#endif // AVX512VL else
+
+#define SALSA20_XOR_MEM( in, out ) \
+{ \
+   __m128i X0, X1, X2, X3; \
+   X.m256[0] = _mm256_xor_si256( X.m256[0], (in).m256[0] ); \
+   X.m256[1] = _mm256_xor_si256( X.m256[1], (in).m256[1] ); \
+   X0 = X.m128[0]; \
+   X1 = X.m128[1]; \
+   X2 = X.m128[2]; \
+   X3 = X.m128[3]; \
+   SALSA20( out ) \
+   X.m128[0] = X0; \
+   X.m128[1] = X1; \
+   X.m128[2] = X2; \
+   X.m128[3] = X3; \
+}
+
+#else   // SSE2
+
+#define READ_X(in) \
+   X.m128[0] = (in).m128[0]; \
+   X.m128[1] = (in).m128[1]; \
+   X.m128[2] = (in).m128[2]; \
+   X.m128[3] = (in).m128[3];
+
+#define XOR_X_2_XOR_X( in1, in2, in3 ) \
+   X.m128[0] = _mm_xor_si128( (in1).m128[0], \
+                     _mm_xor_si128( (in2).m128[0], (in3).m128[0] ) ); \
+   X.m128[1] = _mm_xor_si128( (in1).m128[1], \
+                     _mm_xor_si128( (in2).m128[1], (in3).m128[1] ) ); \
+   X.m128[2] = _mm_xor_si128( (in1).m128[2], \
+                     _mm_xor_si128( (in2).m128[2], (in3).m128[2] ) ); \
+   X.m128[3] = _mm_xor_si128( (in1).m128[3], \
+                     _mm_xor_si128( (in2).m128[3], (in3).m128[3] ) );
+
+
+#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
+{ \
+   __m128i X0 = _mm_xor_si128( X.m128[0], \
+                         _mm_xor_si128( (in1).m128[0], (in2).m128[0] ) ); \
+   __m128i X1 = _mm_xor_si128( X.m128[1], \
+                         _mm_xor_si128( (in1).m128[1], (in2).m128[1] ) ); \
+   __m128i X2 = _mm_xor_si128( X.m128[2], \
+                         _mm_xor_si128( (in1).m128[2], (in2).m128[2] ) ); \
+   __m128i X3 = _mm_xor_si128( X.m128[3], \
+                         _mm_xor_si128( (in1).m128[3], (in2).m128[3] ) ); \
+   SALSA20( out ); \
+   X.m128[0] = X0; \
+   X.m128[1] = X1; \
+   X.m128[2] = X2; \
+   X.m128[3] = X3; \
+}   
+     
+// Apply the Salsa20 core to the block provided in X ^ in.
+#define SALSA20_XOR_MEM(in, out) \
+{ \
+   __m128i X0 = _mm_xor_si128( X.m128[0], (in).m128[0] ); \
+   __m128i X1 = _mm_xor_si128( X.m128[1], (in).m128[1] ); \
+   __m128i X2 = _mm_xor_si128( X.m128[2], (in).m128[2] ); \
+   __m128i X3 = _mm_xor_si128( X.m128[3], (in).m128[3] ); \
+   SALSA20( out ) \
+   X.m128[0] = X0; \
+   X.m128[1] = X1; \
+   X.m128[2] = X2; \
+   X.m128[3] = X3; \
+} 
+
+#endif   // AVX512 elif AVX2 else

 #define SALSA20 SALSA20_8
 #else /* pass 2 */
@@ -425,7 +566,7 @@ static inline void salsa20(salsa20_blk_t *restrict B,
 #define SALSA20 SALSA20_2
 #endif

-/**
+/*
 * blockmix_salsa(Bin, Bout):
 * Compute Bout = BlockMix_{salsa20, 1}(Bin).  The input Bin must be 128
 * bytes in length; the output Bout must also be the same size.
@@ -433,29 +574,23 @@ static inline void salsa20(salsa20_blk_t *restrict B,
 static inline void blockmix_salsa(const salsa20_blk_t *restrict Bin,
    salsa20_blk_t *restrict Bout)
 {
-	DECL_X
+   salsa20_blk_t X;

-	READ_X(Bin[1])
-	SALSA20_XOR_MEM(Bin[0], Bout[0])
-	SALSA20_XOR_MEM(Bin[1], Bout[1])
+   READ_X( Bin[1] );
+   SALSA20_XOR_MEM(Bin[0], Bout[0]);
+	SALSA20_XOR_MEM(Bin[1], Bout[1]);
 }

 static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1,
    const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout)
 {
-	DECL_X
+   salsa20_blk_t X;

-   XOR_X_2_XOR_X( Bin1[1], Bin2[1], Bin1[0] )   
-//	XOR_X_2(Bin1[1], Bin2[1])
-//	XOR_X(Bin1[0])
-	SALSA20_XOR_MEM(Bin2[0], Bout[0])
+   XOR_X_2_XOR_X( Bin1[1], Bin2[1], Bin1[0] );   
+	SALSA20_XOR_MEM( Bin2[0], Bout[0] );
+   XOR_X_SALSA20_XOR_MEM( Bin1[1], Bin2[1], Bout[1] );

-// Factor out the XOR from salsa20 to do a xor3
-   XOR_X_SALSA20_XOR_MEM( Bin1[1], Bin2[1], Bout[1] )
-//   XOR_X(Bin1[1])
-//	SALSA20_XOR_MEM(Bin2[1], Bout[1])
-
-	return INTEGERIFY;
+   return X.d[0];
 }

 #if _YESPOWER_OPT_C_PASS_ == 1
@@ -490,7 +625,6 @@ typedef struct {
 #define DECL_SMASK2REG /* empty */
 #define MAYBE_MEMORY_BARRIER /* empty */

-#ifdef __SSE2__
 /*
 * (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs
 * starting with Sandy Bridge.  Additionally, PSHUFD uses separate source and
@@ -513,28 +647,40 @@ typedef struct {

 #if defined(__x86_64__) && \
    __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__ICC)
+
 #ifdef __AVX__
+
 #define MOVQ "vmovq"
+
 #else
 /* "movq" would be more correct, but "movd" is supported by older binutils
 * due to an error in AMD's spec for x86-64. */
+
 #define MOVQ "movd"
+
 #endif
+
 #define EXTRACT64(X) ({ \
 	uint64_t result; \
 	__asm__(MOVQ " %1, %0" : "=r" (result) : "x" (X)); \
 	result; \
 })
+
 #elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__)
 /* MSVC and Open64 had bugs */
+
 #define EXTRACT64(X) _mm_cvtsi128_si64(X)
+
 #elif defined(__x86_64__) && defined(__SSE4_1__)
 /* No known bugs for this intrinsic */
+
 #include <smmintrin.h>
 #define EXTRACT64(X) _mm_extract_epi64((X), 0)
+
 #elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__)
 /* 32-bit */
 #include <smmintrin.h>
+
 #if 0
 /* This is currently unused by the code below, which instead uses these two
 * intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */
@@ -542,18 +688,24 @@ typedef struct {
 	((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
 	((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32))
 #endif
+
 #else
 /* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64() */
+
 #define EXTRACT64(X) \
 	((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
 	((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32))
+
 #endif

 #if defined(__x86_64__) && (defined(__AVX__) || !defined(__GNUC__))
 /* 64-bit with AVX */
 /* Force use of 64-bit AND instead of two 32-bit ANDs */
+
 #undef DECL_SMASK2REG
+
 #if defined(__GNUC__) && !defined(__ICC)
+
 #define DECL_SMASK2REG uint64_t Smask2reg = Smask2;
 /* Force use of lower-numbered registers to reduce number of prefixes, relying
 * on out-of-order execution and register renaming. */
@@ -561,12 +713,16 @@ typedef struct {
 	__asm__("" : "=a" (x), "+d" (Smask2reg), "+S" (S0), "+D" (S1));
 #define FORCE_REGALLOC_2 \
 	__asm__("" : : "c" (lo));
-#else
+
+#else   // not GNUC
+
 static volatile uint64_t Smask2var = Smask2;
 #define DECL_SMASK2REG uint64_t Smask2reg = Smask2var;
 #define FORCE_REGALLOC_1 /* empty */
 #define FORCE_REGALLOC_2 /* empty */
+
 #endif
+
 #define PWXFORM_SIMD(X) { \
 	uint64_t x; \
 	FORCE_REGALLOC_1 \
@@ -577,14 +733,18 @@ static volatile uint64_t Smask2var = Smask2;
 	X = _mm_add_epi64(X, *(__m128i *)(S0 + lo)); \
 	X = _mm_xor_si128(X, *(__m128i *)(S1 + hi)); \
 }
+
 #elif defined(__x86_64__)
 /* 64-bit without AVX.  This relies on out-of-order execution and register
 * renaming.  It may actually be fastest on CPUs with AVX(2) as well - e.g.,
 * it runs great on Haswell. */
 //#warning "Note: using x86-64 inline assembly for pwxform.  That's great."
+
 #undef MAYBE_MEMORY_BARRIER
+
 #define MAYBE_MEMORY_BARRIER \
 	__asm__("" : : : "memory");
+
 #define PWXFORM_SIMD(X) { \
 	__m128i H; \
 	__asm__( \
@@ -600,8 +760,10 @@ static volatile uint64_t Smask2var = Smask2;
 	    : "d" (Smask2), "S" (S0), "D" (S1) \
 	    : "cc", "ax", "cx"); \
 }
+
 #elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__)
 /* 32-bit with SSE4.1 */
+
 #define PWXFORM_SIMD(X) { \
 	__m128i x = _mm_and_si128(X, _mm_set1_epi64x(Smask2)); \
 	__m128i s0 = *(__m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \
@@ -610,8 +772,10 @@ static volatile uint64_t Smask2var = Smask2;
 	X = _mm_add_epi64(X, s0); \
 	X = _mm_xor_si128(X, s1); \
 }
+
 #else
 /* 32-bit without SSE4.1 */
+
 #define PWXFORM_SIMD(X) { \
 	uint64_t x = EXTRACT64(X) & Smask2; \
 	__m128i s0 = *(__m128i *)(S0 + (uint32_t)x); \
@@ -620,6 +784,7 @@ static volatile uint64_t Smask2var = Smask2;
 	X = _mm_add_epi64(X, s0); \
 	X = _mm_xor_si128(X, s1); \
 }
+
 #endif

 #define PWXFORM_SIMD_WRITE(X, Sw) \
@@ -649,50 +814,13 @@ static volatile uint64_t Smask2var = Smask2;
 	PWXFORM_SIMD(X2) \
 	PWXFORM_SIMD(X3)

-#else /* !defined(__SSE2__) */
-
-#define PWXFORM_SIMD(x0, x1) { \
-	uint64_t x = x0 & Smask2; \
-	uint64_t *p0 = (uint64_t *)(S0 + (uint32_t)x); \
-	uint64_t *p1 = (uint64_t *)(S1 + (x >> 32)); \
-	x0 = ((x0 >> 32) * (uint32_t)x0 + p0[0]) ^ p1[0]; \
-	x1 = ((x1 >> 32) * (uint32_t)x1 + p0[1]) ^ p1[1]; \
-}
-
-#define PWXFORM_SIMD_WRITE(x0, x1, Sw) \
-	PWXFORM_SIMD(x0, x1) \
-	((uint64_t *)(Sw + w))[0] = x0; \
-	((uint64_t *)(Sw + w))[1] = x1;
-
-#define PWXFORM_ROUND \
-	PWXFORM_SIMD(X.d[0], X.d[1]) \
-	PWXFORM_SIMD(X.d[2], X.d[3]) \
-	PWXFORM_SIMD(X.d[4], X.d[5]) \
-	PWXFORM_SIMD(X.d[6], X.d[7])
-
-#define PWXFORM_ROUND_WRITE4 \
-	PWXFORM_SIMD_WRITE(X.d[0], X.d[1], S0) \
-	PWXFORM_SIMD_WRITE(X.d[2], X.d[3], S1) \
-	w += 16; \
-	PWXFORM_SIMD_WRITE(X.d[4], X.d[5], S0) \
-	PWXFORM_SIMD_WRITE(X.d[6], X.d[7], S1) \
-	w += 16;
-
-#define PWXFORM_ROUND_WRITE2 \
-	PWXFORM_SIMD_WRITE(X.d[0], X.d[1], S0) \
-	PWXFORM_SIMD_WRITE(X.d[2], X.d[3], S1) \
-	w += 16; \
-	PWXFORM_SIMD(X.d[4], X.d[5]) \
-	PWXFORM_SIMD(X.d[6], X.d[7])
-#endif
-
 #define PWXFORM \
 	PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND \
 	PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND

 #define Smask2 Smask2_0_5

-#else /* pass 2 */
+#else // pass 2

 #undef PWXFORM
 #define PWXFORM \
@@ -718,23 +846,27 @@ static volatile uint64_t Smask2var = Smask2;
 static void blockmix(const salsa20_blk_t *restrict Bin,
    salsa20_blk_t *restrict Bout, size_t r, pwxform_ctx_t *restrict ctx)
 {
-	if (unlikely(!ctx)) {
+	if ( unlikely(!ctx) )
+   {
 		blockmix_salsa(Bin, Bout);
 		return;
 	}

+   __m128i X0, X1, X2, X3;
 	uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
 #if _YESPOWER_OPT_C_PASS_ > 1
 	uint8_t *S2 = ctx->S2;
 	size_t w = ctx->w;
 #endif
 	size_t i;
-	DECL_X

 	/* Convert count of 128-byte blocks to max index of 64-byte block */
 	r = r * 2 - 1;

-	READ_X(Bin[r])
+   X0 = Bin[r].m128[0];
+   X1 = Bin[r].m128[1];
+   X2 = Bin[r].m128[2];
+   X3 = Bin[r].m128[3];

 	DECL_SMASK2REG

@@ -763,13 +895,13 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
 	if (unlikely(!ctx))
 		return blockmix_salsa_xor(Bin1, Bin2, Bout);

+   __m128i X0, X1, X2, X3;
 	uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
 #if _YESPOWER_OPT_C_PASS_ > 1
 	uint8_t *S2 = ctx->S2;
 	size_t w = ctx->w;
 #endif
 	size_t i;
-	DECL_X

 	/* Convert count of 128-byte blocks to max index of 64-byte block */
 	r = r * 2 - 1;
@@ -781,7 +913,10 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
 	}
 #endif

-	XOR_X_2(Bin1[r], Bin2[r])
+   X0 = _mm_xor_si128( Bin1[r].m128[0], Bin2[r].m128[0] );
+   X1 = _mm_xor_si128( Bin1[r].m128[1], Bin2[r].m128[1] );
+   X2 = _mm_xor_si128( Bin1[r].m128[2], Bin2[r].m128[2] );
+   X3 = _mm_xor_si128( Bin1[r].m128[3], Bin2[r].m128[3] );

 	DECL_SMASK2REG

@@ -789,21 +924,13 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
 	r--;
 	do {
      XOR_X_XOR_X( Bin1[i], Bin2[i] )
-//      XOR_X(Bin1[i])
-//      XOR_X(Bin2[i])
 		PWXFORM
 		WRITE_X(Bout[i])
-
      XOR_X_XOR_X( Bin1[ i+1 ], Bin2[ i+1 ] )     
-//		XOR_X(Bin1[i + 1])
-//		XOR_X(Bin2[i + 1])
 		PWXFORM
-
 		if (unlikely(i >= r))
 			break;
-
 		WRITE_X(Bout[i + 1])
-
 		i += 2;
 	} while (1);
 	i++;
@@ -815,21 +942,20 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,

 	SALSA20(Bout[i])

-	return INTEGERIFY;
+	return INTEGERIFY( X0 );
 }

-static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out,
-    salsa20_blk_t *restrict Bin2,
-    size_t r, pwxform_ctx_t *restrict ctx)
+static uint32_t blockmix_xor_save( salsa20_blk_t *restrict Bin1out,
+        salsa20_blk_t *restrict Bin2,  size_t r, pwxform_ctx_t *restrict ctx )
 {
+   __m128i X0, X1, X2, X3;
+   __m128i Y0, Y1, Y2, Y3;
 	uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
 #if _YESPOWER_OPT_C_PASS_ > 1
 	uint8_t *S2 = ctx->S2;
 	size_t w = ctx->w;
 #endif
 	size_t i;
-	DECL_X
-	DECL_Y

 	/* Convert count of 128-byte blocks to max index of 64-byte block */
 	r = r * 2 - 1;
@@ -841,7 +967,10 @@ static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out,
 	}
 #endif

-	XOR_X_2(Bin1out[r], Bin2[r])
+   X0 = _mm_xor_si128( Bin1out[r].m128[0], Bin2[r].m128[0] );
+   X1 = _mm_xor_si128( Bin1out[r].m128[1], Bin2[r].m128[1] );
+   X2 = _mm_xor_si128( Bin1out[r].m128[2], Bin2[r].m128[2] );
+   X3 = _mm_xor_si128( Bin1out[r].m128[3], Bin2[r].m128[3] );

 	DECL_SMASK2REG

@@ -851,15 +980,11 @@ static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out,
 		XOR_X_WRITE_XOR_Y_2(Bin2[i], Bin1out[i])
 		PWXFORM
 		WRITE_X(Bin1out[i])
-
 		XOR_X_WRITE_XOR_Y_2(Bin2[i + 1], Bin1out[i + 1])
 		PWXFORM
-
-		if (unlikely(i >= r))
-			break;
-
+		if ( unlikely(i >= r) )
+         break;
 		WRITE_X(Bin1out[i + 1])
-
 		i += 2;
 	} while (1);
 	i++;
@@ -871,7 +996,7 @@ static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out,

 	SALSA20(Bin1out[i])

-	return INTEGERIFY;
+	return INTEGERIFY( X0 );
 }

 #if _YESPOWER_OPT_C_PASS_ == 1
@@ -886,7 +1011,7 @@ static inline uint32_t integerify(const salsa20_blk_t *B, size_t r)
 * w[0] here (would be wrong on big-endian).  Also, our 32-bit words are
 * SIMD-shuffled, but we only care about the least significant 32 bits anyway.
 */
-	return (uint32_t)B[2 * r - 1].d[0];
+	return (uint32_t)B[2 * r - 1].q[0];
 }
 #endif

@@ -915,7 +1040,7 @@ static void smix1(uint8_t *B, size_t r, uint32_t N,
 		salsa20_blk_t *dst = &X[i];
 		size_t k;
 		for (k = 0; k < 16; k++)
-         tmp->w[k] = src->w[k];
+         tmp->d[k] = src->d[k];
 		salsa20_simd_shuffle(tmp, dst);
 	}

@@ -962,7 +1087,7 @@ static void smix1(uint8_t *B, size_t r, uint32_t N,
 		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64];
 		size_t k;
 		for (k = 0; k < 16; k++)
-         tmp->w[k] = src->w[k];
+         tmp->d[k] = src->d[k];
 		salsa20_simd_unshuffle(tmp, dst);
 	}
 }
@@ -988,7 +1113,7 @@ static void smix2(uint8_t *B, size_t r, uint32_t N, uint32_t Nloop,
 		salsa20_blk_t *dst = &X[i];
 		size_t k;
 		for (k = 0; k < 16; k++)
-			tmp->w[k] = src->w[k];
+			tmp->d[k] = src->d[k];
 		salsa20_simd_shuffle(tmp, dst);
 	}

@@ -1020,7 +1145,7 @@ static void smix2(uint8_t *B, size_t r, uint32_t N, uint32_t Nloop,
 		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64];
 		size_t k;
 		for (k = 0; k < 16; k++)
-			tmp->w[k]  = src->w[k];
+			tmp->d[k]  = src->d[k];
 		salsa20_simd_unshuffle(tmp, dst);
 	}
 }
--- a/api.c
+++ b/api.c
@@ -336,7 +336,7 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
 	char inpkey[128] = { 0 };
 	char seckey[64];
 	uchar sha1[20];
-	SHA_CTX ctx;
+//	SHA_CTX ctx;

 	if (opt_protocol)
 		applog(LOG_DEBUG, "clientkey: %s", clientkey);
@@ -346,9 +346,11 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
 	// SHA-1 test from rfc, returns in base64 "s3pPLMBiTxaQ9kYGzzhZRbK+xOo="
 	//sprintf(inpkey, "dGhlIHNhbXBsZSBub25jZQ==258EAFA5-E914-47DA-95CA-C5AB0DC85B11");

-	SHA1_Init(&ctx);
-	SHA1_Update(&ctx, inpkey, strlen(inpkey));
-	SHA1_Final(sha1, &ctx);
+   SHA1( inpkey, strlen(inpkey), sha1 );
+// Deprecated in openssl-3
+// SHA1_Init(&ctx);
+//	SHA1_Update(&ctx, inpkey, strlen(inpkey));
+//	SHA1_Final(sha1, &ctx);

 	base64_encode(sha1, 20, seckey, sizeof(seckey));

--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,7 +4,7 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.

-rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake > /dev/null

 # AVX512 SHA VAES: Intel Core Icelake, Rocketlake
 make distclean || echo clean
@@ -17,13 +17,22 @@ make -j 8
 strip -s cpuminer
 mv cpuminer cpuminer-avx512-sha-vaes

+# AVX256 SHA VAES: Intel Core Alderlake, needs gcc-12
+#make clean || echo clean
+#rm -f config.status
+#./autogen.sh || echo done
+#CFLAGS="-O3 -march=alderlake -Wall -fno-common" ./configure --with-curl
+#make -j 8
+#strip -s cpuminer
+#mv cpuminer cpuminer-alderlake
+
 # Zen4 AVX512 SHA VAES
 make clean || echo clean
 rm -f config.status
 # znver3 needs gcc-11, znver4 ?
 #CFLAGS="-O3 -march=znver4 -Wall -fno-common " ./configure --with-curl
-#CFLAGS="-O3 -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
-CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
+CFLAGS="-O3 -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
+#CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
 make -j 8
 strip -s cpuminer
 mv cpuminer cpuminer-zen4
@@ -31,8 +40,8 @@ mv cpuminer cpuminer-zen4
 # Zen3 AVX2 SHA VAES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=znver2 -mvaes -fno-common " ./configure --with-curl
-#CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl
+#CFLAGS="-O3 -march=znver2 -mvaes -fno-common " ./configure --with-curl
+CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl
 make -j 8
 strip -s cpuminer
 mv cpuminer cpuminer-zen3
@@ -80,7 +89,7 @@ make -j 8
 strip -s cpuminer
 mv cpuminer cpuminer-avx

-# SSE4.2 AES: Intel Westmere
+# SSE4.2 AES: Intel Westmere, most Pentium & Celeron
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.21.0.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.22.2.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.21.0'
-PACKAGE_STRING='cpuminer-opt 3.21.0'
+PACKAGE_VERSION='3.22.2'
+PACKAGE_STRING='cpuminer-opt 3.22.2'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.21.0 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.22.2 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.21.0:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.22.2:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.21.0
+cpuminer-opt configure 3.22.2
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.21.0, which was
+It was created by cpuminer-opt $as_me 3.22.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.21.0'
+ VERSION='3.22.2'


 cat >>confdefs.h <<_ACEOF
@@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.21.0, which was
+This file was extended by cpuminer-opt $as_me 3.22.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6784,7 +6784,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.21.0
+cpuminer-opt config.status 3.22.2
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.21.0])
+AC_INIT([cpuminer-opt], [3.22.2])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -3,7 +3,7 @@
 * Copyright 2012-2014 pooler
 * Copyright 2014 Lucas Jones
 * Copyright 2014-2016 Tanguy Pruvot
- * Copyright 2016-2021 Jay D Dee
+ * Copyright 2016-2023 Jay D Dee
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
@@ -37,6 +37,7 @@
 #include <curl/curl.h>
 #include <jansson.h>
 #include <openssl/sha.h>
+//#include <mm_malloc.h>
 #include "sysinfos.c"
 #include "algo/sha/sha256d.h"

@@ -120,7 +121,6 @@ static uint64_t opt_affinity = 0xFFFFFFFFFFFFFFFFULL;  // default, use all cores
 int opt_priority = 0;  // deprecated
 int num_cpus = 1;
 int num_cpugroups = 1;  // For Windows
-#define max_cpus 256   // max for affinity
 char *rpc_url = NULL;
 char *rpc_userpass = NULL;
 char *rpc_user, *rpc_pass;
@@ -223,8 +223,7 @@ char*  lp_id;

 static void   workio_cmd_free(struct workio_cmd *wc);

-// array mapping thread to cpu
-static uint8_t thread_affinity_map[ max_cpus ];
+static int *thread_affinity_map;

 // display affinity mask graphically
 static void format_affinity_mask( char *mask_str, uint64_t mask )
@@ -317,8 +316,9 @@ static void affine_to_cpu( struct thr_info *thr )
   if ( !ok )
   {
      last_error = GetLastError();
-      applog( LOG_WARNING, "affine_to_cpu_mask for %u returned 0x%x",
-                           thread, last_error );
+      if ( !thread )
+      applog( LOG_WARNING, "Set affinity returned error 0x%x for thread %d",
+                           last_error, thread );
   }
 }   

@@ -430,20 +430,18 @@ static bool work_decode( const json_t *val, struct work *work )
    if ( unlikely( !algo_gate.work_decode( work ) ) )
        return false;

-    if ( !allow_mininginfo )
-        net_diff = algo_gate.calc_network_diff( work );
-    else
-        net_diff = hash_to_diff( work->target );
-
-    work->targetdiff = net_diff;
-    stratum_diff = last_targetdiff = work->targetdiff;
+    // many of these aren't used solo.
+    net_diff =
+    work->targetdiff = 
+    stratum_diff =
+    last_targetdiff = hash_to_diff( work->target );
    work->sharediff = 0;
    algo_gate.decode_extra_data( work, &net_blocks );

    return true;
 }

-// good alternative for wallet mining, difficulty and net hashrate
+// Only used for net_hashrate with GBT/getwork, data is from previous block.
 static const char *info_req =
 "{\"method\": \"getmininginfo\", \"params\": [], \"id\":8}\r\n";

@@ -469,17 +467,14 @@ static bool get_mininginfo( CURL *curl, struct work *work )
   // "networkhashps": 56475980
   if ( res )
   {
-      // net_diff is a global that is set from the work hash target by
-      // both getwork and GBT. Don't overwrite it, define a local to override
-      // the global.
-      double net_diff = 0.;
+      double difficulty = 0.;
  		json_t *key = json_object_get( res, "difficulty" );
   	if ( key )
      {
 	   	if ( json_is_object( key ) )
 		   	key = json_object_get( key, "proof-of-work" );
 		   if ( json_is_real( key ) )
-			   net_diff = json_real_value( key );
+			   difficulty = json_real_value( key );
 	   }

      key = json_object_get( res, "networkhashps" );
@@ -496,12 +491,13 @@ static bool get_mininginfo( CURL *curl, struct work *work )
 		  	net_blocks = json_integer_value( key );

      if ( opt_debug )
-         applog(LOG_INFO,"Mining info: diff %.5g, net_hashrate %f, height %d",
-                              net_diff, net_hashrate, net_blocks );
-      
+         applog( LOG_INFO,"getmininginfo: difficulty %.5g, networkhashps %.5g, blocks %d", difficulty, net_hashrate, net_blocks );
+
      if ( !work->height )
      {
 	      // complete missing data from getwork
+         if ( opt_debug )
+            applog( LOG_DEBUG, "work height set by getmininginfo" );
 	      work->height = (uint32_t) net_blocks + 1;
 	      if ( work->height > g_work.height )
            restart_threads();
@@ -533,9 +529,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
   json_t *tmp, *txa;
   bool rc = false;
   int i, n;
-
-// Segwit BEGIN
   bool segwit = false;
+
   tmp = json_object_get( val, "rules" );
   if ( tmp && json_is_array( tmp ) )
   {
@@ -553,8 +548,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
         }
      }
   }
-// Segwit END
-   
+
   tmp = json_object_get( val, "mutable" );
   if ( tmp && json_is_array( tmp ) )
   {
@@ -636,7 +630,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
         goto out;
      }
   }
-   
+
   /* find count and size of transactions */
   txa = json_object_get(val, "transactions" );
   if ( !txa || !json_is_array( txa ) )
@@ -711,12 +705,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      cbtx[41] = cbtx_size - 42; /* scriptsig length */
      le32enc( (uint32_t *)( cbtx+cbtx_size ), 0xffffffff ); /* sequence */
      cbtx_size += 4;
-
-// Segwit BEGIN
-      //cbtx[cbtx_size++] = 1; /* out-counter */
-        cbtx[cbtx_size++] = segwit ? 2 : 1; /* out-counter */
-// Segwit END
-
+      cbtx[cbtx_size++] = segwit ? 2 : 1; /* out-counter */
      le32enc( (uint32_t *)( cbtx+cbtx_size) , (uint32_t)cbvalue ); /* value */
      le32enc( (uint32_t *)( cbtx+cbtx_size+4 ), cbvalue >> 32 );
      cbtx_size += 8;
@@ -724,7 +713,6 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      memcpy( cbtx+cbtx_size, pk_script, pk_script_size );
      cbtx_size += (int) pk_script_size;

-// Segwit BEGIN
       if ( segwit )
       {
          unsigned char (*wtree)[32] = calloc(tx_count + 2, 32);
@@ -759,12 +747,11 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
            for ( i = 0; i < n; i++ )
               sha256d( wtree[i], wtree[2*i], 64 );
         }
-         memset( wtree[1], 0, 32 );  /* witness reserved value = 0 */
+         memset( wtree[1], 0, 32 );  // witness reserved value = 0
         sha256d( cbtx+cbtx_size, wtree[0], 64 );
         cbtx_size += 32;
         free( wtree );
      }
-// Segwit END

      le32enc( (uint32_t *)( cbtx+cbtx_size ), 0 ); /* lock time */
      cbtx_size += 4;
@@ -783,10 +770,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
            xsig_len += n;
         }
         else
-         {
            applog( LOG_WARNING,
                        "Signature does not fit in coinbase, skipping" );
-         }
      }
      tmp = json_object_get( val, "coinbaseaux" );
      if ( tmp && json_is_object( tmp ) )
@@ -813,8 +798,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      if ( xsig_len )
      {
         unsigned char *ssig_end = cbtx + 42 + cbtx[41];
-         int push_len = cbtx[41] + xsig_len < 76 ? 1 :
-		               cbtx[41] + 2 + xsig_len > 100 ? 0 : 2;
+         int push_len = cbtx[41] + xsig_len < 76
+                        ? 1 : cbtx[41] + 2 + xsig_len > 100 ? 0 : 2;
         n = xsig_len + push_len;
         memmove( ssig_end + n, ssig_end, cbtx_size - 42 - cbtx[41] );
         cbtx[41] += n;
@@ -841,7 +826,6 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      const char *tx_hex = json_string_value( json_object_get( tmp, "data" ) );
      const int tx_size = tx_hex ? (int) ( strlen( tx_hex ) / 2 ) : 0;

-// Segwit BEGIN      
      if ( segwit )
      {
         const char *txid = json_string_value( json_object_get( tmp, "txid" ) );
@@ -854,8 +838,6 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      }
      else
      {
-// Segwit END
-
         unsigned char *tx = (uchar*) malloc( tx_size );
         if ( !tx_hex || !hex2bin( tx, tx_hex, tx_size ) )
         {
@@ -865,10 +847,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
         }
         sha256d( merkle_tree[1 + i], tx, tx_size );
         free( tx );
-
-// Segwit BEGIN      
      }
-// Segwit END

      if ( !submit_coinbase )
         strcat( work->txs, tx_hex );
@@ -886,6 +865,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
         sha256d( merkle_tree[i], merkle_tree[2*i], 64 );
   }

+   work->tx_count = tx_count;
+
   /* assemble block header */
   algo_gate.build_block_header( work, swab32( version ),
                                 (uint32_t*) prevhash, (uint32_t*) merkle_tree,
@@ -898,10 +879,11 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      goto out;
   }

-   for ( i = 0; i < 8; i++ )
-      work->target[7 - i] = be32dec( target + i );
+   // reverse the bytes in target
+   casti_m128i( work->target, 0 ) = mm128_bswap_128( casti_m128i( target, 1 ) );
+   casti_m128i( work->target, 1 ) = mm128_bswap_128( casti_m128i( target, 0 ) );
   net_diff = work->targetdiff = hash_to_diff( work->target );
-   
+
   tmp = json_object_get( val, "workid" );
   if ( tmp )
   {
@@ -1077,12 +1059,11 @@ void report_summary_log( bool force )
   timeval_subtract( &et, &now, &start_time );
   timeval_subtract( &uptime, &total_hashes_time, &session_start );
   
-   double share_time = (double)et.tv_sec + (double)et.tv_usec / 1e6;
+   double share_time = (double)et.tv_sec + (double)et.tv_usec * 1e-6;
   double ghrate = safe_div( total_hashes, (double)uptime.tv_sec, 0. );
   double target_diff = exp32 * last_targetdiff;
   double shrate = safe_div( target_diff * (double)(accepts),
                             share_time, 0. );
-//   global_hashrate = ghrate;
   double sess_hrate = safe_div( exp32 * norm_diff_sum,
                                 (double)uptime.tv_sec, 0. );
   double submit_rate = safe_div( (double)submits * 60., share_time, 0. );
@@ -1103,7 +1084,7 @@ void report_summary_log( bool force )
   applog2( LOG_NOTICE, "Periodic Report     %s        %s", et_str, upt_str );
   applog2( LOG_INFO, "Share rate        %.2f/min     %.2f/min",
            submit_rate, safe_div( (double)submitted_share_count*60.,
-              ( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ), 0. ) );
+              ( (double)uptime.tv_sec + (double)uptime.tv_usec * 1e-6 ), 0. ) );
   applog2( LOG_INFO, "Hash rate       %7.2f%sh/s   %7.2f%sh/s   (%.2f%sh/s)",
            shrate, shr_units, sess_hrate, sess_hr_units, ghrate, ghr_units );

@@ -1550,7 +1531,6 @@ const char *getwork_req =

 #define GBT_CAPABILITIES "[\"coinbasetxn\", \"coinbasevalue\", \"longpoll\", \"workid\"]"

-// Segwit BEGIN
 #define GBT_RULES "[\"segwit\"]"
 static const char *gbt_req =
   "{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
@@ -1559,16 +1539,6 @@ const char *gbt_lp_req =
   "{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
   GBT_CAPABILITIES ", \"rules\": " GBT_RULES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n";

-/*
-static const char *gbt_req =
-	"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
-	GBT_CAPABILITIES "}], \"id\":0}\r\n";
-const char *gbt_lp_req =
-	"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
-	GBT_CAPABILITIES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n";
-*/
-// Segwit END
-
 static bool get_upstream_work( CURL *curl, struct work *work )
 {
   json_t *val;
@@ -1643,49 +1613,49 @@ start:
         last_block_height = work->height;
         last_targetdiff = net_diff;

-         applog( LOG_BLUE, "New Block %d, Net Diff %.5g, Ntime %08x",
-                                work->height, net_diff,
+         applog( LOG_BLUE, "New Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
+                                work->height, work->tx_count, net_diff,
                                work->data[ algo_gate.ntime_index ] );
-
-         if ( !opt_quiet )
-         {
-            double miner_hr = 0.;
-            double net_hr = net_hashrate;
-            double nd = net_diff * exp32;
-            char net_hr_units[4] = {0};
-            char miner_hr_units[4] = {0};
-            char net_ttf[32];
-            char miner_ttf[32];
-
-            pthread_mutex_lock( &stats_lock );
-
-            for ( int i = 0; i < opt_n_threads; i++ )
-               miner_hr += thr_hashrates[i];
-            global_hashrate = miner_hr;
-
-            pthread_mutex_unlock( &stats_lock );
-
-            if ( net_hr > 0. )
-               sprintf_et( net_ttf, nd / net_hr );
-            else
-               sprintf( net_ttf, "NA" );
-            if ( miner_hr > 0. )
-               sprintf_et( miner_ttf, nd / miner_hr );
-            else
-               sprintf( miner_ttf, "NA" );
-
-            scale_hash_for_display ( &miner_hr, miner_hr_units );
-            scale_hash_for_display ( &net_hr, net_hr_units );
-            applog2( LOG_INFO,
-                     "Miner TTF @ %.2f %sh/s %s, Net TTF @ %.2f %sh/s %s",
-                     miner_hr, miner_hr_units, miner_ttf, net_hr,
-                     net_hr_units, net_ttf );
-         }
-      }  // work->height > last_block_height
+      }
      else if ( memcmp( &work->data[1], &g_work.data[1], 32 ) )
-         applog( LOG_BLUE, "New Work: Block %d, Net Diff %.5g, Ntime %08x",
-                                      work->height, net_diff,
-                                      work->data[ algo_gate.ntime_index ] );
+         applog( LOG_BLUE, "New Work: Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
+                                work->height, work->tx_count, net_diff,
+                                work->data[ algo_gate.ntime_index ] );
+       
+      if ( !opt_quiet )
+      {
+         double miner_hr = 0.;
+         double net_hr = net_hashrate;
+         double nd = net_diff * exp32;
+         char net_hr_units[4] = {0};
+         char miner_hr_units[4] = {0};
+         char net_ttf[32];
+         char miner_ttf[32];
+
+         pthread_mutex_lock( &stats_lock );
+
+         for ( int i = 0; i < opt_n_threads; i++ )
+             miner_hr += thr_hashrates[i];
+         global_hashrate = miner_hr;
+
+         pthread_mutex_unlock( &stats_lock );
+
+         if ( net_hr > 0. )
+            sprintf_et( net_ttf, nd / net_hr );
+         else
+            sprintf( net_ttf, "NA" );
+         if ( miner_hr > 0. )
+            sprintf_et( miner_ttf, nd / miner_hr );
+         else
+            sprintf( miner_ttf, "NA" );
+
+         scale_hash_for_display ( &miner_hr, miner_hr_units );
+         scale_hash_for_display ( &net_hr, net_hr_units );
+         applog2( LOG_INFO,
+                  "Miner TTF @ %.2f %sh/s %s, Net TTF @ %.2f %sh/s %s",
+                  miner_hr, miner_hr_units, miner_ttf, net_hr,
+                  net_hr_units, net_ttf );
+      }
   }  // rc

   return rc;
@@ -1711,36 +1681,36 @@ static void workio_cmd_free(struct workio_cmd *wc)

 static bool workio_get_work( struct workio_cmd *wc, CURL *curl )
 {
-   struct work *ret_work;
+   struct work *work_heap;
   int failures = 0;

-   ret_work = (struct work*) calloc( 1, sizeof(*ret_work) );
-   if ( !ret_work )
-	return false;
+   work_heap = calloc( 1, sizeof(struct work) );
+   if ( !work_heap )  return false;

   /* obtain new work from bitcoin via JSON-RPC */
-   while ( !get_upstream_work( curl, ret_work ) )
+   while ( !get_upstream_work( curl, work_heap ) )
   {
      if ( unlikely( ( opt_retries >= 0 ) && ( ++failures > opt_retries ) ) )
      {
         applog( LOG_ERR, "json_rpc_call failed, terminating workio thread" );
-         free( ret_work );
-	      return false;
+         free( work_heap );
+         return false;
      }

      /* pause, then restart work-request loop */
-	   applog( LOG_ERR, "json_rpc_call failed, retry after %d seconds",
-		        opt_fail_pause );
+      applog( LOG_ERR, "json_rpc_call failed, retry after %d seconds",
+              opt_fail_pause );
      sleep( opt_fail_pause );
   }

   /* send work to requesting thread */
-   if ( !tq_push(wc->thr->q, ret_work ) )
-   	free( ret_work );
+   if ( !tq_push(wc->thr->q, work_heap ) )
+      free( work_heap );

   return true;
 }

+
 static bool workio_submit_work(struct workio_cmd *wc, CURL *curl)
 {
   int failures = 0;
@@ -1811,7 +1781,7 @@ static void *workio_thread(void *userdata)
 static bool get_work(struct thr_info *thr, struct work *work)
 {
 	struct workio_cmd *wc;
-	struct work *work_heap;
+   struct work *work_heap;

 	if unlikely( opt_benchmark )
   {
@@ -1836,17 +1806,16 @@ static bool get_work(struct thr_info *thr, struct work *work)
 	wc->thr = thr;
 	/* send work request to workio thread */
 	if (!tq_push(thr_info[work_thr_id].q, wc))
-        {
+   {
 		workio_cmd_free(wc);
 		return false;
 	}
 	/* wait for response, a unit of work */
 	work_heap = (struct work*) tq_pop(thr->q, NULL);
-	if (!work_heap)
-		return false;
-	/* copy returned work into storage provided by caller */
-	memcpy(work, work_heap, sizeof(*work));
-	free(work_heap);
+	if ( !work_heap ) return false;
+   /* copy returned work into storage provided by caller */
+	memcpy( work, work_heap, sizeof(*work) );
+	free( work_heap );
 	return true;
 }

@@ -1896,9 +1865,9 @@ static void update_submit_stats( struct work *work, const void *hash )
 bool submit_solution( struct work *work, const void *hash,
                      struct thr_info *thr )
 {
-   // Job went stale during hashing of a valid share.
-   if ( !opt_quiet && work_restart[ thr->id ].restart )
-      applog( LOG_INFO, CL_LBL "Share may be stale, submitting anyway..." CL_N );
+// Job went stale during hashing of a valid share.
+//   if ( !opt_quiet && work_restart[ thr->id ].restart )
+//      applog( LOG_INFO, CL_LBL "Share may be stale, submitting anyway..." CL_N );
   
   work->sharediff = hash_to_diff( hash );
   if ( likely( submit_work( thr, work ) ) )
@@ -1916,32 +1885,34 @@ bool submit_solution( struct work *work, const void *hash,
     if ( !opt_quiet )
     {
        if ( have_stratum )
+        {
           applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Job %s",
                   submitted_share_count, work->sharediff, work->height,
                   work->job_id );
+           if ( opt_debug && opt_extranonce )
+           {
+              unsigned char *xnonce2str = abin2hex( work->xnonce2,
+                                                    work->xnonce2_len );
+              applog( LOG_INFO, "Xnonce2 %s", xnonce2str );
+              free( xnonce2str );
+           }
+        }
        else
           applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Ntime %08x",
                   submitted_share_count, work->sharediff, work->height,
                   work->data[ algo_gate.ntime_index ] );
-     }

-     if ( opt_debug )
-     {
-        uint32_t* h = (uint32_t*)hash;
-        uint32_t* t = (uint32_t*)work->target;
-        uint32_t* d = (uint32_t*)work->data;
+        if ( opt_debug )
+        {
+           uint32_t* h = (uint32_t*)hash;
+           uint32_t* t = (uint32_t*)work->target;
+           uint32_t* d = (uint32_t*)work->data;

-        unsigned char *xnonce2str = abin2hex( work->xnonce2,
-                                              work->xnonce2_len );
-        applog(LOG_INFO,"Thread %d, Nonce %08x, Xnonce2 %s", thr->id,
-                       work->data[ algo_gate.nonce_index ], xnonce2str );
-        free( xnonce2str );
-        applog(LOG_INFO,"Data[0:19]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9] );
-        applog(LOG_INFO,"          : %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[10],d[11],d[12],d[13],d[14],d[15],d[16],d[17],d[18],d[19]);
-        applog(LOG_INFO,"Hash[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
-                                    h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0]);
-        applog(LOG_INFO,"Targ[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
-                                    t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0]);
+           applog( LOG_INFO, "Data[ 0: 9]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9] );
+           applog( LOG_INFO, "Data[10:19]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[10],d[11],d[12],d[13],d[14],d[15],d[16],d[17],d[18],d[19] );
+           applog( LOG_INFO, "Hash[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x", h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0] );
+           applog( LOG_INFO, "Targ[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x", t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0] );
+        }
     }
     return true;
   }
@@ -1959,15 +1930,15 @@ static bool wanna_mine(int thr_id)
 		float temp = cpu_temp(0);
 		if (temp > opt_max_temp)
      {
-			if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
-				applog(LOG_INFO, "temperature too high (%.0fC), waiting...", temp);
-			state = false;
+         if ( !thr_id && !conditional_state[thr_id] && !opt_quiet )
+           applog(LOG_NOTICE, "CPU temp too high: %.0fC max %.0f, waiting...", temp, opt_max_temp );
+         state = false;
 		}
 	}
 	if (opt_max_diff > 0.0 && net_diff > opt_max_diff)
   {
 		if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
-			applog(LOG_INFO, "network diff too high, waiting...");
+			applog(LOG_NOTICE, "network diff too high, waiting...");
 		state = false;
 	}
 	if (opt_max_rate > 0.0 && net_hashrate > opt_max_rate)
@@ -1976,12 +1947,14 @@ static bool wanna_mine(int thr_id)
      {
 			char rate[32];
 			format_hashrate(opt_max_rate, rate);
-			applog(LOG_INFO, "network hashrate too high, waiting %s...", rate);
+			applog(LOG_NOTICE, "network hashrate too high (%s), waiting...", rate);
 		}
 		state = false;
 	}
-	if (thr_id < MAX_CPUS)
-		conditional_state[thr_id] = (uint8_t) !state;
+  
+   if ( conditional_state[thr_id] && state && !thr_id && !opt_quiet )
+      applog(LOG_NOTICE, "...resuming" );
+	conditional_state[thr_id] = (uint8_t) !state;
 	return state;
 }

@@ -2015,33 +1988,6 @@ void set_work_data_big_endian( struct work *work )
        be32enc( work->data + i, work->data[i] );
 }

-// calculate net diff from nbits.
-double std_calc_network_diff( struct work* work )
-{
-   uint32_t nbits = work->data[ algo_gate.nbits_index ];
-   uint32_t shift = nbits & 0xff;
-   uint32_t bits = bswap_32( nbits ) & 0x00ffffff;
-/*
-   // sample for diff 43.281 : 1c05ea29
-   // todo: endian reversed on longpoll could be zr5 specific...
-   int nbits_index = algo_gate.nbits_index;
-   uint32_t nbits = have_longpoll ? work->data[ nbits_index]
-                                  : swab32( work->data[ nbits_index ] );
-   uint32_t bits  = ( nbits & 0xffffff );
-   int16_t  shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
-*/
-
-   int m;
-   long double d = (long double)0x0000ffff / (long double)bits;
-   for ( m = shift; m < 29; m++ )
-       d *= 256.0;
-   for ( m = 29; m < shift; m++ )
-       d /= 256.0;
-   if ( opt_debug_diff )
-      applog(LOG_DEBUG, "net diff: %8f -> shift %u, bits %08x", (double)d, shift, bits);
-   return (double)d;
-}
-
 void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
                     uint32_t *end_nonce_ptr )
 {
@@ -2065,17 +2011,6 @@ void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
       ++(*nonceptr);
 }

-bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                           int thr_id )
-{
-   if ( have_stratum && !work->data[0] && !opt_benchmark )
-   {
-      sleep(1);
-      return false;
-   }
-   return true;
-}
-
 static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
 {
   bool new_job;
@@ -2092,7 +2027,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   g_work->xnonce2 = (uchar*) realloc( g_work->xnonce2, sctx->xnonce2_size );
   memcpy( g_work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size );
   algo_gate.build_extraheader( g_work, sctx );
-   net_diff = algo_gate.calc_network_diff( g_work );
+   net_diff = nbits_to_diff( g_work->data[ algo_gate.nbits_index ] );
   algo_gate.set_work_data_endian( g_work );
   g_work->height = sctx->block_height;
   g_work->targetdiff = sctx->job.diff
@@ -2121,14 +2056,17 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   pthread_mutex_unlock( &stats_lock );

   if ( stratum_diff != sctx->job.diff )
-      applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Job %s",
-                        sctx->job.diff, sctx->block_height, g_work->job_id );
+      applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Tx %d, Job %s",
+                        sctx->job.diff, sctx->block_height,
+                        sctx->job.merkle_count, g_work->job_id );
   else if ( last_block_height != sctx->block_height )
-      applog( LOG_BLUE, "New Block %d, Net diff %.5g, Job %s",
-                        sctx->block_height, net_diff, g_work->job_id );
+      applog( LOG_BLUE, "New Block %d, Tx %d, Netdiff %.5g, Job %s",
+                        sctx->block_height, sctx->job.merkle_count,
+                        net_diff, g_work->job_id );
   else if ( g_work->job_id && new_job )
-      applog( LOG_BLUE, "New Work: Block %d, Net diff %.5g, Job %s",
-                         sctx->block_height, net_diff, g_work->job_id );
+      applog( LOG_BLUE, "New Work: Block %d, Tx %d, Netdiff %.5g, Job %s",
+                         sctx->block_height, sctx->job.merkle_count,
+                         net_diff, g_work->job_id );
   else if ( !opt_quiet )
   {
      unsigned char *xnonce2str = bebin2hex( g_work->xnonce2,
@@ -2142,8 +2080,6 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   if ( ( stratum_diff != sctx->job.diff )
   || ( last_block_height != sctx->block_height ) )
   {
-      static bool multipool = false;
-      if ( stratum.block_height < last_block_height ) multipool = true;
      if ( unlikely( !session_first_block ) )
         session_first_block = stratum.block_height;
      last_block_height = stratum.block_height;
@@ -2151,58 +2087,47 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
      last_targetdiff   = g_work->targetdiff;
      if ( lowest_share < last_targetdiff )
         lowest_share = 9e99;
+    }

-      if ( !opt_quiet )
-      {
-         applog2( LOG_INFO, "Diff: Net %.5g, Stratum %.5g, Target %.5g",
-                            net_diff, stratum_diff, g_work->targetdiff );
+    if ( !opt_quiet )
+    {
+       applog2( LOG_INFO, "Diff: Net %.5g, Stratum %.5g, Target %.5g",
+                          net_diff, stratum_diff, g_work->targetdiff );

-         if ( likely( hr > 0. ) )
-         {
-            double nd = net_diff * exp32;
-            char hr_units[4] = {0};
-            char block_ttf[32];
-            char share_ttf[32];
+       if ( likely( hr > 0. ) )
+       {
+          double nd = net_diff * exp32;
+          char hr_units[4] = {0};
+          char block_ttf[32];
+          char share_ttf[32];
+          static bool multipool = false;
+      
+          if ( stratum.block_height < last_block_height ) multipool = true;
+            
+          sprintf_et( block_ttf, nd / hr );
+          sprintf_et( share_ttf, ( g_work->targetdiff * exp32 ) / hr );
+          scale_hash_for_display ( &hr, hr_units );
+          applog2( LOG_INFO, "TTF @ %.2f %sh/s: Block %s, Share %s",
+                             hr, hr_units, block_ttf, share_ttf );

-            sprintf_et( block_ttf, nd / hr );
-            sprintf_et( share_ttf, ( g_work->targetdiff * exp32 ) / hr );
-            scale_hash_for_display ( &hr, hr_units );
-            applog2( LOG_INFO, "TTF @ %.2f %sh/s: Block %s, Share %s",
-                               hr, hr_units, block_ttf, share_ttf );
-
-            if ( !multipool && last_block_height > session_first_block )
-            {
-               struct timeval now, et;
-               gettimeofday( &now, NULL );
-               timeval_subtract( &et, &now, &session_start );
-               uint64_t net_ttf =
-                    ( last_block_height - session_first_block ) == 0 ? 0
-                    : et.tv_sec / ( last_block_height - session_first_block );
-               if ( net_diff > 0. && net_ttf )
-               {
-                  double net_hr = nd / net_ttf;
-                  char net_hr_units[4] = {0};
-                  scale_hash_for_display ( &net_hr, net_hr_units );
-                  applog2( LOG_INFO, "Net hash rate (est) %.2f %sh/s",
-                                     net_hr, net_hr_units );
-               }
-            }
-         }  // hr > 0
-      } // !quiet
-   }  // new diff/block
-
-/*   
-   if ( new_job && !( opt_quiet || stratum_errors ) )
-   {
-      int mismatch = submitted_share_count - ( accepted_share_count
-                                             + stale_share_count
-                                             + rejected_share_count );
-      if ( mismatch )
-         applog( LOG_INFO,
-                 CL_LBL "%d Submitted share pending, maybe stale" CL_N,
-                 submitted_share_count );
-   }
-*/
+          if ( !multipool && last_block_height > session_first_block )
+          {
+             struct timeval now, et;
+             gettimeofday( &now, NULL );
+             timeval_subtract( &et, &now, &session_start );
+             uint64_t net_ttf = safe_div( et.tv_sec,
+                                 last_block_height - session_first_block, 0 );
+             if ( net_diff > 0. && net_ttf )
+             {
+                double net_hr = safe_div( nd, net_ttf, 0. );
+                char net_hr_units[4] = {0};
+                scale_hash_for_display ( &net_hr, net_hr_units );
+                applog2( LOG_INFO, "Net hash rate (est) %.2f %sh/s",
+                                   net_hr, net_hr_units );
+             }
+          }
+       }  // hr > 0
+    } // !quiet
 }

 static void *miner_thread( void *userdata )
@@ -2340,9 +2265,14 @@ static void *miner_thread( void *userdata )
       } // do_this_thread
       algo_gate.resync_threads( thr_id, &work );

-       if ( unlikely( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) )
+       // conditional mining
+       if ( unlikely( !wanna_mine( thr_id ) ) )
+       {
+          restart_threads();
+          sleep(5);
          continue;
-
+       }
+       
       // opt_scantime expressed in hashes
       max64 = opt_scantime * thr_hashrates[thr_id];

@@ -2489,14 +2419,6 @@ static void *miner_thread( void *userdata )
             }
          }
       }  // benchmark
-
-       // conditional mining
-       if ( unlikely( !wanna_mine( thr_id ) ) )
-       {
-          sleep(5);
-          continue;
-       }
-
   }  // miner_thread loop

 out:
@@ -3671,7 +3593,7 @@ int main(int argc, char *argv[])

 #if defined(WIN32)

-// Are Windows CPU Groups supported?
+// Get the number of cpus, display after parsing command line
 #if defined(WINDOWS_CPU_GROUPS_ENABLED)
 	num_cpus = 0;
 	num_cpugroups = GetActiveProcessorGroupCount();
@@ -3680,8 +3602,8 @@ int main(int argc, char *argv[])
 	   int cpus = GetActiveProcessorCount( i );
 	   num_cpus += cpus;

-	   if (opt_debug)
-         applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
+//	   if (opt_debug)
+//         applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
 	}

 #else
@@ -3698,7 +3620,7 @@ int main(int argc, char *argv[])
 	sysctl(req, 2, &num_cpus, &len, NULL, 0);
 #else
 	num_cpus = 1;
-#endif
+#endif 

   if ( num_cpus < 1 )
      num_cpus = 1;
@@ -3722,7 +3644,6 @@ int main(int argc, char *argv[])
   if ( opt_time_limit )
      time_limit_stop = (unsigned int)time(NULL) + opt_time_limit;

-
   // need to register to get algo optimizations for cpu capabilities
   // but that causes registration logs before cpu capabilities is output.
   // Would need to split register function into 2 parts. First part sets algo
@@ -3850,20 +3771,30 @@ int main(int argc, char *argv[])
 	}
 #endif

-   if ( opt_affinity && num_cpus > max_cpus )
-   {
-      applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled",
-                            max_cpus );
-      opt_affinity = 0ULL;
-   }
+#if defined(WIN32) && defined(WINDOWS_CPU_GROUPS_ENABLED)
+      if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )
+         applog( LOG_INFO, "Found %d CPUs in %d groups",
+                           num_cpus, num_cpugroups );
+#endif
   
+   const int map_size = opt_n_threads < num_cpus ? num_cpus : opt_n_threads;   
+   thread_affinity_map = malloc( map_size * (sizeof (int)) );
+   if ( !thread_affinity_map )
+   {
+      applog( LOG_ERR, "CPU Affinity disabled, memory allocation failed" );
+      opt_affinity = 0ULL;
+   }   
   if ( opt_affinity )
   {
-      for ( int thr = 0, cpu = 0; thr < opt_n_threads; thr++, cpu++ )
+      int active_cpus = 0; // total CPUs available using rolling affinity mask
+      for ( int thr = 0, cpu = 0; thr < map_size; thr++, cpu++ )
      {
-         while ( !( ( opt_affinity >> ( cpu&63 ) ) & 1ULL ) ) cpu++;   
+         while ( !( ( opt_affinity >> ( cpu & 63 ) ) & 1ULL ) ) cpu++;   
         thread_affinity_map[ thr ] = cpu % num_cpus;
+         if ( cpu < num_cpus ) active_cpus++;
      }
+      if ( opt_n_threads > active_cpus )
+         applog( LOG_WARNING, "Affinity: more threads (%d) than active CPUs (%d)", opt_n_threads, active_cpus );
      if ( !opt_quiet )
      {
         char affinity_mask[64];
--- a/miner.h
+++ b/miner.h
@@ -24,6 +24,11 @@

 #endif /* _MSC_VER */

+// prevent questions from ARM users that don't read the requirements.
+#if !defined(__x86_64__)
+#error "CPU architecture not supported. Consult the requirements for supported CPUs."
+#endif
+
 #include <stdbool.h>
 #include <inttypes.h>
 #include <sys/time.h>
@@ -91,6 +96,19 @@ enum {
   LOG_PINK  = 0x14 };
 #endif

+#define WORK_ALIGNMENT 64
+
+// When working with dynamically allocated memory to guarantee data alignment
+// for large vectors. Physical block size must be extended by alignment number
+// of bytes when allocated. free() should use the physical pointer returned by
+// malloc(), not the aligned pointer. All others shoujld use the logical,
+// aligned, pointer returned by this function. 
+static inline void *align_ptr( const void *ptr, const uint64_t alignment )
+{
+  const uint64_t mask = alignment - 1;
+  return (void*)( ( ((const uint64_t)ptr) + mask ) & (~mask) );
+}
+
 extern bool is_power_of_2( int n );

 static inline bool is_windows(void)
@@ -317,7 +335,7 @@ extern void cbin2hex(char *out, const char *in, size_t len);
 void   bin2hex( char *s, const unsigned char *p, size_t len );
 char  *abin2hex( const unsigned char *p, size_t len );
 char  *bebin2hex( const unsigned char *p, size_t len );
-bool   hex2bin( unsigned char *p, const char *hexstr, size_t len );
+bool   hex2bin( unsigned char *p, const char *hexstr, const size_t len );
 bool   jobj_binary( const json_t *obj, const char *key, void *buf,
                    size_t buflen );
 int    varint_encode( unsigned char *p, uint64_t n );
@@ -333,10 +351,7 @@ extern void memrev(unsigned char *p, size_t len);
 // number of hashes.
 //
 //     https://en.bitcoin.it/wiki/Difficulty
-//
 //     hash = diff * 2**32
-//
-// diff_to_hash = 2**32 = 0x100000000 = 4294967296 = exp32;

 #define EXP16 65536.
 #define EXP32 4294967296.
@@ -350,8 +365,9 @@ extern const long double exp160; // 2**160
 bool   fulltest( const uint32_t *hash, const uint32_t *target );
 bool   valid_hash( const void*, const void* );

-double hash_to_diff( const void* );
+extern double hash_to_diff( const void* );
 extern void diff_to_hash( uint32_t*, const double );
+extern double nbits_to_diff( uint32_t );

 double hash_target_ratio( uint32_t* hash, uint32_t* target );
 void   work_set_target_ratio( struct work* work, const void *hash );
@@ -399,13 +415,14 @@ struct work
   double stratum_diff;
 	int height;
 	char *txs;
-	char *workid;
+   int tx_count;
+   char *workid;
 	char *job_id;
 	size_t xnonce2_len;
 	unsigned char *xnonce2;
   bool sapling;
   bool stale;
-} __attribute__ ((aligned (64)));
+} __attribute__ ((aligned (WORK_ALIGNMENT)));

 struct stratum_job
 {
@@ -416,7 +433,8 @@ struct stratum_job
 	unsigned char *coinbase;
 	unsigned char *xnonce2;
 	int merkle_count;
-	unsigned char **merkle;
+   int merkle_buf_size;
+   unsigned char **merkle;
 	unsigned char version[4];
 	unsigned char nbits[4];
 	unsigned char ntime[4];
@@ -540,7 +558,6 @@ enum algos {
        ALGO_BMW,        
        ALGO_BMW512,
        ALGO_C11,         
-        ALGO_DECRED,
        ALGO_DEEP,
        ALGO_DMD_GR,
        ALGO_GROESTL,     
@@ -572,9 +589,11 @@ enum algos {
        ALGO_QUBIT,       
        ALGO_SCRYPT,
        ALGO_SHA256D,
+        ALGO_SHA256DT,
        ALGO_SHA256Q,
        ALGO_SHA256T,
        ALGO_SHA3D,
+        ALGO_SHA512256D,
        ALGO_SHAVITE3,    
        ALGO_SKEIN,       
        ALGO_SKEIN2,      
@@ -634,7 +653,6 @@ static const char* const algo_names[] = {
        "bmw",
        "bmw512",
        "c11",
-        "decred",
        "deep",
        "dmd-gr",
        "groestl",
@@ -666,9 +684,11 @@ static const char* const algo_names[] = {
        "qubit",
        "scrypt",
        "sha256d",
+        "sha256dt",
        "sha256q",
        "sha256t",
        "sha3d",
+        "sha512256d",
        "shavite3",
        "skein",
        "skein2",
@@ -795,7 +815,6 @@ Options:\n\
                          bmw           BMW 256\n\
                          bmw512        BMW 512\n\
                          c11           Chaincoin\n\
-                          decred        Blake256r14dcr\n\
                          deep          Deepcoin (DCN)\n\
                          dmd-gr        Diamond\n\
                          groestl       Groestl coin\n\
@@ -829,9 +848,11 @@ Options:\n\
                          scrypt:N      scrypt(N, 1, 1)\n\
                          scryptn2      scrypt(1048576, 1,1)\n\
                          sha256d       Double SHA-256\n\
+                          sha256dt      Modified sha256d (Novo)\n\
                          sha256q       Quad SHA-256, Pyrite (PYE)\n\
                          sha256t       Triple SHA-256, Onecoin (OC)\n\
                          sha3d         Double Keccak256 (BSHA3)\n\
+                          sha512256d    Double SHA-512 (Radiant)\n\
                          shavite3      Shavite3\n\
                          skein         Skein+Sha (Skeincoin)\n\
                          skein2        Double Skein (Woodcoin)\n\
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -54,7 +54,7 @@ static inline __m128i mm128_mov64_128( const uint64_t n )
 #else
  asm( "movq %1, %0\n\t" : "=x"(a) : "r"(n) );
 #endif
-  return  a;
+  return a;
 }

 static inline __m128i mm128_mov32_128( const uint32_t n )
@@ -65,7 +65,7 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
 #else  
  asm( "movd %1, %0\n\t" : "=x"(a) : "r"(n) );
 #endif
-  return  a;
+  return a;
 }

 // Inconstant naming, prefix should reflect return value:
@@ -79,7 +79,7 @@ static inline uint64_t u64_mov128_64( const __m128i a )
 #else  
  asm( "movq %1, %0\n\t" : "=r"(n) : "x"(a) );
 #endif
-  return  n;
+  return n;
 }

 static inline uint32_t u32_mov128_32( const __m128i a )
@@ -90,13 +90,18 @@ static inline uint32_t u32_mov128_32( const __m128i a )
 #else  
  asm( "movd %1, %0\n\t" : "=r"(n) : "x"(a) );
 #endif
-  return  n;
+  return n;
 }

-// Equivalent of set1, broadcast integer to all elements.
-#define m128_const_i128( i ) mm128_mov64_128( i )
-#define m128_const1_64( i ) _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 )
-#define m128_const1_32( i ) _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 )
+// Emulate broadcast & insert instructions not available in SSE2
+#define mm128_bcast_i64( i )   _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 )
+#define mm128_bcast_i32( i )   _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 )
+
+#define m128_const_i128( i )    mm128_mov64_128( i )
+
+// deprecated
+#define m128_const1_64          mm128_bcast_i64
+#define m128_const1_32          mm128_bcast_i32

 #if defined(__SSE4_1__)

@@ -104,7 +109,7 @@ static inline uint32_t u32_mov128_32( const __m128i a )
 #define m128_const_64( hi, lo ) \
   _mm_insert_epi64( mm128_mov64_128( lo ), hi, 1 )

-#else  // No insert in SSE2
+#else 

 #define m128_const_64  _mm_set_epi64x

@@ -114,12 +119,10 @@ static inline uint32_t u32_mov128_32( const __m128i a )

 #define m128_zero      _mm_setzero_si128()
 #define m128_one_128   mm128_mov64_128( 1 )
-#define m128_one_64    _mm_shuffle_epi32( mm128_mov64_128( 1 ), 0x44 )
-#define m128_one_32    _mm_shuffle_epi32( mm128_mov32_128( 1 ), 0x00 )
-#define m128_one_16    _mm_shuffle_epi32( \
-                                 mm128_mov32_128( 0x00010001 ), 0x00 )
-#define m128_one_8     _mm_shuffle_epi32( \
-                                 mm128_mov32_128( 0x01010101 ), 0x00 )
+#define m128_one_64    mm128_bcast_i64( 1 )
+#define m128_one_32    mm128_bcast_i32( 1 )
+#define m128_one_16    mm128_bcast_i32( 0x00010001 )
+#define m128_one_8     mm128_bcast_i32( 0x01010101 )

 // ASM avoids the need to initialize return variable to avoid compiler warning.
 // Macro abstracts function parentheses to look like an identifier.
@@ -149,7 +152,7 @@ static inline __m128i mm128_neg1_fn()
 // sizing. It's unique.
 //
 // It can:
-//   - zero 32 bit elements of a 128 bit vector.
+//   - zero any number of 32 bit elements of a 128 bit vector.
 //   - extract any 32 bit element from one 128 bit vector and insert the
 //     data to any 32 bit element of another 128 bit vector, or the same vector.
 //   - do both simultaneoulsly.
@@ -162,14 +165,21 @@ static inline __m128i mm128_neg1_fn()
 //    c[5:4] destination element selector
 //    c[7:6] source element selector

-// Convert type and abbreviate name: e"x"tract "i"nsert "m"ask
+// Convert type and abbreviate name: eXtract Insert Mask = XIM
 #define mm128_xim_32( v1, v2, c ) \
   _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
                                    _mm_castsi128_ps( v2 ), c ) )

-// Some examples of simple operations:
+/* Another way to do it with individual arguments.
+#define mm128_xim_32( v1, i1, v2, i2, mask ) \
+   _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
+                                    _mm_castsi128_ps( v2 ), \
+                                    (mask) | ((i1)<<4) | ((i2)<<6) ) )
+*/

-// Insert 32 bit integer into v at element c and return modified v.
+// Examples of simple operations using xim:
+
+// Insert 32 bit integer into v at element c and return updated v.
 static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
                                       const int c )
 {   return mm128_xim_32( v, mm128_mov32_128( i ), c<<4 ); }
@@ -178,13 +188,12 @@ static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
 static inline uint32_t mm128_extract_32( const __m128i v, const int c )
 {   return u32_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }

-// Clear (zero) 32 bit elements based on bits set in 4 bit mask.
+// Zero 32 bit elements when bit in mask is set.
 static inline __m128i mm128_mask_32( const __m128i v, const int m ) 
 {   return mm128_xim_32( v, v, m ); }

-// Move element i2 of v2 to element i1 of v1. For reference and convenience,
-// it's faster to precalculate the index.
-#define mm128_shuflmov_32( v1, i1, v2, i2 ) \
+// Move element i2 of v2 to element i1 of v1 and return updated v1.
+#define mm128_mov32_32( v1, i1, v2, i2 ) \
  mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )

 #endif  // SSE4_1
@@ -204,11 +213,12 @@ static inline __m128i mm128_not( const __m128i v )

 #endif

+/*
 // Unary negation of elements (-v)
 #define mm128_negate_64( v )    _mm_sub_epi64( m128_zero, v )
 #define mm128_negate_32( v )    _mm_sub_epi32( m128_zero, v )  
 #define mm128_negate_16( v )    _mm_sub_epi16( m128_zero, v )  
-
+*/

 // Add 4 values, fewer dependencies than sequential addition.
 #define mm128_add4_64( a, b, c, d ) \
@@ -264,26 +274,22 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #if defined(__AVX512VL__)

 // a ^ b ^ c
-#define mm128_xor3( a, b, c ) \
-   _mm_ternarylogic_epi64( a, b, c, 0x96 )
+#define mm128_xor3( a, b, c )    _mm_ternarylogic_epi64( a, b, c, 0x96 )

 // a ^ ( b & c )
-#define mm128_xorand( a, b, c ) \
-   _mm_ternarylogic_epi64( a, b, c, 0x78 )
+#define mm128_xorand( a, b, c )  _mm_ternarylogic_epi64( a, b, c, 0x78 )

 #else

-#define mm128_xor3( a, b, c ) \
-   _mm_xor_si128( a, _mm_xor_si128( b, c ) )
+#define mm128_xor3( a, b, c )    _mm_xor_si128( a, _mm_xor_si128( b, c ) )

-#define mm128_xorand( a, b, c ) \
-  _mm_xor_si128( a, _mm_and_si128( b, c ) )
+#define mm128_xorand( a, b, c )  _mm_xor_si128( a, _mm_and_si128( b, c ) )

 #endif

 // Mask making
 // Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
-// Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements.
+// Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements.
 // Effectively a sign test.

 #define mm_movmask_64( v ) \
@@ -292,64 +298,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm_movmask_32( v ) \
   _mm_castps_si128( _mm_movmask_ps( _mm_castsi128_ps( v ) ) )

-
-// Diagonal blend
-
-// Blend 4 32 bit elements from 4 vectors
-
-#if defined (__AVX2__)
-
-#define mm128_diagonal_32( v3, v2, v1, v0 ) \
-  mm_blend_epi32( _mm_blend_epi32( s3, s2, 0x4 ), \
-                  _mm_blend_epi32( s1, s0, 0x1 ), 0x3 )
-
-#elif defined(__SSE4_1__)
-
-#define mm128_diagonal_32( v3, v2, v1, v0 ) \
-  mm_blend_epi16( _mm_blend_epi16( s3, s2, 0x30 ), \
-                  _mm_blend_epi16( s1, s0, 0x03 ), 0x0f )
-
-#endif
-
-/*
-//
-// Extended bit shift for concatenated packed elements from 2 vectors.
-// Shift right returns low half, shift left return high half.
-
-#if defined(__AVX512VBMI2__) && defined(__AVX512VL__)
-
-#define mm128_shl2_64( v1, v2, c )    _mm_shldi_epi64( v1, v2, c ) 
-#define mm128_shr2_64( v1, v2, c )    _mm_shrdi_epi64( v1, v2, c ) 
-
-#define mm128_shl2_32( v1, v2, c )    _mm_shldi_epi32( v1, v2, c ) 
-#define mm128_shr2_32( v1, v2, c )    _mm_shrdi_epi32( v1, v2, c ) 
-
-#define mm128_shl2_16( v1, v2, c )    _mm_shldi_epi16( v1, v2, c )
-#define mm128_shr2_16( v1, v2, c )    _mm_shrdi_epi16( v1, v2, c )
-
-#else
-
-#define mm128_shl2_64( v1, v2, c ) \
-   _mm_or_si128( _mm_slli_epi64( v1, c ), _mm_srli_epi64( v2, 64 - (c) ) )
-
-#define mm128_shr2_64( v1, v2, c ) \
-   _mm_or_si128( _mm_srli_epi64( v2, c ), _mm_slli_epi64( v1, 64 - (c) ) )
-
-#define mm128_shl2_32( v1, v2, c ) \
-   _mm_or_si128( _mm_slli_epi32( v1, c ), _mm_srli_epi32( v2, 32 - (c) ) )
-
-#define mm128_shr2_32( v1, v2, c ) \
-   _mm_or_si128( _mm_srli_epi32( v2, c ), _mm_slli_epi32( v1, 32 - (c) ) )
-
-#define mm128_shl2_16( v1, v2, c ) \
-   _mm_or_si128( _mm_slli_epi16( v1, c ), _mm_srli_epi16( v2, 16 - (c) ) )
-
-#define mm128_shr2_16( v1, v2, c ) \
-   _mm_or_si128( _mm_srli_epi16( v2, c ), _mm_slli_epi16( v1, 16 - (c) ) )
-
-#endif
-*/
-
 //
 // Bit rotations

@@ -446,6 +394,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_var_32( v, c ) \
   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )

+// Cross lane shuffles
 //
 // Limited 2 input shuffle, combines shuffle with blend. The destination low
 // half is always taken from v1, and the high half from v2.
@@ -457,12 +406,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
   _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
                                     _mm_castsi128_ps( v2 ), c ) ); 

-//
 // Rotate vector elements accross all lanes

-#define mm128_swap_64( v )    _mm_shuffle_epi32( v, 0x4e )
-#define mm128_shuflr_64       mm128_swap_64
-#define mm128_shufll_64       mm128_swap_64
+#define mm128_swap_64( v )     _mm_shuffle_epi32( v, 0x4e )
+#define mm128_shuflr_64        mm128_swap_64
+#define mm128_shufll_64        mm128_swap_64

 #define mm128_shuflr_32( v )   _mm_shuffle_epi32( v, 0x39 )
 #define mm128_shufll_32( v )   _mm_shuffle_epi32( v, 0x93 )
@@ -475,13 +423,11 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )

 #endif

-// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit rotations
-// for multiples of 8 bits. Uses ror/rol macros when AVX512 is available
-// (unlikely but faster), or when SSSE3 is not available (slower).
+//  Rotate 64 bit lanes

 #define mm128_swap64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
-#define mm128_shuflr64_32 mm128_swap64_32
-#define mm128_shufll64_32 mm128_swap64_32
+#define mm128_shuflr64_32     mm128_swap64_32
+#define mm128_shufll64_32     mm128_swap64_32

 #if defined(__SSSE3__) && !defined(__AVX512VL__)
  #define mm128_shuflr64_24( v ) \
@@ -499,6 +445,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
  #define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 )
 #endif

+// Rotate 32 bit lanes
+
 #if defined(__SSSE3__) && !defined(__AVX512VL__)
  #define mm128_swap32_16( v ) \
    _mm_shuffle_epi8( v, _mm_set_epi64x( \
@@ -506,8 +454,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 #else
  #define mm128_swap32_16( v ) mm128_ror_32( v, 16 )
 #endif
-#define mm128_shuflr32_16 mm128_swap32_16
-#define mm128_shufll32_16 mm128_swap32_16
+#define mm128_shuflr32_16      mm128_swap32_16
+#define mm128_shufll32_16      mm128_swap32_16

 #if defined(__SSSE3__) && !defined(__AVX512VL__)
  #define mm128_shuflr32_8( v ) \
@@ -522,6 +470,10 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )

 #if defined(__SSSE3__)

+#define mm128_bswap_128( v ) \
+   _mm_shuffle_epi8( v, m128_const_64( 0x0001020304050607, \
+                                       0x08090a0b0c0d0e0f ) )
+
 #define mm128_bswap_64( v ) \
   _mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \
                                       0x0001020304050607 ) )
@@ -583,6 +535,9 @@ static inline __m128i mm128_bswap_16( __m128i v )
  return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
 }

+#define mm128_bswap_128( v ) \
+   mm128_swap_64( mm128_bswap_64( v ) )
+
 static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
 {
   d[0] = mm128_bswap_64( s[0] );
@@ -617,67 +572,23 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
   v1 = _mm_xor_si128( v1, v2 );


-// alignr for 32 & 64 bit elements is only available with AVX512 but
-// emulated here. Shift argument is not needed, it's always 1.
-// Behaviour is otherwise consistent with Intel alignr intrinsics.
+// alignr instruction for 32 & 64 bit elements is only available with AVX512
+// but emulated here. Behaviour is consistent with Intel alignr intrinsics.

 #if defined(__SSSE3__)

-#define mm128_alignr_64( v1, v2 )    _mm_alignr_epi8( v1, v2, 8 )
-#define mm128_alignr_32( v1, v2 )    _mm_alignr_epi8( v1, v2, 4 )
+#define mm128_alignr_64( hi, lo, c )    _mm_alignr_epi8( hi, lo, (c)*8 )
+#define mm128_alignr_32( hi, lo, c )    _mm_alignr_epi8( hi, lo, (c)*4 )

 #else

-#define mm128_alignr_64( v1, v2 )    _mm_or_si128( _mm_slli_si128( v1, 8 ), \
-                                                   _mm_srli_si128( v2, 8 ) )
+#define mm128_alignr_64( hi, lo, c ) \
+   _mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )

-#define mm128_alignr_32( v1, v2 )    _mm_or_si128( _mm_slli_si128( v1, 4 ), \
-                                                   _mm_srli_si128( v2, 4 ) )
+#define mm128_alignr_32( hi, lo, c ) \
+   _mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )

 #endif

-// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
-// vrol & vror are deprecated and do not exist for larger vectors.
-// Their only use is by lyra2 blake2b when AVX2 is not available and is
-// grandfathered.
-
-#if defined(__SSSE3__)
-
-#define mm128_vror256_64( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
-           v1 = _mm_alignr_epi8( v2, v1, 8 ); \
-           v2 = t; \
-} while(0)
-
-#define mm128_vrol256_64( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
-           v2 = _mm_alignr_epi8( v2, v1, 8 ); \
-           v1 = t; \
-} while(0)
-
-#else  // SSE2
-
-#define mm128_vror256_64( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
-                              _mm_slli_si128( v2, 8 ) ); \
-           v2 = _mm_or_si128( _mm_srli_si128( v2, 8 ), \
-                              _mm_slli_si128( v1, 8 ) ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vrol256_64( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
-                              _mm_srli_si128( v2, 8 ) ); \
-           v2 = _mm_or_si128( _mm_slli_si128( v2, 8 ), \
-                              _mm_srli_si128( v1, 8 ) ); \
-           v1 = t; \
-} while(0)
-
-#endif  // SSE4.1 else SSE2
-
 #endif // __SSE2__
 #endif // SIMD_128_H__
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -15,6 +15,8 @@
 //
 // "_mm256_shuffle_epi8" and "_mm256_alignr_epi8" are restricted to 128 bit
 // lanes and data can't cross the 128 bit lane boundary.  
+// Full width byte shuffle is available with AVX512VL using the mask version
+// with a full mask (-1). 
 // Instructions that can move data across 128 bit lane boundary incur a
 // performance penalty over those that can't.
 // Some usage of index vectors may be encoded as if full vector shuffles are
@@ -65,36 +67,34 @@ typedef union
 #define u64_mov256_64( v ) u64_mov128_64( _mm256_castsi256_si128( v ) )
 #define u32_mov256_32( v ) u32_mov128_32( _mm256_castsi256_si128( v ) )

-// deprecated
-//#define mm256_mov256_64 u64_mov256_64 
-//#define mm256_mov256_32 u32_mov256_32
-
 // concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
+
 #define mm256_concat_128( hi, lo ) \
   _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )

+#define mm256_bcast_m128( v ) \
+                 _mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
+#define mm256_bcast_i128( i ) mm256_bcast_m128( mm128_mov64_128( i ) )
+#define mm256_bcast_i64( i )  _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
+#define mm256_bcast_i32( i )  _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
+#define mm256_bcast_i16( i )  _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
+#define mm256_bcast_i8( i )   _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )

 // Equivalent of set, move 64 bit integer constants to respective 64 bit
 // elements.
 static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
                                     const uint64_t i1, const uint64_t i0 )
 {
-  union { __m256i m256i;
-          uint64_t u64[4]; } v;
+  union { __m256i m256i;  uint64_t u64[4]; } v;
  v.u64[0] = i0; v.u64[1] = i1; v.u64[2] = i2; v.u64[3] = i3;
  return v.m256i;
 }

-// Equivalent of set1.
-// 128 bit vector argument
-#define m256_const1_128( v ) \
-   _mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
-// 64 bit integer argument zero extended to 128 bits.
-#define m256_const1_i128( i ) m256_const1_128( mm128_mov64_128( i ) )
-#define m256_const1_64( i )  _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
-#define m256_const1_32( i )  _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
-#define m256_const1_16( i )  _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
-#define m256_const1_8 ( i )  _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )
+// Deprecated
+#define m256_const1_128      mm256_bcast_m128
+#define m256_const1_i128     mm256_bcast_i128
+#define m256_const1_64       mm256_bcast_i64
+#define m256_const1_32       mm256_bcast_i32

 #define m256_const2_64( i1, i0 ) \
  m256_const1_128( m128_const_64( i1, i0 ) )
@@ -103,13 +103,13 @@ static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
 // All SIMD constant macros are actually functions containing executable
 // code and therefore can't be used as compile time initializers.

-#define m256_zero      _mm256_setzero_si256()
-#define m256_one_256   mm256_mov64_256( 1 )
-#define m256_one_128   m256_const1_i128( 1 )
-#define m256_one_64    _mm256_broadcastq_epi64( mm128_mov64_128( 1 ) )
-#define m256_one_32    _mm256_broadcastd_epi32( mm128_mov64_128( 1 ) )
-#define m256_one_16    _mm256_broadcastw_epi16( mm128_mov64_128( 1 ) )
-#define m256_one_8     _mm256_broadcastb_epi8 ( mm128_mov64_128( 1 ) )
+#define m256_zero         _mm256_setzero_si256()
+#define m256_one_256      mm256_mov64_256( 1 )
+#define m256_one_128      mm256_bcast_i128( 1 )
+#define m256_one_64       mm256_bcast_i64( 1 )
+#define m256_one_32       mm256_bcast_i32( 1 )
+#define m256_one_16       mm256_bcast_i16( 1 )
+#define m256_one_8        mm256_bcast_i8 ( 1 )

 static inline __m256i mm256_neg1_fn()
 {
@@ -120,8 +120,8 @@ static inline __m256i mm256_neg1_fn()
 #define m256_neg1  mm256_neg1_fn()

 // Consistent naming for similar operations.
-#define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v )
-#define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 )
+#define mm128_extr_lo128_256( v )    _mm256_castsi256_si128( v )
+#define mm128_extr_hi128_256( v )    _mm256_extracti128_si256( v, 1 )

 //
 // Memory functions
@@ -151,10 +151,12 @@ static inline __m256i mm256_not( const __m256i v )

 #endif

+/*
 // Unary negation of each element ( -v )
 #define mm256_negate_64( v ) _mm256_sub_epi64( m256_zero, v )
 #define mm256_negate_32( v ) _mm256_sub_epi32( m256_zero, v )
 #define mm256_negate_16( v ) _mm256_sub_epi16( m256_zero, v )
+*/


 // Add 4 values, fewer dependencies than sequential addition.
@@ -176,44 +178,34 @@ static inline __m256i mm256_not( const __m256i v )
 // AVX512 has ternary logic that supports any 3 input boolean expression.

 // a ^ b ^ c
-#define mm256_xor3( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x96 )
+#define mm256_xor3( a, b, c )      _mm256_ternarylogic_epi64( a, b, c, 0x96 )

 // legacy convenience only
-#define mm256_xor4( a, b, c, d ) \
-   _mm256_xor_si256( a, mm256_xor3( b, c, d ) )
+#define mm256_xor4( a, b, c, d )   _mm256_xor_si256( a, mm256_xor3( b, c, d ) )

 // a & b & c
-#define mm256_and3( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x80 )
+#define mm256_and3( a, b, c )      _mm256_ternarylogic_epi64( a, b, c, 0x80 )

 // a | b | c
-#define mm256_or3( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0xfe )
+#define mm256_or3( a, b, c )       _mm256_ternarylogic_epi64( a, b, c, 0xfe )

 // a ^ ( b & c )
-#define mm256_xorand( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x78 )
+#define mm256_xorand( a, b, c )    _mm256_ternarylogic_epi64( a, b, c, 0x78 )

 // a & ( b ^ c )
-#define mm256_andxor( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x60 )
+#define mm256_andxor( a, b, c )    _mm256_ternarylogic_epi64( a, b, c, 0x60 )

 // a ^ ( b | c )
-#define mm256_xoror( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x1e )
+#define mm256_xoror( a, b, c )     _mm256_ternarylogic_epi64( a, b, c, 0x1e )

 // a ^ ( ~b & c )   
-#define mm256_xorandnot( a, b, c ) \
-  _mm256_ternarylogic_epi64( a, b, c, 0xd2 )
+#define mm256_xorandnot( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0xd2 )

 // a | ( b & c )
-#define mm256_orand( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0xf8  )
+#define mm256_orand( a, b, c )     _mm256_ternarylogic_epi64( a, b, c, 0xf8 )

 // ~( a ^ b ), same as (~a) ^ b
-#define mm256_xnor( a, b ) \
-   _mm256_ternarylogic_epi64( a, b, b, 0x81  )
+#define mm256_xnor( a, b )         _mm256_ternarylogic_epi64( a, b, b, 0x81 )
    
 #else

@@ -251,7 +243,7 @@ static inline __m256i mm256_not( const __m256i v )

 // Mask making
 // Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
-// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
+// Returns 4 or 8 bit integer mask from MSBit of 64 or 32 bit elements.
 // Effectively a sign test.

 #define mm256_movmask_64( v ) \
@@ -260,76 +252,6 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_movmask_32( v ) \
   _mm256_castps_si256( _mm256_movmask_ps( _mm256_castsi256_ps( v ) ) )

-
-// Diagonal blending
-
-// Blend 4 64 bit elements from 4 vectors
-#define mm256_diagonal_64( v3, v2, v1, v0 ) \
-  mm256_blend_epi32( _mm256_blend_epi32( v3, v2, 0x30 ), \
-                     _mm256_blend_epi32( v1, v0, 0x03 ), 0x0f )
-
-// Blend 8 32 bit elements from 8 vectors
-#define mm256_diagonal_32( v7, v6, v5, v4, v3, v2, v1, v0 ) \
-  _mm256_blend_epi32( \
-        _mm256_blend_epi32( \
-               _mm256_blend_epi32( v7, v6, 0x40 ), \
-               _mm256_blend_epi32( v5, v4, 0x10 ) 0x30 ), \
-        _mm256_blend_epi32( \
-               _mm256_blend_epi32( v3, v2, 0x04) \
-               _mm256_blend_epi32( v1, v0, 0x01 ), 0x03 ), 0x0f )  
-
-
-// Blend 4 32 bit elements from each 128 bit lane.
-#define mm256_diagonal128_32( v3, v2, v1, v0 ) \
-    _mm256_blend_epi32( \
-           _mm256_blend_epi32( v3, v2, 0x44) \
-           _mm256_blend_epi32( v1, v0, 0x11 ) )  
-
-/*
-//
-// Extended bit shift for concatenated packed elements from 2 vectors.
-// Shift right returns low half, shift left return high half.
-
-#if defined(__AVX512VBMI2__) && defined(__AVX512VL__)
-
-#define mm256_shl2_64( v1, v2, c )    _mm256_shldi_epi64( v1, v2, c )
-#define mm256_shr2_64( v1, v2, c )    _mm256_shrdi_epi64( v1, v2, c )
-
-#define mm256_shl2_32( v1, v2, c )    _mm256_shldi_epi32( v1, v2, c )
-#define mm256_shr2_32( v1, v2, c )    _mm256_shrdi_epi32( v1, v2, c )
-
-#define mm256_shl2_16( v1, v2, c )    _mm256_shldi_epi16( v1, v2, c )
-#define mm256_shr2_16( v1, v2, c )    _mm256_shrdi_epi16( v1, v2, c )
-
-#else
-
-#define mm256_shl2i_64( v1, v2, c ) \
-                     _mm256_or_si256( _mm256_slli_epi64( v1, c ), \
-                                      _mm256_srli_epi64( v2, 64 - (c) ) )
-
-#define mm512_shr2_64( v1, v2, c ) \
-                    _mm256_or_si256( _mm256_srli_epi64( v2, c ), \
-                                     _mm256_slli_epi64( v1, 64 - (c) ) )
-
-#define mm256_shl2_32( v1, v2, c ) \
-                    _mm256_or_si256( _mm256_slli_epi32( v1, c ), \
-                                     _mm256_srli_epi32( v2, 32 - (c) ) )
-
-#define mm256_shr2_32( v1, v2, c ) \
-                    _mm256_or_si256( _mm256_srli_epi32( v2, c ), \
-                                     _mm256_slli_epi32( v1, 32 - (c) ) )
-
-#define mm256_shl2_16( v1, v2, c ) \
-                    _mm256_or_si256( _mm256_slli_epi16( v1, c ), \
-                                     _mm256_srli_epi16( v2, 16 - (c) ) )
-
-#define mm256_shr2_16( v1, v2, c ) \
-                    _mm256_or_si256( _mm256_srli_epi16( v2, c ), \
-                                     _mm256_slli_epi16( v1, 16 - (c) ) )
-
-#endif
-*/
-
 //
 //           Bit rotations.
 //
@@ -435,19 +357,33 @@ static inline __m256i mm256_not( const __m256i v )
   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
                    _mm256_srli_epi32( v, 32-(c) ) )

+//
+// Cross lane shuffles
 //
 // Rotate elements accross all lanes.

 // Swap 128 bit elements in 256 bit vector.
 #define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )
-#define mm256_shuflr_128 mm256_swap_128
-#define mm256_shufll_128 mm256_swap_128
+#define mm256_shuflr_128        mm256_swap_128
+#define mm256_shufll_128        mm256_swap_128

 // Rotate 256 bit vector by one 64 bit element
 #define mm256_shuflr_64( v )    _mm256_permute4x64_epi64( v, 0x39 )
 #define mm256_shufll_64( v )    _mm256_permute4x64_epi64( v, 0x93 )

+
+/* Not used
 // Rotate 256 bit vector by one 32 bit element.
+#if defined(__AVX512VL__)
+
+static inline __m256i mm256_shuflr_32( const __m256i v )
+{ return _mm256_alignr_epi32( v, v, 1 ); }
+
+static inline __m256i mm256_shufll_32( const __m256i v )
+{ return _mm256_alignr_epi32( v, v, 15 ); }
+
+#else
+
 #define mm256_shuflr_32( v ) \
    _mm256_permutevar8x32_epi32( v, \
                     m256_const_64( 0x0000000000000007, 0x0000000600000005, \
@@ -458,6 +394,9 @@ static inline __m256i mm256_not( const __m256i v )
                     m256_const_64( 0x0000000600000005,  0x0000000400000003, \
                                    0x0000000200000001,  0x0000000000000007 ) )

+#endif
+*/
+
 //
 // Rotate elements within each 128 bit lane of 256 bit vector.

@@ -480,20 +419,17 @@ static inline __m256i mm256_not( const __m256i v )
 static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 { return _mm256_alignr_epi8( v, v, c ); }

-// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit
-// rotations for multiples of 8 bits. Uses faster ror/rol instructions when
-// AVX512 is available.
+// 64 bit lanes

-#define mm256_swap64_32( v )   _mm256_shuffle_epi32( v, 0xb1 )
-#define mm256_shuflr64_32 mm256_swap64_32
-#define mm256_shufll64_32 mm256_swap64_32
+#define mm256_swap64_32( v )      _mm256_shuffle_epi32( v, 0xb1 )
+#define mm256_shuflr64_32         mm256_swap64_32
+#define mm256_shufll64_32         mm256_swap64_32

 #if defined(__AVX512VL__)
  #define mm256_shuflr64_24( v )  _mm256_ror_epi64( v, 24 )
 #else
  #define mm256_shuflr64_24( v ) \
-    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
-                                    0x0a09080f0e0d0c0b, 0x0201000706050403, \
+    _mm256_shuffle_epi8( v, m256_const2_64( \
                                    0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
 #endif

@@ -501,21 +437,21 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
  #define mm256_shuflr64_16( v )  _mm256_ror_epi64( v, 16 )
 #else
  #define mm256_shuflr64_16( v ) \
-    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
-                                    0x09080f0e0d0c0b0a, 0x0100070605040302, \
+    _mm256_shuffle_epi8( v, m256_const2_64( \
                                    0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
 #endif

+// 32 bit lanes
+
 #if defined(__AVX512VL__)
  #define mm256_swap32_16( v )  _mm256_ror_epi32( v, 16 )
 #else
  #define mm256_swap32_16( v ) \
-    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
-                                    0x0d0c0f0e09080b0a, 0x0504070601000302, \
+    _mm256_shuffle_epi8( v, m256_const2_64( \
                                    0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
 #endif
-#define mm256_shuflr32_16 mm256_swap32_16
-#define mm256_shufll32_16 mm256_swap32_16
+#define mm256_shuflr32_16       mm256_swap32_16
+#define mm256_shufll32_16       mm256_swap32_16

 #if defined(__AVX512VL__)
  #define mm256_shuflr32_8( v )  _mm256_ror_epi32( v, 8 )
@@ -526,35 +462,24 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
                                    0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
 #endif

-// NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
-// lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
-// AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
-// needed for a shuffle that crosses 128 bit lanes. BSWAP doesn't therefore the
-// AVX2 version will work here. The bswap control vector is coded to work
-// with both versions, bit 4 is ignored in AVX2. 
-
 // Reverse byte order in elements, endian bswap.
 #define mm256_bswap_64( v ) \
   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                        0x08090a0b0c0d0e0f, 0x0001020304050607 ) )
+         m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) )

 #define mm256_bswap_32( v ) \
   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
-                        0x0c0d0e0f08090a0b, 0x0405060700010203 ) )
+         m256_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) )

 #define mm256_bswap_16( v ) \
   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x1e1f1c1d1a1b1819, 0x1617141512131011, \
-                        0x0e0f0c0d0a0b0809, 0x0607040502030001, ) )
+         m256_const2_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001, ) )

 // Source and destination are pointers, may point to same memory.
 // 8 byte qword * 8 qwords * 4 lanes = 256 bytes
 #define mm256_block_bswap_64( d, s ) do \
 { \
-  __m256i ctl = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                               0x08090a0b0c0d0e0f, 0x0001020304050607 ) ; \
+  __m256i ctl = m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ; \
  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
@@ -568,8 +493,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 // 4 byte dword * 8 dwords * 8 lanes = 256 bytes
 #define mm256_block_bswap_32( d, s ) do \
 { \
-  __m256i ctl = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
-                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+  __m256i ctl = m256_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -113,7 +113,17 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
 #define mm512_concat_256( hi, lo ) \
   _mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 )

-#define m512_const_128( v3, v2, v1, v0 ) \
+// Work in progress.
+// modified naming scheme to align more with opcode mnenonic:
+// m512_const1 becomes mm512_bcast_m[n] or mm512_bcast_i[n], short for
+// broadcast, i indicates integer arg, m is vector. Set1 intrinsics should
+// genarally be used for integer data.
+// mm512_const should only be used with immediate integer arguments, use
+// _mm512_set intrinsic instead.
+// mm512_set, mm512_set[n] macros may be defined when no intrinsic exists
+// for either the arg size or arg count.
+
+#define mm512_set_128( v3, v2, v1, v0 ) \
   mm512_concat_256( mm256_concat_128( v3, v2 ), \
                     mm256_concat_128( v1, v0 ) )

@@ -133,29 +143,35 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
  return v.m512i;
 }

+// Broadcast with vector argument is generally more efficient except for
+// integer immediate constants or when data was most recently referenced as
+// integer and is still available in an integer register.
+
+/* not used
 // Equivalent of set1, broadcast lo element to all elements.
 static inline __m512i m512_const1_256( const __m256i v )
 { return _mm512_inserti64x4( _mm512_castsi256_si512( v ), v, 1 ); }  
+*/

-#define m512_const1_128( v ) \
-    mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
-// Integer input argument up to 64 bits
-#define m512_const1_i128( i ) \
-    mm512_perm_128( _mm512_castsi128_si512( mm128_mov64_128( i ) ), 0 )
+#define mm512_bcast_m128( v )  mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
+// Low 64 bits only, high 64 bits are zeroed.
+#define mm512_bcast_i128( i )  mm512_bcast_m128( mm128_mov64_128( i ) )
+#define mm512_bcast_i64( i )   _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
+#define mm512_bcast_i32( i )   _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
+#define mm512_bcast_i16( i )   _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
+#define mm512_bcast_i8( i )    _mm512_broadcastb_epi8( mm128_mov32_128( i ) )

-//#define m512_const1_256( v )   _mm512_broadcast_i64x4( v )
-//#define m512_const1_128( v )   _mm512_broadcast_i64x2( v )
-#define m512_const1_64( i )    _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
-#define m512_const1_32( i )    _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
-#define m512_const1_16( i )    _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
-#define m512_const1_8( i )     _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) )
+// const1 is deprecated, use bcast instead
+#define m512_const1_128   mm512_bcast_m128
+#define m512_const1_i128  mm512_bcast_i128
+#define m512_const1_64    mm512_bcast_i64
+#define m512_const1_32    mm512_bcast_i32

 #define m512_const2_128( v1, v0 ) \
-   m512_const1_256( _mm512_inserti64x2( _mm512_castsi128_si512( v0 ), v1, 1 ) )
+   _mm512_inserti64x2( _mm512_castsi128_si512( v0 ), v1, 1 )

 #define m512_const2_64( i1, i0 ) \
-   m512_const1_128( m128_const_64( i1, i0 ) )
-
+   mm512_bcast_m128( m128_const_64( i1, i0 ) )

 static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
                                      const uint64_t i1, const uint64_t i0 )
@@ -179,14 +195,20 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
 #define m512_zero       _mm512_setzero_si512()
 #define m512_one_512    mm512_mov64_512( 1 )
 #define m512_one_256    _mm512_inserti64x4( m512_one_512, m256_one_256, 1 )  
-#define m512_one_128    m512_const1_i128( 1 )
-#define m512_one_64     m512_const1_64( 1 )
-#define m512_one_32     m512_const1_32( 1 )
-#define m512_one_16     m512_const1_16( 1 )
-#define m512_one_8      m512_const1_8( 1 )
+#define m512_one_128    mm512_bcast_i128( (__uint128_t)1 )
+#define m512_one_64     mm512_bcast_i64( (uint64_t)1 )
+#define m512_one_32     mm512_bcast_i32( (uint32_t)1 )
+#define m512_one_16     mm512_bcast_i16( (uint16_t)1 )
+#define m512_one_8      mm512_bcast_i8(  (uint8_t)1 )

-//#define m512_neg1 m512_const1_64( 0xffffffffffffffff )
-#define m512_neg1 _mm512_movm_epi64( 0xff )
+// use asm to avoid compiler warning for unitialized local
+static inline __m512i mm512_neg1_fn()
+{
+   __m512i a;
+   asm( "vpternlogq $0xff, %0, %0, %0\n\t" : "=x"(a) );
+   return a;
+}
+#define m512_neg1 mm512_neg1_fn()                          // 1 clock

 //
 // Basic operations without SIMD equivalent
@@ -195,11 +217,12 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
 static inline __m512i mm512_not( const __m512i x )
 {  return _mm512_ternarylogic_epi64( x, x, x, 1 ); }

+/*
 // Unary negation: -x
 #define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x )
 #define mm512_negate_32( x ) _mm512_sub_epi32( m512_zero, x )  
 #define mm512_negate_16( x ) _mm512_sub_epi16( m512_zero, x )  
-
+*/

 //
 // Pointer casting
@@ -253,119 +276,43 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // expression using any number or combinations of AND, OR, XOR, NOT.

 // a ^ b ^ c
-#define mm512_xor3( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x96 )
+#define mm512_xor3( a, b, c )      _mm512_ternarylogic_epi64( a, b, c, 0x96 )

 // legacy convenience only
-#define mm512_xor4( a, b, c, d ) \
-   _mm512_xor_si512( a, mm512_xor3( b, c, d ) )
+#define mm512_xor4( a, b, c, d )   _mm512_xor_si512( a, mm512_xor3( b, c, d ) )

 // a & b & c
-#define mm512_and3( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x80 )
+#define mm512_and3( a, b, c )      _mm512_ternarylogic_epi64( a, b, c, 0x80 )

 // a | b | c
-#define mm512_or3( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0xfe )
+#define mm512_or3( a, b, c )       _mm512_ternarylogic_epi64( a, b, c, 0xfe )

 // a ^ ( b & c )
-#define mm512_xorand( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x78 )
+#define mm512_xorand( a, b, c )    _mm512_ternarylogic_epi64( a, b, c, 0x78 )

 // a & ( b ^ c )
-#define mm512_andxor( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x60 )
+#define mm512_andxor( a, b, c )    _mm512_ternarylogic_epi64( a, b, c, 0x60 )

 // a ^ ( b | c )
-#define mm512_xoror( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x1e )
+#define mm512_xoror( a, b, c )     _mm512_ternarylogic_epi64( a, b, c, 0x1e )

 // a ^ ( ~b & c ),     xor( a, andnot( b, c ) )
-#define mm512_xorandnot( a, b, c ) \
-  _mm512_ternarylogic_epi64( a, b, c, 0xd2 ) 
+#define mm512_xorandnot( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0xd2 ) 

 // a | ( b & c )
-#define mm512_orand( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0xf8  )
+#define mm512_orand( a, b, c )     _mm512_ternarylogic_epi64( a, b, c, 0xf8 )

 // Some 2 input operations that don't have their own instruction mnemonic.
+// Use with caution, args are not expression safe.

 // ~( a | b ),  (~a) & (~b)
-#define mm512_nor( a, b ) \
-   _mm512_ternarylogic_epi64( a, b, b, 0x01  )
+#define mm512_nor( a, b )          _mm512_ternarylogic_epi64( a, b, b, 0x01 )

 // ~( a ^ b ),  (~a) ^ b
-#define mm512_xnor( a, b ) \
-   _mm512_ternarylogic_epi64( a, b, b, 0x81  )
+#define mm512_xnor( a, b )         _mm512_ternarylogic_epi64( a, b, b, 0x81 )

 // ~( a & b )
-#define mm512_nand( a, b ) \
-   _mm512_ternarylogic_epi64( a, b, b, 0xef  )
-
-/*
-// Diagonal blending
-// Blend 8 64 bit elements from 8 vectors
-#define mm512_diagonal_64( v7, v6, v5, v4, v3, v2, v1, v0 ) \
-  _mm512_mask_blend_epi64( 0x0f, \
-        _mm512_mask_blend_epi64( 0x30, \
-               _mm512_mask_blend_epi64( 0x40, v7, v6 ), \
-               _mm512_mask_blend_epi64( 0x40, v5, v4 ) ), \
-        _mm512_mask_blend_epi64( 0x03, \
-               _mm512_mask_blend_epi64( 0x04, v3, v2 ) \
-               _mm512_mask_blend_epi64( 0x01, v1, v0 ) ) )  
-
-
-// Blend 4 32 bit elements from each 128 bit lane.
-#define mm512_diagonal128_32( v3, v2, v1, v0 ) \
-    _mm512_mask_blend_epi32( 0x3333, \
-           _mm512_mask_blend_epi32( 0x4444, v3, v2 ), \
-           _mm512_mask_blend_epi32( 0x1111, v1, v0 ) )  
-*/
-
-/*
-//
-// Extended bit shift of concatenated packed elements from 2 vectors.
-// Shift right returns low half, shift left returns high half.
-
-#if defined(__AVX512VBMI2__)
-
-#define mm512_shl2_64( v1, v2, c )    _mm512_shldi_epi64( v1, v2, c )
-#define mm512_shr2_64( v1, v2, c )    _mm512_shrdi_epi64( v1, v2, c )
-
-#define mm512_shl2_32( v1, v2, c )    _mm512_shldi_epi32( v1, v2, c )
-#define mm512_shr2_32( v1, v2, c )    _mm512_shrdi_epi32( v1, v2, c )
-
-#define mm512_shl2_16( v1, v2, c )    _mm512_shldi_epi16( v1, v2, c )
-#define mm512_shr2_16( v1, v2, c )    _mm512_shrdi_epi16( v1, v2, c )
-
-#else
-
-#define mm512_shl2_64( v1, v2, c ) \
-                     _mm512_or_si512( _mm512_slli_epi64( v1, c ), \
-                                      _mm512_srli_epi64( v2, 64 - (c) ) )
-
-#define mm512_shr2_64( v1, v2, c ) \
-                     _mm512_or_si512( _mm512_srli_epi64( v2, c ), \
-                                      _mm512_slli_epi64( v1, 64 - (c) ) )
-
-#define mm512_shl2_32( v1, v2, c ) \
-                     _mm512_or_si512( _mm512_slli_epi32( v1, c ), \
-                                      _mm512_srli_epi32( v2, 32 - (c) ) )
-
-#define mm512_shr2_32( v1, v2, c ) \
-                     _mm512_or_si512( _mm512_srli_epi32( v2, c ), \
-                                      _mm512_slli_epi32( v1, 32 - (c) ) )
-
-#define mm512_shl2_16( v1, v2, c ) \
-                     _mm512_or_si512( _mm512_slli_epi16( v1, c ), \
-                                      _mm512_srli_epi16( v2, 16 - (c) ) )
-
-#define mm512_shr2_16( v1, v2, c ) \
-                     _mm512_or_si512( _mm512_srli_epi16( v2, c ), \
-                                      _mm512_slli_epi16( v1, 16 - (c) ) )
-
-#endif
-*/
+#define mm512_nand( a, b )         _mm512_ternarylogic_epi64( a, b, b, 0xef )

 // Bit rotations.

@@ -382,19 +329,6 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_ror_32 _mm512_ror_epi32
 #define mm512_rol_32 _mm512_rol_epi32

-/*
-#if defined(__AVX512VBMI2__)
-
-// Use C inline function in case arg is coded as an expression.
-static inline __m512i mm512_ror_16( __m512i v, int c )
-{ return _mm512_shrdi_epi16( v, v, c ); }
-
-static inline __m512i mm512_rol_16( __m512i v, int c )
-{ return _mm512_shldi_epi16( v, v, c ); }
-
-#endif
-*/
-
 //
 // Reverse byte order of packed elements, vectorized endian conversion.

@@ -423,10 +357,10 @@ static inline __m512i mm512_rol_16( __m512i v, int c )
 // 8 lanes of 64 bytes each
 #define mm512_block_bswap_64( d, s ) do \
 { \
-  __m512i ctl = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
-                               0x28292a2b2c2d2e2f, 0x2021222324252627, \
-                               0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                               0x08090a0b0c0d0e0f, 0x0001020304050607  ); \
+  const __m512i ctl = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
+                                     0x28292a2b2c2d2e2f, 0x2021222324252627, \
+                                     0x18191a1b1c1d1e1f, 0x1011121314151617, \
+                                     0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
@@ -440,10 +374,10 @@ static inline __m512i mm512_rol_16( __m512i v, int c )
 // 16 lanes of 32 bytes each
 #define mm512_block_bswap_32( d, s ) do \
 { \
-  __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
-                               0x2c2d2e2f28292a2b, 0x2425262720212223, \
-                               0x1c1d1e1f18191a1b, 0x1415161710111213, \
-                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+  const __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
+                                     0x2c2d2e2f28292a2b, 0x2425262720212223, \
+                                     0x1c1d1e1f18191a1b, 0x1415161710111213, \
+                                     0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
@@ -455,30 +389,10 @@ static inline __m512i mm512_rol_16( __m512i v, int c )
 } while(0)


-// Cross-lane shuffles implementing rotate & shift of packed elements.
-//
-
-#define mm512_shiftr_256( v ) \
-  _mm512_alignr_epi64( _mm512_setzero, v, 4 )
-#define mm512_shiftl_256( v ) mm512_shifr_256
-
-#define mm512_shiftr_128( v ) \
-  _mm512_alignr_epi64( _mm512_setzero, v, 2 )
-#define mm512_shiftl_128( v ) \
-  _mm512_alignr_epi64( v,  _mm512_setzero, 6 )
-
-#define mm512_shiftr_64( v ) \
-  _mm512_alignr_epi64( _mm512_setzero, v, 1 )
-#define mm512_shiftl_64( v ) \
-  _mm512_alignr_epi64( v, _mm512_setzero, 7 )
-
-#define mm512_shiftr_32( v ) \
-  _mm512_alignr_epi32( _mm512_setzero, v, 1 )
-#define mm512_shiftl_32( v ) \
-  _mm512_alignr_epi32( v, _mm512_setzero, 15 )
-
-// Shuffle-rotate elements left or right in 512 bit vector.
+// Cross-lane shuffles implementing rotation of packed elements.
+// 

+// Rotate elements across entire vector.
 static inline __m512i mm512_swap_256( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 4 ); }
 #define mm512_shuflr_256( v ) mm512_swap_256
@@ -512,16 +426,16 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuflr_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                       0x0000001F001E001D, 0x001C001B001A0019, \
-                       0X0018001700160015, 0X0014001300120011, \
-                       0X0010000F000E000D, 0X000C000B000A0009, \
-                       0X0008000700060005, 0X0004000300020001 ), v )
+                       0x0018001700160015, 0x0014001300120011, \
+                       0x0010000F000E000D, 0x000C000B000A0009, \
+                       0x0008000700060005, 0x0004000300020001 ), v )

 #define mm512_shufll_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                       0x001E001D001C001B, 0x001A001900180017, \
-                       0X0016001500140013, 0X001200110010000F, \
-                       0X000E000D000C000B, 0X000A000900080007, \
-                       0X0006000500040003, 0X000200010000001F ), v )
+                       0x0016001500140013, 0x001200110010000F, \
+                       0x000E000D000C000B, 0x000A000900080007, \
+                       0x0006000500040003, 0x000200010000001F ), v )

 #define mm512_shuflr_8( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
@@ -537,7 +451,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
                       0x1E1D1C1B1A191817, 0x161514131211100F, \
                       0x0E0D0C0B0A090807, 0x060504030201003F ) )

-//
+// 256 bit lanes used only by lyra2, move these there
 // Rotate elements within 256 bit lanes of 512 bit vector.

 // Swap hi & lo 128 bits in each 256 bit lane
@@ -549,6 +463,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuflr256_64( v )     _mm512_permutex_epi64( v, 0x39 )
 #define mm512_shufll256_64( v )     _mm512_permutex_epi64( v, 0x93 )

+/*  Not used
 // Rotate 256 bit lanes by one 32 bit element
 #define mm512_shuflr256_32( v ) \
   _mm512_permutexvar_epi32( m512_const_64( \
@@ -591,10 +506,22 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
                     0x2e2d2c2b2a292827, 0x262524232221203f, \
                     0x1e1d1c1b1a191817, 0x161514131211100f, \
                     0x0e0d0c0b0a090807, 0x060504030201001f ) )
-
+*/
 //
 // Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
 
+#define mm512_swap128_64( v )   _mm512_shuffle_epi32( v, 0x4e )
+#define mm512_shuflr128_64      mm512_swap128_64
+#define mm512_shufll128_64      mm512_swap128_64
+
+// Rotate 128 bit lanes by one 32 bit element
+#define mm512_shuflr128_32( v )    _mm512_shuffle_epi32( v, 0x39 )
+#define mm512_shufll128_32( v )    _mm512_shuffle_epi32( v, 0x93 )
+
+// Rotate 128 bit lanes right by c bytes, versatile and just as fast
+static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
+{  return _mm512_alignr_epi8( v, v, c ); }
+
 // Limited 2 input, 1 output shuffle, combines shuffle with blend.
 // Like most shuffles it's limited to 128 bit lanes and like some shuffles
 // destination elements must come from a specific source arg. 
@@ -606,26 +533,11 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
   _mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( v1 ), \
                                           _mm512_castsi512_ps( v2 ), c ) ); 

-// Swap 64 bits in each 128 bit lane
-#define mm512_swap128_64( v )   _mm512_shuffle_epi32( v, 0x4e )
-#define mm512_shuflr128_64  mm512_swap128_64
-#define mm512_shufll128_64  mm512_swap128_64
-
-// Rotate 128 bit lanes by one 32 bit element
-#define mm512_shuflr128_32( v )    _mm512_shuffle_epi32( v, 0x39 )
-#define mm512_shufll128_32( v )    _mm512_shuffle_epi32( v, 0x93 )
-
-// Rotate right 128 bit lanes by c bytes, versatile and just as fast
-static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
-{  return _mm512_alignr_epi8( v, v, c ); }
-
-// Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all
-// can be done with ror & rol. Defined only for convenience and consistency
-// with AVX2 & SSE2 macros.
+// 64 bit lanes

 #define mm512_swap64_32( v )    _mm512_shuffle_epi32( v, 0xb1 )
-#define mm512_shuflr64_32 mm512_swap64_32
-#define mm512_shufll64_32 mm512_swap64_32
+#define mm512_shuflr64_32       mm512_swap64_32
+#define mm512_shufll64_32       mm512_swap64_32

 #define mm512_shuflr64_24( v )  _mm512_ror_epi64( v, 24 )
 #define mm512_shufll64_24( v )  _mm512_rol_epi64( v, 24 )
@@ -636,12 +548,14 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 #define mm512_shuflr64_8(  v )  _mm512_ror_epi64( v,  8 )
 #define mm512_shufll64_8(  v )  _mm512_rol_epi64( v,  8 )

-#define mm512_swap32_16(   v )  _mm512_ror_epi32( v, 16 )
-#define mm512_shuflr32_16 mm512_swap32_16
-#define mm512_shufll32_16 mm512_swap32_16
+// 32 bit lanes

-#define mm512_shuflr32_8(  v )  _mm512_ror_epi32( v,  8 )
-#define mm512_shufll32_8(  v )  _mm512_rol_epi32( v,  8 )
+#define mm512_swap32_16( v )    _mm512_ror_epi32( v, 16 )
+#define mm512_shuflr32_16       mm512_swap32_16
+#define mm512_shufll32_16       mm512_swap32_16
+
+#define mm512_shuflr32_8( v )   _mm512_ror_epi32( v,  8 )
+#define mm512_shufll32_8( v )   _mm512_rol_epi32( v,  8 )

 #endif // AVX512
 #endif // SIMD_512_H__
--- a/simd-utils/simd-64.h
+++ b/simd-utils/simd-64.h
@@ -34,10 +34,12 @@
 //#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 )
 #define mm64_not( a ) ( (__m64)( ~( (uint64_t)(a) ) )

+/*      
 // Unary negate elements
 #define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, v )
 #define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, v )
 #define mm64_negate_8(  v ) _mm_sub_pi8(  m64_zero, v )
+*/

 // Rotate bits in packed elements of 64 bit vector
 #define mm64_rol_64( a, n ) \
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -55,6 +55,13 @@
 typedef          __int128  int128_t;
 typedef unsigned __int128 uint128_t;

+typedef union
+{
+   uint128_t u128;
+   uint64_t  u64[2];
+   uint32_t  u32[4];
+} __attribute__ ((aligned (16))) u128_ovly;
+
 // Extracting the low bits is a trivial cast.
 // These specialized functions are optimized while providing a
 // consistent interface.
--- a/util.c
+++ b/util.c
@@ -44,28 +44,22 @@
 #include <libgen.h>
 #endif

-//#include "miner.h"
 #include "elist.h"
 #include "algo-gate-api.h"
 #include "algo/sha/sha256d.h"

-//extern pthread_mutex_t stats_lock;
-
-struct data_buffer {
-	void		*buf;
-	size_t		len;
-};
-
-struct upload_buffer {
-	const void	*buf;
-	size_t		len;
-	size_t		pos;
-};
-
 struct header_info {
 	char		*lp_path;
 	char		*reason;
 	char		*stratum_url;
+   size_t	content_length;
+};
+
+struct data_buffer {
+	void			*buf;
+	size_t			len;
+	size_t			allocated;
+	struct header_info	*headers;
 };

 struct tq_ent {
@@ -127,7 +121,6 @@ void applog2( int prio, const char *fmt, ... )
      int len;
 //    struct tm tm;
 //    time_t now = time(NULL);
-
 //    localtime_r(&now, &tm);

      switch ( prio )
@@ -395,67 +388,53 @@ static void databuf_free(struct data_buffer *db)
 static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb,
 			  void *user_data)
 {
-	struct data_buffer *db = (struct data_buffer *) user_data;
+	struct data_buffer *db = user_data;
 	size_t len = size * nmemb;
-	size_t oldlen, newlen;
+	size_t newalloc, reqalloc;
 	void *newmem;
 	static const unsigned char zero = 0;
+	static const size_t max_realloc_increase = 8 * 1024 * 1024;
+	static const size_t initial_alloc = 16 * 1024;

-	oldlen = db->len;
-	newlen = oldlen + len;
+	/* minimum required allocation size */
+	reqalloc = db->len + len + 1;

-	newmem = realloc(db->buf, newlen + 1);
-	if (!newmem)
-		return 0;
+	if (reqalloc > db->allocated) {
+		if (db->len > 0) {
+			newalloc = db->allocated * 2;
+		} else {
+			if (db->headers->content_length > 0)
+				newalloc = db->headers->content_length + 1;
+			else
+				newalloc = initial_alloc;
+		}

-	db->buf = newmem;
-	db->len = newlen;
-	memcpy((uchar*) db->buf + oldlen, ptr, len);
-	memcpy((uchar*) db->buf + newlen, &zero, 1);	/* null terminate */
+		if (db->headers->content_length == 0) {
+			/* limit the maximum buffer increase */
+			if (newalloc - db->allocated > max_realloc_increase)
+				newalloc = db->allocated + max_realloc_increase;
+		}
+
+		/* ensure we have a big enough allocation */
+		if (reqalloc > newalloc)
+			newalloc = reqalloc;
+
+		newmem = realloc(db->buf, newalloc);
+		if (!newmem)
+			return 0;
+
+		db->buf = newmem;
+		db->allocated = newalloc;
+	}
+
+	memcpy(db->buf + db->len, ptr, len); /* append new data */
+	memcpy(db->buf + db->len + len, &zero, 1); /* null terminate */
+
+	db->len += len;

 	return len;
 }

-static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb,
-			     void *user_data)
-{
-	struct upload_buffer *ub = (struct upload_buffer *) user_data;
-	size_t len = size * nmemb;
-
-	if (len > ub->len - ub->pos)
-		len = ub->len - ub->pos;
-
-	if (len) {
-		memcpy(ptr, ((uchar*)ub->buf) + ub->pos, len);
-		ub->pos += len;
-	}
-
-	return len;
-}
-
-#if LIBCURL_VERSION_NUM >= 0x071200
-static int seek_data_cb(void *user_data, curl_off_t offset, int origin)
-{
-	struct upload_buffer *ub = (struct upload_buffer *) user_data;
-	
-	switch (origin) {
-	case SEEK_SET:
-		ub->pos = (size_t) offset;
-		break;
-	case SEEK_CUR:
-		ub->pos += (size_t) offset;
-		break;
-	case SEEK_END:
-		ub->pos = ub->len + (size_t) offset;
-		break;
-	default:
-		return 1; /* CURL_SEEKFUNC_FAIL */
-	}
-
-	return 0; /* CURL_SEEKFUNC_OK */
-}
-#endif
-
 static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
 {
 	struct header_info *hi = (struct header_info *) user_data;
@@ -505,6 +484,9 @@ static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
 		val = NULL;
 	}

+	if (!strcasecmp("Content-Length", key))
+		hi->content_length = strtoul(val, NULL, 10);
+
 out:
 	free(key);
 	free(val);
@@ -564,48 +546,38 @@ json_t *json_rpc_call(CURL *curl, const char *url,
 	int rc;
 	long http_rc;
 	struct data_buffer all_data = {0};
-	struct upload_buffer upload_data;
 	char *json_buf;
 	json_error_t err;
 	struct curl_slist *headers = NULL;
-	char len_hdr[64];
 	char curl_err_str[CURL_ERROR_SIZE] = { 0 };
 	long timeout = (flags & JSON_RPC_LONGPOLL) ? opt_timeout : 30;
 	struct header_info hi = {0};

+   all_data.headers = &hi;
 	/* it is assumed that 'curl' is freshly [re]initialized at this pt */

-	if (opt_protocol)
-		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
+	if (opt_protocol)  curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
 	curl_easy_setopt(curl, CURLOPT_URL, url);
-	if (opt_cert)
-		curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
-//
-        curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
-
+	if (opt_cert)      curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
+   curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
 	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
 	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
 	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
 	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
 	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb);
 	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
-	curl_easy_setopt(curl, CURLOPT_READFUNCTION, upload_data_cb);
-	curl_easy_setopt(curl, CURLOPT_READDATA, &upload_data);
-#if LIBCURL_VERSION_NUM >= 0x071200
-	curl_easy_setopt(curl, CURLOPT_SEEKFUNCTION, &seek_data_cb);
-	curl_easy_setopt(curl, CURLOPT_SEEKDATA, &upload_data);
-#endif
-	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
-	if (opt_redirect)
-		curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
+   curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
+	if (opt_redirect)  curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
 	curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
 	curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, resp_hdr_cb);
 	curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hi);
-	if (opt_proxy) {
+	if (opt_proxy)
+   {
 		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
 		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
 	}
-	if (userpass) {
+	if (userpass)
+   {
 		curl_easy_setopt(curl, CURLOPT_USERPWD, userpass);
 		curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC);
 	}
@@ -613,23 +585,16 @@ json_t *json_rpc_call(CURL *curl, const char *url,
 	if (flags & JSON_RPC_LONGPOLL)
 		curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
 #endif
-	curl_easy_setopt(curl, CURLOPT_POST, 1);
+   curl_easy_setopt(curl, CURLOPT_POSTFIELDS, rpc_req);

 	if (opt_protocol)
 		applog(LOG_DEBUG, "JSON protocol request:\n%s\n", rpc_req);

-	upload_data.buf = rpc_req;
-	upload_data.len = strlen(rpc_req);
-	upload_data.pos = 0;
-	sprintf(len_hdr, "Content-Length: %lu",
-		(unsigned long) upload_data.len);
-
 	headers = curl_slist_append(headers, "Content-Type: application/json");
-	headers = curl_slist_append(headers, len_hdr);
 	headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
 	headers = curl_slist_append(headers, "X-Mining-Extensions: longpoll reject-reason");
-	//headers = curl_slist_append(headers, "Accept:"); /* disable Accept hdr*/
-	//headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/
+	//headers = curl_slist_append(headers, "Accept:"); // disable Accept hdr
+	//headers = curl_slist_append(headers, "Expect:"); // disable Expect hdr

 	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);

@@ -786,18 +751,26 @@ err_out:
 	return cfg;
 }

-// Segwit BEGIN
 void memrev(unsigned char *p, size_t len)
 {
-   unsigned char c, *q;
-   for (q = p + len - 1; p < q; p++, q--) {
-      c = *p;
-      *p = *q;
-      *q = c;
+   if ( len == 32 )
+   {
+      __m128i *pv = (__m128i*)p;
+      __m128i t = mm128_bswap_128( pv[0] );
+      pv[0] =     mm128_bswap_128( pv[1] );   
+      pv[1] = t;
+   }
+   else
+   {
+      unsigned char c, *q;
+      for (q = p + len - 1; p < q; p++, q--) 
+      {
+         c = *p;
+         *p = *q;
+         *q = c;
+      }
   }
 }
-// Segwit END
-

 void cbin2hex(char *out, const char *in, size_t len)
 {
@@ -832,32 +805,42 @@ char *bebin2hex(const unsigned char *p, size_t len)
   return s;
 }

-bool hex2bin(unsigned char *p, const char *hexstr, size_t len)
+bool hex2bin( unsigned char *p, const char *hexstr, const size_t len )
 {
-	char hex_byte[3];
-	char *ep;
+	if( hexstr == NULL )	return false;

-	hex_byte[2] = '\0';
-
-	while (*hexstr && len) {
-		if (!hexstr[1]) {
-			applog(LOG_ERR, "hex2bin str truncated");
-			return false;
-		}
-		hex_byte[0] = hexstr[0];
-		hex_byte[1] = hexstr[1];
-		*p = (unsigned char) strtol(hex_byte, &ep, 16);
-		if (*ep) {
-			applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte);
-			return false;
-		}
-		p++;
-		hexstr += 2;
-		len--;
+	size_t hexstr_len = strlen( hexstr );
+	if( ( hexstr_len % 2 ) != 0 )
+   {
+		applog( LOG_ERR, "hex2bin string truncated" );
+		return false;
+	}
+	size_t bin_len = hexstr_len / 2;
+	if ( bin_len > len )
+   {
+		applog( LOG_ERR, "hex2bin buffer too small" );
+		return false;
 	}

-	return(!len) ? true : false;
-/*	return (len == 0 && *hexstr == 0) ? true : false; */
+	memset( p, 0, len );
+	size_t i = 0;
+	while ( i < hexstr_len )
+   {
+		char c = hexstr[i];
+		unsigned char nibble;
+		if      ( c >= '0' && c <= '9' )	 nibble = (c - '0');
+		else if ( c >= 'A' && c <= 'F' )	 nibble = ( 10 + (c - 'A') );
+		else if ( c >= 'a' && c <= 'f' )	 nibble = ( 10 + (c - 'a') );
+		else
+      {
+			applog( LOG_ERR, "hex2bin invalid hex" );
+			return false;
+		}
+		p[(i / 2)] |= (nibble << ( (1 - (i % 2) ) * 4) );
+		i++;
+	}
+
+	return true;
 }

 int varint_encode(unsigned char *p, uint64_t n)
@@ -1339,6 +1322,43 @@ inline bool valid_hash( const void *hash, const void *target )

 #endif 

+inline double nbits_to_diff( uint32_t nbits )
+{
+   long double diff;
+   uint32_t shift = nbits & 0xff;
+   uint32_t bits = bswap_32( nbits ) & 0x00ffffff;
+   int shift_off = (int)shift - 29;
+
+   // diff = ( (2**16 -1) / ( 256**shift_off * bits )
+   // With uint128 byte shift is good for 16 <= shift <= 41. As unlikely
+   // as this may seem necessary, check just in case.
+
+   if ( shift_off >= -13 && shift_off <= 12 ) 
+   {  // fast
+      if ( shift_off == 0 )
+         diff = (long double)0xffff / (long double)bits;
+      else if ( shift_off < 0 )   // shift < 29
+         diff = (long double)( (uint128_t)0xffff << ( (-shift_off) *8 ) ) 
+              / (long double)bits;
+      else // ( shift_off > 0 )   // shift > 29
+         diff =   (long double)0xffff
+                / (long double)( (uint128_t)bits << ( shift_off*8 ) );  
+   }
+   else
+   {  // slow
+      int m;
+      diff = 0.;
+      for ( m = shift; m < 29; m++ )    diff *= 256.0;
+      for ( m = 29; m < shift; m++ )    diff /= 256.0;
+   }
+
+   if ( opt_debug )
+      applog( LOG_INFO, "nbits %08x: shift %u(%d), bits %06x, diff %8g",
+                         nbits, shift, shift_off, bits, (double)diff );
+
+   return (double)diff;
+}
+
 #ifdef WIN32
 #define socket_blocks() (WSAGetLastError() == WSAEWOULDBLOCK)
 #else
@@ -1507,7 +1527,8 @@ out:
 	return sret;
 }

-#if LIBCURL_VERSION_NUM >= 0x071101
+#if LIBCURL_VERSION_NUM >= 0x071101 && LIBCURL_VERSION_NUM < 0x072d00
+//#if LIBCURL_VERSION_NUM >= 0x071101
 static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose,
 	struct curl_sockaddr *addr)
 {
@@ -1575,7 +1596,8 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
 #if LIBCURL_VERSION_NUM >= 0x070f06
 	curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
 #endif
-#if LIBCURL_VERSION_NUM >= 0x071101
+#if LIBCURL_VERSION_NUM >= 0x071101 && LIBCURL_VERSION_NUM < 0x072d00
+//#if LIBCURL_VERSION_NUM >= 0x071101
 	curl_easy_setopt(curl, CURLOPT_OPENSOCKETFUNCTION, opensocket_grab_cb);
 	curl_easy_setopt(curl, CURLOPT_OPENSOCKETDATA, &sctx->sock);
 #endif
@@ -1589,7 +1611,10 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
 		return false;
 	}

-#if LIBCURL_VERSION_NUM < 0x071101
+#if LIBCURL_VERSION_NUM >= 0x072d00
+	curl_easy_getinfo(curl, CURLINFO_ACTIVESOCKET, &sctx->sock);
+#elif LIBCURL_VERSION_NUM < 0x071101   
+//#if LIBCURL_VERSION_NUM < 0x071101
 	/* CURLINFO_LASTSOCKET is broken on Win64; only use it as a last resort */
 	curl_easy_getinfo(curl, CURLINFO_LASTSOCKET, (long *)&sctx->sock);
 #endif
@@ -1885,7 +1910,8 @@ static uint32_t getblocheight(struct stratum_ctx *sctx)

 	// find 0xffff tag
 	p = (uint8_t*) sctx->job.coinbase + 32;
-	m = p + 128;
+   m = p + sctx->job.coinbase_size - 32 - 2;
+//   m = p + 128;
 	while (*p != 0xff && p < m) p++;
 	while (*p == 0xff && p < m) p++;
 	if (*(p-1) == 0xff && *(p-2) == 0xff) {
@@ -1992,23 +2018,41 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
      }
   }

-   if ( merkle_count )
-      merkle = (uchar**) malloc( merkle_count * sizeof(char *) );
-	for ( i = 0; i < merkle_count; i++ )
-   {
-		const char *s = json_string_value( json_array_get( merkle_arr, i ) );
-		if ( !s || strlen(s) != 64 )
-      {
-			while ( i-- ) free( merkle[i] );
-			free( merkle );
-			applog( LOG_ERR, "Stratum notify: invalid Merkle branch" );
-			goto out;
-		}
-		merkle[i] = (uchar*) malloc( 32 );
-		hex2bin( merkle[i], s, 32 );
-	}
+   pthread_mutex_lock( &sctx->work_lock );

-	pthread_mutex_lock( &sctx->work_lock );
+   if ( merkle_count )
+   {
+      if ( merkle_count > sctx->job.merkle_buf_size )
+      {
+         for ( i = 0; i < sctx->job.merkle_count; i++ )
+            free( sctx->job.merkle[i] );
+         free( sctx->job.merkle );
+
+         merkle = (uchar**) malloc( merkle_count * sizeof(char *) );
+         for ( i = 0; i < merkle_count; i++ )
+            merkle[i] = (uchar*) malloc( 32 );
+         sctx->job.merkle_buf_size = merkle_count;
+         sctx->job.merkle = merkle;
+      }
+
+      for ( i = 0; i < merkle_count; i++ )
+      {
+         const char *s = json_string_value( json_array_get( merkle_arr, i ) );
+         if ( !s || strlen(s) != 64 )
+         {
+            for ( int j = sctx->job.merkle_buf_size; j > 0; j-- )
+               free( sctx->job.merkle[i] );
+            free( sctx->job.merkle );
+            sctx->job.merkle_count =
+            sctx->job.merkle_buf_size = 0;
+            pthread_mutex_unlock( &sctx->work_lock );
+            applog( LOG_ERR, "Stratum notify: invalid Merkle branch" );
+            goto out;
+         }
+         hex2bin( sctx->job.merkle[i], s, 32 );
+      }   
+   }
+   sctx->job.merkle_count = merkle_count;         

 	coinb1_size = strlen( coinb1 ) / 2;
 	coinb2_size = strlen( coinb2 ) / 2;
@@ -2041,18 +2085,9 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
   }

 	sctx->block_height = getblocheight( sctx );
-
-	for ( i = 0; i < sctx->job.merkle_count; i++ )
-		free( sctx->job.merkle[i] );
-
-	free( sctx->job.merkle );
-	sctx->job.merkle = merkle;
-	sctx->job.merkle_count = merkle_count;
-
 	hex2bin( sctx->job.nbits, nbits, 4 );
 	hex2bin( sctx->job.ntime, stime, 4 );
 	sctx->job.clean = clean;
-
 	sctx->job.diff = sctx->next_diff;

 	pthread_mutex_unlock( &sctx->work_lock );
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -17,7 +17,9 @@ export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
 # used by GCC
 export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
 # Support for Windows 7 CPU groups, AES sometimes not included in -march
-export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
+# CPU groups disabled due to incompatibilities between Intel and AMD CPUs.
+#export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
+export DEFAULT_CFLAGS="-maes -O3 -Wall"
 export DEFAULT_CFLAGS_OLD="-O3 -Wall"

 # make link to local gmp header file.
@@ -127,7 +129,7 @@ make clean || echo clean
 # Native with CPU groups ennabled
 make clean || echo clean
 rm -f config.status
-CFLAGS="-march=native $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=native $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
Author	SHA1	Message	Date
Jay D Dee	de564ccbde	v3.22.2	2023-04-06 13:38:37 -04:00
Jay D Dee	fcd7727b0d	v3.22.1	2023-03-24 18:29:42 -04:00
Jay D Dee	3dd6787531	v3.22.0	2023-03-21 17:12:51 -04:00
Jay D Dee	cae1ce2ab7	v3.21.5	2023-03-15 12:27:04 -04:00
Jay D Dee	7a91c41d74	v3.21.4	2023-03-13 14:54:38 -04:00
Jay D Dee	c6bc9d67fb	v3.21.3 Unreleased	2023-03-13 03:20:13 -04:00
Jay D Dee	b339450898	v3.21.3	2023-03-11 14:54:49 -05:00
Jay D Dee	fb93160641	v3.21.2	2023-03-03 12:38:31 -05:00
Jay D Dee	520d4d5384	v3.21.1	2023-02-08 22:11:05 -05:00