v3.22.2

v3.22.1
v3.22.0
2025-09-17 23:44:27 +00:00 · 2023-04-06 13:38:37 -04:00 · 2023-03-24 18:29:42 -04:00 · 2023-03-21 17:12:51 -04:00 · 2023-03-15 12:27:04 -04:00
32 changed files with 3786 additions and 5532 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -55,9 +55,6 @@ cpuminer_SOURCES = \
  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
  algo/blake/blakecoin-4way.c \
-  algo/blake/decred-gate.c \
-  algo/blake/decred.c \
-  algo/blake/decred-4way.c \
  algo/blake/pentablake-gate.c \
  algo/blake/pentablake-4way.c \
  algo/blake/pentablake.c \
@@ -178,6 +175,8 @@ cpuminer_SOURCES = \
  algo/sha/sha256t.c \
  algo/sha/sha256q-4way.c \
  algo/sha/sha256q.c \
+  algo/sha/sha512256d-4way.c \
+  algo/sha/sha256dt.c \
  algo/shabal/sph_shabal.c \
  algo/shabal/shabal-hash-4way.c \
  algo/shavite/sph_shavite.c \
--- a/41
+++ b/41
@@ -65,7 +65,45 @@ If not what makes it happen or not happen?
 Change Log
 ----------

-v3.21.4
+v3.22.2
+
+Added sha512256d & sha256dt algos.
+Fixed intermittant invalid shares lyra2v2 AVX512.
+Removed application limits on the number of CPUs and threads, HW and OS limits still apply.
+Added a log warning if more threads are defined than active CPUs in affinity mask.
+Improved merkle tree memory management for stratum.
+Added transaction count to New Work log.
+Other small improvements.
+
+v3.22.1
+
+#393 fixed segfault in GBT, regression from v3.22.0.
+More efficient 32 bit data interleaving.
+
+v3.22.0
+
+Stratum: faster netdiff calculation.
+Merged a few updates from Pooler/cpuminer:
+   Use CURLOPT_POSTFIELDS in json_rpc_call,
+   Use CURLINFO_ACTIVESOCKET when supported,
+   JSONRPC speedup,
+   Speed up hex2bin function.  
+Small log improvements, notably more frequent hash rate reports.
+Removed decred algo.
+
+v3.21.5
+
+All issues with v3.21.3 & v3.21.4 should be resolved.
+Changes since v3.21.2:
+#392 #379 #389 Fixed misaligned address segfault solo mining.
+#392 Fixed stats for myr-gr algo, and a few others, for CPUs without AVX2.
+#392 Fixed conditional mining.
+#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
+     Windows binaries no longer support CPU groups,
+     Windows binaries support CPUs with up to 64 threads.
+Small optimizations to serialized vectoring.
+
+v3.21.4 CANCELLED

 Reapply selected changes from v3.21.3.
 #392 #379 #389 Fixed misaligned address segfault solo mining.
@@ -74,7 +112,6 @@ Reapply selected changes from v3.21.3.
     Windows binaries no longer support CPU groups,
     Windows binaries support CPUs with up to 64 threads.

-
 v3.21.3.1 UNRELEASED

 Revert to 3.21.2
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -1,6 +1,6 @@
-# generated automatically by aclocal 1.16.5 -*- Autoconf -*-
+# generated automatically by aclocal 1.16.1 -*- Autoconf -*-

-# Copyright (C) 1996-2021 Free Software Foundation, Inc.
+# Copyright (C) 1996-2018 Free Software Foundation, Inc.

 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -14,13 +14,13 @@
 m4_ifndef([AC_CONFIG_MACRO_DIRS], [m4_defun([_AM_CONFIG_MACRO_DIRS], [])m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])])
 m4_ifndef([AC_AUTOCONF_VERSION],
  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
-m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.71],,
-[m4_warning([this file was generated for autoconf 2.71.
+m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.69],,
+[m4_warning([this file was generated for autoconf 2.69.
 You have another version of autoconf.  It may work, but is not guaranteed to.
 If you have problems, you may need to regenerate the build system entirely.
 To do so, use the procedure documented by the package, typically 'autoreconf'.])])

-# Copyright (C) 2002-2021 Free Software Foundation, Inc.
+# Copyright (C) 2002-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -35,7 +35,7 @@ AC_DEFUN([AM_AUTOMAKE_VERSION],
 [am__api_version='1.16'
 dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
 dnl require some minimum version.  Point them to the right macro.
-m4_if([$1], [1.16.5], [],
+m4_if([$1], [1.16.1], [],
      [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
 ])

@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
 # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
 # This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.16.5])dnl
+[AM_AUTOMAKE_VERSION([1.16.1])dnl
 m4_ifndef([AC_AUTOCONF_VERSION],
  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])

 # Figure out how to run the assembler.                      -*- Autoconf -*-

-# Copyright (C) 2001-2021 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -78,7 +78,7 @@ _AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl

 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-

-# Copyright (C) 2001-2021 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -130,7 +130,7 @@ am_aux_dir=`cd "$ac_aux_dir" && pwd`

 # AM_CONDITIONAL                                            -*- Autoconf -*-

-# Copyright (C) 1997-2021 Free Software Foundation, Inc.
+# Copyright (C) 1997-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -161,7 +161,7 @@ AC_CONFIG_COMMANDS_PRE(
 Usually this means the macro was only invoked conditionally.]])
 fi])])

-# Copyright (C) 1999-2021 Free Software Foundation, Inc.
+# Copyright (C) 1999-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -352,7 +352,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl

 # Generate code to set up dependency tracking.              -*- Autoconf -*-

-# Copyright (C) 1999-2021 Free Software Foundation, Inc.
+# Copyright (C) 1999-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -391,9 +391,7 @@ AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
  done
  if test $am_rc -ne 0; then
    AC_MSG_FAILURE([Something went wrong bootstrapping makefile fragments
-    for automatic dependency tracking.  If GNU make was not used, consider
-    re-running the configure script with MAKE="gmake" (or whatever is
-    necessary).  You can also try re-running configure with the
+    for automatic dependency tracking.  Try re-running configure with the
    '--disable-dependency-tracking' option to at least be able to build
    the package (albeit without support for automatic dependency tracking).])
  fi
@@ -420,7 +418,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],

 # Do all the work for Automake.                             -*- Autoconf -*-

-# Copyright (C) 1996-2021 Free Software Foundation, Inc.
+# Copyright (C) 1996-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -448,10 +446,6 @@ m4_defn([AC_PROG_CC])
 # release and drop the old call support.
 AC_DEFUN([AM_INIT_AUTOMAKE],
 [AC_PREREQ([2.65])dnl
-m4_ifdef([_$0_ALREADY_INIT],
-  [m4_fatal([$0 expanded multiple times
-]m4_defn([_$0_ALREADY_INIT]))],
-  [m4_define([_$0_ALREADY_INIT], m4_expansion_stack)])dnl
 dnl Autoconf wants to disallow AM_ names.  We explicitly allow
 dnl the ones we care about.
 m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
@@ -488,7 +482,7 @@ m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
 [_AM_SET_OPTIONS([$1])dnl
 dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT.
 m4_if(
-  m4_ifset([AC_PACKAGE_NAME], [ok]):m4_ifset([AC_PACKAGE_VERSION], [ok]),
+  m4_ifdef([AC_PACKAGE_NAME], [ok]):m4_ifdef([AC_PACKAGE_VERSION], [ok]),
  [ok:ok],,
  [m4_fatal([AC_INIT should be called with package and version arguments])])dnl
 AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
@@ -540,20 +534,6 @@ AC_PROVIDE_IFELSE([AC_PROG_OBJCXX],
 		  [m4_define([AC_PROG_OBJCXX],
 			     m4_defn([AC_PROG_OBJCXX])[_AM_DEPENDENCIES([OBJCXX])])])dnl
 ])
-# Variables for tags utilities; see am/tags.am
-if test -z "$CTAGS"; then
-  CTAGS=ctags
-fi
-AC_SUBST([CTAGS])
-if test -z "$ETAGS"; then
-  ETAGS=etags
-fi
-AC_SUBST([ETAGS])
-if test -z "$CSCOPE"; then
-  CSCOPE=cscope
-fi
-AC_SUBST([CSCOPE])
-
 AC_REQUIRE([AM_SILENT_RULES])dnl
 dnl The testsuite driver may need to know about EXEEXT, so add the
 dnl 'am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen.  This
@@ -635,7 +615,7 @@ for _am_header in $config_headers :; do
 done
 echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])

-# Copyright (C) 2001-2021 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -656,7 +636,7 @@ if test x"${install_sh+set}" != xset; then
 fi
 AC_SUBST([install_sh])])

-# Copyright (C) 2003-2021 Free Software Foundation, Inc.
+# Copyright (C) 2003-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -678,7 +658,7 @@ AC_SUBST([am__leading_dot])])
 # Add --enable-maintainer-mode option to configure.         -*- Autoconf -*-
 # From Jim Meyering

-# Copyright (C) 1996-2021 Free Software Foundation, Inc.
+# Copyright (C) 1996-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -713,7 +693,7 @@ AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])

 # Check to see how 'make' treats includes.	            -*- Autoconf -*-

-# Copyright (C) 2001-2021 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -756,7 +736,7 @@ AC_SUBST([am__quote])])

 # Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-

-# Copyright (C) 1997-2021 Free Software Foundation, Inc.
+# Copyright (C) 1997-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -777,7 +757,12 @@ AC_DEFUN([AM_MISSING_HAS_RUN],
 [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
 AC_REQUIRE_AUX_FILE([missing])dnl
 if test x"${MISSING+set}" != xset; then
-  MISSING="\${SHELL} '$am_aux_dir/missing'"
+  case $am_aux_dir in
+  *\ * | *\	*)
+    MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;;
+  *)
+    MISSING="\${SHELL} $am_aux_dir/missing" ;;
+  esac
 fi
 # Use eval to expand $SHELL
 if eval "$MISSING --is-lightweight"; then
@@ -790,7 +775,7 @@ fi

 # Helper functions for option handling.                     -*- Autoconf -*-

-# Copyright (C) 2001-2021 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -819,7 +804,7 @@ AC_DEFUN([_AM_SET_OPTIONS],
 AC_DEFUN([_AM_IF_OPTION],
 [m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])

-# Copyright (C) 1999-2021 Free Software Foundation, Inc.
+# Copyright (C) 1999-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -866,7 +851,7 @@ AC_LANG_POP([C])])
 # For backward compatibility.
 AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])

-# Copyright (C) 2001-2021 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -885,7 +870,7 @@ AC_DEFUN([AM_RUN_LOG],

 # Check to make sure that the build environment is sane.    -*- Autoconf -*-

-# Copyright (C) 1996-2021 Free Software Foundation, Inc.
+# Copyright (C) 1996-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -966,7 +951,7 @@ AC_CONFIG_COMMANDS_PRE(
 rm -f conftest.file
 ])

-# Copyright (C) 2009-2021 Free Software Foundation, Inc.
+# Copyright (C) 2009-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1026,7 +1011,7 @@ AC_SUBST([AM_BACKSLASH])dnl
 _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
 ])

-# Copyright (C) 2001-2021 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1054,7 +1039,7 @@ fi
 INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
 AC_SUBST([INSTALL_STRIP_PROGRAM])])

-# Copyright (C) 2006-2021 Free Software Foundation, Inc.
+# Copyright (C) 2006-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1073,7 +1058,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])

 # Check how to create a tarball.                            -*- Autoconf -*-

-# Copyright (C) 2004-2021 Free Software Foundation, Inc.
+# Copyright (C) 2004-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -263,8 +263,6 @@ void init_algo_gate( algo_gate_t* gate )
   gate->build_block_header      = (void*)&std_build_block_header;
   gate->build_extraheader       = (void*)&std_build_extraheader;
   gate->set_work_data_endian    = (void*)&do_nothing;
-   gate->calc_network_diff       = (void*)&std_calc_network_diff;
-   gate->ready_to_mine           = (void*)&std_ready_to_mine;
   gate->resync_threads          = (void*)&do_nothing;
   gate->do_this_thread          = (void*)&return_true;
   gate->longpoll_rpc_call       = (void*)&std_longpoll_rpc_call;
@@ -308,7 +306,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_BLAKECOIN:    rc = register_blakecoin_algo     ( gate ); break;
    case ALGO_BMW512:       rc = register_bmw512_algo        ( gate ); break;
    case ALGO_C11:          rc = register_c11_algo           ( gate ); break;
-    case ALGO_DECRED:       rc = register_decred_algo        ( gate ); break;
    case ALGO_DEEP:         rc = register_deep_algo          ( gate ); break;
    case ALGO_DMD_GR:       rc = register_dmd_gr_algo        ( gate ); break;
    case ALGO_GROESTL:      rc = register_groestl_algo       ( gate ); break;
@@ -340,9 +337,11 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_QUBIT:        rc = register_qubit_algo         ( gate ); break;
    case ALGO_SCRYPT:       rc = register_scrypt_algo        ( gate ); break;
    case ALGO_SHA256D:      rc = register_sha256d_algo       ( gate ); break;
+    case ALGO_SHA256DT:     rc = register_sha256dt_algo      ( gate ); break;
    case ALGO_SHA256Q:      rc = register_sha256q_algo       ( gate ); break;
    case ALGO_SHA256T:      rc = register_sha256t_algo       ( gate ); break;
    case ALGO_SHA3D:        rc = register_sha3d_algo         ( gate ); break;
+    case ALGO_SHA512256D:   rc = register_sha512256d_algo    ( gate ); break;
    case ALGO_SHAVITE3:     rc = register_shavite_algo       ( gate ); break;
    case ALGO_SKEIN:        rc = register_skein_algo         ( gate ); break;
    case ALGO_SKEIN2:       rc = register_skein2_algo        ( gate ); break;
@@ -427,7 +426,6 @@ const char* const algo_alias_map[][2] =
  { "blake256r8",        "blakecoin"      },
  { "blake256r8vnl",     "vanilla"        },
  { "blake256r14",       "blake"          },
-  { "blake256r14dcr",    "decred"         },
  { "diamond",           "dmd-gr"         },
  { "espers",            "hmq1725"        },
  { "flax",              "c11"            },
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -144,7 +144,7 @@ void ( *gen_merkle_root )       ( char*, struct stratum_ctx* );
 void ( *build_extraheader )     ( struct work*, struct stratum_ctx* );

 void ( *build_block_header )    ( struct work*, uint32_t, uint32_t*,
-	                                uint32_t*, uint32_t, uint32_t,
+	                                uint32_t*,   uint32_t, uint32_t,
                                   unsigned char* );

 // Build mining.submit message
@@ -155,19 +155,13 @@ char* ( *malloc_txs_request )   ( struct work* );
 // Big endian or little endian
 void ( *set_work_data_endian )  ( struct work* );

-double ( *calc_network_diff )   ( struct work* );
-
-// Wait for first work
-bool ( *ready_to_mine )         ( struct work*, struct stratum_ctx*, int );
-
 // Diverge mining threads
 bool ( *do_this_thread )        ( int );

 // After do_this_thread
 void ( *resync_threads )        ( int, struct work* );

-// No longer needed
-json_t* (*longpoll_rpc_call)      ( CURL*, int*, char* );
+json_t* ( *longpoll_rpc_call )  ( CURL*, int*, char* );

 set_t optimizations;
 int  ( *get_work_data_size )     ();
@@ -286,8 +280,6 @@ char* std_malloc_txs_request( struct work *work );
 // Default is do_nothing, little endian is assumed
 void set_work_data_big_endian( struct work *work );

-double std_calc_network_diff( struct work *work );
-
 void std_build_block_header( struct work* g_work, uint32_t version,
 	                          uint32_t *prevhash,  uint32_t *merkle_root,
   	                       uint32_t ntime,      uint32_t nbits,
@@ -297,9 +289,6 @@ void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );

 json_t* std_longpoll_rpc_call( CURL *curl, int *err, char *lp_url );

-bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                        int thr_id );
-
 int std_get_work_data_size();

 // Gate admin functions
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -1,74 +0,0 @@
-#include "decred-gate.h"
-#include "blake-hash-4way.h"
-#include <string.h>
-#include <stdint.h>
-#include <memory.h>
-#include <unistd.h>
-
-#if defined (DECRED_4WAY)
-
-static __thread blake256_4way_context blake_mid;
-
-void decred_hash_4way( void *state, const void *input )
-{
-     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-//     uint32_t hash0[8] __attribute__ ((aligned (32)));
-//     uint32_t hash1[8] __attribute__ ((aligned (32)));
-//     uint32_t hash2[8] __attribute__ ((aligned (32)));
-//     uint32_t hash3[8] __attribute__ ((aligned (32)));
-     const void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
-     int tail_len = 180 - DECRED_MIDSTATE_LEN; 
-     blake256_4way_context ctx __attribute__ ((aligned (64)));
-
-     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
-     blake256_4way_update( &ctx, tail, tail_len );
-     blake256_4way_close( &ctx, vhash );
-     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
-}
-
-int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t vdata[48*4] __attribute__ ((aligned (64)));
-   uint32_t hash[8*4] __attribute__ ((aligned (32)));
-   uint32_t _ALIGN(64) edata[48];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
-   uint32_t n = first_nonce;
-   const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-   // copy to buffer guaranteed to be aligned.
-   memcpy( edata, pdata, 180 );
-
-   // use the old way until  new way updated for size.
-   mm128_intrlv_4x32x( vdata, edata, edata, edata, edata, 180*8 );
-
-   blake256_4way_init( &blake_mid );
-   blake256_4way_update( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
-
-   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
-   do {
-      * noncep    = n;
-      *(noncep+1) = n+1;
-      *(noncep+2) = n+2;
-      *(noncep+3) = n+3;
-
-      decred_hash_4way( hash, vdata );
-
-      for ( int i = 0; i < 4; i++ )
-      if (  (hash+(i<<3))[7] <= HTarget )
-      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-      {
-          pdata[DECRED_NONCE_INDEX] = n+i;
-          submit_solution( work, hash+(i<<3), mythr );
-      }
-      n += 4;
-  } while ( (n < max_nonce) && !work_restart[thr_id].restart );
-
-  *hashes_done = n - first_nonce + 1;
-  return 0;
-}
-
-#endif
--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -1,171 +0,0 @@
-#include "decred-gate.h"
-#include <unistd.h>
-#include <memory.h>
-#include <string.h>
-
-uint32_t *decred_get_nonceptr( uint32_t *work_data )
-{
-   return &work_data[ DECRED_NONCE_INDEX ];
-}
-
-long double decred_calc_network_diff( struct work* work )
-{
-   // sample for diff 43.281 : 1c05ea29
-   // todo: endian reversed on longpoll could be zr5 specific...
-   uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
-   uint32_t bits = ( nbits & 0xffffff );
-   int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
-   int m;
-   long double d = (long double)0x0000ffff / (long double)bits;
-
-   for ( m = shift; m < 29; m++ )
-       d *= 256.0;
-   for ( m = 29; m < shift; m++ )
-       d /= 256.0;
-   if ( shift == 28 )
-       d *= 256.0; // testnet
-   if ( opt_debug_diff )
-       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", (double)d,
-                           shift, bits );
-   return net_diff;
-}
-
-void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
-{
-   // some random extradata to make the work unique
-   work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
-   work->height = work->data[32];
-   if (!have_longpoll && work->height > *net_blocks + 1)
-   {
-      char netinfo[64] = { 0 };
-      if ( net_diff > 0. )
-      {
-         if (net_diff != work->targetdiff)
-            sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
-                   work->targetdiff);
-         else
-             sprintf(netinfo, ", diff %.3f", net_diff);
-       }
-       applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
-                       netinfo);
-       *net_blocks = work->height - 1;
-   }
-}
-
-void decred_be_build_stratum_request( char *req, struct work *work,
-                                      struct stratum_ctx *sctx )
-{
-   unsigned char *xnonce2str;
-   uint32_t ntime, nonce;
-   char ntimestr[9], noncestr[9];
-
-   be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
-   be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
-   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
-   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
-   xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
-                                     sctx->xnonce1_size );
-   snprintf( req, JSON_BUF_LEN,
-        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
-   free(xnonce2str);
-}
-
-#if !defined(min)
-#define min(a,b) (a>b ? (b) :(a))
-#endif
-
-void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
-{
-   uchar merkle_root[64] = { 0 };
-   uint32_t extraheader[32] = { 0 };
-   int headersize = 0;
-   uint32_t* extradata = (uint32_t*) sctx->xnonce1;
-   int i;
-
-   // getwork over stratum, getwork merkle + header passed in coinb1
-   memcpy(merkle_root, sctx->job.coinbase, 32);
-   headersize = min((int)sctx->job.coinbase_size - 32,
-                  sizeof(extraheader) );
-   memcpy( extraheader, &sctx->job.coinbase[32], headersize );
-
-   // Assemble block header 
-   memset( g_work->data, 0, sizeof(g_work->data) );
-   g_work->data[0] = le32dec( sctx->job.version );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[1 + i] = swab32(
-                              le32dec( (uint32_t *) sctx->job.prevhash + i ) );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
-
-//   for ( i = 0; i < 8; i++ ) // prevhash
-//      g_work->data[1 + i] = swab32( g_work->data[1 + i] );
-//   for ( i = 0; i < 8; i++ ) // merkle
-//      g_work->data[9 + i] = swab32( g_work->data[9 + i] );
-
-   for ( i = 0; i < headersize/4; i++ ) // header
-      g_work->data[17 + i] = extraheader[i];
-   // extradata
-
-   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
-      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
-   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
-      g_work->data[i] = 0;
-   g_work->data[37] = (rand()*4) << 8;
-   // block header suffix from coinb2 (stake version)
-   memcpy( &g_work->data[44],
-           &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
-   sctx->block_height = g_work->data[32];
-   //applog_hex(work->data, 180);
-   //applog_hex(&work->data[36], 36);
-}
-
-#undef min
-
-bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                           int thr_id )
-{
-   if ( have_stratum && strcmp(stratum->job.job_id, work->job_id)  )
-      // need to regen g_work..
-      return false;
-   if ( have_stratum && !work->data[0] && !opt_benchmark )
-   {
-      sleep(1);
-      return false;
-   }
-   // extradata: prevent duplicates
-   work->data[ DECRED_XNONCE_INDEX     ] += 1;
-   work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
-   return true;
-}
-
-int decred_get_work_data_size() { return DECRED_DATA_SIZE; }
-
-bool register_decred_algo( algo_gate_t* gate )
-{
-#if defined(DECRED_4WAY)
-  four_way_not_tested();
-  gate->scanhash  = (void*)&scanhash_decred_4way;
-  gate->hash      = (void*)&decred_hash_4way;
-#else
-  gate->scanhash  = (void*)&scanhash_decred;
-  gate->hash      = (void*)&decred_hash;
-#endif
-  gate->optimizations = AVX2_OPT;
-//  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
-  gate->decode_extra_data     = (void*)&decred_decode_extradata;
-  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
-  gate->work_decode           = (void*)&std_be_work_decode;
-  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
-  gate->build_extraheader     = (void*)&decred_build_extraheader;
-  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
-  gate->nbits_index           = DECRED_NBITS_INDEX;
-  gate->ntime_index           = DECRED_NTIME_INDEX;
-  gate->nonce_index           = DECRED_NONCE_INDEX;
-  gate->get_work_data_size    = (void*)&decred_get_work_data_size;
-  gate->work_cmp_size         = DECRED_WORK_COMPARE_SIZE;
-  allow_mininginfo            = false;
-  have_gbt                    = false;
-  return true;
-}
-
--- a/algo/blake/decred-gate.h
+++ b/algo/blake/decred-gate.h
@@ -1,36 +0,0 @@
-#ifndef __DECRED_GATE_H__
-#define __DECRED_GATE_H__
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-#define DECRED_NBITS_INDEX 29
-#define DECRED_NTIME_INDEX 34
-#define DECRED_NONCE_INDEX 35
-#define DECRED_XNONCE_INDEX 36
-#define DECRED_DATA_SIZE 192
-#define DECRED_WORK_COMPARE_SIZE 140
-#define DECRED_MIDSTATE_LEN 128
-
-#if defined (__AVX2__) 
-//void blakehash_84way(void *state, const void *input);
-//int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
-//                         uint64_t *hashes_done );
-#endif
-
-#if defined(__SSE4_2__)
-  #define DECRED_4WAY
-#endif
-
-#if defined (DECRED_4WAY)
-void decred_hash_4way(void *state, const void *input);
-int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr );
-#endif
-
-void decred_hash( void *state, const void *input );
-int scanhash_decred( struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done, struct thr_info *mythr );
-
-#endif
-
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -1,282 +0,0 @@
-#include "decred-gate.h"
-
-#if !defined(DECRED_8WAY) && !defined(DECRED_4WAY)
-
-#include "sph_blake.h"
-
-#include <string.h>
-#include <stdint.h>
-#include <memory.h>
-#include <unistd.h>
-
-/*
-#ifndef min
-#define min(a,b) (a>b ? b : a)
-#endif
-#ifndef max 
-#define max(a,b) (a<b ? b : a)
-#endif
-*/
-/*
-#define DECRED_NBITS_INDEX 29
-#define DECRED_NTIME_INDEX 34
-#define DECRED_NONCE_INDEX 35
-#define DECRED_XNONCE_INDEX 36
-#define DECRED_DATA_SIZE 192
-#define DECRED_WORK_COMPARE_SIZE 140
-*/
-static __thread sph_blake256_context blake_mid;
-static __thread bool ctx_midstate_done = false;
-
-void decred_hash(void *state, const void *input)
-{
-//        #define MIDSTATE_LEN 128
-        sph_blake256_context ctx __attribute__ ((aligned (64)));
-
-        uint8_t *ending = (uint8_t*) input;
-        ending += DECRED_MIDSTATE_LEN;
-
-        if (!ctx_midstate_done) {
-                sph_blake256_init(&blake_mid);
-                sph_blake256(&blake_mid, input, DECRED_MIDSTATE_LEN);
-                ctx_midstate_done = true;
-        }
-        memcpy(&ctx, &blake_mid, sizeof(blake_mid));
-
-        sph_blake256(&ctx, ending, (180 - DECRED_MIDSTATE_LEN));
-        sph_blake256_close(&ctx, state);
-}
-
-void decred_hash_simple(void *state, const void *input)
-{
-        sph_blake256_context ctx;
-        sph_blake256_init(&ctx);
-        sph_blake256(&ctx, input, 180);
-        sph_blake256_close(&ctx, state);
-}
-
-int scanhash_decred( struct work *work, uint32_t max_nonce,
-               uint64_t *hashes_done, struct thr_info *mythr )
-{
-        uint32_t _ALIGN(64) endiandata[48];
-        uint32_t _ALIGN(64) hash32[8];
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-//        #define DCR_NONCE_OFT32 35
-
-        const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
-        const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
-
-        uint32_t n = first_nonce;
-
-        ctx_midstate_done = false;
-
-#if 1
-        memcpy(endiandata, pdata, 180);
-#else
-        for (int k=0; k < (180/4); k++)
-                be32enc(&endiandata[k], pdata[k]);
-#endif
-
-        do {
-                //be32enc(&endiandata[DCR_NONCE_OFT32], n);
-                endiandata[DECRED_NONCE_INDEX] = n;
-                decred_hash(hash32, endiandata);
-
-                if (hash32[7] <= HTarget && fulltest(hash32, ptarget))
-                {
-                   pdata[DECRED_NONCE_INDEX] = n;
-                   submit_solution( work, hash32, mythr );
-                }
-
-                n++;
-
-        } while (n < max_nonce && !work_restart[thr_id].restart);
-
-        *hashes_done = n - first_nonce + 1;
-        pdata[DECRED_NONCE_INDEX] = n;
-        return 0;
-}
-
-/*
-uint32_t *decred_get_nonceptr( uint32_t *work_data )
-{
-   return &work_data[ DECRED_NONCE_INDEX ];
-}
-
-double decred_calc_network_diff( struct work* work )
-{
-   // sample for diff 43.281 : 1c05ea29
-   // todo: endian reversed on longpoll could be zr5 specific...
-   uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
-   uint32_t bits = ( nbits & 0xffffff );
-   int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
-   int m;
-   double d = (double)0x0000ffff / (double)bits;
-
-   for ( m = shift; m < 29; m++ )
-       d *= 256.0;
-   for ( m = 29; m < shift; m++ )
-       d /= 256.0;
-   if ( shift == 28 )
-       d *= 256.0; // testnet
-   if ( opt_debug_diff )
-       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
-                           shift, bits );
-   return net_diff;
-}
-
-void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
-{
-   // some random extradata to make the work unique
-   work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
-   work->height = work->data[32];
-   if (!have_longpoll && work->height > *net_blocks + 1)
-   {
-      char netinfo[64] = { 0 };
-      if (net_diff > 0.)
-      {
-         if (net_diff != work->targetdiff)
-	    sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
-                   work->targetdiff);
-	 else
-	     sprintf(netinfo, ", diff %.3f", net_diff);
-       }
-       applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
-                       netinfo);
-       *net_blocks = work->height - 1;
-   }
-}
-
-void decred_be_build_stratum_request( char *req, struct work *work,
-                                      struct stratum_ctx *sctx )
-{
-   unsigned char *xnonce2str;
-   uint32_t ntime, nonce;
-   char ntimestr[9], noncestr[9];
-
-   be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
-   be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
-   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
-   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
-   xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
-                                     sctx->xnonce1_size );
-   snprintf( req, JSON_BUF_LEN,
-        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
-   free(xnonce2str);
-}
-*/
-/*
-// data shared between gen_merkle_root and build_extraheader.
-__thread uint32_t decred_extraheader[32] = { 0 };
-__thread int decred_headersize = 0;
-
-void decred_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
-{
-   // getwork over stratum, getwork merkle + header passed in coinb1
-   memcpy(merkle_root, sctx->job.coinbase, 32);
-   decred_headersize = min((int)sctx->job.coinbase_size - 32, 
-                  sizeof(decred_extraheader) );
-   memcpy( decred_extraheader, &sctx->job.coinbase[32], decred_headersize);
-}
-*/
-
-/*
-#define min(a,b) (a>b ? (b) :(a))
-
-void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
-{
-   uchar merkle_root[64] = { 0 };
-   uint32_t extraheader[32] = { 0 };
-   int headersize = 0;
-   uint32_t* extradata = (uint32_t*) sctx->xnonce1;
-   size_t t;
-   int i;
-
-   // getwork over stratum, getwork merkle + header passed in coinb1
-   memcpy(merkle_root, sctx->job.coinbase, 32);
-   headersize = min((int)sctx->job.coinbase_size - 32,
-                  sizeof(extraheader) );
-   memcpy( extraheader, &sctx->job.coinbase[32], headersize );
-
-   // Increment extranonce2 
-   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
-
-   // Assemble block header 
-   memset( g_work->data, 0, sizeof(g_work->data) );
-   g_work->data[0] = le32dec( sctx->job.version );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[1 + i] = swab32(
-                              le32dec( (uint32_t *) sctx->job.prevhash + i ) );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
-
-//   for ( i = 0; i < 8; i++ ) // prevhash
-//      g_work->data[1 + i] = swab32( g_work->data[1 + i] );
-//   for ( i = 0; i < 8; i++ ) // merkle
-//      g_work->data[9 + i] = swab32( g_work->data[9 + i] );
-
-   for ( i = 0; i < headersize/4; i++ ) // header
-      g_work->data[17 + i] = extraheader[i];
-   // extradata
-
-   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
-      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
-   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
-      g_work->data[i] = 0;
-   g_work->data[37] = (rand()*4) << 8;
-   // block header suffix from coinb2 (stake version)
-   memcpy( &g_work->data[44],
-           &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
-   sctx->bloc_height = g_work->data[32];
-   //applog_hex(work->data, 180);
-   //applog_hex(&work->data[36], 36);
-}
-
-#undef min
-
-bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                           int thr_id )
-{
-   if ( have_stratum && strcmp(stratum->job.job_id, work->job_id)  )
-      // need to regen g_work..
-      return false;
-   if ( have_stratum && !work->data[0] && !opt_benchmark )
-   {
-      sleep(1);
-      return false;
-   }      
-   // extradata: prevent duplicates
-   work->data[ DECRED_XNONCE_INDEX     ] += 1;
-   work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
-   return true;
-}
-
-
-bool register_decred_algo( algo_gate_t* gate )
-{
-  gate->optimizations         = SSE2_OPT;
-  gate->scanhash              = (void*)&scanhash_decred;
-  gate->hash                  = (void*)&decred_hash;
-  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
-  gate->decode_extra_data     = (void*)&decred_decode_extradata;
-  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
-  gate->work_decode           = (void*)&std_be_work_decode;
-  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
-  gate->build_extraheader     = (void*)&decred_build_extraheader;
-  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
-  gate->nbits_index           = DECRED_NBITS_INDEX;
-  gate->ntime_index           = DECRED_NTIME_INDEX;
-  gate->nonce_index           = DECRED_NONCE_INDEX;
-  gate->work_data_size        = DECRED_DATA_SIZE;
-  gate->work_cmp_size         = DECRED_WORK_COMPARE_SIZE; 
-  allow_mininginfo            = false;
-  have_gbt                    = false;
-  return true;
-}
-*/
-
-#endif
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -103,16 +103,16 @@
   const uint8_t *sigmaR = sigma[R]; \
   BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
   BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
-   V2 = mm128_alignr_64( V[3], V[2] ); \
-   V3 = mm128_alignr_64( V[2], V[3] ); \
-   V6 = mm128_alignr_64( V[6], V[7] ); \
-   V7 = mm128_alignr_64( V[7], V[6] ); \
+   V2 = mm128_alignr_64( V[3], V[2], 1 ); \
+   V3 = mm128_alignr_64( V[2], V[3], 1 ); \
+   V6 = mm128_alignr_64( V[6], V[7], 1 ); \
+   V7 = mm128_alignr_64( V[7], V[6], 1 ); \
   BLAKE2B_G( V[0], V2, V[5], V6,  8,  9, 10, 11 ); \
   BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
-   V[2] = mm128_alignr_64( V2, V3 ); \
-   V[3] = mm128_alignr_64( V3, V2 ); \
-   V[6] = mm128_alignr_64( V7, V6 ); \
-   V[7] = mm128_alignr_64( V6, V7 ); \
+   V[2] = mm128_alignr_64( V2, V3, 1 ); \
+   V[3] = mm128_alignr_64( V3, V2, 1 ); \
+   V[6] = mm128_alignr_64( V7, V6, 1 ); \
+   V[7] = mm128_alignr_64( V6, V7, 1 ); \
 }

 #else
--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -73,11 +73,11 @@ int scanhash_myriad( struct work *work, uint32_t max_nonce,
      be32enc(&endiandata[19], nonce);
      myriad_hash(hash, endiandata);

-      if (hash[7] <= Htarg && fulltest(hash, ptarget))
+      if (hash[7] <= Htarg )
+      if ( fulltest(hash, ptarget) && !opt_benchmark )
      {
         pdata[19] = nonce;
-         *hashes_done = pdata[19] - first_nonce;
-         return 1;
+         submit_solution( work, hash, mythr );
      }
      nonce++;

--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -554,20 +554,10 @@ int luffa_4way_update_close( luffa_4way_context *state,
    a = _mm256_xor_si256( a, c0 ); \
    b = _mm256_xor_si256( b, c1 );

-/*
-#define MULT2( a0, a1, mask ) \
-do { \
-  __m256i b = _mm256_xor_si256( a0, \
-                   _mm256_shuffle_epi32( _mm256_and_si256(a1,mask), 16 ) ); \
-  a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
-  a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) );  \
-} while(0)
-*/
-
-#define MULT2( a0, a1, mask ) \
+#define MULT2( a0, a1 ) \
 { \
-  __m256i b = _mm256_xor_si256( a0, \
-                 _mm256_shuffle_epi32( _mm256_and_si256( a1, mask ), 16 ) ); \
+  __m256i b = _mm256_xor_si256( a0, _mm256_shuffle_epi32( \
+                         _mm256_blend_epi32( a1, m256_zero, 0xee ), 16 ) ); \
  a0 = _mm256_alignr_epi8( a1,  b, 4 ); \
  a1 = _mm256_alignr_epi8(  b, a1, 4 ); \
 }
@@ -682,7 +672,6 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
    __m256i *chainv = state->chainv;
    __m256i msg0, msg1;
    __m256i x0, x1, x2, x3, x4, x5, x6, x7;
-    const __m256i MASK = m256_const1_i128( 0xffffffff );

    t0 = chainv[0];
    t1 = chainv[1];
@@ -696,7 +685,7 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
    t0 = _mm256_xor_si256( t0, chainv[8] );
    t1 = _mm256_xor_si256( t1, chainv[9] );

-    MULT2( t0, t1, MASK );
+    MULT2( t0, t1 );

    msg0 = _mm256_shuffle_epi32( msg[0], 27 );
    msg1 = _mm256_shuffle_epi32( msg[1], 27 );
@@ -715,66 +704,66 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
    t0 = chainv[0];
    t1 = chainv[1];

-    MULT2( chainv[0], chainv[1], MASK );
+    MULT2( chainv[0], chainv[1] );
    chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
    chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );

-    MULT2( chainv[2], chainv[3], MASK );
+    MULT2( chainv[2], chainv[3] );
    chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
    chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);

-    MULT2( chainv[4], chainv[5], MASK );
+    MULT2( chainv[4], chainv[5] );
    chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
    chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);

-    MULT2( chainv[6], chainv[7], MASK );
+    MULT2( chainv[6], chainv[7] );
    chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
    chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);

-    MULT2( chainv[8], chainv[9], MASK );
+    MULT2( chainv[8], chainv[9] );
    chainv[8] = _mm256_xor_si256( chainv[8], t0 );
    chainv[9] = _mm256_xor_si256( chainv[9], t1 );

    t0 = chainv[8];
    t1 = chainv[9];

-    MULT2( chainv[8], chainv[9], MASK );
+    MULT2( chainv[8], chainv[9] );
    chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
    chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );

-    MULT2( chainv[6], chainv[7], MASK );
+    MULT2( chainv[6], chainv[7] );
    chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
    chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );

-    MULT2( chainv[4], chainv[5], MASK );
+    MULT2( chainv[4], chainv[5] );
    chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
    chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );

-    MULT2( chainv[2], chainv[3], MASK );
+    MULT2( chainv[2], chainv[3] );
    chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
    chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );

-    MULT2( chainv[0], chainv[1], MASK );
+    MULT2( chainv[0], chainv[1] );
    chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
    chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );
    chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
    chainv[3] = _mm256_xor_si256( chainv[3], msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );
    chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
    chainv[5] = _mm256_xor_si256( chainv[5], msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );
    chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
    chainv[7] = _mm256_xor_si256( chainv[7], msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );
    chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
    chainv[9] = _mm256_xor_si256( chainv[9], msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );

    chainv[3] = mm256_rol_32( chainv[3], 1 );
    chainv[5] = mm256_rol_32( chainv[5], 2 );
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -19,26 +19,34 @@
 */

 #include <string.h>
-#include <emmintrin.h>
 #include "simd-utils.h"
 #include "luffa_for_sse2.h"

-#if defined(__SSE4_1__)
+#if defined(__AVX512VL__)
+
+#define MULT2( a0, a1 ) \
+{ \
+  __m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
+  a0 = _mm_alignr_epi32( a1, b, 1 ); \
+  a1 = _mm_alignr_epi32( b, a1, 1 ); \
+}
+
+#elif defined(__SSE4_1__)

 #define MULT2( a0, a1 ) do \
 { \
-  __m128i b =  _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
-  a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
-  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) );  \
+  __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
+  a0 = _mm_alignr_epi8( a1, b, 4 ); \
+  a1 = _mm_alignr_epi8( b, a1, 4 ); \
 } while(0)

 #else

 #define MULT2( a0, a1 ) do \
 { \
-  __m128i b =  _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 16 ) ); \
+  __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
  a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
-  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) );  \
+  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
 } while(0)

 #endif
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -75,7 +75,7 @@ void lyra2rev2_16way_hash( void *state, const void *input )
   keccak256_8way_close( &ctx.keccak, vhash );

   dintrlv_8x64( hash8,  hash9,  hash10,  hash11,
-                 hash12, hash13, hash14, hash5, vhash, 256 );
+                 hash12, hash13, hash14, hash15, vhash, 256 );

   cubehash_full( &ctx.cube, (byte*) hash0,  256, (const byte*) hash0,  32 );
   cubehash_full( &ctx.cube, (byte*) hash1,  256, (const byte*) hash1,  32 );
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -146,14 +146,25 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );

 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+{ \
+   __m128i t; \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_vrol256_64( s6, s7 ); \
-   mm128_vror256_64( s2, s3 ); \
+   t = mm128_alignr_64( s7, s6, 1 ); \
+   s6 = mm128_alignr_64( s6, s7, 1 ); \
+   s7 = t; \
+   t = mm128_alignr_64( s2, s3, 1 ); \
+   s2 =  mm128_alignr_64( s3, s2, 1 ); \
+   s3 = t; \
   G_2X64( s0, s2, s5, s6 ); \
   G_2X64( s1, s3, s4, s7 ); \
-   mm128_vror256_64( s6, s7 ); \
-   mm128_vrol256_64( s2, s3 );
+   t = mm128_alignr_64( s6, s7, 1 ); \
+   s6 = mm128_alignr_64( s7, s6, 1 ); \
+   s7 = t; \
+   t = mm128_alignr_64( s3, s2, 1 ); \
+   s2 =  mm128_alignr_64( s2, s3, 1 ); \
+   s3 = t; \
+} 

 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -4,24 +4,6 @@
 #include <string.h>
 #include <stdio.h>

-long double lbry_calc_network_diff( struct work *work )
-{
-        // sample for diff 43.281 : 1c05ea29
-        // todo: endian reversed on longpoll could be zr5 specific...
-
-   uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
-   uint32_t bits = (nbits & 0xffffff);
-   int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
-   long double d = (long double)0x0000ffff / (long double)bits;
-
-   for (int m=shift; m < 29; m++) d *= 256.0;
-   for (int m=29; m < shift; m++) d /= 256.0;
-   if (opt_debug_diff)
-      applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
-
-   return d;
-}
-
 // std_le should work but it doesn't
 void lbry_le_build_stratum_request( char *req, struct work *work,
                                      struct stratum_ctx *sctx )
@@ -41,31 +23,6 @@ void lbry_le_build_stratum_request( char *req, struct work *work,
   free(xnonce2str);
 }

-/*
-void lbry_build_block_header( struct work* g_work, uint32_t version,
-                             uint32_t *prevhash, uint32_t *merkle_root,
-                             uint32_t ntime, uint32_t nbits )
-{
-   int i;
-   memset( g_work->data, 0, sizeof(g_work->data) );
-   g_work->data[0] =  version;
-
-   if ( have_stratum )
-      for ( i = 0; i < 8; i++ )
-         g_work->data[1 + i] = le32dec( prevhash + i );
-   else
-      for (i = 0; i < 8; i++)
-         g_work->data[ 8-i ] = le32dec( prevhash + i );
-
-   for ( i = 0; i < 8; i++ )
-      g_work->data[9 + i] = be32dec( merkle_root + i );
-
-   g_work->data[ LBRY_NTIME_INDEX ] = ntime;
-   g_work->data[ LBRY_NBITS_INDEX ] = nbits;
-   g_work->data[28] = 0x80000000;
-}
-*/
-
 void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 {
   unsigned char merkle_root[64] = { 0 };
@@ -112,9 +69,7 @@ bool register_lbry_algo( algo_gate_t* gate )
  gate->hash                  = (void*)&lbry_hash;
  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
 #endif
-  gate->calc_network_diff     = (void*)&lbry_calc_network_diff;
  gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
-//  gate->build_block_header    = (void*)&build_block_header;
  gate->build_extraheader     = (void*)&lbry_build_extraheader;
  gate->ntime_index           = LBRY_NTIME_INDEX;
  gate->nbits_index           = LBRY_NBITS_INDEX;
--- a/algo/sha/sha256dt.c
+++ b/algo/sha/sha256dt.c
@@ -0,0 +1,268 @@
+#include "algo-gate-api.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "sha-hash-4way.h"
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define SHA256DT_16WAY 1
+#elif defined(__AVX2__)
+  #define SHA256DT_8WAY 1
+#else
+  #define SHA256DT_4WAY 1
+#endif
+
+#if defined(SHA256DT_16WAY)
+
+int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m512i  vdata[32]    __attribute__ ((aligned (128)));
+   __m512i  block[16]    __attribute__ ((aligned (64)));
+   __m512i  hash32[8]    __attribute__ ((aligned (64)));
+   __m512i  initstate[8] __attribute__ ((aligned (64)));
+   __m512i  midstate1[8] __attribute__ ((aligned (64)));
+   __m512i  midstate2[8] __attribute__ ((aligned (64)));
+   __m512i  mexp_pre[16] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 16;
+   uint32_t n = first_nonce;
+   __m512i *noncev = vdata + 19; 
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m512i last_byte = m512_const1_32( 0x80000000 );
+   const __m512i sixteen = m512_const1_32( 16 );
+
+   for ( int i = 0; i < 19; i++ )
+      vdata[i] = mm512_bcast_i32( pdata[i] );
+
+   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
+                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
+
+   vdata[16+4] = last_byte;
+   memset_zero_512( vdata+16 + 5, 10 );
+   vdata[16+15] = mm512_bcast_i32( 0x480 ); 
+   
+   block[ 8] = last_byte;
+   memset_zero_512( block + 9, 6 );
+   block[15] = mm512_bcast_i32( 0x300 ); 
+   
+   initstate[0] = mm512_bcast_i64( 0xdfa9bf2cdfa9bf2c );
+   initstate[1] = mm512_bcast_i64( 0xb72074d4b72074d4 );
+   initstate[2] = mm512_bcast_i64( 0x6bb011226bb01122 );
+   initstate[3] = mm512_bcast_i64( 0xd338e869d338e869 );
+   initstate[4] = mm512_bcast_i64( 0xaa3ff126aa3ff126 );
+   initstate[5] = mm512_bcast_i64( 0x475bbf30475bbf30 );
+   initstate[6] = mm512_bcast_i64( 0x8fd52e5b8fd52e5b );
+   initstate[7] = mm512_bcast_i64( 0x9f75c9ad9f75c9ad );
+
+   sha256_16way_transform_le( midstate1, vdata, initstate );
+   
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
+
+   do
+   {
+      sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                 mexp_pre );
+      sha256_16way_transform_le( hash32, block, initstate );
+      mm512_block_bswap_32( hash32, hash32 );    
+
+      for ( int lane = 0; lane < 16; lane++ )
+      if ( hash32_d7[ lane ] <= targ32_d7 )
+      {
+         extr_lane_16x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+      }
+      *noncev = _mm512_add_epi32( *noncev, sixteen );
+      n += 16;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+#endif
+
+#if defined(SHA256DT_8WAY)
+
+int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m256i  vdata[32]    __attribute__ ((aligned (64)));
+   __m256i  block[16]    __attribute__ ((aligned (32)));
+   __m256i  hash32[8]    __attribute__ ((aligned (32)));
+   __m256i  initstate[8] __attribute__ ((aligned (32)));
+   __m256i  midstate1[8] __attribute__ ((aligned (32)));
+   __m256i  midstate2[8] __attribute__ ((aligned (32)));
+   __m256i  mexp_pre[16] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+   __m256i *noncev = vdata + 19;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m256i last_byte = m256_const1_32( 0x80000000 );
+   const __m256i eight = m256_const1_32( 8 );
+
+   for ( int i = 0; i < 19; i++ )
+      vdata[i] = mm256_bcast_i32( pdata[i] );
+
+   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
+
+   vdata[16+4] = last_byte;
+   memset_zero_256( vdata+16 + 5, 10 );
+   vdata[16+15] = mm256_bcast_i32( 0x480 );
+
+   block[ 8] = last_byte;
+   memset_zero_256( block + 9, 6 );
+   block[15] = mm256_bcast_i32( 0x300 ); 
+   
+   // initialize state
+   initstate[0] = mm256_bcast_i64( 0xdfa9bf2cdfa9bf2c );
+   initstate[1] = mm256_bcast_i64( 0xb72074d4b72074d4 );
+   initstate[2] = mm256_bcast_i64( 0x6bb011226bb01122 );
+   initstate[3] = mm256_bcast_i64( 0xd338e869d338e869 );
+   initstate[4] = mm256_bcast_i64( 0xaa3ff126aa3ff126 );
+   initstate[5] = mm256_bcast_i64( 0x475bbf30475bbf30 );
+   initstate[6] = mm256_bcast_i64( 0x8fd52e5b8fd52e5b );
+   initstate[7] = mm256_bcast_i64( 0x9f75c9ad9f75c9ad );
+
+   sha256_8way_transform_le( midstate1, vdata, initstate );
+
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
+   
+   do
+   {
+      sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                mexp_pre );
+      sha256_8way_transform_le( hash32, block, initstate );
+      mm256_block_bswap_32( hash32, hash32 );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( hash32_d7[ lane ] <= targ32_d7 )
+      {
+         extr_lane_8x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+      }
+      *noncev = _mm256_add_epi32( *noncev, eight );
+      n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
+
+#if defined(SHA256DT_4WAY)
+
+int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m128i  vdata[32]    __attribute__ ((aligned (64)));
+   __m128i  block[16]    __attribute__ ((aligned (32)));
+   __m128i  hash32[8]    __attribute__ ((aligned (32)));
+   __m128i  initstate[8] __attribute__ ((aligned (32)));
+   __m128i  midstate[8]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   uint32_t n = first_nonce;
+   __m128i *noncev = vdata + 19;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m128i last_byte = m128_const1_32( 0x80000000 );
+   const __m128i four = m128_const1_32( 4 );
+
+   for ( int i = 0; i < 19; i++ )
+       vdata[i] = mm128_bcast_i32( pdata[i] );
+
+   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
+
+   vdata[16+4] = last_byte;
+   memset_zero_128( vdata+16 + 5, 10 );
+   vdata[16+15] = mm128_bcast_i32( 0x480 );
+
+   block[ 8] = last_byte;
+   memset_zero_128( block + 9, 6 );
+   block[15] = mm128_bcast_i32( 0x300 );
+   
+   // initialize state
+   initstate[0] = mm128_bcast_i64( 0xdfa9bf2cdfa9bf2c );
+   initstate[1] = mm128_bcast_i64( 0xb72074d4b72074d4 );
+   initstate[2] = mm128_bcast_i64( 0x6bb011226bb01122 );
+   initstate[3] = mm128_bcast_i64( 0xd338e869d338e869 );
+   initstate[4] = mm128_bcast_i64( 0xaa3ff126aa3ff126 );
+   initstate[5] = mm128_bcast_i64( 0x475bbf30475bbf30 );
+   initstate[6] = mm128_bcast_i64( 0x8fd52e5b8fd52e5b );
+   initstate[7] = mm128_bcast_i64( 0x9f75c9ad9f75c9ad );
+
+   // hash first 64 bytes of data
+   sha256_4way_transform_le( midstate, vdata, initstate );
+
+   do
+   {
+      sha256_4way_transform_le( block,  vdata+16, midstate  );
+      sha256_4way_transform_le( hash32, block,    initstate );
+      mm128_block_bswap_32( hash32, hash32 );
+
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_4x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+       }
+       *noncev = _mm_add_epi32( *noncev, four );
+       n += 4;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
+bool register_sha256dt_algo( algo_gate_t* gate )
+{
+    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+#if defined(SHA256DT_16WAY)
+    gate->scanhash   = (void*)&scanhash_sha256dt_16way;
+#elif defined(SHA256DT_8WAY)
+    gate->scanhash   = (void*)&scanhash_sha256dt_8way;
+#else
+    gate->scanhash   = (void*)&scanhash_sha256dt_4way;
+#endif
+    return true;
+}
+
--- a/algo/sha/sha512256d-4way.c
+++ b/algo/sha/sha512256d-4way.c
@@ -0,0 +1,221 @@
+#include "algo-gate-api.h"
+#include "sha-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#define SHA512256D_8WAY 1
+#elif defined(__AVX2__)
+#define SHA512256D_4WAY 1
+#endif
+
+#if defined(SHA512256D_8WAY)
+
+static void sha512256d_8way_init( sha512_8way_context *ctx )
+{
+  ctx->count = 0;
+  ctx->initialized = true;
+  ctx->val[0] = mm512_bcast_i64( 0x22312194FC2BF72C );
+  ctx->val[1] = mm512_bcast_i64( 0x9F555FA3C84C64C2 );
+  ctx->val[2] = mm512_bcast_i64( 0x2393B86B6F53B151 );
+  ctx->val[3] = mm512_bcast_i64( 0x963877195940EABD );
+  ctx->val[4] = mm512_bcast_i64( 0x96283EE2A88EFFE3 );
+  ctx->val[5] = mm512_bcast_i64( 0xBE5E1E2553863992 );
+  ctx->val[6] = mm512_bcast_i64( 0x2B0199FC2C85B8AA );
+  ctx->val[7] = mm512_bcast_i64( 0x0EB72DDC81C52CA2 );
+}
+
+int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint64_t hash[8*8] __attribute__ ((aligned (128)));
+    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+    sha512_8way_context ctx; 
+    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+    uint64_t *hash_q3 = &(hash[3*8]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 8;
+    uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;
+    const int thr_id = mythr->id;
+    const bool bench = opt_benchmark;
+    const __m512i eight = mm512_bcast_i64( 0x0000000800000000 );
+
+    mm512_bswap32_intrlv80_8x64( vdata, pdata );
+    *noncev = mm512_intrlv_blend_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
+    do
+    {
+       sha512256d_8way_init( &ctx );
+       sha512_8way_update( &ctx, vdata, 80 );
+       sha512_8way_close( &ctx, hash );        
+
+       sha512256d_8way_init( &ctx );
+       sha512_8way_update( &ctx, hash, 32 );
+       sha512_8way_close( &ctx, hash );
+
+       for ( int lane = 0; lane < 8; lane++ )
+       if ( unlikely( hash_q3[ lane ] <= targ_q3 && !bench ) )
+       {
+          extr_lane_8x64( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) && !bench )
+          {
+             pdata[19] = bswap_32( n + lane );
+             submit_solution( work, lane_hash, mythr );
+          }
+       }
+       *noncev = _mm512_add_epi32( *noncev, eight );
+       n += 8;
+    } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+
+    pdata[19] = n;
+    *hashes_done = n - first_nonce;
+    return 0;
+}
+
+#elif defined(SHA512256D_4WAY)
+
+static void sha512256d_4way_init( sha512_4way_context *ctx )
+{
+  ctx->count = 0;
+  ctx->initialized = true;
+  ctx->val[0] = mm256_bcast_i64( 0x22312194FC2BF72C );
+  ctx->val[1] = mm256_bcast_i64( 0x9F555FA3C84C64C2 );
+  ctx->val[2] = mm256_bcast_i64( 0x2393B86B6F53B151 );
+  ctx->val[3] = mm256_bcast_i64( 0x963877195940EABD );
+  ctx->val[4] = mm256_bcast_i64( 0x96283EE2A88EFFE3 );
+  ctx->val[5] = mm256_bcast_i64( 0xBE5E1E2553863992 );
+  ctx->val[6] = mm256_bcast_i64( 0x2B0199FC2C85B8AA );
+  ctx->val[7] = mm256_bcast_i64( 0x0EB72DDC81C52CA2 );
+}
+
+int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint64_t hash[8*4] __attribute__ ((aligned (64)));
+    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+    sha512_4way_context ctx;
+    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+    uint64_t *hash_q3 = &(hash[3*4]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 4;
+    uint32_t n = first_nonce;
+    __m256i  *noncev = (__m256i*)vdata + 9;
+    const int thr_id = mythr->id;
+    const bool bench = opt_benchmark;
+    const __m256i four = mm256_bcast_i64( 0x0000000400000000 );
+
+    mm256_bswap32_intrlv80_4x64( vdata, pdata );
+    *noncev = mm256_intrlv_blend_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+    do
+    {
+       sha512256d_4way_init( &ctx );
+       sha512_4way_update( &ctx, vdata, 80 );
+       sha512_4way_close( &ctx, hash );
+
+       sha512256d_4way_init( &ctx );
+       sha512_4way_update( &ctx, hash, 32 );
+       sha512_4way_close( &ctx, hash );
+
+       for ( int lane = 0; lane < 4; lane++ )
+       if ( hash_q3[ lane ] <= targ_q3 )
+       {
+          extr_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) && !bench )
+          {
+             pdata[19] = bswap_32( n + lane );
+             submit_solution( work, lane_hash, mythr );
+          }
+       }
+       *noncev = _mm256_add_epi32( *noncev, four );
+       n += 4;
+    } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
+    pdata[19] = n;
+    *hashes_done = n - first_nonce;
+    return 0;
+}
+
+#else
+
+#include "sph_sha2.h"
+
+static const uint64_t H512_256[8] =
+{
+   0x22312194FC2BF72C, 0x9F555FA3C84C64C2,
+   0x2393B86B6F53B151, 0x963877195940EABD,
+   0x96283EE2A88EFFE3, 0xBE5E1E2553863992,
+   0x2B0199FC2C85B8AA, 0x0EB72DDC81C52CA2,
+};
+
+static void sha512256d_init( sph_sha512_context *ctx )
+{
+   memcpy( ctx->val, H512_256, sizeof H512_256 );
+   ctx->count = 0;
+}
+
+int scanhash_sha512256d( struct work *work,   uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t hash64[8] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__ ((aligned (64)));
+   sph_sha512_context ctx;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   int thr_id = mythr->id;
+
+   swab32_array( endiandata, pdata, 20 );
+
+   do {
+      be32enc( &endiandata[19], n );
+
+      sha512256d_init( &ctx );
+      sph_sha512( &ctx, endiandata, 80 );
+      sph_sha512_close( &ctx, hash64 );
+
+      sha512256d_init( &ctx );
+      sph_sha512( &ctx, hash64, 32 );
+      sph_sha512_close( &ctx, hash64 );
+      
+      if ( hash64[7] <= Htarg )
+      if ( fulltest( hash64, ptarget ) && !opt_benchmark )
+      {
+         pdata[19] = n;
+         submit_solution( work, hash64, mythr );
+      }
+      n++;
+
+   } while (n < max_nonce && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   pdata[19] = n;
+
+   return 0;
+}
+
+#endif
+
+bool register_sha512256d_algo( algo_gate_t* gate )
+{
+   gate->optimizations = AVX2_OPT | AVX512_OPT;
+#if defined(SHA512256D_8WAY)
+   gate->scanhash = (void*)&scanhash_sha512256d_8way;
+#elif defined(SHA512256D_4WAY)
+   gate->scanhash = (void*)&scanhash_sha512256d_4way;
+#else
+   gate->scanhash = (void*)&scanhash_sha512256d;
+#endif
+   return true;
+};
+
--- a/algo/skein/skein.c
+++ b/algo/skein/skein.c
@@ -31,18 +31,19 @@ int scanhash_skein( struct work *work, uint32_t max_nonce,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t n = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;

   swab32_array( endiandata, pdata, 20 );

 	do {
 		be32enc(&endiandata[19], n); 
 		skeinhash(hash64, endiandata);
-		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
-			*hashes_done = n - first_nonce + 1;
-			pdata[19] = n;
-			return true;
-		}
+      if (hash64[7] <= Htarg )
+      if ( fulltest(hash64, ptarget) && !opt_benchmark )
+      {
+         pdata[19] = n;
+         submit_solution( work, hash64, mythr );
+      }
 		n++;

 	} while (n < max_nonce && !work_restart[thr_id].restart);
--- a/algo/skein/skein2.c
+++ b/algo/skein/skein2.c
@@ -34,31 +34,31 @@ void skein2hash(void *output, const void *input)
 	sph_skein512_close(&ctx_skein, hash);

 	memcpy(output, hash, 32);
-
 }

 int scanhash_skein2( struct work *work,	uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	uint32_t hash64[8] __attribute__ ((aligned (64)));
 	uint32_t endiandata[20] __attribute__ ((aligned (64)));
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t n = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id; 

-        swab32_array( endiandata, pdata, 20 );
+   swab32_array( endiandata, pdata, 20 );

 	do {
 		be32enc(&endiandata[19], n);
 		skein2hash(hash64, endiandata);
-		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
-			*hashes_done = n - first_nonce + 1;
-			pdata[19] = n;
-			return true;
-		}
+      if (hash64[7] <= Htarg )
+      if ( fulltest(hash64, ptarget) && !opt_benchmark )
+      {
+         pdata[19] = n;
+         submit_solution( work, hash64, mythr );
+      }
 		n++;

 	} while (n < max_nonce && !work_restart[thr_id].restart);
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -257,6 +257,7 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
   const __m512i eight = m512_const1_64( 8 );
   const bool bench = opt_benchmark;

+   // convert LE32 to LE64
   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
@@ -470,6 +471,7 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
   const __m256i four = m256_const1_64( 4 );
   const bool bench = opt_benchmark;

+   // convert LE32 to LE64
   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
--- a/4355
+++ b/4355
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.21.4])
+AC_INIT([cpuminer-opt], [3.22.2])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -3,7 +3,7 @@
 * Copyright 2012-2014 pooler
 * Copyright 2014 Lucas Jones
 * Copyright 2014-2016 Tanguy Pruvot
- * Copyright 2016-2021 Jay D Dee
+ * Copyright 2016-2023 Jay D Dee
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
@@ -37,7 +37,7 @@
 #include <curl/curl.h>
 #include <jansson.h>
 #include <openssl/sha.h>
-#include <mm_malloc.h>
+//#include <mm_malloc.h>
 #include "sysinfos.c"
 #include "algo/sha/sha256d.h"

@@ -121,7 +121,6 @@ static uint64_t opt_affinity = 0xFFFFFFFFFFFFFFFFULL;  // default, use all cores
 int opt_priority = 0;  // deprecated
 int num_cpus = 1;
 int num_cpugroups = 1;  // For Windows
-#define max_cpus 256   // max for affinity
 char *rpc_url = NULL;
 char *rpc_userpass = NULL;
 char *rpc_user, *rpc_pass;
@@ -224,8 +223,7 @@ char*  lp_id;

 static void   workio_cmd_free(struct workio_cmd *wc);

-// array mapping thread to cpu
-static uint8_t thread_affinity_map[ max_cpus ];
+static int *thread_affinity_map;

 // display affinity mask graphically
 static void format_affinity_mask( char *mask_str, uint64_t mask )
@@ -432,20 +430,18 @@ static bool work_decode( const json_t *val, struct work *work )
    if ( unlikely( !algo_gate.work_decode( work ) ) )
        return false;

-    if ( !allow_mininginfo )
-        net_diff = algo_gate.calc_network_diff( work );
-    else
-        net_diff = hash_to_diff( work->target );
-
-    work->targetdiff = net_diff;
-    stratum_diff = last_targetdiff = work->targetdiff;
+    // many of these aren't used solo.
+    net_diff =
+    work->targetdiff = 
+    stratum_diff =
+    last_targetdiff = hash_to_diff( work->target );
    work->sharediff = 0;
    algo_gate.decode_extra_data( work, &net_blocks );

    return true;
 }

-// good alternative for wallet mining, difficulty and net hashrate
+// Only used for net_hashrate with GBT/getwork, data is from previous block.
 static const char *info_req =
 "{\"method\": \"getmininginfo\", \"params\": [], \"id\":8}\r\n";

@@ -471,17 +467,14 @@ static bool get_mininginfo( CURL *curl, struct work *work )
   // "networkhashps": 56475980
   if ( res )
   {
-      // net_diff is a global that is set from the work hash target by
-      // both getwork and GBT. Don't overwrite it, define a local to override
-      // the global.
-      double net_diff = 0.;
+      double difficulty = 0.;
  		json_t *key = json_object_get( res, "difficulty" );
   	if ( key )
      {
 	   	if ( json_is_object( key ) )
 		   	key = json_object_get( key, "proof-of-work" );
 		   if ( json_is_real( key ) )
-			   net_diff = json_real_value( key );
+			   difficulty = json_real_value( key );
 	   }

      key = json_object_get( res, "networkhashps" );
@@ -498,12 +491,13 @@ static bool get_mininginfo( CURL *curl, struct work *work )
 		  	net_blocks = json_integer_value( key );

      if ( opt_debug )
-         applog(LOG_INFO,"Mining info: diff %.5g, net_hashrate %f, height %d",
-                              net_diff, net_hashrate, net_blocks );
-      
+         applog( LOG_INFO,"getmininginfo: difficulty %.5g, networkhashps %.5g, blocks %d", difficulty, net_hashrate, net_blocks );
+
      if ( !work->height )
      {
 	      // complete missing data from getwork
+         if ( opt_debug )
+            applog( LOG_DEBUG, "work height set by getmininginfo" );
 	      work->height = (uint32_t) net_blocks + 1;
 	      if ( work->height > g_work.height )
            restart_threads();
@@ -535,9 +529,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
   json_t *tmp, *txa;
   bool rc = false;
   int i, n;
-
-// Segwit BEGIN
   bool segwit = false;
+
   tmp = json_object_get( val, "rules" );
   if ( tmp && json_is_array( tmp ) )
   {
@@ -555,8 +548,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
         }
      }
   }
-// Segwit END
-   
+
   tmp = json_object_get( val, "mutable" );
   if ( tmp && json_is_array( tmp ) )
   {
@@ -638,7 +630,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
         goto out;
      }
   }
-   
+
   /* find count and size of transactions */
   txa = json_object_get(val, "transactions" );
   if ( !txa || !json_is_array( txa ) )
@@ -713,12 +705,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      cbtx[41] = cbtx_size - 42; /* scriptsig length */
      le32enc( (uint32_t *)( cbtx+cbtx_size ), 0xffffffff ); /* sequence */
      cbtx_size += 4;
-
-// Segwit BEGIN
-      //cbtx[cbtx_size++] = 1; /* out-counter */
-        cbtx[cbtx_size++] = segwit ? 2 : 1; /* out-counter */
-// Segwit END
-
+      cbtx[cbtx_size++] = segwit ? 2 : 1; /* out-counter */
      le32enc( (uint32_t *)( cbtx+cbtx_size) , (uint32_t)cbvalue ); /* value */
      le32enc( (uint32_t *)( cbtx+cbtx_size+4 ), cbvalue >> 32 );
      cbtx_size += 8;
@@ -726,7 +713,6 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      memcpy( cbtx+cbtx_size, pk_script, pk_script_size );
      cbtx_size += (int) pk_script_size;

-// Segwit BEGIN
       if ( segwit )
       {
          unsigned char (*wtree)[32] = calloc(tx_count + 2, 32);
@@ -761,12 +747,11 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
            for ( i = 0; i < n; i++ )
               sha256d( wtree[i], wtree[2*i], 64 );
         }
-         memset( wtree[1], 0, 32 );  /* witness reserved value = 0 */
+         memset( wtree[1], 0, 32 );  // witness reserved value = 0
         sha256d( cbtx+cbtx_size, wtree[0], 64 );
         cbtx_size += 32;
         free( wtree );
      }
-// Segwit END

      le32enc( (uint32_t *)( cbtx+cbtx_size ), 0 ); /* lock time */
      cbtx_size += 4;
@@ -785,10 +770,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
            xsig_len += n;
         }
         else
-         {
            applog( LOG_WARNING,
                        "Signature does not fit in coinbase, skipping" );
-         }
      }
      tmp = json_object_get( val, "coinbaseaux" );
      if ( tmp && json_is_object( tmp ) )
@@ -815,8 +798,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      if ( xsig_len )
      {
         unsigned char *ssig_end = cbtx + 42 + cbtx[41];
-         int push_len = cbtx[41] + xsig_len < 76 ? 1 :
-		               cbtx[41] + 2 + xsig_len > 100 ? 0 : 2;
+         int push_len = cbtx[41] + xsig_len < 76
+                        ? 1 : cbtx[41] + 2 + xsig_len > 100 ? 0 : 2;
         n = xsig_len + push_len;
         memmove( ssig_end + n, ssig_end, cbtx_size - 42 - cbtx[41] );
         cbtx[41] += n;
@@ -843,7 +826,6 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      const char *tx_hex = json_string_value( json_object_get( tmp, "data" ) );
      const int tx_size = tx_hex ? (int) ( strlen( tx_hex ) / 2 ) : 0;

-// Segwit BEGIN      
      if ( segwit )
      {
         const char *txid = json_string_value( json_object_get( tmp, "txid" ) );
@@ -856,8 +838,6 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      }
      else
      {
-// Segwit END
-
         unsigned char *tx = (uchar*) malloc( tx_size );
         if ( !tx_hex || !hex2bin( tx, tx_hex, tx_size ) )
         {
@@ -867,10 +847,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
         }
         sha256d( merkle_tree[1 + i], tx, tx_size );
         free( tx );
-
-// Segwit BEGIN      
      }
-// Segwit END

      if ( !submit_coinbase )
         strcat( work->txs, tx_hex );
@@ -888,6 +865,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
         sha256d( merkle_tree[i], merkle_tree[2*i], 64 );
   }

+   work->tx_count = tx_count;
+
   /* assemble block header */
   algo_gate.build_block_header( work, swab32( version ),
                                 (uint32_t*) prevhash, (uint32_t*) merkle_tree,
@@ -900,21 +879,11 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      goto out;
   }

-// See git issue https://github.com/JayDDee/cpuminer-opt/issues/379    
-#if defined(__AVX2__)
-   if ( opt_debug )
-   {
-      if ( (uint64_t)target % 32 )
-         applog( LOG_ERR, "Misaligned target %p", target );
-      if ( (uint64_t)(work->target) % 32 )
-         applog( LOG_ERR, "Misaligned work->target %p", work->target );
-   }   
-#endif
-
-   for ( i = 0; i < 8; i++ )
-      work->target[7 - i] = be32dec( target + i );
+   // reverse the bytes in target
+   casti_m128i( work->target, 0 ) = mm128_bswap_128( casti_m128i( target, 1 ) );
+   casti_m128i( work->target, 1 ) = mm128_bswap_128( casti_m128i( target, 0 ) );
   net_diff = work->targetdiff = hash_to_diff( work->target );
-   
+
   tmp = json_object_get( val, "workid" );
   if ( tmp )
   {
@@ -1090,12 +1059,11 @@ void report_summary_log( bool force )
   timeval_subtract( &et, &now, &start_time );
   timeval_subtract( &uptime, &total_hashes_time, &session_start );
   
-   double share_time = (double)et.tv_sec + (double)et.tv_usec / 1e6;
+   double share_time = (double)et.tv_sec + (double)et.tv_usec * 1e-6;
   double ghrate = safe_div( total_hashes, (double)uptime.tv_sec, 0. );
   double target_diff = exp32 * last_targetdiff;
   double shrate = safe_div( target_diff * (double)(accepts),
                             share_time, 0. );
-//   global_hashrate = ghrate;
   double sess_hrate = safe_div( exp32 * norm_diff_sum,
                                 (double)uptime.tv_sec, 0. );
   double submit_rate = safe_div( (double)submits * 60., share_time, 0. );
@@ -1116,7 +1084,7 @@ void report_summary_log( bool force )
   applog2( LOG_NOTICE, "Periodic Report     %s        %s", et_str, upt_str );
   applog2( LOG_INFO, "Share rate        %.2f/min     %.2f/min",
            submit_rate, safe_div( (double)submitted_share_count*60.,
-              ( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ), 0. ) );
+              ( (double)uptime.tv_sec + (double)uptime.tv_usec * 1e-6 ), 0. ) );
   applog2( LOG_INFO, "Hash rate       %7.2f%sh/s   %7.2f%sh/s   (%.2f%sh/s)",
            shrate, shr_units, sess_hrate, sess_hr_units, ghrate, ghr_units );

@@ -1563,7 +1531,6 @@ const char *getwork_req =

 #define GBT_CAPABILITIES "[\"coinbasetxn\", \"coinbasevalue\", \"longpoll\", \"workid\"]"

-// Segwit BEGIN
 #define GBT_RULES "[\"segwit\"]"
 static const char *gbt_req =
   "{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
@@ -1572,16 +1539,6 @@ const char *gbt_lp_req =
   "{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
   GBT_CAPABILITIES ", \"rules\": " GBT_RULES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n";

-/*
-static const char *gbt_req =
-	"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
-	GBT_CAPABILITIES "}], \"id\":0}\r\n";
-const char *gbt_lp_req =
-	"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
-	GBT_CAPABILITIES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n";
-*/
-// Segwit END
-
 static bool get_upstream_work( CURL *curl, struct work *work )
 {
   json_t *val;
@@ -1656,49 +1613,49 @@ start:
         last_block_height = work->height;
         last_targetdiff = net_diff;

-         applog( LOG_BLUE, "New Block %d, Net Diff %.5g, Ntime %08x",
-                                work->height, net_diff,
+         applog( LOG_BLUE, "New Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
+                                work->height, work->tx_count, net_diff,
                                work->data[ algo_gate.ntime_index ] );
-
-         if ( !opt_quiet )
-         {
-            double miner_hr = 0.;
-            double net_hr = net_hashrate;
-            double nd = net_diff * exp32;
-            char net_hr_units[4] = {0};
-            char miner_hr_units[4] = {0};
-            char net_ttf[32];
-            char miner_ttf[32];
-
-            pthread_mutex_lock( &stats_lock );
-
-            for ( int i = 0; i < opt_n_threads; i++ )
-               miner_hr += thr_hashrates[i];
-            global_hashrate = miner_hr;
-
-            pthread_mutex_unlock( &stats_lock );
-
-            if ( net_hr > 0. )
-               sprintf_et( net_ttf, nd / net_hr );
-            else
-               sprintf( net_ttf, "NA" );
-            if ( miner_hr > 0. )
-               sprintf_et( miner_ttf, nd / miner_hr );
-            else
-               sprintf( miner_ttf, "NA" );
-
-            scale_hash_for_display ( &miner_hr, miner_hr_units );
-            scale_hash_for_display ( &net_hr, net_hr_units );
-            applog2( LOG_INFO,
-                     "Miner TTF @ %.2f %sh/s %s, Net TTF @ %.2f %sh/s %s",
-                     miner_hr, miner_hr_units, miner_ttf, net_hr,
-                     net_hr_units, net_ttf );
-         }
-      }  // work->height > last_block_height
+      }
      else if ( memcmp( &work->data[1], &g_work.data[1], 32 ) )
-         applog( LOG_BLUE, "New Work: Block %d, Net Diff %.5g, Ntime %08x",
-                                      work->height, net_diff,
-                                      work->data[ algo_gate.ntime_index ] );
+         applog( LOG_BLUE, "New Work: Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
+                                work->height, work->tx_count, net_diff,
+                                work->data[ algo_gate.ntime_index ] );
+       
+      if ( !opt_quiet )
+      {
+         double miner_hr = 0.;
+         double net_hr = net_hashrate;
+         double nd = net_diff * exp32;
+         char net_hr_units[4] = {0};
+         char miner_hr_units[4] = {0};
+         char net_ttf[32];
+         char miner_ttf[32];
+
+         pthread_mutex_lock( &stats_lock );
+
+         for ( int i = 0; i < opt_n_threads; i++ )
+             miner_hr += thr_hashrates[i];
+         global_hashrate = miner_hr;
+
+         pthread_mutex_unlock( &stats_lock );
+
+         if ( net_hr > 0. )
+            sprintf_et( net_ttf, nd / net_hr );
+         else
+            sprintf( net_ttf, "NA" );
+         if ( miner_hr > 0. )
+            sprintf_et( miner_ttf, nd / miner_hr );
+         else
+            sprintf( miner_ttf, "NA" );
+
+         scale_hash_for_display ( &miner_hr, miner_hr_units );
+         scale_hash_for_display ( &net_hr, net_hr_units );
+         applog2( LOG_INFO,
+                  "Miner TTF @ %.2f %sh/s %s, Net TTF @ %.2f %sh/s %s",
+                  miner_hr, miner_hr_units, miner_ttf, net_hr,
+                  net_hr_units, net_ttf );
+      }
   }  // rc

   return rc;
@@ -1724,20 +1681,19 @@ static void workio_cmd_free(struct workio_cmd *wc)

 static bool workio_get_work( struct workio_cmd *wc, CURL *curl )
 {
-   struct work *ret_work;
+   struct work *work_heap;
   int failures = 0;

-   ret_work = (struct work*) _mm_malloc( sizeof(*ret_work), 32 );
-   if ( !ret_work )  return false;
-   memset( ret_work, 0, sizeof(*ret_work) );
+   work_heap = calloc( 1, sizeof(struct work) );
+   if ( !work_heap )  return false;

   /* obtain new work from bitcoin via JSON-RPC */
-   while ( !get_upstream_work( curl, ret_work ) )
+   while ( !get_upstream_work( curl, work_heap ) )
   {
      if ( unlikely( ( opt_retries >= 0 ) && ( ++failures > opt_retries ) ) )
      {
         applog( LOG_ERR, "json_rpc_call failed, terminating workio thread" );
-         free( ret_work );
+         free( work_heap );
         return false;
      }

@@ -1748,8 +1704,8 @@ static bool workio_get_work( struct workio_cmd *wc, CURL *curl )
   }

   /* send work to requesting thread */
-   if ( !tq_push(wc->thr->q, ret_work ) )
-      free( ret_work );
+   if ( !tq_push(wc->thr->q, work_heap ) )
+      free( work_heap );

   return true;
 }
@@ -1825,7 +1781,7 @@ static void *workio_thread(void *userdata)
 static bool get_work(struct thr_info *thr, struct work *work)
 {
 	struct workio_cmd *wc;
-	struct work *work_heap;
+   struct work *work_heap;

 	if unlikely( opt_benchmark )
   {
@@ -1850,17 +1806,16 @@ static bool get_work(struct thr_info *thr, struct work *work)
 	wc->thr = thr;
 	/* send work request to workio thread */
 	if (!tq_push(thr_info[work_thr_id].q, wc))
-        {
+   {
 		workio_cmd_free(wc);
 		return false;
 	}
 	/* wait for response, a unit of work */
 	work_heap = (struct work*) tq_pop(thr->q, NULL);
-	if (!work_heap)
-		return false;
-	/* copy returned work into storage provided by caller */
-	memcpy(work, work_heap, sizeof(*work));
-	free(work_heap);
+	if ( !work_heap ) return false;
+   /* copy returned work into storage provided by caller */
+	memcpy( work, work_heap, sizeof(*work) );
+	free( work_heap );
 	return true;
 }

@@ -1910,9 +1865,9 @@ static void update_submit_stats( struct work *work, const void *hash )
 bool submit_solution( struct work *work, const void *hash,
                      struct thr_info *thr )
 {
-   // Job went stale during hashing of a valid share.
-   if ( !opt_quiet && work_restart[ thr->id ].restart )
-      applog( LOG_INFO, CL_LBL "Share may be stale, submitting anyway..." CL_N );
+// Job went stale during hashing of a valid share.
+//   if ( !opt_quiet && work_restart[ thr->id ].restart )
+//      applog( LOG_INFO, CL_LBL "Share may be stale, submitting anyway..." CL_N );
   
   work->sharediff = hash_to_diff( hash );
   if ( likely( submit_work( thr, work ) ) )
@@ -1930,32 +1885,34 @@ bool submit_solution( struct work *work, const void *hash,
     if ( !opt_quiet )
     {
        if ( have_stratum )
+        {
           applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Job %s",
                   submitted_share_count, work->sharediff, work->height,
                   work->job_id );
+           if ( opt_debug && opt_extranonce )
+           {
+              unsigned char *xnonce2str = abin2hex( work->xnonce2,
+                                                    work->xnonce2_len );
+              applog( LOG_INFO, "Xnonce2 %s", xnonce2str );
+              free( xnonce2str );
+           }
+        }
        else
           applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Ntime %08x",
                   submitted_share_count, work->sharediff, work->height,
                   work->data[ algo_gate.ntime_index ] );
-     }

-     if ( opt_debug )
-     {
-        uint32_t* h = (uint32_t*)hash;
-        uint32_t* t = (uint32_t*)work->target;
-        uint32_t* d = (uint32_t*)work->data;
+        if ( opt_debug )
+        {
+           uint32_t* h = (uint32_t*)hash;
+           uint32_t* t = (uint32_t*)work->target;
+           uint32_t* d = (uint32_t*)work->data;

-        unsigned char *xnonce2str = abin2hex( work->xnonce2,
-                                              work->xnonce2_len );
-        applog(LOG_INFO,"Thread %d, Nonce %08x, Xnonce2 %s", thr->id,
-                       work->data[ algo_gate.nonce_index ], xnonce2str );
-        free( xnonce2str );
-        applog(LOG_INFO,"Data[0:19]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9] );
-        applog(LOG_INFO,"          : %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[10],d[11],d[12],d[13],d[14],d[15],d[16],d[17],d[18],d[19]);
-        applog(LOG_INFO,"Hash[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
-                                    h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0]);
-        applog(LOG_INFO,"Targ[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
-                                    t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0]);
+           applog( LOG_INFO, "Data[ 0: 9]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9] );
+           applog( LOG_INFO, "Data[10:19]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[10],d[11],d[12],d[13],d[14],d[15],d[16],d[17],d[18],d[19] );
+           applog( LOG_INFO, "Hash[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x", h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0] );
+           applog( LOG_INFO, "Targ[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x", t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0] );
+        }
     }
     return true;
   }
@@ -2031,33 +1988,6 @@ void set_work_data_big_endian( struct work *work )
        be32enc( work->data + i, work->data[i] );
 }

-// calculate net diff from nbits.
-double std_calc_network_diff( struct work* work )
-{
-   uint32_t nbits = work->data[ algo_gate.nbits_index ];
-   uint32_t shift = nbits & 0xff;
-   uint32_t bits = bswap_32( nbits ) & 0x00ffffff;
-/*
-   // sample for diff 43.281 : 1c05ea29
-   // todo: endian reversed on longpoll could be zr5 specific...
-   int nbits_index = algo_gate.nbits_index;
-   uint32_t nbits = have_longpoll ? work->data[ nbits_index]
-                                  : swab32( work->data[ nbits_index ] );
-   uint32_t bits  = ( nbits & 0xffffff );
-   int16_t  shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
-*/
-
-   int m;
-   long double d = (long double)0x0000ffff / (long double)bits;
-   for ( m = shift; m < 29; m++ )
-       d *= 256.0;
-   for ( m = 29; m < shift; m++ )
-       d /= 256.0;
-   if ( opt_debug_diff )
-      applog(LOG_DEBUG, "net diff: %8f -> shift %u, bits %08x", (double)d, shift, bits);
-   return (double)d;
-}
-
 void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
                     uint32_t *end_nonce_ptr )
 {
@@ -2081,17 +2011,6 @@ void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
       ++(*nonceptr);
 }

-bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                           int thr_id )
-{
-   if ( have_stratum && !work->data[0] && !opt_benchmark )
-   {
-      sleep(1);
-      return false;
-   }
-   return true;
-}
-
 static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
 {
   bool new_job;
@@ -2108,7 +2027,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   g_work->xnonce2 = (uchar*) realloc( g_work->xnonce2, sctx->xnonce2_size );
   memcpy( g_work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size );
   algo_gate.build_extraheader( g_work, sctx );
-   net_diff = algo_gate.calc_network_diff( g_work );
+   net_diff = nbits_to_diff( g_work->data[ algo_gate.nbits_index ] );
   algo_gate.set_work_data_endian( g_work );
   g_work->height = sctx->block_height;
   g_work->targetdiff = sctx->job.diff
@@ -2137,15 +2056,18 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   pthread_mutex_unlock( &stats_lock );

   if ( stratum_diff != sctx->job.diff )
-      applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Job %s",
-                        sctx->job.diff, sctx->block_height, g_work->job_id );
+      applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Tx %d, Job %s",
+                        sctx->job.diff, sctx->block_height,
+                        sctx->job.merkle_count, g_work->job_id );
   else if ( last_block_height != sctx->block_height )
-      applog( LOG_BLUE, "New Block %d, Net diff %.5g, Job %s",
-                        sctx->block_height, net_diff, g_work->job_id );
+      applog( LOG_BLUE, "New Block %d, Tx %d, Netdiff %.5g, Job %s",
+                        sctx->block_height, sctx->job.merkle_count,
+                        net_diff, g_work->job_id );
   else if ( g_work->job_id && new_job )
-      applog( LOG_BLUE, "New Work: Block %d, Net diff %.5g, Job %s",
-                         sctx->block_height, net_diff, g_work->job_id );
-   else if ( opt_debug )
+      applog( LOG_BLUE, "New Work: Block %d, Tx %d, Netdiff %.5g, Job %s",
+                         sctx->block_height, sctx->job.merkle_count,
+                         net_diff, g_work->job_id );
+   else if ( !opt_quiet )
   {
      unsigned char *xnonce2str = bebin2hex( g_work->xnonce2,
                                             g_work->xnonce2_len );
@@ -2158,8 +2080,6 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   if ( ( stratum_diff != sctx->job.diff )
   || ( last_block_height != sctx->block_height ) )
   {
-      static bool multipool = false;
-      if ( stratum.block_height < last_block_height ) multipool = true;
      if ( unlikely( !session_first_block ) )
         session_first_block = stratum.block_height;
      last_block_height = stratum.block_height;
@@ -2167,58 +2087,47 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
      last_targetdiff   = g_work->targetdiff;
      if ( lowest_share < last_targetdiff )
         lowest_share = 9e99;
+    }

-      if ( !opt_quiet )
-      {
-         applog2( LOG_INFO, "Diff: Net %.5g, Stratum %.5g, Target %.5g",
-                            net_diff, stratum_diff, g_work->targetdiff );
+    if ( !opt_quiet )
+    {
+       applog2( LOG_INFO, "Diff: Net %.5g, Stratum %.5g, Target %.5g",
+                          net_diff, stratum_diff, g_work->targetdiff );

-         if ( likely( hr > 0. ) )
-         {
-            double nd = net_diff * exp32;
-            char hr_units[4] = {0};
-            char block_ttf[32];
-            char share_ttf[32];
+       if ( likely( hr > 0. ) )
+       {
+          double nd = net_diff * exp32;
+          char hr_units[4] = {0};
+          char block_ttf[32];
+          char share_ttf[32];
+          static bool multipool = false;
+      
+          if ( stratum.block_height < last_block_height ) multipool = true;
+            
+          sprintf_et( block_ttf, nd / hr );
+          sprintf_et( share_ttf, ( g_work->targetdiff * exp32 ) / hr );
+          scale_hash_for_display ( &hr, hr_units );
+          applog2( LOG_INFO, "TTF @ %.2f %sh/s: Block %s, Share %s",
+                             hr, hr_units, block_ttf, share_ttf );

-            sprintf_et( block_ttf, nd / hr );
-            sprintf_et( share_ttf, ( g_work->targetdiff * exp32 ) / hr );
-            scale_hash_for_display ( &hr, hr_units );
-            applog2( LOG_INFO, "TTF @ %.2f %sh/s: Block %s, Share %s",
-                               hr, hr_units, block_ttf, share_ttf );
-
-            if ( !multipool && last_block_height > session_first_block )
-            {
-               struct timeval now, et;
-               gettimeofday( &now, NULL );
-               timeval_subtract( &et, &now, &session_start );
-               uint64_t net_ttf =
-                    ( last_block_height - session_first_block ) == 0 ? 0
-                    : et.tv_sec / ( last_block_height - session_first_block );
-               if ( net_diff > 0. && net_ttf )
-               {
-                  double net_hr = nd / net_ttf;
-                  char net_hr_units[4] = {0};
-                  scale_hash_for_display ( &net_hr, net_hr_units );
-                  applog2( LOG_INFO, "Net hash rate (est) %.2f %sh/s",
-                                     net_hr, net_hr_units );
-               }
-            }
-         }  // hr > 0
-      } // !quiet
-   }  // new diff/block
-
-/*   
-   if ( new_job && !( opt_quiet || stratum_errors ) )
-   {
-      int mismatch = submitted_share_count - ( accepted_share_count
-                                             + stale_share_count
-                                             + rejected_share_count );
-      if ( mismatch )
-         applog( LOG_INFO,
-                 CL_LBL "%d Submitted share pending, maybe stale" CL_N,
-                 submitted_share_count );
-   }
-*/
+          if ( !multipool && last_block_height > session_first_block )
+          {
+             struct timeval now, et;
+             gettimeofday( &now, NULL );
+             timeval_subtract( &et, &now, &session_start );
+             uint64_t net_ttf = safe_div( et.tv_sec,
+                                 last_block_height - session_first_block, 0 );
+             if ( net_diff > 0. && net_ttf )
+             {
+                double net_hr = safe_div( nd, net_ttf, 0. );
+                char net_hr_units[4] = {0};
+                scale_hash_for_display ( &net_hr, net_hr_units );
+                applog2( LOG_INFO, "Net hash rate (est) %.2f %sh/s",
+                                   net_hr, net_hr_units );
+             }
+          }
+       }  // hr > 0
+    } // !quiet
 }

 static void *miner_thread( void *userdata )
@@ -2356,9 +2265,6 @@ static void *miner_thread( void *userdata )
       } // do_this_thread
       algo_gate.resync_threads( thr_id, &work );

-       if ( unlikely( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) )
-          continue;
-
       // conditional mining
       if ( unlikely( !wanna_mine( thr_id ) ) )
       {
@@ -3738,7 +3644,6 @@ int main(int argc, char *argv[])
   if ( opt_time_limit )
      time_limit_stop = (unsigned int)time(NULL) + opt_time_limit;

-
   // need to register to get algo optimizations for cpu capabilities
   // but that causes registration logs before cpu capabilities is output.
   // Would need to split register function into 2 parts. First part sets algo
@@ -3867,24 +3772,29 @@ int main(int argc, char *argv[])
 #endif

 #if defined(WIN32) && defined(WINDOWS_CPU_GROUPS_ENABLED)
-      if ( !opt_quiet )
-         applog( LOG_INFO, "Found %d CPUs in %d groups", num_cpus, num_cpugroups );
+      if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )
+         applog( LOG_INFO, "Found %d CPUs in %d groups",
+                           num_cpus, num_cpugroups );
 #endif
   
-   if ( opt_affinity && num_cpus > max_cpus )
+   const int map_size = opt_n_threads < num_cpus ? num_cpus : opt_n_threads;   
+   thread_affinity_map = malloc( map_size * (sizeof (int)) );
+   if ( !thread_affinity_map )
   {
-      applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled",
-                            max_cpus );
+      applog( LOG_ERR, "CPU Affinity disabled, memory allocation failed" );
      opt_affinity = 0ULL;
-   }
-   
+   }   
   if ( opt_affinity )
   {
-      for ( int thr = 0, cpu = 0; thr < opt_n_threads; thr++, cpu++ )
+      int active_cpus = 0; // total CPUs available using rolling affinity mask
+      for ( int thr = 0, cpu = 0; thr < map_size; thr++, cpu++ )
      {
         while ( !( ( opt_affinity >> ( cpu & 63 ) ) & 1ULL ) ) cpu++;   
         thread_affinity_map[ thr ] = cpu % num_cpus;
+         if ( cpu < num_cpus ) active_cpus++;
      }
+      if ( opt_n_threads > active_cpus )
+         applog( LOG_WARNING, "Affinity: more threads (%d) than active CPUs (%d)", opt_n_threads, active_cpus );
      if ( !opt_quiet )
      {
         char affinity_mask[64];
--- a/miner.h
+++ b/miner.h
@@ -24,6 +24,11 @@

 #endif /* _MSC_VER */

+// prevent questions from ARM users that don't read the requirements.
+#if !defined(__x86_64__)
+#error "CPU architecture not supported. Consult the requirements for supported CPUs."
+#endif
+
 #include <stdbool.h>
 #include <inttypes.h>
 #include <sys/time.h>
@@ -91,6 +96,19 @@ enum {
   LOG_PINK  = 0x14 };
 #endif

+#define WORK_ALIGNMENT 64
+
+// When working with dynamically allocated memory to guarantee data alignment
+// for large vectors. Physical block size must be extended by alignment number
+// of bytes when allocated. free() should use the physical pointer returned by
+// malloc(), not the aligned pointer. All others shoujld use the logical,
+// aligned, pointer returned by this function. 
+static inline void *align_ptr( const void *ptr, const uint64_t alignment )
+{
+  const uint64_t mask = alignment - 1;
+  return (void*)( ( ((const uint64_t)ptr) + mask ) & (~mask) );
+}
+
 extern bool is_power_of_2( int n );

 static inline bool is_windows(void)
@@ -317,7 +335,7 @@ extern void cbin2hex(char *out, const char *in, size_t len);
 void   bin2hex( char *s, const unsigned char *p, size_t len );
 char  *abin2hex( const unsigned char *p, size_t len );
 char  *bebin2hex( const unsigned char *p, size_t len );
-bool   hex2bin( unsigned char *p, const char *hexstr, size_t len );
+bool   hex2bin( unsigned char *p, const char *hexstr, const size_t len );
 bool   jobj_binary( const json_t *obj, const char *key, void *buf,
                    size_t buflen );
 int    varint_encode( unsigned char *p, uint64_t n );
@@ -333,10 +351,7 @@ extern void memrev(unsigned char *p, size_t len);
 // number of hashes.
 //
 //     https://en.bitcoin.it/wiki/Difficulty
-//
 //     hash = diff * 2**32
-//
-// diff_to_hash = 2**32 = 0x100000000 = 4294967296 = exp32;

 #define EXP16 65536.
 #define EXP32 4294967296.
@@ -350,8 +365,9 @@ extern const long double exp160; // 2**160
 bool   fulltest( const uint32_t *hash, const uint32_t *target );
 bool   valid_hash( const void*, const void* );

-double hash_to_diff( const void* );
+extern double hash_to_diff( const void* );
 extern void diff_to_hash( uint32_t*, const double );
+extern double nbits_to_diff( uint32_t );

 double hash_target_ratio( uint32_t* hash, uint32_t* target );
 void   work_set_target_ratio( struct work* work, const void *hash );
@@ -399,13 +415,14 @@ struct work
   double stratum_diff;
 	int height;
 	char *txs;
-	char *workid;
+   int tx_count;
+   char *workid;
 	char *job_id;
 	size_t xnonce2_len;
 	unsigned char *xnonce2;
   bool sapling;
   bool stale;
-} __attribute__ ((aligned (64)));
+} __attribute__ ((aligned (WORK_ALIGNMENT)));

 struct stratum_job
 {
@@ -416,7 +433,8 @@ struct stratum_job
 	unsigned char *coinbase;
 	unsigned char *xnonce2;
 	int merkle_count;
-	unsigned char **merkle;
+   int merkle_buf_size;
+   unsigned char **merkle;
 	unsigned char version[4];
 	unsigned char nbits[4];
 	unsigned char ntime[4];
@@ -540,7 +558,6 @@ enum algos {
        ALGO_BMW,        
        ALGO_BMW512,
        ALGO_C11,         
-        ALGO_DECRED,
        ALGO_DEEP,
        ALGO_DMD_GR,
        ALGO_GROESTL,     
@@ -572,9 +589,11 @@ enum algos {
        ALGO_QUBIT,       
        ALGO_SCRYPT,
        ALGO_SHA256D,
+        ALGO_SHA256DT,
        ALGO_SHA256Q,
        ALGO_SHA256T,
        ALGO_SHA3D,
+        ALGO_SHA512256D,
        ALGO_SHAVITE3,    
        ALGO_SKEIN,       
        ALGO_SKEIN2,      
@@ -634,7 +653,6 @@ static const char* const algo_names[] = {
        "bmw",
        "bmw512",
        "c11",
-        "decred",
        "deep",
        "dmd-gr",
        "groestl",
@@ -666,9 +684,11 @@ static const char* const algo_names[] = {
        "qubit",
        "scrypt",
        "sha256d",
+        "sha256dt",
        "sha256q",
        "sha256t",
        "sha3d",
+        "sha512256d",
        "shavite3",
        "skein",
        "skein2",
@@ -795,7 +815,6 @@ Options:\n\
                          bmw           BMW 256\n\
                          bmw512        BMW 512\n\
                          c11           Chaincoin\n\
-                          decred        Blake256r14dcr\n\
                          deep          Deepcoin (DCN)\n\
                          dmd-gr        Diamond\n\
                          groestl       Groestl coin\n\
@@ -829,9 +848,11 @@ Options:\n\
                          scrypt:N      scrypt(N, 1, 1)\n\
                          scryptn2      scrypt(1048576, 1,1)\n\
                          sha256d       Double SHA-256\n\
+                          sha256dt      Modified sha256d (Novo)\n\
                          sha256q       Quad SHA-256, Pyrite (PYE)\n\
                          sha256t       Triple SHA-256, Onecoin (OC)\n\
                          sha3d         Double Keccak256 (BSHA3)\n\
+                          sha512256d    Double SHA-512 (Radiant)\n\
                          shavite3      Shavite3\n\
                          skein         Skein+Sha (Skeincoin)\n\
                          skein2        Double Skein (Woodcoin)\n\
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -93,10 +93,15 @@ static inline uint32_t u32_mov128_32( const __m128i a )
  return n;
 }

-// Equivalent of set1, broadcast integer to all elements.
-#define m128_const_i128( i ) mm128_mov64_128( i )
-#define m128_const1_64( i ) _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 )
-#define m128_const1_32( i ) _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 )
+// Emulate broadcast & insert instructions not available in SSE2
+#define mm128_bcast_i64( i )   _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 )
+#define mm128_bcast_i32( i )   _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 )
+
+#define m128_const_i128( i )    mm128_mov64_128( i )
+
+// deprecated
+#define m128_const1_64          mm128_bcast_i64
+#define m128_const1_32          mm128_bcast_i32

 #if defined(__SSE4_1__)

@@ -104,7 +109,7 @@ static inline uint32_t u32_mov128_32( const __m128i a )
 #define m128_const_64( hi, lo ) \
   _mm_insert_epi64( mm128_mov64_128( lo ), hi, 1 )

-#else  // No insert in SSE2
+#else 

 #define m128_const_64  _mm_set_epi64x

@@ -114,12 +119,10 @@ static inline uint32_t u32_mov128_32( const __m128i a )

 #define m128_zero      _mm_setzero_si128()
 #define m128_one_128   mm128_mov64_128( 1 )
-#define m128_one_64    _mm_shuffle_epi32( mm128_mov64_128( 1 ), 0x44 )
-#define m128_one_32    _mm_shuffle_epi32( mm128_mov32_128( 1 ), 0x00 )
-#define m128_one_16    _mm_shuffle_epi32( \
-                                 mm128_mov32_128( 0x00010001 ), 0x00 )
-#define m128_one_8     _mm_shuffle_epi32( \
-                                 mm128_mov32_128( 0x01010101 ), 0x00 )
+#define m128_one_64    mm128_bcast_i64( 1 )
+#define m128_one_32    mm128_bcast_i32( 1 )
+#define m128_one_16    mm128_bcast_i32( 0x00010001 )
+#define m128_one_8     mm128_bcast_i32( 0x01010101 )

 // ASM avoids the need to initialize return variable to avoid compiler warning.
 // Macro abstracts function parentheses to look like an identifier.
@@ -149,7 +152,7 @@ static inline __m128i mm128_neg1_fn()
 // sizing. It's unique.
 //
 // It can:
-//   - zero 32 bit elements of a 128 bit vector.
+//   - zero any number of 32 bit elements of a 128 bit vector.
 //   - extract any 32 bit element from one 128 bit vector and insert the
 //     data to any 32 bit element of another 128 bit vector, or the same vector.
 //   - do both simultaneoulsly.
@@ -162,14 +165,21 @@ static inline __m128i mm128_neg1_fn()
 //    c[5:4] destination element selector
 //    c[7:6] source element selector

-// Convert type and abbreviate name: e"x"tract "i"nsert "m"ask
+// Convert type and abbreviate name: eXtract Insert Mask = XIM
 #define mm128_xim_32( v1, v2, c ) \
   _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
                                    _mm_castsi128_ps( v2 ), c ) )

-// Some examples of simple operations:
+/* Another way to do it with individual arguments.
+#define mm128_xim_32( v1, i1, v2, i2, mask ) \
+   _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
+                                    _mm_castsi128_ps( v2 ), \
+                                    (mask) | ((i1)<<4) | ((i2)<<6) ) )
+*/

-// Insert 32 bit integer into v at element c and return modified v.
+// Examples of simple operations using xim:
+
+// Insert 32 bit integer into v at element c and return updated v.
 static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
                                       const int c )
 {   return mm128_xim_32( v, mm128_mov32_128( i ), c<<4 ); }
@@ -178,13 +188,12 @@ static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
 static inline uint32_t mm128_extract_32( const __m128i v, const int c )
 {   return u32_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }

-// Clear (zero) 32 bit elements based on bits set in 4 bit mask.
+// Zero 32 bit elements when bit in mask is set.
 static inline __m128i mm128_mask_32( const __m128i v, const int m ) 
 {   return mm128_xim_32( v, v, m ); }

-// Move element i2 of v2 to element i1 of v1. For reference and convenience,
-// it's faster to precalculate the index.
-#define mm128_shuflmov_32( v1, i1, v2, i2 ) \
+// Move element i2 of v2 to element i1 of v1 and return updated v1.
+#define mm128_mov32_32( v1, i1, v2, i2 ) \
  mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )

 #endif  // SSE4_1
@@ -280,7 +289,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )

 // Mask making
 // Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
-// Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements.
+// Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements.
 // Effectively a sign test.

 #define mm_movmask_64( v ) \
@@ -385,6 +394,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_var_32( v, c ) \
   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )

+// Cross lane shuffles
 //
 // Limited 2 input shuffle, combines shuffle with blend. The destination low
 // half is always taken from v1, and the high half from v2.
@@ -396,12 +406,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
   _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
                                     _mm_castsi128_ps( v2 ), c ) ); 

-//
 // Rotate vector elements accross all lanes

-#define mm128_swap_64( v )    _mm_shuffle_epi32( v, 0x4e )
-#define mm128_shuflr_64       mm128_swap_64
-#define mm128_shufll_64       mm128_swap_64
+#define mm128_swap_64( v )     _mm_shuffle_epi32( v, 0x4e )
+#define mm128_shuflr_64        mm128_swap_64
+#define mm128_shufll_64        mm128_swap_64

 #define mm128_shuflr_32( v )   _mm_shuffle_epi32( v, 0x39 )
 #define mm128_shufll_32( v )   _mm_shuffle_epi32( v, 0x93 )
@@ -414,13 +423,11 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )

 #endif

-// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit rotations
-// for multiples of 8 bits. Uses ror/rol macros when AVX512 is available
-// (unlikely but faster), or when SSSE3 is not available (slower).
+//  Rotate 64 bit lanes

 #define mm128_swap64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
-#define mm128_shuflr64_32 mm128_swap64_32
-#define mm128_shufll64_32 mm128_swap64_32
+#define mm128_shuflr64_32     mm128_swap64_32
+#define mm128_shufll64_32     mm128_swap64_32

 #if defined(__SSSE3__) && !defined(__AVX512VL__)
  #define mm128_shuflr64_24( v ) \
@@ -438,6 +445,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
  #define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 )
 #endif

+// Rotate 32 bit lanes
+
 #if defined(__SSSE3__) && !defined(__AVX512VL__)
  #define mm128_swap32_16( v ) \
    _mm_shuffle_epi8( v, _mm_set_epi64x( \
@@ -445,8 +454,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 #else
  #define mm128_swap32_16( v ) mm128_ror_32( v, 16 )
 #endif
-#define mm128_shuflr32_16 mm128_swap32_16
-#define mm128_shufll32_16 mm128_swap32_16
+#define mm128_shuflr32_16      mm128_swap32_16
+#define mm128_shufll32_16      mm128_swap32_16

 #if defined(__SSSE3__) && !defined(__AVX512VL__)
  #define mm128_shuflr32_8( v ) \
@@ -461,6 +470,10 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )

 #if defined(__SSSE3__)

+#define mm128_bswap_128( v ) \
+   _mm_shuffle_epi8( v, m128_const_64( 0x0001020304050607, \
+                                       0x08090a0b0c0d0e0f ) )
+
 #define mm128_bswap_64( v ) \
   _mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \
                                       0x0001020304050607 ) )
@@ -522,6 +535,9 @@ static inline __m128i mm128_bswap_16( __m128i v )
  return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
 }

+#define mm128_bswap_128( v ) \
+   mm128_swap_64( mm128_bswap_64( v ) )
+
 static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
 {
   d[0] = mm128_bswap_64( s[0] );
@@ -556,67 +572,23 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
   v1 = _mm_xor_si128( v1, v2 );


-// alignr for 32 & 64 bit elements is only available with AVX512 but
-// emulated here. Shift argument is not needed, it's always 1.
-// Behaviour is otherwise consistent with Intel alignr intrinsics.
+// alignr instruction for 32 & 64 bit elements is only available with AVX512
+// but emulated here. Behaviour is consistent with Intel alignr intrinsics.

 #if defined(__SSSE3__)

-#define mm128_alignr_64( v1, v2 )    _mm_alignr_epi8( v1, v2, 8 )
-#define mm128_alignr_32( v1, v2 )    _mm_alignr_epi8( v1, v2, 4 )
+#define mm128_alignr_64( hi, lo, c )    _mm_alignr_epi8( hi, lo, (c)*8 )
+#define mm128_alignr_32( hi, lo, c )    _mm_alignr_epi8( hi, lo, (c)*4 )

 #else

-#define mm128_alignr_64( v1, v2 )    _mm_or_si128( _mm_slli_si128( v1, 8 ), \
-                                                   _mm_srli_si128( v2, 8 ) )
+#define mm128_alignr_64( hi, lo, c ) \
+   _mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )

-#define mm128_alignr_32( v1, v2 )    _mm_or_si128( _mm_slli_si128( v1, 4 ), \
-                                                   _mm_srli_si128( v2, 4 ) )
+#define mm128_alignr_32( hi, lo, c ) \
+   _mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )

 #endif

-// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
-// vrol & vror are deprecated and do not exist for larger vectors.
-// Their only use is by lyra2 blake2b when AVX2 is not available and is
-// grandfathered.
-
-#if defined(__SSSE3__)
-
-#define mm128_vror256_64( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
-           v1 = _mm_alignr_epi8( v2, v1, 8 ); \
-           v2 = t; \
-} while(0)
-
-#define mm128_vrol256_64( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
-           v2 = _mm_alignr_epi8( v2, v1, 8 ); \
-           v1 = t; \
-} while(0)
-
-#else  // SSE2
-
-#define mm128_vror256_64( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
-                              _mm_slli_si128( v2, 8 ) ); \
-           v2 = _mm_or_si128( _mm_srli_si128( v2, 8 ), \
-                              _mm_slli_si128( v1, 8 ) ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vrol256_64( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
-                              _mm_srli_si128( v2, 8 ) ); \
-           v2 = _mm_or_si128( _mm_slli_si128( v2, 8 ), \
-                              _mm_srli_si128( v1, 8 ) ); \
-           v1 = t; \
-} while(0)
-
-#endif  // SSE4.1 else SSE2
-
 #endif // __SSE2__
 #endif // SIMD_128_H__
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -15,6 +15,8 @@
 //
 // "_mm256_shuffle_epi8" and "_mm256_alignr_epi8" are restricted to 128 bit
 // lanes and data can't cross the 128 bit lane boundary.  
+// Full width byte shuffle is available with AVX512VL using the mask version
+// with a full mask (-1). 
 // Instructions that can move data across 128 bit lane boundary incur a
 // performance penalty over those that can't.
 // Some usage of index vectors may be encoded as if full vector shuffles are
@@ -66,31 +68,33 @@ typedef union
 #define u32_mov256_32( v ) u32_mov128_32( _mm256_castsi256_si128( v ) )

 // concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
+
 #define mm256_concat_128( hi, lo ) \
   _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )

+#define mm256_bcast_m128( v ) \
+                 _mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
+#define mm256_bcast_i128( i ) mm256_bcast_m128( mm128_mov64_128( i ) )
+#define mm256_bcast_i64( i )  _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
+#define mm256_bcast_i32( i )  _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
+#define mm256_bcast_i16( i )  _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
+#define mm256_bcast_i8( i )   _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )

 // Equivalent of set, move 64 bit integer constants to respective 64 bit
 // elements.
 static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
                                     const uint64_t i1, const uint64_t i0 )
 {
-  union { __m256i m256i;
-          uint64_t u64[4]; } v;
+  union { __m256i m256i;  uint64_t u64[4]; } v;
  v.u64[0] = i0; v.u64[1] = i1; v.u64[2] = i2; v.u64[3] = i3;
  return v.m256i;
 }

-// Equivalent of set1.
-// 128 bit vector argument
-#define m256_const1_128( v ) \
-   _mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
-// 64 bit integer argument zero extended to 128 bits.
-#define m256_const1_i128( i ) m256_const1_128( mm128_mov64_128( i ) )
-#define m256_const1_64( i )  _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
-#define m256_const1_32( i )  _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
-#define m256_const1_16( i )  _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
-#define m256_const1_8 ( i )  _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )
+// Deprecated
+#define m256_const1_128      mm256_bcast_m128
+#define m256_const1_i128     mm256_bcast_i128
+#define m256_const1_64       mm256_bcast_i64
+#define m256_const1_32       mm256_bcast_i32

 #define m256_const2_64( i1, i0 ) \
  m256_const1_128( m128_const_64( i1, i0 ) )
@@ -99,13 +103,13 @@ static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
 // All SIMD constant macros are actually functions containing executable
 // code and therefore can't be used as compile time initializers.

-#define m256_zero      _mm256_setzero_si256()
-#define m256_one_256   mm256_mov64_256( 1 )
-#define m256_one_128   m256_const1_i128( 1 )
-#define m256_one_64    _mm256_broadcastq_epi64( mm128_mov64_128( 1 ) )
-#define m256_one_32    _mm256_broadcastd_epi32( mm128_mov64_128( 1 ) )
-#define m256_one_16    _mm256_broadcastw_epi16( mm128_mov64_128( 1 ) )
-#define m256_one_8     _mm256_broadcastb_epi8 ( mm128_mov64_128( 1 ) )
+#define m256_zero         _mm256_setzero_si256()
+#define m256_one_256      mm256_mov64_256( 1 )
+#define m256_one_128      mm256_bcast_i128( 1 )
+#define m256_one_64       mm256_bcast_i64( 1 )
+#define m256_one_32       mm256_bcast_i32( 1 )
+#define m256_one_16       mm256_bcast_i16( 1 )
+#define m256_one_8        mm256_bcast_i8 ( 1 )

 static inline __m256i mm256_neg1_fn()
 {
@@ -116,8 +120,8 @@ static inline __m256i mm256_neg1_fn()
 #define m256_neg1  mm256_neg1_fn()

 // Consistent naming for similar operations.
-#define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v )
-#define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 )
+#define mm128_extr_lo128_256( v )    _mm256_castsi256_si128( v )
+#define mm128_extr_hi128_256( v )    _mm256_extracti128_si256( v, 1 )

 //
 // Memory functions
@@ -239,7 +243,7 @@ static inline __m256i mm256_not( const __m256i v )

 // Mask making
 // Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
-// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
+// Returns 4 or 8 bit integer mask from MSBit of 64 or 32 bit elements.
 // Effectively a sign test.

 #define mm256_movmask_64( v ) \
@@ -353,18 +357,22 @@ static inline __m256i mm256_not( const __m256i v )
   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
                    _mm256_srli_epi32( v, 32-(c) ) )

+//
+// Cross lane shuffles
 //
 // Rotate elements accross all lanes.

 // Swap 128 bit elements in 256 bit vector.
 #define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )
-#define mm256_shuflr_128 mm256_swap_128
-#define mm256_shufll_128 mm256_swap_128
+#define mm256_shuflr_128        mm256_swap_128
+#define mm256_shufll_128        mm256_swap_128

 // Rotate 256 bit vector by one 64 bit element
 #define mm256_shuflr_64( v )    _mm256_permute4x64_epi64( v, 0x39 )
 #define mm256_shufll_64( v )    _mm256_permute4x64_epi64( v, 0x93 )

+
+/* Not used
 // Rotate 256 bit vector by one 32 bit element.
 #if defined(__AVX512VL__)

@@ -387,6 +395,7 @@ static inline __m256i mm256_shufll_32( const __m256i v )
                                    0x0000000200000001,  0x0000000000000007 ) )

 #endif
+*/

 //
 // Rotate elements within each 128 bit lane of 256 bit vector.
@@ -410,20 +419,17 @@ static inline __m256i mm256_shufll_32( const __m256i v )
 static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 { return _mm256_alignr_epi8( v, v, c ); }

-// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit
-// rotations for multiples of 8 bits. Uses faster ror/rol instructions when
-// AVX512 is available.
+// 64 bit lanes

-#define mm256_swap64_32( v )   _mm256_shuffle_epi32( v, 0xb1 )
-#define mm256_shuflr64_32 mm256_swap64_32
-#define mm256_shufll64_32 mm256_swap64_32
+#define mm256_swap64_32( v )      _mm256_shuffle_epi32( v, 0xb1 )
+#define mm256_shuflr64_32         mm256_swap64_32
+#define mm256_shufll64_32         mm256_swap64_32

 #if defined(__AVX512VL__)
  #define mm256_shuflr64_24( v )  _mm256_ror_epi64( v, 24 )
 #else
  #define mm256_shuflr64_24( v ) \
-    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
-                                    0x0a09080f0e0d0c0b, 0x0201000706050403, \
+    _mm256_shuffle_epi8( v, m256_const2_64( \
                                    0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
 #endif

@@ -431,21 +437,21 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
  #define mm256_shuflr64_16( v )  _mm256_ror_epi64( v, 16 )
 #else
  #define mm256_shuflr64_16( v ) \
-    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
-                                    0x09080f0e0d0c0b0a, 0x0100070605040302, \
+    _mm256_shuffle_epi8( v, m256_const2_64( \
                                    0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
 #endif

+// 32 bit lanes
+
 #if defined(__AVX512VL__)
  #define mm256_swap32_16( v )  _mm256_ror_epi32( v, 16 )
 #else
  #define mm256_swap32_16( v ) \
-    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
-                                    0x0d0c0f0e09080b0a, 0x0504070601000302, \
+    _mm256_shuffle_epi8( v, m256_const2_64( \
                                    0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
 #endif
-#define mm256_shuflr32_16 mm256_swap32_16
-#define mm256_shufll32_16 mm256_swap32_16
+#define mm256_shuflr32_16       mm256_swap32_16
+#define mm256_shufll32_16       mm256_swap32_16

 #if defined(__AVX512VL__)
  #define mm256_shuflr32_8( v )  _mm256_ror_epi32( v, 8 )
@@ -456,35 +462,24 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
                                    0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
 #endif

-// NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
-// lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
-// AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
-// needed for a shuffle that crosses 128 bit lanes. BSWAP doesn't therefore the
-// AVX2 version will work here. The bswap control vector is coded to work
-// with both versions, bit 4 is ignored in AVX2. 
-
 // Reverse byte order in elements, endian bswap.
 #define mm256_bswap_64( v ) \
   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                        0x08090a0b0c0d0e0f, 0x0001020304050607 ) )
+         m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) )

 #define mm256_bswap_32( v ) \
   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
-                        0x0c0d0e0f08090a0b, 0x0405060700010203 ) )
+         m256_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) )

 #define mm256_bswap_16( v ) \
   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x1e1f1c1d1a1b1819, 0x1617141512131011, \
-                        0x0e0f0c0d0a0b0809, 0x0607040502030001, ) )
+         m256_const2_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001, ) )

 // Source and destination are pointers, may point to same memory.
 // 8 byte qword * 8 qwords * 4 lanes = 256 bytes
 #define mm256_block_bswap_64( d, s ) do \
 { \
-  __m256i ctl = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                               0x08090a0b0c0d0e0f, 0x0001020304050607 ) ; \
+  __m256i ctl = m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ; \
  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
@@ -498,8 +493,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 // 4 byte dword * 8 dwords * 8 lanes = 256 bytes
 #define mm256_block_bswap_32( d, s ) do \
 { \
-  __m256i ctl = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
-                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+  __m256i ctl = m256_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -113,7 +113,17 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
 #define mm512_concat_256( hi, lo ) \
   _mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 )

-#define m512_const_128( v3, v2, v1, v0 ) \
+// Work in progress.
+// modified naming scheme to align more with opcode mnenonic:
+// m512_const1 becomes mm512_bcast_m[n] or mm512_bcast_i[n], short for
+// broadcast, i indicates integer arg, m is vector. Set1 intrinsics should
+// genarally be used for integer data.
+// mm512_const should only be used with immediate integer arguments, use
+// _mm512_set intrinsic instead.
+// mm512_set, mm512_set[n] macros may be defined when no intrinsic exists
+// for either the arg size or arg count.
+
+#define mm512_set_128( v3, v2, v1, v0 ) \
   mm512_concat_256( mm256_concat_128( v3, v2 ), \
                     mm256_concat_128( v1, v0 ) )

@@ -133,29 +143,35 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
  return v.m512i;
 }

+// Broadcast with vector argument is generally more efficient except for
+// integer immediate constants or when data was most recently referenced as
+// integer and is still available in an integer register.
+
+/* not used
 // Equivalent of set1, broadcast lo element to all elements.
 static inline __m512i m512_const1_256( const __m256i v )
 { return _mm512_inserti64x4( _mm512_castsi256_si512( v ), v, 1 ); }  
+*/

-#define m512_const1_128( v ) \
-    mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
-// Integer input argument up to 64 bits
-#define m512_const1_i128( i ) \
-    mm512_perm_128( _mm512_castsi128_si512( mm128_mov64_128( i ) ), 0 )
+#define mm512_bcast_m128( v )  mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
+// Low 64 bits only, high 64 bits are zeroed.
+#define mm512_bcast_i128( i )  mm512_bcast_m128( mm128_mov64_128( i ) )
+#define mm512_bcast_i64( i )   _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
+#define mm512_bcast_i32( i )   _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
+#define mm512_bcast_i16( i )   _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
+#define mm512_bcast_i8( i )    _mm512_broadcastb_epi8( mm128_mov32_128( i ) )

-//#define m512_const1_256( v )   _mm512_broadcast_i64x4( v )
-//#define m512_const1_128( v )   _mm512_broadcast_i64x2( v )
-#define m512_const1_64( i )    _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
-#define m512_const1_32( i )    _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
-#define m512_const1_16( i )    _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
-#define m512_const1_8( i )     _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) )
+// const1 is deprecated, use bcast instead
+#define m512_const1_128   mm512_bcast_m128
+#define m512_const1_i128  mm512_bcast_i128
+#define m512_const1_64    mm512_bcast_i64
+#define m512_const1_32    mm512_bcast_i32

 #define m512_const2_128( v1, v0 ) \
-   m512_const1_256( _mm512_inserti64x2( _mm512_castsi128_si512( v0 ), v1, 1 ) )
+   _mm512_inserti64x2( _mm512_castsi128_si512( v0 ), v1, 1 )

 #define m512_const2_64( i1, i0 ) \
-   m512_const1_128( m128_const_64( i1, i0 ) )
-
+   mm512_bcast_m128( m128_const_64( i1, i0 ) )

 static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
                                      const uint64_t i1, const uint64_t i0 )
@@ -179,11 +195,11 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
 #define m512_zero       _mm512_setzero_si512()
 #define m512_one_512    mm512_mov64_512( 1 )
 #define m512_one_256    _mm512_inserti64x4( m512_one_512, m256_one_256, 1 )  
-#define m512_one_128    m512_const1_i128( 1 )
-#define m512_one_64     m512_const1_64( 1 )
-#define m512_one_32     m512_const1_32( 1 )
-#define m512_one_16     m512_const1_16( 1 )
-#define m512_one_8      m512_const1_8( 1 )
+#define m512_one_128    mm512_bcast_i128( (__uint128_t)1 )
+#define m512_one_64     mm512_bcast_i64( (uint64_t)1 )
+#define m512_one_32     mm512_bcast_i32( (uint32_t)1 )
+#define m512_one_16     mm512_bcast_i16( (uint16_t)1 )
+#define m512_one_8      mm512_bcast_i8(  (uint8_t)1 )

 // use asm to avoid compiler warning for unitialized local
 static inline __m512i mm512_neg1_fn()
@@ -193,8 +209,6 @@ static inline __m512i mm512_neg1_fn()
   return a;
 }
 #define m512_neg1 mm512_neg1_fn()                          // 1 clock
-//#define m512_neg1 m512_const1_64( 0xffffffffffffffff )   // 5 clocks
-//#define m512_neg1 _mm512_movm_epi64( 0xff )              // 2 clocks

 //
 // Basic operations without SIMD equivalent
@@ -343,10 +357,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // 8 lanes of 64 bytes each
 #define mm512_block_bswap_64( d, s ) do \
 { \
-  __m512i ctl = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
-                               0x28292a2b2c2d2e2f, 0x2021222324252627, \
-                               0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                               0x08090a0b0c0d0e0f, 0x0001020304050607  ); \
+  const __m512i ctl = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
+                                     0x28292a2b2c2d2e2f, 0x2021222324252627, \
+                                     0x18191a1b1c1d1e1f, 0x1011121314151617, \
+                                     0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
@@ -360,10 +374,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // 16 lanes of 32 bytes each
 #define mm512_block_bswap_32( d, s ) do \
 { \
-  __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
-                               0x2c2d2e2f28292a2b, 0x2425262720212223, \
-                               0x1c1d1e1f18191a1b, 0x1415161710111213, \
-                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+  const __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
+                                     0x2c2d2e2f28292a2b, 0x2425262720212223, \
+                                     0x1c1d1e1f18191a1b, 0x1415161710111213, \
+                                     0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
@@ -412,16 +426,16 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuflr_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                       0x0000001F001E001D, 0x001C001B001A0019, \
-                       0X0018001700160015, 0X0014001300120011, \
-                       0X0010000F000E000D, 0X000C000B000A0009, \
-                       0X0008000700060005, 0X0004000300020001 ), v )
+                       0x0018001700160015, 0x0014001300120011, \
+                       0x0010000F000E000D, 0x000C000B000A0009, \
+                       0x0008000700060005, 0x0004000300020001 ), v )

 #define mm512_shufll_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                       0x001E001D001C001B, 0x001A001900180017, \
-                       0X0016001500140013, 0X001200110010000F, \
-                       0X000E000D000C000B, 0X000A000900080007, \
-                       0X0006000500040003, 0X000200010000001F ), v )
+                       0x0016001500140013, 0x001200110010000F, \
+                       0x000E000D000C000B, 0x000A000900080007, \
+                       0x0006000500040003, 0x000200010000001F ), v )

 #define mm512_shuflr_8( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
@@ -449,7 +463,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuflr256_64( v )     _mm512_permutex_epi64( v, 0x39 )
 #define mm512_shufll256_64( v )     _mm512_permutex_epi64( v, 0x93 )

-/*
+/*  Not used
 // Rotate 256 bit lanes by one 32 bit element
 #define mm512_shuflr256_32( v ) \
   _mm512_permutexvar_epi32( m512_const_64( \
@@ -496,6 +510,18 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 //
 // Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
 
+#define mm512_swap128_64( v )   _mm512_shuffle_epi32( v, 0x4e )
+#define mm512_shuflr128_64      mm512_swap128_64
+#define mm512_shufll128_64      mm512_swap128_64
+
+// Rotate 128 bit lanes by one 32 bit element
+#define mm512_shuflr128_32( v )    _mm512_shuffle_epi32( v, 0x39 )
+#define mm512_shufll128_32( v )    _mm512_shuffle_epi32( v, 0x93 )
+
+// Rotate 128 bit lanes right by c bytes, versatile and just as fast
+static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
+{  return _mm512_alignr_epi8( v, v, c ); }
+
 // Limited 2 input, 1 output shuffle, combines shuffle with blend.
 // Like most shuffles it's limited to 128 bit lanes and like some shuffles
 // destination elements must come from a specific source arg. 
@@ -507,26 +533,11 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
   _mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( v1 ), \
                                           _mm512_castsi512_ps( v2 ), c ) ); 

-// Swap 64 bits in each 128 bit lane
-#define mm512_swap128_64( v )   _mm512_shuffle_epi32( v, 0x4e )
-#define mm512_shuflr128_64  mm512_swap128_64
-#define mm512_shufll128_64  mm512_swap128_64
-
-// Rotate 128 bit lanes by one 32 bit element
-#define mm512_shuflr128_32( v )    _mm512_shuffle_epi32( v, 0x39 )
-#define mm512_shufll128_32( v )    _mm512_shuffle_epi32( v, 0x93 )
-
-// Rotate right 128 bit lanes by c bytes, versatile and just as fast
-static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
-{  return _mm512_alignr_epi8( v, v, c ); }
-
-// Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all
-// can be done with ror & rol. Defined only for convenience and consistency
-// with AVX2 & SSE2 macros.
+// 64 bit lanes

 #define mm512_swap64_32( v )    _mm512_shuffle_epi32( v, 0xb1 )
-#define mm512_shuflr64_32 mm512_swap64_32
-#define mm512_shufll64_32 mm512_swap64_32
+#define mm512_shuflr64_32       mm512_swap64_32
+#define mm512_shufll64_32       mm512_swap64_32

 #define mm512_shuflr64_24( v )  _mm512_ror_epi64( v, 24 )
 #define mm512_shufll64_24( v )  _mm512_rol_epi64( v, 24 )
@@ -537,12 +548,14 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 #define mm512_shuflr64_8(  v )  _mm512_ror_epi64( v,  8 )
 #define mm512_shufll64_8(  v )  _mm512_rol_epi64( v,  8 )

-#define mm512_swap32_16(   v )  _mm512_ror_epi32( v, 16 )
-#define mm512_shuflr32_16 mm512_swap32_16
-#define mm512_shufll32_16 mm512_swap32_16
+// 32 bit lanes

-#define mm512_shuflr32_8(  v )  _mm512_ror_epi32( v,  8 )
-#define mm512_shufll32_8(  v )  _mm512_rol_epi32( v,  8 )
+#define mm512_swap32_16( v )    _mm512_ror_epi32( v, 16 )
+#define mm512_shuflr32_16       mm512_swap32_16
+#define mm512_shufll32_16       mm512_swap32_16
+
+#define mm512_shuflr32_8( v )   _mm512_ror_epi32( v,  8 )
+#define mm512_shufll32_8( v )   _mm512_rol_epi32( v,  8 )

 #endif // AVX512
 #endif // SIMD_512_H__
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -55,6 +55,13 @@
 typedef          __int128  int128_t;
 typedef unsigned __int128 uint128_t;

+typedef union
+{
+   uint128_t u128;
+   uint64_t  u64[2];
+   uint32_t  u32[4];
+} __attribute__ ((aligned (16))) u128_ovly;
+
 // Extracting the low bits is a trivial cast.
 // These specialized functions are optimized while providing a
 // consistent interface.
--- a/util.c
+++ b/util.c
@@ -44,28 +44,22 @@
 #include <libgen.h>
 #endif

-//#include "miner.h"
 #include "elist.h"
 #include "algo-gate-api.h"
 #include "algo/sha/sha256d.h"

-//extern pthread_mutex_t stats_lock;
-
-struct data_buffer {
-	void		*buf;
-	size_t		len;
-};
-
-struct upload_buffer {
-	const void	*buf;
-	size_t		len;
-	size_t		pos;
-};
-
 struct header_info {
 	char		*lp_path;
 	char		*reason;
 	char		*stratum_url;
+   size_t	content_length;
+};
+
+struct data_buffer {
+	void			*buf;
+	size_t			len;
+	size_t			allocated;
+	struct header_info	*headers;
 };

 struct tq_ent {
@@ -127,7 +121,6 @@ void applog2( int prio, const char *fmt, ... )
      int len;
 //    struct tm tm;
 //    time_t now = time(NULL);
-
 //    localtime_r(&now, &tm);

      switch ( prio )
@@ -395,67 +388,53 @@ static void databuf_free(struct data_buffer *db)
 static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb,
 			  void *user_data)
 {
-	struct data_buffer *db = (struct data_buffer *) user_data;
+	struct data_buffer *db = user_data;
 	size_t len = size * nmemb;
-	size_t oldlen, newlen;
+	size_t newalloc, reqalloc;
 	void *newmem;
 	static const unsigned char zero = 0;
+	static const size_t max_realloc_increase = 8 * 1024 * 1024;
+	static const size_t initial_alloc = 16 * 1024;

-	oldlen = db->len;
-	newlen = oldlen + len;
+	/* minimum required allocation size */
+	reqalloc = db->len + len + 1;

-	newmem = realloc(db->buf, newlen + 1);
-	if (!newmem)
-		return 0;
+	if (reqalloc > db->allocated) {
+		if (db->len > 0) {
+			newalloc = db->allocated * 2;
+		} else {
+			if (db->headers->content_length > 0)
+				newalloc = db->headers->content_length + 1;
+			else
+				newalloc = initial_alloc;
+		}

-	db->buf = newmem;
-	db->len = newlen;
-	memcpy((uchar*) db->buf + oldlen, ptr, len);
-	memcpy((uchar*) db->buf + newlen, &zero, 1);	/* null terminate */
+		if (db->headers->content_length == 0) {
+			/* limit the maximum buffer increase */
+			if (newalloc - db->allocated > max_realloc_increase)
+				newalloc = db->allocated + max_realloc_increase;
+		}
+
+		/* ensure we have a big enough allocation */
+		if (reqalloc > newalloc)
+			newalloc = reqalloc;
+
+		newmem = realloc(db->buf, newalloc);
+		if (!newmem)
+			return 0;
+
+		db->buf = newmem;
+		db->allocated = newalloc;
+	}
+
+	memcpy(db->buf + db->len, ptr, len); /* append new data */
+	memcpy(db->buf + db->len + len, &zero, 1); /* null terminate */
+
+	db->len += len;

 	return len;
 }

-static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb,
-			     void *user_data)
-{
-	struct upload_buffer *ub = (struct upload_buffer *) user_data;
-	size_t len = size * nmemb;
-
-	if (len > ub->len - ub->pos)
-		len = ub->len - ub->pos;
-
-	if (len) {
-		memcpy(ptr, ((uchar*)ub->buf) + ub->pos, len);
-		ub->pos += len;
-	}
-
-	return len;
-}
-
-#if LIBCURL_VERSION_NUM >= 0x071200
-static int seek_data_cb(void *user_data, curl_off_t offset, int origin)
-{
-	struct upload_buffer *ub = (struct upload_buffer *) user_data;
-	
-	switch (origin) {
-	case SEEK_SET:
-		ub->pos = (size_t) offset;
-		break;
-	case SEEK_CUR:
-		ub->pos += (size_t) offset;
-		break;
-	case SEEK_END:
-		ub->pos = ub->len + (size_t) offset;
-		break;
-	default:
-		return 1; /* CURL_SEEKFUNC_FAIL */
-	}
-
-	return 0; /* CURL_SEEKFUNC_OK */
-}
-#endif
-
 static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
 {
 	struct header_info *hi = (struct header_info *) user_data;
@@ -505,6 +484,9 @@ static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
 		val = NULL;
 	}

+	if (!strcasecmp("Content-Length", key))
+		hi->content_length = strtoul(val, NULL, 10);
+
 out:
 	free(key);
 	free(val);
@@ -564,48 +546,38 @@ json_t *json_rpc_call(CURL *curl, const char *url,
 	int rc;
 	long http_rc;
 	struct data_buffer all_data = {0};
-	struct upload_buffer upload_data;
 	char *json_buf;
 	json_error_t err;
 	struct curl_slist *headers = NULL;
-	char len_hdr[64];
 	char curl_err_str[CURL_ERROR_SIZE] = { 0 };
 	long timeout = (flags & JSON_RPC_LONGPOLL) ? opt_timeout : 30;
 	struct header_info hi = {0};

+   all_data.headers = &hi;
 	/* it is assumed that 'curl' is freshly [re]initialized at this pt */

-	if (opt_protocol)
-		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
+	if (opt_protocol)  curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
 	curl_easy_setopt(curl, CURLOPT_URL, url);
-	if (opt_cert)
-		curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
-//
-        curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
-
+	if (opt_cert)      curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
+   curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
 	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
 	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
 	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
 	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
 	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb);
 	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
-	curl_easy_setopt(curl, CURLOPT_READFUNCTION, upload_data_cb);
-	curl_easy_setopt(curl, CURLOPT_READDATA, &upload_data);
-#if LIBCURL_VERSION_NUM >= 0x071200
-	curl_easy_setopt(curl, CURLOPT_SEEKFUNCTION, &seek_data_cb);
-	curl_easy_setopt(curl, CURLOPT_SEEKDATA, &upload_data);
-#endif
-	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
-	if (opt_redirect)
-		curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
+   curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
+	if (opt_redirect)  curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
 	curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
 	curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, resp_hdr_cb);
 	curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hi);
-	if (opt_proxy) {
+	if (opt_proxy)
+   {
 		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
 		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
 	}
-	if (userpass) {
+	if (userpass)
+   {
 		curl_easy_setopt(curl, CURLOPT_USERPWD, userpass);
 		curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC);
 	}
@@ -613,23 +585,16 @@ json_t *json_rpc_call(CURL *curl, const char *url,
 	if (flags & JSON_RPC_LONGPOLL)
 		curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
 #endif
-	curl_easy_setopt(curl, CURLOPT_POST, 1);
+   curl_easy_setopt(curl, CURLOPT_POSTFIELDS, rpc_req);

 	if (opt_protocol)
 		applog(LOG_DEBUG, "JSON protocol request:\n%s\n", rpc_req);

-	upload_data.buf = rpc_req;
-	upload_data.len = strlen(rpc_req);
-	upload_data.pos = 0;
-	sprintf(len_hdr, "Content-Length: %lu",
-		(unsigned long) upload_data.len);
-
 	headers = curl_slist_append(headers, "Content-Type: application/json");
-	headers = curl_slist_append(headers, len_hdr);
 	headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
 	headers = curl_slist_append(headers, "X-Mining-Extensions: longpoll reject-reason");
-	//headers = curl_slist_append(headers, "Accept:"); /* disable Accept hdr*/
-	//headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/
+	//headers = curl_slist_append(headers, "Accept:"); // disable Accept hdr
+	//headers = curl_slist_append(headers, "Expect:"); // disable Expect hdr

 	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);

@@ -786,18 +751,26 @@ err_out:
 	return cfg;
 }

-// Segwit BEGIN
 void memrev(unsigned char *p, size_t len)
 {
-   unsigned char c, *q;
-   for (q = p + len - 1; p < q; p++, q--) {
-      c = *p;
-      *p = *q;
-      *q = c;
+   if ( len == 32 )
+   {
+      __m128i *pv = (__m128i*)p;
+      __m128i t = mm128_bswap_128( pv[0] );
+      pv[0] =     mm128_bswap_128( pv[1] );   
+      pv[1] = t;
+   }
+   else
+   {
+      unsigned char c, *q;
+      for (q = p + len - 1; p < q; p++, q--) 
+      {
+         c = *p;
+         *p = *q;
+         *q = c;
+      }
   }
 }
-// Segwit END
-

 void cbin2hex(char *out, const char *in, size_t len)
 {
@@ -832,32 +805,42 @@ char *bebin2hex(const unsigned char *p, size_t len)
   return s;
 }

-bool hex2bin(unsigned char *p, const char *hexstr, size_t len)
+bool hex2bin( unsigned char *p, const char *hexstr, const size_t len )
 {
-	char hex_byte[3];
-	char *ep;
+	if( hexstr == NULL )	return false;

-	hex_byte[2] = '\0';
-
-	while (*hexstr && len) {
-		if (!hexstr[1]) {
-			applog(LOG_ERR, "hex2bin str truncated");
-			return false;
-		}
-		hex_byte[0] = hexstr[0];
-		hex_byte[1] = hexstr[1];
-		*p = (unsigned char) strtol(hex_byte, &ep, 16);
-		if (*ep) {
-			applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte);
-			return false;
-		}
-		p++;
-		hexstr += 2;
-		len--;
+	size_t hexstr_len = strlen( hexstr );
+	if( ( hexstr_len % 2 ) != 0 )
+   {
+		applog( LOG_ERR, "hex2bin string truncated" );
+		return false;
+	}
+	size_t bin_len = hexstr_len / 2;
+	if ( bin_len > len )
+   {
+		applog( LOG_ERR, "hex2bin buffer too small" );
+		return false;
 	}

-	return(!len) ? true : false;
-/*	return (len == 0 && *hexstr == 0) ? true : false; */
+	memset( p, 0, len );
+	size_t i = 0;
+	while ( i < hexstr_len )
+   {
+		char c = hexstr[i];
+		unsigned char nibble;
+		if      ( c >= '0' && c <= '9' )	 nibble = (c - '0');
+		else if ( c >= 'A' && c <= 'F' )	 nibble = ( 10 + (c - 'A') );
+		else if ( c >= 'a' && c <= 'f' )	 nibble = ( 10 + (c - 'a') );
+		else
+      {
+			applog( LOG_ERR, "hex2bin invalid hex" );
+			return false;
+		}
+		p[(i / 2)] |= (nibble << ( (1 - (i % 2) ) * 4) );
+		i++;
+	}
+
+	return true;
 }

 int varint_encode(unsigned char *p, uint64_t n)
@@ -1339,6 +1322,43 @@ inline bool valid_hash( const void *hash, const void *target )

 #endif 

+inline double nbits_to_diff( uint32_t nbits )
+{
+   long double diff;
+   uint32_t shift = nbits & 0xff;
+   uint32_t bits = bswap_32( nbits ) & 0x00ffffff;
+   int shift_off = (int)shift - 29;
+
+   // diff = ( (2**16 -1) / ( 256**shift_off * bits )
+   // With uint128 byte shift is good for 16 <= shift <= 41. As unlikely
+   // as this may seem necessary, check just in case.
+
+   if ( shift_off >= -13 && shift_off <= 12 ) 
+   {  // fast
+      if ( shift_off == 0 )
+         diff = (long double)0xffff / (long double)bits;
+      else if ( shift_off < 0 )   // shift < 29
+         diff = (long double)( (uint128_t)0xffff << ( (-shift_off) *8 ) ) 
+              / (long double)bits;
+      else // ( shift_off > 0 )   // shift > 29
+         diff =   (long double)0xffff
+                / (long double)( (uint128_t)bits << ( shift_off*8 ) );  
+   }
+   else
+   {  // slow
+      int m;
+      diff = 0.;
+      for ( m = shift; m < 29; m++ )    diff *= 256.0;
+      for ( m = 29; m < shift; m++ )    diff /= 256.0;
+   }
+
+   if ( opt_debug )
+      applog( LOG_INFO, "nbits %08x: shift %u(%d), bits %06x, diff %8g",
+                         nbits, shift, shift_off, bits, (double)diff );
+
+   return (double)diff;
+}
+
 #ifdef WIN32
 #define socket_blocks() (WSAGetLastError() == WSAEWOULDBLOCK)
 #else
@@ -1507,7 +1527,8 @@ out:
 	return sret;
 }

-#if LIBCURL_VERSION_NUM >= 0x071101
+#if LIBCURL_VERSION_NUM >= 0x071101 && LIBCURL_VERSION_NUM < 0x072d00
+//#if LIBCURL_VERSION_NUM >= 0x071101
 static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose,
 	struct curl_sockaddr *addr)
 {
@@ -1575,7 +1596,8 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
 #if LIBCURL_VERSION_NUM >= 0x070f06
 	curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
 #endif
-#if LIBCURL_VERSION_NUM >= 0x071101
+#if LIBCURL_VERSION_NUM >= 0x071101 && LIBCURL_VERSION_NUM < 0x072d00
+//#if LIBCURL_VERSION_NUM >= 0x071101
 	curl_easy_setopt(curl, CURLOPT_OPENSOCKETFUNCTION, opensocket_grab_cb);
 	curl_easy_setopt(curl, CURLOPT_OPENSOCKETDATA, &sctx->sock);
 #endif
@@ -1589,7 +1611,10 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
 		return false;
 	}

-#if LIBCURL_VERSION_NUM < 0x071101
+#if LIBCURL_VERSION_NUM >= 0x072d00
+	curl_easy_getinfo(curl, CURLINFO_ACTIVESOCKET, &sctx->sock);
+#elif LIBCURL_VERSION_NUM < 0x071101   
+//#if LIBCURL_VERSION_NUM < 0x071101
 	/* CURLINFO_LASTSOCKET is broken on Win64; only use it as a last resort */
 	curl_easy_getinfo(curl, CURLINFO_LASTSOCKET, (long *)&sctx->sock);
 #endif
@@ -1885,7 +1910,8 @@ static uint32_t getblocheight(struct stratum_ctx *sctx)

 	// find 0xffff tag
 	p = (uint8_t*) sctx->job.coinbase + 32;
-	m = p + 128;
+   m = p + sctx->job.coinbase_size - 32 - 2;
+//   m = p + 128;
 	while (*p != 0xff && p < m) p++;
 	while (*p == 0xff && p < m) p++;
 	if (*(p-1) == 0xff && *(p-2) == 0xff) {
@@ -1992,23 +2018,41 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
      }
   }

-   if ( merkle_count )
-      merkle = (uchar**) malloc( merkle_count * sizeof(char *) );
-	for ( i = 0; i < merkle_count; i++ )
-   {
-		const char *s = json_string_value( json_array_get( merkle_arr, i ) );
-		if ( !s || strlen(s) != 64 )
-      {
-			while ( i-- ) free( merkle[i] );
-			free( merkle );
-			applog( LOG_ERR, "Stratum notify: invalid Merkle branch" );
-			goto out;
-		}
-		merkle[i] = (uchar*) malloc( 32 );
-		hex2bin( merkle[i], s, 32 );
-	}
+   pthread_mutex_lock( &sctx->work_lock );

-	pthread_mutex_lock( &sctx->work_lock );
+   if ( merkle_count )
+   {
+      if ( merkle_count > sctx->job.merkle_buf_size )
+      {
+         for ( i = 0; i < sctx->job.merkle_count; i++ )
+            free( sctx->job.merkle[i] );
+         free( sctx->job.merkle );
+
+         merkle = (uchar**) malloc( merkle_count * sizeof(char *) );
+         for ( i = 0; i < merkle_count; i++ )
+            merkle[i] = (uchar*) malloc( 32 );
+         sctx->job.merkle_buf_size = merkle_count;
+         sctx->job.merkle = merkle;
+      }
+
+      for ( i = 0; i < merkle_count; i++ )
+      {
+         const char *s = json_string_value( json_array_get( merkle_arr, i ) );
+         if ( !s || strlen(s) != 64 )
+         {
+            for ( int j = sctx->job.merkle_buf_size; j > 0; j-- )
+               free( sctx->job.merkle[i] );
+            free( sctx->job.merkle );
+            sctx->job.merkle_count =
+            sctx->job.merkle_buf_size = 0;
+            pthread_mutex_unlock( &sctx->work_lock );
+            applog( LOG_ERR, "Stratum notify: invalid Merkle branch" );
+            goto out;
+         }
+         hex2bin( sctx->job.merkle[i], s, 32 );
+      }   
+   }
+   sctx->job.merkle_count = merkle_count;         

 	coinb1_size = strlen( coinb1 ) / 2;
 	coinb2_size = strlen( coinb2 ) / 2;
@@ -2041,18 +2085,9 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
   }

 	sctx->block_height = getblocheight( sctx );
-
-	for ( i = 0; i < sctx->job.merkle_count; i++ )
-		free( sctx->job.merkle[i] );
-
-	free( sctx->job.merkle );
-	sctx->job.merkle = merkle;
-	sctx->job.merkle_count = merkle_count;
-
 	hex2bin( sctx->job.nbits, nbits, 4 );
 	hex2bin( sctx->job.ntime, stime, 4 );
 	sctx->job.clean = clean;
-
 	sctx->job.diff = sctx->next_diff;

 	pthread_mutex_unlock( &sctx->work_lock );
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -129,7 +129,7 @@ make clean || echo clean
 # Native with CPU groups ennabled
 make clean || echo clean
 rm -f config.status
-CFLAGS="-march=native $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=native $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
Author	SHA1	Message	Date
Jay D Dee	de564ccbde	v3.22.2	2023-04-06 13:38:37 -04:00
Jay D Dee	fcd7727b0d	v3.22.1	2023-03-24 18:29:42 -04:00
Jay D Dee	3dd6787531	v3.22.0	2023-03-21 17:12:51 -04:00
Jay D Dee	cae1ce2ab7	v3.21.5	2023-03-15 12:27:04 -04:00