v3.22.2

v3.22.1
v3.22.0
2025-09-17 23:44:27 +00:00 · 2023-04-06 13:38:37 -04:00 · 2023-03-24 18:29:42 -04:00 · 2023-03-21 17:12:51 -04:00 · 2023-03-15 12:27:04 -04:00 · 2023-03-13 14:54:38 -04:00
56 changed files with 4713 additions and 6422 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -55,9 +55,6 @@ cpuminer_SOURCES = \
  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
  algo/blake/blakecoin-4way.c \
-  algo/blake/decred-gate.c \
-  algo/blake/decred.c \
-  algo/blake/decred-4way.c \
  algo/blake/pentablake-gate.c \
  algo/blake/pentablake-4way.c \
  algo/blake/pentablake.c \
@@ -178,6 +175,8 @@ cpuminer_SOURCES = \
  algo/sha/sha256t.c \
  algo/sha/sha256q-4way.c \
  algo/sha/sha256q.c \
+  algo/sha/sha512256d-4way.c \
+  algo/sha/sha256dt.c \
  algo/shabal/sph_shabal.c \
  algo/shabal/shabal-hash-4way.c \
  algo/shavite/sph_shavite.c \
@@ -264,6 +263,8 @@ cpuminer_SOURCES = \
  algo/x16/x16r-4way.c \
  algo/x16/x16rv2.c \
  algo/x16/x16rv2-4way.c \
+  algo/x16/x16rt.c \
+  algo/x16/x16rt-4way.c \
  algo/x16/hex.c \
  algo/x16/x21s-4way.c \
  algo/x16/x21s.c \
--- a/57
+++ b/57
@@ -65,7 +65,58 @@ If not what makes it happen or not happen?
 Change Log
 ----------

-v3.22.3
+v3.22.2
+
+Added sha512256d & sha256dt algos.
+Fixed intermittant invalid shares lyra2v2 AVX512.
+Removed application limits on the number of CPUs and threads, HW and OS limits still apply.
+Added a log warning if more threads are defined than active CPUs in affinity mask.
+Improved merkle tree memory management for stratum.
+Added transaction count to New Work log.
+Other small improvements.
+
+v3.22.1
+
+#393 fixed segfault in GBT, regression from v3.22.0.
+More efficient 32 bit data interleaving.
+
+v3.22.0
+
+Stratum: faster netdiff calculation.
+Merged a few updates from Pooler/cpuminer:
+   Use CURLOPT_POSTFIELDS in json_rpc_call,
+   Use CURLINFO_ACTIVESOCKET when supported,
+   JSONRPC speedup,
+   Speed up hex2bin function.  
+Small log improvements, notably more frequent hash rate reports.
+Removed decred algo.
+
+v3.21.5
+
+All issues with v3.21.3 & v3.21.4 should be resolved.
+Changes since v3.21.2:
+#392 #379 #389 Fixed misaligned address segfault solo mining.
+#392 Fixed stats for myr-gr algo, and a few others, for CPUs without AVX2.
+#392 Fixed conditional mining.
+#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
+     Windows binaries no longer support CPU groups,
+     Windows binaries support CPUs with up to 64 threads.
+Small optimizations to serialized vectoring.
+
+v3.21.4 CANCELLED
+
+Reapply selected changes from v3.21.3.
+#392 #379 #389 Fixed misaligned address segfault solo mining.
+#392 Fixed conditional mining.
+#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
+     Windows binaries no longer support CPU groups,
+     Windows binaries support CPUs with up to 64 threads.
+
+v3.21.3.1 UNRELEASED
+
+Revert to 3.21.2
+
+v3.21.3 CANCELLED

 #392 #379 #389 Fixed misaligned address segfault solo mining.
 #392 Fixed stats for myr-gr algo, and a few others, for CPUs without AVX2.
@@ -74,10 +125,10 @@ v3.22.3
     Windows binaries no longer support CPU groups,
     Windows binaries support CPUs with up to 64 threads.
 Midstate prehash is now centralized, done only once instead of by every thread
-for selected algos. 
+for selected algos.
 Small optimizations to serialized vectoring.

-v3.22.2
+v3.21.2 

 Faster SALSA SIMD shuffle for yespower, yescrypt & scryptn2.
 Fixed a couple of compiler warnings with gcc-12.
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -1,6 +1,6 @@
-# generated automatically by aclocal 1.16.5 -*- Autoconf -*-
+# generated automatically by aclocal 1.16.1 -*- Autoconf -*-

-# Copyright (C) 1996-2021 Free Software Foundation, Inc.
+# Copyright (C) 1996-2018 Free Software Foundation, Inc.

 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -14,13 +14,13 @@
 m4_ifndef([AC_CONFIG_MACRO_DIRS], [m4_defun([_AM_CONFIG_MACRO_DIRS], [])m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])])
 m4_ifndef([AC_AUTOCONF_VERSION],
  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
-m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.71],,
-[m4_warning([this file was generated for autoconf 2.71.
+m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.69],,
+[m4_warning([this file was generated for autoconf 2.69.
 You have another version of autoconf.  It may work, but is not guaranteed to.
 If you have problems, you may need to regenerate the build system entirely.
 To do so, use the procedure documented by the package, typically 'autoreconf'.])])

-# Copyright (C) 2002-2021 Free Software Foundation, Inc.
+# Copyright (C) 2002-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -35,7 +35,7 @@ AC_DEFUN([AM_AUTOMAKE_VERSION],
 [am__api_version='1.16'
 dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
 dnl require some minimum version.  Point them to the right macro.
-m4_if([$1], [1.16.5], [],
+m4_if([$1], [1.16.1], [],
      [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
 ])

@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
 # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
 # This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.16.5])dnl
+[AM_AUTOMAKE_VERSION([1.16.1])dnl
 m4_ifndef([AC_AUTOCONF_VERSION],
  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])

 # Figure out how to run the assembler.                      -*- Autoconf -*-

-# Copyright (C) 2001-2021 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -78,7 +78,7 @@ _AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl

 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-

-# Copyright (C) 2001-2021 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -130,7 +130,7 @@ am_aux_dir=`cd "$ac_aux_dir" && pwd`

 # AM_CONDITIONAL                                            -*- Autoconf -*-

-# Copyright (C) 1997-2021 Free Software Foundation, Inc.
+# Copyright (C) 1997-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -161,7 +161,7 @@ AC_CONFIG_COMMANDS_PRE(
 Usually this means the macro was only invoked conditionally.]])
 fi])])

-# Copyright (C) 1999-2021 Free Software Foundation, Inc.
+# Copyright (C) 1999-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -352,7 +352,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl

 # Generate code to set up dependency tracking.              -*- Autoconf -*-

-# Copyright (C) 1999-2021 Free Software Foundation, Inc.
+# Copyright (C) 1999-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -391,9 +391,7 @@ AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
  done
  if test $am_rc -ne 0; then
    AC_MSG_FAILURE([Something went wrong bootstrapping makefile fragments
-    for automatic dependency tracking.  If GNU make was not used, consider
-    re-running the configure script with MAKE="gmake" (or whatever is
-    necessary).  You can also try re-running configure with the
+    for automatic dependency tracking.  Try re-running configure with the
    '--disable-dependency-tracking' option to at least be able to build
    the package (albeit without support for automatic dependency tracking).])
  fi
@@ -420,7 +418,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],

 # Do all the work for Automake.                             -*- Autoconf -*-

-# Copyright (C) 1996-2021 Free Software Foundation, Inc.
+# Copyright (C) 1996-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -448,10 +446,6 @@ m4_defn([AC_PROG_CC])
 # release and drop the old call support.
 AC_DEFUN([AM_INIT_AUTOMAKE],
 [AC_PREREQ([2.65])dnl
-m4_ifdef([_$0_ALREADY_INIT],
-  [m4_fatal([$0 expanded multiple times
-]m4_defn([_$0_ALREADY_INIT]))],
-  [m4_define([_$0_ALREADY_INIT], m4_expansion_stack)])dnl
 dnl Autoconf wants to disallow AM_ names.  We explicitly allow
 dnl the ones we care about.
 m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
@@ -488,7 +482,7 @@ m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
 [_AM_SET_OPTIONS([$1])dnl
 dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT.
 m4_if(
-  m4_ifset([AC_PACKAGE_NAME], [ok]):m4_ifset([AC_PACKAGE_VERSION], [ok]),
+  m4_ifdef([AC_PACKAGE_NAME], [ok]):m4_ifdef([AC_PACKAGE_VERSION], [ok]),
  [ok:ok],,
  [m4_fatal([AC_INIT should be called with package and version arguments])])dnl
 AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
@@ -540,20 +534,6 @@ AC_PROVIDE_IFELSE([AC_PROG_OBJCXX],
 		  [m4_define([AC_PROG_OBJCXX],
 			     m4_defn([AC_PROG_OBJCXX])[_AM_DEPENDENCIES([OBJCXX])])])dnl
 ])
-# Variables for tags utilities; see am/tags.am
-if test -z "$CTAGS"; then
-  CTAGS=ctags
-fi
-AC_SUBST([CTAGS])
-if test -z "$ETAGS"; then
-  ETAGS=etags
-fi
-AC_SUBST([ETAGS])
-if test -z "$CSCOPE"; then
-  CSCOPE=cscope
-fi
-AC_SUBST([CSCOPE])
-
 AC_REQUIRE([AM_SILENT_RULES])dnl
 dnl The testsuite driver may need to know about EXEEXT, so add the
 dnl 'am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen.  This
@@ -635,7 +615,7 @@ for _am_header in $config_headers :; do
 done
 echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])

-# Copyright (C) 2001-2021 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -656,7 +636,7 @@ if test x"${install_sh+set}" != xset; then
 fi
 AC_SUBST([install_sh])])

-# Copyright (C) 2003-2021 Free Software Foundation, Inc.
+# Copyright (C) 2003-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -678,7 +658,7 @@ AC_SUBST([am__leading_dot])])
 # Add --enable-maintainer-mode option to configure.         -*- Autoconf -*-
 # From Jim Meyering

-# Copyright (C) 1996-2021 Free Software Foundation, Inc.
+# Copyright (C) 1996-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -713,7 +693,7 @@ AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])

 # Check to see how 'make' treats includes.	            -*- Autoconf -*-

-# Copyright (C) 2001-2021 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -756,7 +736,7 @@ AC_SUBST([am__quote])])

 # Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-

-# Copyright (C) 1997-2021 Free Software Foundation, Inc.
+# Copyright (C) 1997-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -777,7 +757,12 @@ AC_DEFUN([AM_MISSING_HAS_RUN],
 [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
 AC_REQUIRE_AUX_FILE([missing])dnl
 if test x"${MISSING+set}" != xset; then
-  MISSING="\${SHELL} '$am_aux_dir/missing'"
+  case $am_aux_dir in
+  *\ * | *\	*)
+    MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;;
+  *)
+    MISSING="\${SHELL} $am_aux_dir/missing" ;;
+  esac
 fi
 # Use eval to expand $SHELL
 if eval "$MISSING --is-lightweight"; then
@@ -790,7 +775,7 @@ fi

 # Helper functions for option handling.                     -*- Autoconf -*-

-# Copyright (C) 2001-2021 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -819,7 +804,7 @@ AC_DEFUN([_AM_SET_OPTIONS],
 AC_DEFUN([_AM_IF_OPTION],
 [m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])

-# Copyright (C) 1999-2021 Free Software Foundation, Inc.
+# Copyright (C) 1999-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -866,7 +851,7 @@ AC_LANG_POP([C])])
 # For backward compatibility.
 AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])

-# Copyright (C) 2001-2021 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -885,7 +870,7 @@ AC_DEFUN([AM_RUN_LOG],

 # Check to make sure that the build environment is sane.    -*- Autoconf -*-

-# Copyright (C) 1996-2021 Free Software Foundation, Inc.
+# Copyright (C) 1996-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -966,7 +951,7 @@ AC_CONFIG_COMMANDS_PRE(
 rm -f conftest.file
 ])

-# Copyright (C) 2009-2021 Free Software Foundation, Inc.
+# Copyright (C) 2009-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1026,7 +1011,7 @@ AC_SUBST([AM_BACKSLASH])dnl
 _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
 ])

-# Copyright (C) 2001-2021 Free Software Foundation, Inc.
+# Copyright (C) 2001-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1054,7 +1039,7 @@ fi
 INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
 AC_SUBST([INSTALL_STRIP_PROGRAM])])

-# Copyright (C) 2006-2021 Free Software Foundation, Inc.
+# Copyright (C) 2006-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1073,7 +1058,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])

 # Check how to create a tarball.                            -*- Autoconf -*-

-# Copyright (C) 2004-2021 Free Software Foundation, Inc.
+# Copyright (C) 2004-2018 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -253,7 +253,6 @@ void init_algo_gate( algo_gate_t* gate )
   gate->miner_thread_init       = (void*)&return_true;
   gate->scanhash                = (void*)&scanhash_generic;
   gate->hash                    = (void*)&null_hash;
-   gate->prehash                 = (void*)&return_true;
   gate->get_new_work            = (void*)&std_get_new_work;
   gate->work_decode             = (void*)&std_le_work_decode;
   gate->decode_extra_data       = (void*)&do_nothing;
@@ -264,8 +263,6 @@ void init_algo_gate( algo_gate_t* gate )
   gate->build_block_header      = (void*)&std_build_block_header;
   gate->build_extraheader       = (void*)&std_build_extraheader;
   gate->set_work_data_endian    = (void*)&do_nothing;
-   gate->calc_network_diff       = (void*)&std_calc_network_diff;
-   gate->ready_to_mine           = (void*)&std_ready_to_mine;
   gate->resync_threads          = (void*)&do_nothing;
   gate->do_this_thread          = (void*)&return_true;
   gate->longpoll_rpc_call       = (void*)&std_longpoll_rpc_call;
@@ -309,7 +306,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_BLAKECOIN:    rc = register_blakecoin_algo     ( gate ); break;
    case ALGO_BMW512:       rc = register_bmw512_algo        ( gate ); break;
    case ALGO_C11:          rc = register_c11_algo           ( gate ); break;
-    case ALGO_DECRED:       rc = register_decred_algo        ( gate ); break;
    case ALGO_DEEP:         rc = register_deep_algo          ( gate ); break;
    case ALGO_DMD_GR:       rc = register_dmd_gr_algo        ( gate ); break;
    case ALGO_GROESTL:      rc = register_groestl_algo       ( gate ); break;
@@ -341,9 +337,11 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_QUBIT:        rc = register_qubit_algo         ( gate ); break;
    case ALGO_SCRYPT:       rc = register_scrypt_algo        ( gate ); break;
    case ALGO_SHA256D:      rc = register_sha256d_algo       ( gate ); break;
+    case ALGO_SHA256DT:     rc = register_sha256dt_algo      ( gate ); break;
    case ALGO_SHA256Q:      rc = register_sha256q_algo       ( gate ); break;
    case ALGO_SHA256T:      rc = register_sha256t_algo       ( gate ); break;
    case ALGO_SHA3D:        rc = register_sha3d_algo         ( gate ); break;
+    case ALGO_SHA512256D:   rc = register_sha512256d_algo    ( gate ); break;
    case ALGO_SHAVITE3:     rc = register_shavite_algo       ( gate ); break;
    case ALGO_SKEIN:        rc = register_skein_algo         ( gate ); break;
    case ALGO_SKEIN2:       rc = register_skein2_algo        ( gate ); break;
@@ -428,7 +426,6 @@ const char* const algo_alias_map[][2] =
  { "blake256r8",        "blakecoin"      },
  { "blake256r8vnl",     "vanilla"        },
  { "blake256r14",       "blake"          },
-  { "blake256r14dcr",    "decred"         },
  { "diamond",           "dmd-gr"         },
  { "espers",            "hmq1725"        },
  { "flax",              "c11"            },
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -119,7 +119,7 @@ typedef struct
 // to be registered with the gate. 
 int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );

-int ( *hash )     ( void*, const void*, const int );
+int ( *hash )     ( void*, const void*, int );

 //optional, safe to use default in most cases

@@ -127,9 +127,6 @@ int ( *hash )     ( void*, const void*, const int );
 // other initialization specific to miner threads.
 bool ( *miner_thread_init )     ( int );

-// Perform prehash after receiving new work
-int ( *prehash )                ( struct work* );
-
 // Get thread local copy of blockheader with unique nonce.
 void ( *get_new_work )          ( struct work*, struct work*, int, uint32_t* );

@@ -147,7 +144,7 @@ void ( *gen_merkle_root )       ( char*, struct stratum_ctx* );
 void ( *build_extraheader )     ( struct work*, struct stratum_ctx* );

 void ( *build_block_header )    ( struct work*, uint32_t, uint32_t*,
-	                                uint32_t*, uint32_t, uint32_t,
+	                                uint32_t*,   uint32_t, uint32_t,
                                   unsigned char* );

 // Build mining.submit message
@@ -158,19 +155,13 @@ char* ( *malloc_txs_request )   ( struct work* );
 // Big endian or little endian
 void ( *set_work_data_endian )  ( struct work* );

-double ( *calc_network_diff )   ( struct work* );
-
-// Wait for first work
-bool ( *ready_to_mine )         ( struct work*, struct stratum_ctx*, int );
-
 // Diverge mining threads
 bool ( *do_this_thread )        ( int );

 // After do_this_thread
 void ( *resync_threads )        ( int, struct work* );

-// No longer needed
-json_t* (*longpoll_rpc_call)      ( CURL*, int*, char* );
+json_t* ( *longpoll_rpc_call )  ( CURL*, int*, char* );

 set_t optimizations;
 int  ( *get_work_data_size )     ();
@@ -289,8 +280,6 @@ char* std_malloc_txs_request( struct work *work );
 // Default is do_nothing, little endian is assumed
 void set_work_data_big_endian( struct work *work );

-double std_calc_network_diff( struct work *work );
-
 void std_build_block_header( struct work* g_work, uint32_t version,
 	                          uint32_t *prevhash,  uint32_t *merkle_root,
   	                       uint32_t ntime,      uint32_t nbits,
@@ -300,9 +289,6 @@ void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );

 json_t* std_longpoll_rpc_call( CURL *curl, int *err, char *lp_url );

-bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                        int thr_id );
-
 int std_get_work_data_size();

 // Gate admin functions
--- a/algo/blake/blake2s-4way.c
+++ b/algo/blake/blake2s-4way.c
@@ -1,6 +1,5 @@
 #include "blake2s-gate.h"
 #include "blake2s-hash-4way.h"
-//#include "sph-blake2s.h"
 #include <string.h>
 #include <stdint.h>

@@ -8,43 +7,6 @@

 static __thread blake2s_16way_state blake2s_16w_ctx;

-/*
-static blake2s_16way_state blake2s_16w_ctx;
-static uint32_t blake2s_16way_vdata[20*16] __attribute__ ((aligned (64)));
-*/
-/*
-int blake2s_16way_prehash( struct work *work )
-{
-   uint32_t edata[20] __attribute__ ((aligned (64)));
-   blake2s_state ctx;
-   mm128_bswap32_80( edata, work->data );
-   blake2s_init( &ctx, BLAKE2S_OUTBYTES );
-   ctx.buflen = ctx.t[0] = 64;
-   blake2s_compress( &ctx, (const uint8_t*)edata );
-
-   blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
-   intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
-                                    ctx.h, ctx.h, ctx.h, ctx.h,
-                                    ctx.h, ctx.h, ctx.h, ctx.h,
-                                    ctx.h, ctx.h, ctx.h, ctx.h, 256 );
-   intrlv_16x32( blake2s_16way_vdata, edata, edata, edata, edata,
-                                      edata, edata, edata, edata,
-                                      edata, edata, edata, edata,
-                                      edata, edata, edata, edata, 640 );
-   blake2s_16w_ctx.t[0] = 64;
-   return 1;
-}
-*/
-/*
-int blake2s_16way_prehash( struct work *work )
-{
-   mm512_bswap32_intrlv80_16x32( blake2s_16way_vdata, work->data );
-   blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
-   blake2s_16way_update( &blake2s_16w_ctx, blake2s_16way_vdata, 64 );
-   return 1;
-}
-*/
-
 void blake2s_16way_hash( void *output, const void *input )
 {
   blake2s_16way_state ctx;
@@ -68,40 +30,10 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   int thr_id = mythr->id;  

-/*   
-//   pthread_rwlock_rdlock( &g_work_lock );
-       memcpy( (__m512i*)vdata +16, (__m512i*)blake2s_16way_vdata +16, 3*4*16 );
-//     casti_m512i( vdata, 16 ) = casti_m512i( blake2s_16way_vdata, 16 );
-//     casti_m512i( vdata, 17 ) = casti_m512i( blake2s_16way_vdata, 17 );
-//     casti_m512i( vdata, 18 ) = casti_m512i( blake2s_16way_vdata, 18 );
-       
-//   pthread_rwlock_unlock( &g_work_lock );
-*/
-/*
-   uint32_t edata[20] __attribute__ ((aligned (64)));
-   blake2s_state ctx;
-   mm128_bswap32_80( edata, pdata );
-   blake2s_init( &ctx, BLAKE2S_OUTBYTES );
-   ctx.buflen = ctx.t[0] = 64;
-   blake2s_compress( &ctx, (const uint8_t*)edata );
-
-   blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
-   intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
-                                    ctx.h, ctx.h, ctx.h, ctx.h,
-                                    ctx.h, ctx.h, ctx.h, ctx.h,
-                                    ctx.h, ctx.h, ctx.h, ctx.h, 256 );
-   intrlv_16x32( blake2s_16way_blake2s_16way_vdata, edata, edata, edata, edata,
-                                      edata, edata, edata, edata,
-                                      edata, edata, edata, edata,
-                                      edata, edata, edata, edata, 640 );
-   blake2s_16w_ctx.t[0] = 64;
-*/
-   
   mm512_bswap32_intrlv80_16x32( vdata, pdata );
   blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
   blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );

-
   do {
      *noncev = mm512_bswap_32( _mm512_set_epi32(
 	                  n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
@@ -131,36 +63,6 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,

 static __thread blake2s_8way_state blake2s_8w_ctx;

-/*
-static blake2s_8way_state blake2s_8w_ctx;
-static uint32_t blake2s_8way_vdata[20*8] __attribute__ ((aligned (32)));
-
-int blake2s_8way_prehash( struct work *work )
-{
-   uint32_t edata[20] __attribute__ ((aligned (64)));
-   blake2s_state ctx;
-   mm128_bswap32_80( edata, work->data );
-   blake2s_init( &ctx, BLAKE2S_OUTBYTES );
-   ctx.buflen = ctx.t[0] = 64;
-   blake2s_compress( &ctx, (const uint8_t*)edata );
-
-   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
-
-   for ( int i = 0; i < 8; i++ )
-      casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] );
-
-   casti_m256i( blake2s_8way_vdata, 16 ) = _mm256_set1_epi32( edata[16] );
-   casti_m256i( blake2s_8way_vdata, 17 ) = _mm256_set1_epi32( edata[17] );
-   casti_m256i( blake2s_8way_vdata, 18 ) = _mm256_set1_epi32( edata[18] );
-
-//   intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
-//                                  ctx.h, ctx.h, ctx.h, ctx.h, 256 );
-//   intrlv_8x32( blake2s_8way_vdata, edata, edata, edata, edata,
-//                                    edata, edata, edata, edata, 640 );
-   blake2s_8w_ctx.t[0] = 64;
-}
-*/
-
 void blake2s_8way_hash( void *output, const void *input )
 {
   blake2s_8way_state ctx;
@@ -184,41 +86,10 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   int thr_id = mythr->id; 

-/*   
-//   pthread_rwlock_rdlock( &g_work_lock );
-       memcpy( &vdata[16*8], &blake2s_8way_vdata[16*8], 3*4*8 );
-//   pthread_rwlock_unlock( &g_work_lock );
-*/
-/*
-   uint32_t edata[20] __attribute__ ((aligned (64)));
-   blake2s_state ctx;
-   mm128_bswap32_80( edata, pdata );
-   blake2s_init( &ctx, BLAKE2S_OUTBYTES );
-   ctx.buflen = ctx.t[0] = 64;
-   blake2s_compress( &ctx, (const uint8_t*)edata );
-
-   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
-   for ( int i = 0; i < 8; i++ )
-      casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] );
-
-   casti_m256i( vdata, 16 ) = _mm256_set1_epi32( edata[16] );
-   casti_m256i( vdata, 17 ) = _mm256_set1_epi32( edata[17] );
-   casti_m256i( vdata, 18 ) = _mm256_set1_epi32( edata[18] );
-
-
-//  intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
-//                                  ctx.h, ctx.h, ctx.h, ctx.h, 256 );
-//   intrlv_8x32( vdata, edata, edata, edata, edata,
-//                                    edata, edata, edata, edata, 640 );
-
-   blake2s_8w_ctx.t[0] = 64;
-*/
-   
   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
   blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );

-
   do {
      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
                                                  n+3, n+2, n+1, n ) );
@@ -246,25 +117,7 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
 #elif defined(BLAKE2S_4WAY)

 static __thread blake2s_4way_state blake2s_4w_ctx;
-/*
-static blake2s_4way_state blake2s_4w_ctx;
-static uint32_t blake2s_4way_vdata[20*4] __attribute__ ((aligned (32)));

-int blake2s_4way_prehash( struct work *work )
-{
-   uint32_t edata[20] __attribute__ ((aligned (64)));
-   blake2s_state ctx;
-   mm128_bswap32_80( edata, work->data );
-   blake2s_init( &ctx, BLAKE2S_OUTBYTES );
-   ctx.buflen = ctx.t[0] = 64;
-   blake2s_compress( &ctx, (const uint8_t*)edata );
-
-   blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
-   intrlv_4x32( blake2s_4w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h, 256 );
-   intrlv_4x32( blake2s_4way_vdata, edata, edata, edata, edata, 640 );
-   blake2s_4w_ctx.t[0] = 64;
-}
-*/
 void blake2s_4way_hash( void *output, const void *input )
 {
   blake2s_4way_state ctx;
@@ -287,15 +140,11 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
   uint32_t n = first_nonce;
   int thr_id = mythr->id; 
-/*
-   pthread_rwlock_rdlock( &g_work_lock );
-       memcpy( vdata, blake2s_4way_vdata, sizeof vdata );
-   pthread_rwlock_unlock( &g_work_lock );
-*/
+
   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
   blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
-   
+
   do {
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      pdata[19] = n;
--- a/algo/blake/blake2s-gate.c
+++ b/algo/blake/blake2s-gate.c
@@ -5,15 +5,13 @@ bool register_blake2s_algo( algo_gate_t* gate )
 #if defined(BLAKE2S_16WAY)
  gate->scanhash  = (void*)&scanhash_blake2s_16way;
  gate->hash      = (void*)&blake2s_16way_hash;
-//  gate->prehash   = (void*)&blake2s_16way_prehash;
 #elif defined(BLAKE2S_8WAY)
+//#if defined(BLAKE2S_8WAY)
  gate->scanhash  = (void*)&scanhash_blake2s_8way;
  gate->hash      = (void*)&blake2s_8way_hash;
-//  gate->prehash   = (void*)&blake2s_8way_prehash;
 #elif defined(BLAKE2S_4WAY)
  gate->scanhash  = (void*)&scanhash_blake2s_4way;
  gate->hash      = (void*)&blake2s_4way_hash;
-//  gate->prehash   = (void*)&blake2s_4way_prehash;
 #else
  gate->scanhash  = (void*)&scanhash_blake2s;
  gate->hash      = (void*)&blake2s_hash;
--- a/algo/blake/blake2s-gate.h
+++ b/algo/blake/blake2s-gate.h
@@ -23,22 +23,18 @@ bool register_blake2s_algo( algo_gate_t* gate );
 void blake2s_16way_hash( void *state, const void *input );
 int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-int blake2s_16way_prehash( struct work * );

 #elif defined (BLAKE2S_8WAY)

 void blake2s_8way_hash( void *state, const void *input );
 int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-int blake2s_8way_prehash( struct work * );

 #elif defined (BLAKE2S_4WAY)

 void blake2s_4way_hash( void *state, const void *input );
 int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-int blake2s_4way_prehash( struct work * );
-
 #else

 void blake2s_hash( void *state, const void *input );
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -105,8 +105,8 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )

 #define G4W( sigma0, sigma1, a, b, c, d ) \
 do { \
-   const uint8_t s0 = sigma0; \
-   const uint8_t s1 = sigma1; \
+   uint8_t s0 = sigma0; \
+   uint8_t s1 = sigma1; \
   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
   d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
   c = _mm_add_epi32( c, d ); \
@@ -120,7 +120,7 @@ do { \

 #define ROUND4W(r)  \
 do { \
-   const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
+   uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
   G4W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
   G4W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
   G4W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
@@ -317,8 +317,8 @@ do { \

 #define G8W( sigma0, sigma1, a, b, c, d) \
 do { \
-   const uint8_t s0 = sigma0; \
-   const uint8_t s1 = sigma1; \
+   uint8_t s0 = sigma0; \
+   uint8_t s1 = sigma1; \
   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
   d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
   c = _mm256_add_epi32( c, d ); \
@@ -331,7 +331,7 @@ do { \

 #define ROUND8W(r)  \
 do { \
-   const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
+   uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
   G8W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
   G8W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
   G8W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
@@ -529,8 +529,8 @@ int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )

 #define G16W( sigma0, sigma1, a, b, c, d) \
 do { \
-   const uint8_t s0 = sigma0; \
-   const uint8_t s1 = sigma1; \
+   uint8_t s0 = sigma0; \
+   uint8_t s1 = sigma1; \
   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s0 ] ); \
   d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
   c = _mm512_add_epi32( c, d ); \
@@ -543,7 +543,7 @@ do { \

 #define ROUND16W(r)  \
 do { \
-   const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
+   uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
   G16W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
   G16W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
   G16W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -20,7 +20,6 @@

 #include <stddef.h>
 #include <stdint.h>
-//#include "sph-blake2s.h"

 #if defined(_MSC_VER)
 #include <inttypes.h>
@@ -34,7 +33,7 @@
 #if defined(__cplusplus)
 extern "C" {
 #endif
-/*
+
 enum blake2s_constant
 {
   BLAKE2S_BLOCKBYTES = 64,
@@ -43,13 +42,6 @@ enum blake2s_constant
   BLAKE2S_SALTBYTES  = 8,
   BLAKE2S_PERSONALBYTES = 8
 };
-*/
-
-#define BLAKE2S_BLOCKBYTES  64
-#define BLAKE2S_OUTBYTES    32
-#define BLAKE2S_KEYBYTES    32
-#define BLAKE2S_SALTBYTES   8
-#define BLAKE2S_PERSONALBYTES  8

 #pragma pack(push, 1)
 typedef struct __blake2s_nway_param
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -8,6 +8,8 @@
 #include "sph-blake2s.h"

 static __thread blake2s_state blake2s_ctx;
+//static __thread blake2s_state s_ctx;
+#define MIDLEN 76

 void blake2s_hash( void *output, const void *input )
 {
@@ -17,27 +19,37 @@ void blake2s_hash( void *output, const void *input )
   memcpy( &ctx, &blake2s_ctx, sizeof ctx );
   blake2s_update( &ctx, input+64, 16 );
 
+//	blake2s_init(&ctx, BLAKE2S_OUTBYTES);
+//	blake2s_update(&ctx, input, 80);
 	blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );

 	memcpy(output, hash, 32);
 }
-
+/*
+static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
+{
+	s_ctx.buflen = MIDLEN;
+	memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
+	blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
+	blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
+}
+*/
 int scanhash_blake2s( struct work *work,
 	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;

 	uint32_t _ALIGN(64) hash64[8];
 	uint32_t _ALIGN(64) endiandata[20];
-   int thr_id = mythr->id;  
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];

 	uint32_t n = first_nonce;

-   swab32_array( endiandata, pdata, 20 );
+        swab32_array( endiandata, pdata, 20 );

 	// midstate
 	blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
@@ -46,12 +58,11 @@ int scanhash_blake2s( struct work *work,
 	do {
 		be32enc(&endiandata[19], n);
 		blake2s_hash( hash64, endiandata );
-      if (hash64[7] <= Htarg )
-      if ( fulltest(hash64, ptarget) && !opt_benchmark )
-      {
-         pdata[19] = n;
-         submit_solution( work, hash64, mythr );
-      }
+		if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) {
+			*hashes_done = n - first_nonce + 1;
+			pdata[19] = n;
+			return true;
+		}
 		n++;

 	} while (n < max_nonce && !work_restart[thr_id].restart);
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -1,74 +0,0 @@
-#include "decred-gate.h"
-#include "blake-hash-4way.h"
-#include <string.h>
-#include <stdint.h>
-#include <memory.h>
-#include <unistd.h>
-
-#if defined (DECRED_4WAY)
-
-static __thread blake256_4way_context blake_mid;
-
-void decred_hash_4way( void *state, const void *input )
-{
-     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-//     uint32_t hash0[8] __attribute__ ((aligned (32)));
-//     uint32_t hash1[8] __attribute__ ((aligned (32)));
-//     uint32_t hash2[8] __attribute__ ((aligned (32)));
-//     uint32_t hash3[8] __attribute__ ((aligned (32)));
-     const void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
-     int tail_len = 180 - DECRED_MIDSTATE_LEN; 
-     blake256_4way_context ctx __attribute__ ((aligned (64)));
-
-     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
-     blake256_4way_update( &ctx, tail, tail_len );
-     blake256_4way_close( &ctx, vhash );
-     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
-}
-
-int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t vdata[48*4] __attribute__ ((aligned (64)));
-   uint32_t hash[8*4] __attribute__ ((aligned (32)));
-   uint32_t _ALIGN(64) edata[48];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
-   uint32_t n = first_nonce;
-   const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-   // copy to buffer guaranteed to be aligned.
-   memcpy( edata, pdata, 180 );
-
-   // use the old way until  new way updated for size.
-   mm128_intrlv_4x32x( vdata, edata, edata, edata, edata, 180*8 );
-
-   blake256_4way_init( &blake_mid );
-   blake256_4way_update( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
-
-   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
-   do {
-      * noncep    = n;
-      *(noncep+1) = n+1;
-      *(noncep+2) = n+2;
-      *(noncep+3) = n+3;
-
-      decred_hash_4way( hash, vdata );
-
-      for ( int i = 0; i < 4; i++ )
-      if (  (hash+(i<<3))[7] <= HTarget )
-      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-      {
-          pdata[DECRED_NONCE_INDEX] = n+i;
-          submit_solution( work, hash+(i<<3), mythr );
-      }
-      n += 4;
-  } while ( (n < max_nonce) && !work_restart[thr_id].restart );
-
-  *hashes_done = n - first_nonce + 1;
-  return 0;
-}
-
-#endif
--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -1,171 +0,0 @@
-#include "decred-gate.h"
-#include <unistd.h>
-#include <memory.h>
-#include <string.h>
-
-uint32_t *decred_get_nonceptr( uint32_t *work_data )
-{
-   return &work_data[ DECRED_NONCE_INDEX ];
-}
-
-long double decred_calc_network_diff( struct work* work )
-{
-   // sample for diff 43.281 : 1c05ea29
-   // todo: endian reversed on longpoll could be zr5 specific...
-   uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
-   uint32_t bits = ( nbits & 0xffffff );
-   int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
-   int m;
-   long double d = (long double)0x0000ffff / (long double)bits;
-
-   for ( m = shift; m < 29; m++ )
-       d *= 256.0;
-   for ( m = 29; m < shift; m++ )
-       d /= 256.0;
-   if ( shift == 28 )
-       d *= 256.0; // testnet
-   if ( opt_debug_diff )
-       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", (double)d,
-                           shift, bits );
-   return net_diff;
-}
-
-void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
-{
-   // some random extradata to make the work unique
-   work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
-   work->height = work->data[32];
-   if (!have_longpoll && work->height > *net_blocks + 1)
-   {
-      char netinfo[64] = { 0 };
-      if ( net_diff > 0. )
-      {
-         if (net_diff != work->targetdiff)
-            sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
-                   work->targetdiff);
-         else
-             sprintf(netinfo, ", diff %.3f", net_diff);
-       }
-       applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
-                       netinfo);
-       *net_blocks = work->height - 1;
-   }
-}
-
-void decred_be_build_stratum_request( char *req, struct work *work,
-                                      struct stratum_ctx *sctx )
-{
-   unsigned char *xnonce2str;
-   uint32_t ntime, nonce;
-   char ntimestr[9], noncestr[9];
-
-   be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
-   be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
-   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
-   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
-   xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
-                                     sctx->xnonce1_size );
-   snprintf( req, JSON_BUF_LEN,
-        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
-   free(xnonce2str);
-}
-
-#if !defined(min)
-#define min(a,b) (a>b ? (b) :(a))
-#endif
-
-void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
-{
-   uchar merkle_root[64] = { 0 };
-   uint32_t extraheader[32] = { 0 };
-   int headersize = 0;
-   uint32_t* extradata = (uint32_t*) sctx->xnonce1;
-   int i;
-
-   // getwork over stratum, getwork merkle + header passed in coinb1
-   memcpy(merkle_root, sctx->job.coinbase, 32);
-   headersize = min((int)sctx->job.coinbase_size - 32,
-                  sizeof(extraheader) );
-   memcpy( extraheader, &sctx->job.coinbase[32], headersize );
-
-   // Assemble block header 
-   memset( g_work->data, 0, sizeof(g_work->data) );
-   g_work->data[0] = le32dec( sctx->job.version );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[1 + i] = swab32(
-                              le32dec( (uint32_t *) sctx->job.prevhash + i ) );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
-
-//   for ( i = 0; i < 8; i++ ) // prevhash
-//      g_work->data[1 + i] = swab32( g_work->data[1 + i] );
-//   for ( i = 0; i < 8; i++ ) // merkle
-//      g_work->data[9 + i] = swab32( g_work->data[9 + i] );
-
-   for ( i = 0; i < headersize/4; i++ ) // header
-      g_work->data[17 + i] = extraheader[i];
-   // extradata
-
-   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
-      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
-   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
-      g_work->data[i] = 0;
-   g_work->data[37] = (rand()*4) << 8;
-   // block header suffix from coinb2 (stake version)
-   memcpy( &g_work->data[44],
-           &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
-   sctx->block_height = g_work->data[32];
-   //applog_hex(work->data, 180);
-   //applog_hex(&work->data[36], 36);
-}
-
-#undef min
-
-bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                           int thr_id )
-{
-   if ( have_stratum && strcmp(stratum->job.job_id, work->job_id)  )
-      // need to regen g_work..
-      return false;
-   if ( have_stratum && !work->data[0] && !opt_benchmark )
-   {
-      sleep(1);
-      return false;
-   }
-   // extradata: prevent duplicates
-   work->data[ DECRED_XNONCE_INDEX     ] += 1;
-   work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
-   return true;
-}
-
-int decred_get_work_data_size() { return DECRED_DATA_SIZE; }
-
-bool register_decred_algo( algo_gate_t* gate )
-{
-#if defined(DECRED_4WAY)
-  four_way_not_tested();
-  gate->scanhash  = (void*)&scanhash_decred_4way;
-  gate->hash      = (void*)&decred_hash_4way;
-#else
-  gate->scanhash  = (void*)&scanhash_decred;
-  gate->hash      = (void*)&decred_hash;
-#endif
-  gate->optimizations = AVX2_OPT;
-//  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
-  gate->decode_extra_data     = (void*)&decred_decode_extradata;
-  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
-  gate->work_decode           = (void*)&std_be_work_decode;
-  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
-  gate->build_extraheader     = (void*)&decred_build_extraheader;
-  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
-  gate->nbits_index           = DECRED_NBITS_INDEX;
-  gate->ntime_index           = DECRED_NTIME_INDEX;
-  gate->nonce_index           = DECRED_NONCE_INDEX;
-  gate->get_work_data_size    = (void*)&decred_get_work_data_size;
-  gate->work_cmp_size         = DECRED_WORK_COMPARE_SIZE;
-  allow_mininginfo            = false;
-  have_gbt                    = false;
-  return true;
-}
-
--- a/algo/blake/decred-gate.h
+++ b/algo/blake/decred-gate.h
@@ -1,36 +0,0 @@
-#ifndef __DECRED_GATE_H__
-#define __DECRED_GATE_H__
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-#define DECRED_NBITS_INDEX 29
-#define DECRED_NTIME_INDEX 34
-#define DECRED_NONCE_INDEX 35
-#define DECRED_XNONCE_INDEX 36
-#define DECRED_DATA_SIZE 192
-#define DECRED_WORK_COMPARE_SIZE 140
-#define DECRED_MIDSTATE_LEN 128
-
-#if defined (__AVX2__) 
-//void blakehash_84way(void *state, const void *input);
-//int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
-//                         uint64_t *hashes_done );
-#endif
-
-#if defined(__SSE4_2__)
-  #define DECRED_4WAY
-#endif
-
-#if defined (DECRED_4WAY)
-void decred_hash_4way(void *state, const void *input);
-int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr );
-#endif
-
-void decred_hash( void *state, const void *input );
-int scanhash_decred( struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done, struct thr_info *mythr );
-
-#endif
-
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -1,282 +0,0 @@
-#include "decred-gate.h"
-
-#if !defined(DECRED_8WAY) && !defined(DECRED_4WAY)
-
-#include "sph_blake.h"
-
-#include <string.h>
-#include <stdint.h>
-#include <memory.h>
-#include <unistd.h>
-
-/*
-#ifndef min
-#define min(a,b) (a>b ? b : a)
-#endif
-#ifndef max 
-#define max(a,b) (a<b ? b : a)
-#endif
-*/
-/*
-#define DECRED_NBITS_INDEX 29
-#define DECRED_NTIME_INDEX 34
-#define DECRED_NONCE_INDEX 35
-#define DECRED_XNONCE_INDEX 36
-#define DECRED_DATA_SIZE 192
-#define DECRED_WORK_COMPARE_SIZE 140
-*/
-static __thread sph_blake256_context blake_mid;
-static __thread bool ctx_midstate_done = false;
-
-void decred_hash(void *state, const void *input)
-{
-//        #define MIDSTATE_LEN 128
-        sph_blake256_context ctx __attribute__ ((aligned (64)));
-
-        uint8_t *ending = (uint8_t*) input;
-        ending += DECRED_MIDSTATE_LEN;
-
-        if (!ctx_midstate_done) {
-                sph_blake256_init(&blake_mid);
-                sph_blake256(&blake_mid, input, DECRED_MIDSTATE_LEN);
-                ctx_midstate_done = true;
-        }
-        memcpy(&ctx, &blake_mid, sizeof(blake_mid));
-
-        sph_blake256(&ctx, ending, (180 - DECRED_MIDSTATE_LEN));
-        sph_blake256_close(&ctx, state);
-}
-
-void decred_hash_simple(void *state, const void *input)
-{
-        sph_blake256_context ctx;
-        sph_blake256_init(&ctx);
-        sph_blake256(&ctx, input, 180);
-        sph_blake256_close(&ctx, state);
-}
-
-int scanhash_decred( struct work *work, uint32_t max_nonce,
-               uint64_t *hashes_done, struct thr_info *mythr )
-{
-        uint32_t _ALIGN(64) endiandata[48];
-        uint32_t _ALIGN(64) hash32[8];
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-//        #define DCR_NONCE_OFT32 35
-
-        const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
-        const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
-
-        uint32_t n = first_nonce;
-
-        ctx_midstate_done = false;
-
-#if 1
-        memcpy(endiandata, pdata, 180);
-#else
-        for (int k=0; k < (180/4); k++)
-                be32enc(&endiandata[k], pdata[k]);
-#endif
-
-        do {
-                //be32enc(&endiandata[DCR_NONCE_OFT32], n);
-                endiandata[DECRED_NONCE_INDEX] = n;
-                decred_hash(hash32, endiandata);
-
-                if (hash32[7] <= HTarget && fulltest(hash32, ptarget))
-                {
-                   pdata[DECRED_NONCE_INDEX] = n;
-                   submit_solution( work, hash32, mythr );
-                }
-
-                n++;
-
-        } while (n < max_nonce && !work_restart[thr_id].restart);
-
-        *hashes_done = n - first_nonce + 1;
-        pdata[DECRED_NONCE_INDEX] = n;
-        return 0;
-}
-
-/*
-uint32_t *decred_get_nonceptr( uint32_t *work_data )
-{
-   return &work_data[ DECRED_NONCE_INDEX ];
-}
-
-double decred_calc_network_diff( struct work* work )
-{
-   // sample for diff 43.281 : 1c05ea29
-   // todo: endian reversed on longpoll could be zr5 specific...
-   uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
-   uint32_t bits = ( nbits & 0xffffff );
-   int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
-   int m;
-   double d = (double)0x0000ffff / (double)bits;
-
-   for ( m = shift; m < 29; m++ )
-       d *= 256.0;
-   for ( m = 29; m < shift; m++ )
-       d /= 256.0;
-   if ( shift == 28 )
-       d *= 256.0; // testnet
-   if ( opt_debug_diff )
-       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
-                           shift, bits );
-   return net_diff;
-}
-
-void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
-{
-   // some random extradata to make the work unique
-   work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
-   work->height = work->data[32];
-   if (!have_longpoll && work->height > *net_blocks + 1)
-   {
-      char netinfo[64] = { 0 };
-      if (net_diff > 0.)
-      {
-         if (net_diff != work->targetdiff)
-	    sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
-                   work->targetdiff);
-	 else
-	     sprintf(netinfo, ", diff %.3f", net_diff);
-       }
-       applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
-                       netinfo);
-       *net_blocks = work->height - 1;
-   }
-}
-
-void decred_be_build_stratum_request( char *req, struct work *work,
-                                      struct stratum_ctx *sctx )
-{
-   unsigned char *xnonce2str;
-   uint32_t ntime, nonce;
-   char ntimestr[9], noncestr[9];
-
-   be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
-   be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
-   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
-   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
-   xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
-                                     sctx->xnonce1_size );
-   snprintf( req, JSON_BUF_LEN,
-        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
-   free(xnonce2str);
-}
-*/
-/*
-// data shared between gen_merkle_root and build_extraheader.
-__thread uint32_t decred_extraheader[32] = { 0 };
-__thread int decred_headersize = 0;
-
-void decred_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
-{
-   // getwork over stratum, getwork merkle + header passed in coinb1
-   memcpy(merkle_root, sctx->job.coinbase, 32);
-   decred_headersize = min((int)sctx->job.coinbase_size - 32, 
-                  sizeof(decred_extraheader) );
-   memcpy( decred_extraheader, &sctx->job.coinbase[32], decred_headersize);
-}
-*/
-
-/*
-#define min(a,b) (a>b ? (b) :(a))
-
-void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
-{
-   uchar merkle_root[64] = { 0 };
-   uint32_t extraheader[32] = { 0 };
-   int headersize = 0;
-   uint32_t* extradata = (uint32_t*) sctx->xnonce1;
-   size_t t;
-   int i;
-
-   // getwork over stratum, getwork merkle + header passed in coinb1
-   memcpy(merkle_root, sctx->job.coinbase, 32);
-   headersize = min((int)sctx->job.coinbase_size - 32,
-                  sizeof(extraheader) );
-   memcpy( extraheader, &sctx->job.coinbase[32], headersize );
-
-   // Increment extranonce2 
-   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
-
-   // Assemble block header 
-   memset( g_work->data, 0, sizeof(g_work->data) );
-   g_work->data[0] = le32dec( sctx->job.version );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[1 + i] = swab32(
-                              le32dec( (uint32_t *) sctx->job.prevhash + i ) );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
-
-//   for ( i = 0; i < 8; i++ ) // prevhash
-//      g_work->data[1 + i] = swab32( g_work->data[1 + i] );
-//   for ( i = 0; i < 8; i++ ) // merkle
-//      g_work->data[9 + i] = swab32( g_work->data[9 + i] );
-
-   for ( i = 0; i < headersize/4; i++ ) // header
-      g_work->data[17 + i] = extraheader[i];
-   // extradata
-
-   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
-      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
-   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
-      g_work->data[i] = 0;
-   g_work->data[37] = (rand()*4) << 8;
-   // block header suffix from coinb2 (stake version)
-   memcpy( &g_work->data[44],
-           &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
-   sctx->bloc_height = g_work->data[32];
-   //applog_hex(work->data, 180);
-   //applog_hex(&work->data[36], 36);
-}
-
-#undef min
-
-bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                           int thr_id )
-{
-   if ( have_stratum && strcmp(stratum->job.job_id, work->job_id)  )
-      // need to regen g_work..
-      return false;
-   if ( have_stratum && !work->data[0] && !opt_benchmark )
-   {
-      sleep(1);
-      return false;
-   }      
-   // extradata: prevent duplicates
-   work->data[ DECRED_XNONCE_INDEX     ] += 1;
-   work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
-   return true;
-}
-
-
-bool register_decred_algo( algo_gate_t* gate )
-{
-  gate->optimizations         = SSE2_OPT;
-  gate->scanhash              = (void*)&scanhash_decred;
-  gate->hash                  = (void*)&decred_hash;
-  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
-  gate->decode_extra_data     = (void*)&decred_decode_extradata;
-  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
-  gate->work_decode           = (void*)&std_be_work_decode;
-  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
-  gate->build_extraheader     = (void*)&decred_build_extraheader;
-  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
-  gate->nbits_index           = DECRED_NBITS_INDEX;
-  gate->ntime_index           = DECRED_NTIME_INDEX;
-  gate->nonce_index           = DECRED_NONCE_INDEX;
-  gate->work_data_size        = DECRED_DATA_SIZE;
-  gate->work_cmp_size         = DECRED_WORK_COMPARE_SIZE; 
-  allow_mininginfo            = false;
-  have_gbt                    = false;
-  return true;
-}
-*/
-
-#endif
--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -17,7 +17,6 @@

 #include "algo/sha/sph_types.h"
 #include "sph-blake2s.h"
-#include "simd-utils.h"

 static const uint32_t blake2s_IV[8] =
 {
@@ -226,71 +225,6 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
 	v[13] = S->t[1] ^ blake2s_IV[5];
 	v[14] = S->f[0] ^ blake2s_IV[6];
 	v[15] = S->f[1] ^ blake2s_IV[7];
-
-#if 0    
-//#if defined(__SSE2__) // always true
-
-The only application for this is to do a prehash for the blake2s algorithm.
-SSE2 also supports 4 way parallel hashing so that is preferred in most cases.
-Testing has found that using this serial SIMD code for prehash is slower than
-doing a parallel hash. A parallel hash has more instructions and uses more
-data. The serial hash uses fewer instructions and data and only needs to
-interleave the final hash into parallel streams. This has shown negligible
-improvement on other algos, notably blake256 which is almost identical.
-Considering the low frequency of prehash no statistically valid change
-was expected. It was simply better on paper.
-
-Furthermore, simply defining this macro has an additional negative effect on
-blake2s as a whole. There are no references to this macro, blake2s-4way does
-not include it in any header files, it's just another unused macro which should
-have no effect beyond the preprocessor. But just being visible to the compiler
-changes things in a dramatic way.
-
-These 2 things combined reduced the hash rate for blake2s by more than 5% when
-using serial SIMD for the blake2s prehash over 16way parallel prehash.
-16way parallel hashing was used in the high frequency nonce loop in both cases.
-Comsidering the prehash represents 50% of the algorithm and is done once vs
-the high frequency second half that is done mega, maybe giga, times more it's
-hard to imagine that big of an effect in either direction.
-
-#define ROUND( r ) \
-{ \
-   __m128i *V = (__m128i*)v; \
-   const uint8_t *sigma = blake2s_sigma[r]; \
-   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
-                       _mm_set_epi32( m[ sigma[ 6 ] ], m[ sigma[ 4 ] ], \
-                                      m[ sigma[ 2 ] ], m[ sigma[ 0 ] ] ) ) ); \
-   V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
-   V[2] = _mm_add_epi32( V[2], V[3] ); \
-   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
-   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
-                        _mm_set_epi32( m[ sigma[ 7 ] ], m[ sigma[ 5 ] ], \
-                                       m[ sigma[ 3 ] ], m[ sigma[ 1 ] ] ) ) ); \
-   V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
-   V[2] = _mm_add_epi32( V[2], V[3] ); \
-   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
-   V[3] = mm128_shufll_32( V[3] ); \
-   V[2] = mm128_swap_64( V[2] ); \
-   V[1] = mm128_shuflr_32( V[1] ); \
-   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
-                        _mm_set_epi32( m[ sigma[14] ], m[ sigma[12] ], \
-                                       m[ sigma[10] ], m[ sigma[ 8] ] ) ) ); \
-   V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
-   V[2] = _mm_add_epi32( V[2], V[3] ); \
-   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
-   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
-                        _mm_set_epi32( m[ sigma[15] ], m[ sigma[13] ], \
-                                       m[ sigma[11] ], m[ sigma[ 9] ] ) ) ); \
-   V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
-   V[2] = _mm_add_epi32( V[2], V[3] ); \
-   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
-   V[3] = mm128_shuflr_32( V[3] ); \
-   V[2] = mm128_swap_64( V[2] ); \
-   V[1] = mm128_shufll_32( V[1] ); \
-}
-
-#else
-
 #define G(r,i,a,b,c,d) \
 	do { \
 		a = a + b + m[blake2s_sigma[r][2*i+0]]; \
@@ -313,10 +247,7 @@ hard to imagine that big of an effect in either direction.
 		G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
 		G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
 	} while(0)
-
-#endif
-
-   ROUND( 0 );
+	ROUND( 0 );
 	ROUND( 1 );
 	ROUND( 2 );
 	ROUND( 3 );
--- a/algo/blake/sph-blake2s.h
+++ b/algo/blake/sph-blake2s.h
@@ -91,7 +91,6 @@ static inline void secure_zero_memory(void *v, size_t n)
 extern "C" {
 #endif

-/*   
 	enum blake2s_constant
 	{
 		BLAKE2S_BLOCKBYTES = 64,
@@ -100,13 +99,6 @@ extern "C" {
 		BLAKE2S_SALTBYTES  = 8,
 		BLAKE2S_PERSONALBYTES = 8
 	};
-*/
-
-#define BLAKE2S_BLOCKBYTES  64
-#define BLAKE2S_OUTBYTES    32
-#define BLAKE2S_KEYBYTES    32
-#define BLAKE2S_SALTBYTES   8
-#define BLAKE2S_PERSONALBYTES  8

 #pragma pack(push, 1)
 	typedef struct __blake2s_param
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -554,20 +554,10 @@ int luffa_4way_update_close( luffa_4way_context *state,
    a = _mm256_xor_si256( a, c0 ); \
    b = _mm256_xor_si256( b, c1 );

-/*
-#define MULT2( a0, a1, mask ) \
-do { \
-  __m256i b = _mm256_xor_si256( a0, \
-                   _mm256_shuffle_epi32( _mm256_and_si256(a1,mask), 16 ) ); \
-  a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
-  a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) );  \
-} while(0)
-*/
-
-#define MULT2( a0, a1, mask ) \
+#define MULT2( a0, a1 ) \
 { \
-  __m256i b = _mm256_xor_si256( a0, \
-                 _mm256_shuffle_epi32( _mm256_and_si256( a1, mask ), 16 ) ); \
+  __m256i b = _mm256_xor_si256( a0, _mm256_shuffle_epi32( \
+                         _mm256_blend_epi32( a1, m256_zero, 0xee ), 16 ) ); \
  a0 = _mm256_alignr_epi8( a1,  b, 4 ); \
  a1 = _mm256_alignr_epi8(  b, a1, 4 ); \
 }
@@ -682,7 +672,6 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
    __m256i *chainv = state->chainv;
    __m256i msg0, msg1;
    __m256i x0, x1, x2, x3, x4, x5, x6, x7;
-    const __m256i MASK = m256_const1_i128( 0xffffffff );

    t0 = chainv[0];
    t1 = chainv[1];
@@ -696,7 +685,7 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
    t0 = _mm256_xor_si256( t0, chainv[8] );
    t1 = _mm256_xor_si256( t1, chainv[9] );

-    MULT2( t0, t1, MASK );
+    MULT2( t0, t1 );

    msg0 = _mm256_shuffle_epi32( msg[0], 27 );
    msg1 = _mm256_shuffle_epi32( msg[1], 27 );
@@ -715,66 +704,66 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
    t0 = chainv[0];
    t1 = chainv[1];

-    MULT2( chainv[0], chainv[1], MASK );
+    MULT2( chainv[0], chainv[1] );
    chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
    chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );

-    MULT2( chainv[2], chainv[3], MASK );
+    MULT2( chainv[2], chainv[3] );
    chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
    chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);

-    MULT2( chainv[4], chainv[5], MASK );
+    MULT2( chainv[4], chainv[5] );
    chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
    chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);

-    MULT2( chainv[6], chainv[7], MASK );
+    MULT2( chainv[6], chainv[7] );
    chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
    chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);

-    MULT2( chainv[8], chainv[9], MASK );
+    MULT2( chainv[8], chainv[9] );
    chainv[8] = _mm256_xor_si256( chainv[8], t0 );
    chainv[9] = _mm256_xor_si256( chainv[9], t1 );

    t0 = chainv[8];
    t1 = chainv[9];

-    MULT2( chainv[8], chainv[9], MASK );
+    MULT2( chainv[8], chainv[9] );
    chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
    chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );

-    MULT2( chainv[6], chainv[7], MASK );
+    MULT2( chainv[6], chainv[7] );
    chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
    chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );

-    MULT2( chainv[4], chainv[5], MASK );
+    MULT2( chainv[4], chainv[5] );
    chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
    chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );

-    MULT2( chainv[2], chainv[3], MASK );
+    MULT2( chainv[2], chainv[3] );
    chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
    chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );

-    MULT2( chainv[0], chainv[1], MASK );
+    MULT2( chainv[0], chainv[1] );
    chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
    chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );
    chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
    chainv[3] = _mm256_xor_si256( chainv[3], msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );
    chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
    chainv[5] = _mm256_xor_si256( chainv[5], msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );
    chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
    chainv[7] = _mm256_xor_si256( chainv[7], msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );
    chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
    chainv[9] = _mm256_xor_si256( chainv[9], msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );

    chainv[3] = mm256_rol_32( chainv[3], 1 );
    chainv[5] = mm256_rol_32( chainv[5], 2 );
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -24,45 +24,6 @@ typedef union {
 #endif
 } allium_16way_ctx_holder;

-static uint32_t allium_16way_midstate_vars[16*16] __attribute__ ((aligned (64)));
-static __m512i allium_16way_block0_hash[8] __attribute__ ((aligned (64)));
-static __m512i allium_16way_block_buf[16] __attribute__ ((aligned (64)));
-
-int allium_16way_prehash( struct work *work )
-{
-   uint32_t phash[8] __attribute__ ((aligned (32))) =
-   {
-      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-   };
-   uint32_t *pdata = work->data;
-
-   // Prehash first block.
-   blake256_transform_le( phash, pdata, 512, 0 );
-
-   // Interleave hash for second block prehash.
-   allium_16way_block0_hash[0] = _mm512_set1_epi32( phash[0] );
-   allium_16way_block0_hash[1] = _mm512_set1_epi32( phash[1] );
-   allium_16way_block0_hash[2] = _mm512_set1_epi32( phash[2] );
-   allium_16way_block0_hash[3] = _mm512_set1_epi32( phash[3] );
-   allium_16way_block0_hash[4] = _mm512_set1_epi32( phash[4] );
-   allium_16way_block0_hash[5] = _mm512_set1_epi32( phash[5] );
-   allium_16way_block0_hash[6] = _mm512_set1_epi32( phash[6] );
-   allium_16way_block0_hash[7] = _mm512_set1_epi32( phash[7] );
-
-   // Build vectored second block, interleave 12 of last 16 bytes of data,
-   // excluding the nonce.
-   allium_16way_block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
-   allium_16way_block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
-   allium_16way_block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
-
-   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_16way_round0_prehash_le( allium_16way_midstate_vars,
-                         allium_16way_block0_hash, allium_16way_block_buf );
-
-   return 1;
-}
-
 static void allium_16way_hash( void *state, const void *midstate_vars, 
                               const void *midhash, const void *block )
 {
@@ -239,6 +200,11 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
   uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
   __m512i block0_hash[8] __attribute__ ((aligned (64)));
   __m512i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (32))) = 
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -250,19 +216,31 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,

   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   pthread_rwlock_rdlock( &g_work_lock );
+   // Prehash first block.
+   blake256_transform_le( phash, pdata, 512, 0 );

-   memcpy( midstate_vars, allium_16way_midstate_vars, sizeof midstate_vars );
-   memcpy( block0_hash,   allium_16way_block0_hash,   sizeof block0_hash );
-   memcpy( block_buf,     allium_16way_block_buf,     sizeof block_buf );
+   // Interleave hash for second block prehash.
+   block0_hash[0] = _mm512_set1_epi32( phash[0] );
+   block0_hash[1] = _mm512_set1_epi32( phash[1] );
+   block0_hash[2] = _mm512_set1_epi32( phash[2] );
+   block0_hash[3] = _mm512_set1_epi32( phash[3] );
+   block0_hash[4] = _mm512_set1_epi32( phash[4] );
+   block0_hash[5] = _mm512_set1_epi32( phash[5] );
+   block0_hash[6] = _mm512_set1_epi32( phash[6] );
+   block0_hash[7] = _mm512_set1_epi32( phash[7] );

-   pthread_rwlock_unlock( &g_work_lock );
-
-   // fill in the nonces
-   block_buf[3] =
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces.
+   block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
+   block_buf[ 3] =
             _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
-   
+
+   // Partialy prehash second block without touching nonces in block_buf[3].
+   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+
   do {
     allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );

@@ -293,44 +271,6 @@ typedef union {
 #endif
 } allium_8way_ctx_holder;

-static uint32_t allium_8way_midstate_vars[16*8] __attribute__ ((aligned (64)));
-static __m256i allium_8way_block0_hash[8] __attribute__ ((aligned (64)));
-static __m256i allium_8way_block_buf[16] __attribute__ ((aligned (64)));
-
-int allium_8way_prehash ( struct work *work )
-{
-   uint32_t phash[8] __attribute__ ((aligned (32))) =
-   {
-      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-   };
-   uint32_t *pdata = work->data;
-
-   // Prehash first block
-   blake256_transform_le( phash, pdata, 512, 0 );
-
-   allium_8way_block0_hash[0] = _mm256_set1_epi32( phash[0] );
-   allium_8way_block0_hash[1] = _mm256_set1_epi32( phash[1] );
-   allium_8way_block0_hash[2] = _mm256_set1_epi32( phash[2] );
-   allium_8way_block0_hash[3] = _mm256_set1_epi32( phash[3] );
-   allium_8way_block0_hash[4] = _mm256_set1_epi32( phash[4] );
-   allium_8way_block0_hash[5] = _mm256_set1_epi32( phash[5] );
-   allium_8way_block0_hash[6] = _mm256_set1_epi32( phash[6] );
-   allium_8way_block0_hash[7] = _mm256_set1_epi32( phash[7] );
-
-   // Build vectored second block, interleave 12 of the last 16 bytes,
-   // excepting the nonces.
-   allium_8way_block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
-   allium_8way_block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
-   allium_8way_block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
-
-   // Partialy prehash second block without touching nonces
-   blake256_8way_round0_prehash_le( allium_8way_midstate_vars,
-                             allium_8way_block0_hash, allium_8way_block_buf );
-
-   return 1;
-}
-
 static void allium_8way_hash( void *hash, const void *midstate_vars,
                               const void *midhash, const void *block )
 {
@@ -446,6 +386,11 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
   uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
   __m256i block0_hash[8] __attribute__ ((aligned (64)));
   __m256i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (32))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
   uint64_t *ptarget = (uint64_t*)work->target;
   const uint32_t first_nonce = pdata[19];
@@ -455,17 +400,29 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
   const bool bench = opt_benchmark;
   const __m256i eight = m256_const1_32( 8 );

-   pthread_rwlock_rdlock( &g_work_lock );
+   // Prehash first block
+   blake256_transform_le( phash, pdata, 512, 0 );

-   memcpy( midstate_vars, allium_8way_midstate_vars, sizeof midstate_vars );
-   memcpy( block0_hash,   allium_8way_block0_hash,   sizeof block0_hash );
-   memcpy( block_buf,     allium_8way_block_buf,     sizeof block_buf );
+   block0_hash[0] = _mm256_set1_epi32( phash[0] );
+   block0_hash[1] = _mm256_set1_epi32( phash[1] );
+   block0_hash[2] = _mm256_set1_epi32( phash[2] );
+   block0_hash[3] = _mm256_set1_epi32( phash[3] );
+   block0_hash[4] = _mm256_set1_epi32( phash[4] );
+   block0_hash[5] = _mm256_set1_epi32( phash[5] );
+   block0_hash[6] = _mm256_set1_epi32( phash[6] );
+   block0_hash[7] = _mm256_set1_epi32( phash[7] );

-   pthread_rwlock_unlock( &g_work_lock );
-   
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces.
+   block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
   block_buf[ 3] = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4,
                                     n+ 3, n+ 2, n+ 1, n );
-   
+
+   // Partialy prehash second block without touching nonces
+   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+
   do {
     allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );

@@ -481,7 +438,6 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
     n += 8;
     block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
   } while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
-
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -131,12 +131,10 @@ bool register_lyra2z_algo( algo_gate_t* gate )
 {
 #if defined(LYRA2Z_16WAY)
  gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
-  gate->prehash    = (void*)&lyra2z_16way_prehash;
  gate->scanhash   = (void*)&scanhash_lyra2z_16way;
 //  gate->hash       = (void*)&lyra2z_16way_hash;
 #elif defined(LYRA2Z_8WAY)
  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
-  gate->prehash    = (void*)&lyra2z_8way_prehash;
  gate->scanhash   = (void*)&scanhash_lyra2z_8way;
 //  gate->hash       = (void*)&lyra2z_8way_hash;
 #elif defined(LYRA2Z_4WAY)
@@ -177,10 +175,8 @@ bool register_lyra2h_algo( algo_gate_t* gate )
 bool register_allium_algo( algo_gate_t* gate )
 {
 #if defined (ALLIUM_16WAY)
-  gate->prehash   = (void*)&allium_16way_prehash;
  gate->scanhash  = (void*)&scanhash_allium_16way;
 #elif defined (ALLIUM_8WAY)
-  gate->prehash   = (void*)&allium_8way_prehash;
  gate->scanhash  = (void*)&scanhash_allium_8way;
 #else
  gate->miner_thread_init = (void*)&init_allium_ctx;
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -5,6 +5,7 @@
 #include <stdint.h>
 #include "lyra2.h"

+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define LYRA2REV3_16WAY 1
 #elif defined(__AVX2__)
@@ -101,7 +102,6 @@ bool init_lyra2rev2_ctx();
 //void lyra2z_16way_hash( void *state, const void *input );
 int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
-int lyra2z_16way_prehash ( struct work *work );
 bool lyra2z_16way_thread_init();

 #elif defined(LYRA2Z_8WAY)
@@ -110,7 +110,6 @@ bool lyra2z_16way_thread_init();
 int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_8way_thread_init();
-int lyra2z_8way_prehash ( struct work *work );

 #elif defined(LYRA2Z_4WAY)

@@ -166,13 +165,11 @@ bool register_allium_algo( algo_gate_t* gate );

 int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
-int allium_16way_prehash ( struct work *work );

 #elif defined(ALLIUM_8WAY)

 int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
-int allium_8way_prehash ( struct work *work );

 #else

--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -75,7 +75,7 @@ void lyra2rev2_16way_hash( void *state, const void *input )
   keccak256_8way_close( &ctx.keccak, vhash );

   dintrlv_8x64( hash8,  hash9,  hash10,  hash11,
-                 hash12, hash13, hash14, hash5, vhash, 256 );
+                 hash12, hash13, hash14, hash15, vhash, 256 );

   cubehash_full( &ctx.cube, (byte*) hash0,  256, (const byte*) hash0,  32 );
   cubehash_full( &ctx.cube, (byte*) hash1,  256, (const byte*) hash1,  32 );
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -14,44 +14,6 @@ bool lyra2z_16way_thread_init()
 return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
 }

-static uint32_t lyra2z_16way_midstate_vars[16*16] __attribute__ ((aligned (64)));
-static __m512i lyra2z_16way_block0_hash[8] __attribute__ ((aligned (64)));
-static __m512i lyra2z_16way_block_buf[16] __attribute__ ((aligned (64)));
-
-int lyra2z_16way_prehash ( struct work *work )
-{
-   uint32_t phash[8] __attribute__ ((aligned (32))) =
-   {
-      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-   };
-   uint32_t *pdata = work->data;
-
-   // Prehash first block
-   blake256_transform_le( phash, pdata, 512, 0 );
-
-   lyra2z_16way_block0_hash[0] = _mm512_set1_epi32( phash[0] );
-   lyra2z_16way_block0_hash[1] = _mm512_set1_epi32( phash[1] );
-   lyra2z_16way_block0_hash[2] = _mm512_set1_epi32( phash[2] );
-   lyra2z_16way_block0_hash[3] = _mm512_set1_epi32( phash[3] );
-   lyra2z_16way_block0_hash[4] = _mm512_set1_epi32( phash[4] );
-   lyra2z_16way_block0_hash[5] = _mm512_set1_epi32( phash[5] );
-   lyra2z_16way_block0_hash[6] = _mm512_set1_epi32( phash[6] );
-   lyra2z_16way_block0_hash[7] = _mm512_set1_epi32( phash[7] );
-
-   // Build vectored second block, interleave 12 of last 16 bytes of data
-   // excepting the nonce.
-   lyra2z_16way_block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
-   lyra2z_16way_block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
-   lyra2z_16way_block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
-
-   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_16way_round0_prehash_le( lyra2z_16way_midstate_vars, 
-                       lyra2z_16way_block0_hash, lyra2z_16way_block_buf );
-
-   return 1;
-}
-
 static void lyra2z_16way_hash( void *state, const void *midstate_vars,
                        const void *midhash, const void *block )
 {
@@ -129,6 +91,11 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
   uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
   __m512i block0_hash[8] __attribute__ ((aligned (64)));
   __m512i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (64))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -140,18 +107,30 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,

   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   pthread_rwlock_rdlock( &g_work_lock );
+   // Prehash first block
+   blake256_transform_le( phash, pdata, 512, 0 );

-   memcpy( midstate_vars, lyra2z_16way_midstate_vars, sizeof midstate_vars );
-   memcpy( block0_hash,   lyra2z_16way_block0_hash,   sizeof block0_hash );
-   memcpy( block_buf,     lyra2z_16way_block_buf,     sizeof block_buf );
+   block0_hash[0] = _mm512_set1_epi32( phash[0] );
+   block0_hash[1] = _mm512_set1_epi32( phash[1] );
+   block0_hash[2] = _mm512_set1_epi32( phash[2] );
+   block0_hash[3] = _mm512_set1_epi32( phash[3] );
+   block0_hash[4] = _mm512_set1_epi32( phash[4] );
+   block0_hash[5] = _mm512_set1_epi32( phash[5] );
+   block0_hash[6] = _mm512_set1_epi32( phash[6] );
+   block0_hash[7] = _mm512_set1_epi32( phash[7] );

-   pthread_rwlock_unlock( &g_work_lock );
-   
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces.
+   block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
   block_buf[ 3] =
             _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );

+   // Partialy prehash second block without touching nonces in block_buf[3].
+   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+
   do {
     lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );

@@ -178,44 +157,6 @@ bool lyra2z_8way_thread_init()
 return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
 }

-static uint32_t lyra2z_8way_midstate_vars[16*8] __attribute__ ((aligned (64)));
-static __m256i lyra2z_8way_block0_hash[8] __attribute__ ((aligned (64)));
-static __m256i lyra2z_8way_block_buf[16] __attribute__ ((aligned (64)));
-
-int lyra2z_8way_prehash ( struct work *work )
-{
-   uint32_t phash[8] __attribute__ ((aligned (32))) =
-   {
-      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-   };
-   uint32_t *pdata = work->data;
-
-   // Prehash first block
-   blake256_transform_le( phash, pdata, 512, 0 );
-
-   lyra2z_8way_block0_hash[0] = _mm256_set1_epi32( phash[0] );
-   lyra2z_8way_block0_hash[1] = _mm256_set1_epi32( phash[1] );
-   lyra2z_8way_block0_hash[2] = _mm256_set1_epi32( phash[2] );
-   lyra2z_8way_block0_hash[3] = _mm256_set1_epi32( phash[3] );
-   lyra2z_8way_block0_hash[4] = _mm256_set1_epi32( phash[4] );
-   lyra2z_8way_block0_hash[5] = _mm256_set1_epi32( phash[5] );
-   lyra2z_8way_block0_hash[6] = _mm256_set1_epi32( phash[6] );
-   lyra2z_8way_block0_hash[7] = _mm256_set1_epi32( phash[7] );
-
-   // Build vectored second block, interleave last 16 bytes of data using
-   // unique nonces.
-   lyra2z_8way_block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
-   lyra2z_8way_block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
-   lyra2z_8way_block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
-
-   // Partialy prehash second block without touching nonces
-   blake256_8way_round0_prehash_le( lyra2z_8way_midstate_vars,
-                           lyra2z_8way_block0_hash, lyra2z_8way_block_buf );
-
-   return 1;
-}
-
 static void lyra2z_8way_hash( void *state, const void *midstate_vars,
                       const void *midhash, const void *block )
 {
@@ -260,6 +201,11 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
   uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
   __m256i block0_hash[8] __attribute__ ((aligned (64)));
   __m256i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (32))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
   uint64_t *ptarget = (uint64_t*)work->target;
   const uint32_t first_nonce = pdata[19];
@@ -269,14 +215,23 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
   const bool bench = opt_benchmark;
   const __m256i eight = m256_const1_32( 8 );

-   pthread_rwlock_rdlock( &g_work_lock );
+   // Prehash first block
+   blake256_transform_le( phash, pdata, 512, 0 );

-   memcpy( midstate_vars, lyra2z_8way_midstate_vars, sizeof midstate_vars );
-   memcpy( block0_hash,   lyra2z_8way_block0_hash,   sizeof block0_hash );
-   memcpy( block_buf,     lyra2z_8way_block_buf,     sizeof block_buf );
+   block0_hash[0] = _mm256_set1_epi32( phash[0] );
+   block0_hash[1] = _mm256_set1_epi32( phash[1] );
+   block0_hash[2] = _mm256_set1_epi32( phash[2] );
+   block0_hash[3] = _mm256_set1_epi32( phash[3] );
+   block0_hash[4] = _mm256_set1_epi32( phash[4] );
+   block0_hash[5] = _mm256_set1_epi32( phash[5] );
+   block0_hash[6] = _mm256_set1_epi32( phash[6] );
+   block0_hash[7] = _mm256_set1_epi32( phash[7] );

-   pthread_rwlock_unlock( &g_work_lock );
-   
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces.
+   block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
   block_buf[ 3] =
            _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );

--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -4,24 +4,6 @@
 #include <string.h>
 #include <stdio.h>

-long double lbry_calc_network_diff( struct work *work )
-{
-        // sample for diff 43.281 : 1c05ea29
-        // todo: endian reversed on longpoll could be zr5 specific...
-
-   uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
-   uint32_t bits = (nbits & 0xffffff);
-   int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
-   long double d = (long double)0x0000ffff / (long double)bits;
-
-   for (int m=shift; m < 29; m++) d *= 256.0;
-   for (int m=29; m < shift; m++) d /= 256.0;
-   if (opt_debug_diff)
-      applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
-
-   return d;
-}
-
 // std_le should work but it doesn't
 void lbry_le_build_stratum_request( char *req, struct work *work,
                                      struct stratum_ctx *sctx )
@@ -41,31 +23,6 @@ void lbry_le_build_stratum_request( char *req, struct work *work,
   free(xnonce2str);
 }

-/*
-void lbry_build_block_header( struct work* g_work, uint32_t version,
-                             uint32_t *prevhash, uint32_t *merkle_root,
-                             uint32_t ntime, uint32_t nbits )
-{
-   int i;
-   memset( g_work->data, 0, sizeof(g_work->data) );
-   g_work->data[0] =  version;
-
-   if ( have_stratum )
-      for ( i = 0; i < 8; i++ )
-         g_work->data[1 + i] = le32dec( prevhash + i );
-   else
-      for (i = 0; i < 8; i++)
-         g_work->data[ 8-i ] = le32dec( prevhash + i );
-
-   for ( i = 0; i < 8; i++ )
-      g_work->data[9 + i] = be32dec( merkle_root + i );
-
-   g_work->data[ LBRY_NTIME_INDEX ] = ntime;
-   g_work->data[ LBRY_NBITS_INDEX ] = nbits;
-   g_work->data[28] = 0x80000000;
-}
-*/
-
 void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 {
   unsigned char merkle_root[64] = { 0 };
@@ -112,9 +69,7 @@ bool register_lbry_algo( algo_gate_t* gate )
  gate->hash                  = (void*)&lbry_hash;
  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
 #endif
-  gate->calc_network_diff     = (void*)&lbry_calc_network_diff;
  gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
-//  gate->build_block_header    = (void*)&build_block_header;
  gate->build_extraheader     = (void*)&lbry_build_extraheader;
  gate->ntime_index           = LBRY_NTIME_INDEX;
  gate->nbits_index           = LBRY_NBITS_INDEX;
--- a/algo/sha/sha256dt.c
+++ b/algo/sha/sha256dt.c
@@ -0,0 +1,268 @@
+#include "algo-gate-api.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "sha-hash-4way.h"
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define SHA256DT_16WAY 1
+#elif defined(__AVX2__)
+  #define SHA256DT_8WAY 1
+#else
+  #define SHA256DT_4WAY 1
+#endif
+
+#if defined(SHA256DT_16WAY)
+
+int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m512i  vdata[32]    __attribute__ ((aligned (128)));
+   __m512i  block[16]    __attribute__ ((aligned (64)));
+   __m512i  hash32[8]    __attribute__ ((aligned (64)));
+   __m512i  initstate[8] __attribute__ ((aligned (64)));
+   __m512i  midstate1[8] __attribute__ ((aligned (64)));
+   __m512i  midstate2[8] __attribute__ ((aligned (64)));
+   __m512i  mexp_pre[16] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 16;
+   uint32_t n = first_nonce;
+   __m512i *noncev = vdata + 19; 
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m512i last_byte = m512_const1_32( 0x80000000 );
+   const __m512i sixteen = m512_const1_32( 16 );
+
+   for ( int i = 0; i < 19; i++ )
+      vdata[i] = mm512_bcast_i32( pdata[i] );
+
+   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
+                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
+
+   vdata[16+4] = last_byte;
+   memset_zero_512( vdata+16 + 5, 10 );
+   vdata[16+15] = mm512_bcast_i32( 0x480 ); 
+   
+   block[ 8] = last_byte;
+   memset_zero_512( block + 9, 6 );
+   block[15] = mm512_bcast_i32( 0x300 ); 
+   
+   initstate[0] = mm512_bcast_i64( 0xdfa9bf2cdfa9bf2c );
+   initstate[1] = mm512_bcast_i64( 0xb72074d4b72074d4 );
+   initstate[2] = mm512_bcast_i64( 0x6bb011226bb01122 );
+   initstate[3] = mm512_bcast_i64( 0xd338e869d338e869 );
+   initstate[4] = mm512_bcast_i64( 0xaa3ff126aa3ff126 );
+   initstate[5] = mm512_bcast_i64( 0x475bbf30475bbf30 );
+   initstate[6] = mm512_bcast_i64( 0x8fd52e5b8fd52e5b );
+   initstate[7] = mm512_bcast_i64( 0x9f75c9ad9f75c9ad );
+
+   sha256_16way_transform_le( midstate1, vdata, initstate );
+   
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
+
+   do
+   {
+      sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                 mexp_pre );
+      sha256_16way_transform_le( hash32, block, initstate );
+      mm512_block_bswap_32( hash32, hash32 );    
+
+      for ( int lane = 0; lane < 16; lane++ )
+      if ( hash32_d7[ lane ] <= targ32_d7 )
+      {
+         extr_lane_16x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+      }
+      *noncev = _mm512_add_epi32( *noncev, sixteen );
+      n += 16;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+#endif
+
+#if defined(SHA256DT_8WAY)
+
+int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m256i  vdata[32]    __attribute__ ((aligned (64)));
+   __m256i  block[16]    __attribute__ ((aligned (32)));
+   __m256i  hash32[8]    __attribute__ ((aligned (32)));
+   __m256i  initstate[8] __attribute__ ((aligned (32)));
+   __m256i  midstate1[8] __attribute__ ((aligned (32)));
+   __m256i  midstate2[8] __attribute__ ((aligned (32)));
+   __m256i  mexp_pre[16] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+   __m256i *noncev = vdata + 19;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m256i last_byte = m256_const1_32( 0x80000000 );
+   const __m256i eight = m256_const1_32( 8 );
+
+   for ( int i = 0; i < 19; i++ )
+      vdata[i] = mm256_bcast_i32( pdata[i] );
+
+   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
+
+   vdata[16+4] = last_byte;
+   memset_zero_256( vdata+16 + 5, 10 );
+   vdata[16+15] = mm256_bcast_i32( 0x480 );
+
+   block[ 8] = last_byte;
+   memset_zero_256( block + 9, 6 );
+   block[15] = mm256_bcast_i32( 0x300 ); 
+   
+   // initialize state
+   initstate[0] = mm256_bcast_i64( 0xdfa9bf2cdfa9bf2c );
+   initstate[1] = mm256_bcast_i64( 0xb72074d4b72074d4 );
+   initstate[2] = mm256_bcast_i64( 0x6bb011226bb01122 );
+   initstate[3] = mm256_bcast_i64( 0xd338e869d338e869 );
+   initstate[4] = mm256_bcast_i64( 0xaa3ff126aa3ff126 );
+   initstate[5] = mm256_bcast_i64( 0x475bbf30475bbf30 );
+   initstate[6] = mm256_bcast_i64( 0x8fd52e5b8fd52e5b );
+   initstate[7] = mm256_bcast_i64( 0x9f75c9ad9f75c9ad );
+
+   sha256_8way_transform_le( midstate1, vdata, initstate );
+
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
+   
+   do
+   {
+      sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                mexp_pre );
+      sha256_8way_transform_le( hash32, block, initstate );
+      mm256_block_bswap_32( hash32, hash32 );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( hash32_d7[ lane ] <= targ32_d7 )
+      {
+         extr_lane_8x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+      }
+      *noncev = _mm256_add_epi32( *noncev, eight );
+      n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
+
+#if defined(SHA256DT_4WAY)
+
+int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m128i  vdata[32]    __attribute__ ((aligned (64)));
+   __m128i  block[16]    __attribute__ ((aligned (32)));
+   __m128i  hash32[8]    __attribute__ ((aligned (32)));
+   __m128i  initstate[8] __attribute__ ((aligned (32)));
+   __m128i  midstate[8]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   uint32_t n = first_nonce;
+   __m128i *noncev = vdata + 19;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m128i last_byte = m128_const1_32( 0x80000000 );
+   const __m128i four = m128_const1_32( 4 );
+
+   for ( int i = 0; i < 19; i++ )
+       vdata[i] = mm128_bcast_i32( pdata[i] );
+
+   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
+
+   vdata[16+4] = last_byte;
+   memset_zero_128( vdata+16 + 5, 10 );
+   vdata[16+15] = mm128_bcast_i32( 0x480 );
+
+   block[ 8] = last_byte;
+   memset_zero_128( block + 9, 6 );
+   block[15] = mm128_bcast_i32( 0x300 );
+   
+   // initialize state
+   initstate[0] = mm128_bcast_i64( 0xdfa9bf2cdfa9bf2c );
+   initstate[1] = mm128_bcast_i64( 0xb72074d4b72074d4 );
+   initstate[2] = mm128_bcast_i64( 0x6bb011226bb01122 );
+   initstate[3] = mm128_bcast_i64( 0xd338e869d338e869 );
+   initstate[4] = mm128_bcast_i64( 0xaa3ff126aa3ff126 );
+   initstate[5] = mm128_bcast_i64( 0x475bbf30475bbf30 );
+   initstate[6] = mm128_bcast_i64( 0x8fd52e5b8fd52e5b );
+   initstate[7] = mm128_bcast_i64( 0x9f75c9ad9f75c9ad );
+
+   // hash first 64 bytes of data
+   sha256_4way_transform_le( midstate, vdata, initstate );
+
+   do
+   {
+      sha256_4way_transform_le( block,  vdata+16, midstate  );
+      sha256_4way_transform_le( hash32, block,    initstate );
+      mm128_block_bswap_32( hash32, hash32 );
+
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_4x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+       }
+       *noncev = _mm_add_epi32( *noncev, four );
+       n += 4;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
+bool register_sha256dt_algo( algo_gate_t* gate )
+{
+    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+#if defined(SHA256DT_16WAY)
+    gate->scanhash   = (void*)&scanhash_sha256dt_16way;
+#elif defined(SHA256DT_8WAY)
+    gate->scanhash   = (void*)&scanhash_sha256dt_8way;
+#else
+    gate->scanhash   = (void*)&scanhash_sha256dt_4way;
+#endif
+    return true;
+}
+
--- a/algo/sha/sha512256d-4way.c
+++ b/algo/sha/sha512256d-4way.c
@@ -0,0 +1,221 @@
+#include "algo-gate-api.h"
+#include "sha-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#define SHA512256D_8WAY 1
+#elif defined(__AVX2__)
+#define SHA512256D_4WAY 1
+#endif
+
+#if defined(SHA512256D_8WAY)
+
+static void sha512256d_8way_init( sha512_8way_context *ctx )
+{
+  ctx->count = 0;
+  ctx->initialized = true;
+  ctx->val[0] = mm512_bcast_i64( 0x22312194FC2BF72C );
+  ctx->val[1] = mm512_bcast_i64( 0x9F555FA3C84C64C2 );
+  ctx->val[2] = mm512_bcast_i64( 0x2393B86B6F53B151 );
+  ctx->val[3] = mm512_bcast_i64( 0x963877195940EABD );
+  ctx->val[4] = mm512_bcast_i64( 0x96283EE2A88EFFE3 );
+  ctx->val[5] = mm512_bcast_i64( 0xBE5E1E2553863992 );
+  ctx->val[6] = mm512_bcast_i64( 0x2B0199FC2C85B8AA );
+  ctx->val[7] = mm512_bcast_i64( 0x0EB72DDC81C52CA2 );
+}
+
+int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint64_t hash[8*8] __attribute__ ((aligned (128)));
+    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+    sha512_8way_context ctx; 
+    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+    uint64_t *hash_q3 = &(hash[3*8]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 8;
+    uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;
+    const int thr_id = mythr->id;
+    const bool bench = opt_benchmark;
+    const __m512i eight = mm512_bcast_i64( 0x0000000800000000 );
+
+    mm512_bswap32_intrlv80_8x64( vdata, pdata );
+    *noncev = mm512_intrlv_blend_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
+    do
+    {
+       sha512256d_8way_init( &ctx );
+       sha512_8way_update( &ctx, vdata, 80 );
+       sha512_8way_close( &ctx, hash );        
+
+       sha512256d_8way_init( &ctx );
+       sha512_8way_update( &ctx, hash, 32 );
+       sha512_8way_close( &ctx, hash );
+
+       for ( int lane = 0; lane < 8; lane++ )
+       if ( unlikely( hash_q3[ lane ] <= targ_q3 && !bench ) )
+       {
+          extr_lane_8x64( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) && !bench )
+          {
+             pdata[19] = bswap_32( n + lane );
+             submit_solution( work, lane_hash, mythr );
+          }
+       }
+       *noncev = _mm512_add_epi32( *noncev, eight );
+       n += 8;
+    } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+
+    pdata[19] = n;
+    *hashes_done = n - first_nonce;
+    return 0;
+}
+
+#elif defined(SHA512256D_4WAY)
+
+static void sha512256d_4way_init( sha512_4way_context *ctx )
+{
+  ctx->count = 0;
+  ctx->initialized = true;
+  ctx->val[0] = mm256_bcast_i64( 0x22312194FC2BF72C );
+  ctx->val[1] = mm256_bcast_i64( 0x9F555FA3C84C64C2 );
+  ctx->val[2] = mm256_bcast_i64( 0x2393B86B6F53B151 );
+  ctx->val[3] = mm256_bcast_i64( 0x963877195940EABD );
+  ctx->val[4] = mm256_bcast_i64( 0x96283EE2A88EFFE3 );
+  ctx->val[5] = mm256_bcast_i64( 0xBE5E1E2553863992 );
+  ctx->val[6] = mm256_bcast_i64( 0x2B0199FC2C85B8AA );
+  ctx->val[7] = mm256_bcast_i64( 0x0EB72DDC81C52CA2 );
+}
+
+int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint64_t hash[8*4] __attribute__ ((aligned (64)));
+    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+    sha512_4way_context ctx;
+    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+    uint64_t *hash_q3 = &(hash[3*4]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 4;
+    uint32_t n = first_nonce;
+    __m256i  *noncev = (__m256i*)vdata + 9;
+    const int thr_id = mythr->id;
+    const bool bench = opt_benchmark;
+    const __m256i four = mm256_bcast_i64( 0x0000000400000000 );
+
+    mm256_bswap32_intrlv80_4x64( vdata, pdata );
+    *noncev = mm256_intrlv_blend_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+    do
+    {
+       sha512256d_4way_init( &ctx );
+       sha512_4way_update( &ctx, vdata, 80 );
+       sha512_4way_close( &ctx, hash );
+
+       sha512256d_4way_init( &ctx );
+       sha512_4way_update( &ctx, hash, 32 );
+       sha512_4way_close( &ctx, hash );
+
+       for ( int lane = 0; lane < 4; lane++ )
+       if ( hash_q3[ lane ] <= targ_q3 )
+       {
+          extr_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) && !bench )
+          {
+             pdata[19] = bswap_32( n + lane );
+             submit_solution( work, lane_hash, mythr );
+          }
+       }
+       *noncev = _mm256_add_epi32( *noncev, four );
+       n += 4;
+    } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
+    pdata[19] = n;
+    *hashes_done = n - first_nonce;
+    return 0;
+}
+
+#else
+
+#include "sph_sha2.h"
+
+static const uint64_t H512_256[8] =
+{
+   0x22312194FC2BF72C, 0x9F555FA3C84C64C2,
+   0x2393B86B6F53B151, 0x963877195940EABD,
+   0x96283EE2A88EFFE3, 0xBE5E1E2553863992,
+   0x2B0199FC2C85B8AA, 0x0EB72DDC81C52CA2,
+};
+
+static void sha512256d_init( sph_sha512_context *ctx )
+{
+   memcpy( ctx->val, H512_256, sizeof H512_256 );
+   ctx->count = 0;
+}
+
+int scanhash_sha512256d( struct work *work,   uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t hash64[8] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__ ((aligned (64)));
+   sph_sha512_context ctx;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   int thr_id = mythr->id;
+
+   swab32_array( endiandata, pdata, 20 );
+
+   do {
+      be32enc( &endiandata[19], n );
+
+      sha512256d_init( &ctx );
+      sph_sha512( &ctx, endiandata, 80 );
+      sph_sha512_close( &ctx, hash64 );
+
+      sha512256d_init( &ctx );
+      sph_sha512( &ctx, hash64, 32 );
+      sph_sha512_close( &ctx, hash64 );
+      
+      if ( hash64[7] <= Htarg )
+      if ( fulltest( hash64, ptarget ) && !opt_benchmark )
+      {
+         pdata[19] = n;
+         submit_solution( work, hash64, mythr );
+      }
+      n++;
+
+   } while (n < max_nonce && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   pdata[19] = n;
+
+   return 0;
+}
+
+#endif
+
+bool register_sha512256d_algo( algo_gate_t* gate )
+{
+   gate->optimizations = AVX2_OPT | AVX512_OPT;
+#if defined(SHA512256D_8WAY)
+   gate->scanhash = (void*)&scanhash_sha512256d_8way;
+#elif defined(SHA512256D_4WAY)
+   gate->scanhash = (void*)&scanhash_sha512256d_4way;
+#else
+   gate->scanhash = (void*)&scanhash_sha512256d;
+#endif
+   return true;
+};
+
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -7,16 +7,8 @@

 #if defined (SKEIN_8WAY)

-static skein512_8way_context skein512_8way_ctx
+static __thread skein512_8way_context skein512_8way_ctx
                                            __attribute__ ((aligned (64)));
-static uint32_t skein_8way_vdata[20*8] __attribute__ ((aligned (64)));
-
-int skein_8way_prehash( struct work *work )
-{
-    mm512_bswap32_intrlv80_8x64( skein_8way_vdata, work->data );
-    skein512_8way_prehash64( &skein512_8way_ctx, skein_8way_vdata );
-    return 1;
-}

 void skeinhash_8way( void *state, const void *input )
 {
@@ -37,27 +29,25 @@ void skeinhash_8way( void *state, const void *input )
 int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[20*8] __attribute__ ((aligned (128)));
-   uint32_t hash[8*8] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash_d7 = &(hash[7*8]);
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t targ_d7 = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 8;
-   uint32_t n = first_nonce;
-   __m512i  *noncev = (__m512i*)vdata + 9; 
-   const int thr_id = mythr->id; 
-   const bool bench = opt_benchmark;
-    
-    pthread_rwlock_rdlock( &g_work_lock );
-       memcpy( vdata, skein_8way_vdata, sizeof vdata );
-    pthread_rwlock_unlock( &g_work_lock );
+    uint32_t vdata[20*8] __attribute__ ((aligned (128)));
+    uint32_t hash[8*8] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+    uint32_t *hash_d7 = &(hash[7*8]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint32_t targ_d7 = ptarget[7];
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 8;
+    uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9; 
+    const int thr_id = mythr->id; 
+    const bool bench = opt_benchmark;

+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
   *noncev = mm512_intrlv_blend_32(
                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
+   skein512_8way_prehash64( &skein512_8way_ctx, vdata );
   do
   {
       skeinhash_8way( hash, vdata );
@@ -84,16 +74,8 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,

 #elif defined (SKEIN_4WAY)

-static skein512_4way_context skein512_4way_ctx
+static __thread skein512_4way_context skein512_4way_ctx
                                            __attribute__ ((aligned (64)));
-static uint32_t skein_4way_vdata[20*4] __attribute__ ((aligned (64)));
-
-int skein_4way_prehash( struct work *work )
-{
-    mm256_bswap32_intrlv80_4x64( skein_4way_vdata, work->data );
-    skein512_4way_prehash64( &skein512_4way_ctx, skein_4way_vdata );
-    return 1;
-}

 void skeinhash_4way( void *state, const void *input )
 {
@@ -136,24 +118,23 @@ void skeinhash_4way( void *state, const void *input )
 int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t hash[8*4] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *hash_d7 = &(hash[7<<2]);
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t targ_d7 = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 4;
-   uint32_t n = first_nonce;
-   __m256i  *noncev = (__m256i*)vdata + 9; 
-   const int thr_id = mythr->id; 
-   const bool bench = opt_benchmark;
+    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+    uint32_t hash[8*4] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+    uint32_t *hash_d7 = &(hash[7<<2]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint32_t targ_d7 = ptarget[7];
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 4;
+    uint32_t n = first_nonce;
+    __m256i  *noncev = (__m256i*)vdata + 9; 
+    const int thr_id = mythr->id; 
+    const bool bench = opt_benchmark;
+
+   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   skein512_4way_prehash64( &skein512_4way_ctx, vdata );

-   pthread_rwlock_rdlock( &g_work_lock );
-      memcpy( vdata, skein_4way_vdata, sizeof vdata );
-   pthread_rwlock_unlock( &g_work_lock );
-    
   *noncev = mm256_intrlv_blend_32(
                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -7,12 +7,10 @@ bool register_skein_algo( algo_gate_t* gate )
 #if defined (SKEIN_8WAY)
    gate->optimizations = AVX2_OPT | AVX512_OPT;
    gate->scanhash  = (void*)&scanhash_skein_8way;
-    gate->prehash   = (void*)&skein_8way_prehash;
    gate->hash      = (void*)&skeinhash_8way;
 #elif defined (SKEIN_4WAY)
    gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
    gate->scanhash  = (void*)&scanhash_skein_4way;
-    gate->prehash   = (void*)&skein_4way_prehash;
    gate->hash      = (void*)&skeinhash_4way;
 #else
    gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
@@ -27,12 +25,10 @@ bool register_skein2_algo( algo_gate_t* gate )
  gate->optimizations = AVX2_OPT | AVX512_OPT;
 #if defined (SKEIN_8WAY)
  gate->scanhash  = (void*)&scanhash_skein2_8way;
-//  gate->hash      = (void*)&skein2hash_8way;
-  gate->prehash   = (void*)&skein2_8way_prehash;
+  gate->hash      = (void*)&skein2hash_8way;
 #elif defined (SKEIN_4WAY)
  gate->scanhash  = (void*)&scanhash_skein2_4way;
-//  gate->hash      = (void*)&skein2hash_4way;
-  gate->prehash   = (void*)&skein2_4way_prehash;
+  gate->hash      = (void*)&skein2hash_4way;
 #else
  gate->scanhash  = (void*)&scanhash_skein2;
  gate->hash      = (void*)&skein2hash;
--- a/algo/skein/skein-gate.h
+++ b/algo/skein/skein-gate.h
@@ -14,24 +14,20 @@
 void skeinhash_8way( void *output, const void *input );
 int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-int skein_8way_prehash( struct work * );

 void skein2hash_8way( void *output, const void *input );
 int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
                          uint64_t* hashes_done, struct thr_info *mythr );
-int skein2_8way_prehash( struct work * );

 #elif defined(SKEIN_4WAY)

 void skeinhash_4way( void *output, const void *input );
 int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-int skein_4way_prehash( struct work * );

 void skein2hash_4way( void *output, const void *input );
 int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
                          uint64_t* hashes_done, struct thr_info *mythr );
-int skein2_4way_prehash( struct work * );

 #else

--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -5,17 +5,9 @@

 #if defined(SKEIN_8WAY)

-static skein512_8way_context skein512_8way_ctx __attribute__ ((aligned (64)));
-static uint32_t skein2_8way_vdata[20*8] __attribute__ ((aligned (64)));
+ static __thread skein512_8way_context skein512_8way_ctx
+                                             __attribute__ ((aligned (64)));

-int skein2_8way_prehash( struct work *work )
-{
-    mm512_bswap32_intrlv80_8x64( skein2_8way_vdata, work->data );
-    skein512_8way_prehash64( &skein512_8way_ctx, skein2_8way_vdata );
-    return 1;
-}
-
-/* not used
 void skein2hash_8way( void *output, const void *input )
 {
   uint64_t hash[16*8] __attribute__ ((aligned (128)));
@@ -25,7 +17,6 @@ void skein2hash_8way( void *output, const void *input )
   skein512_8way_final16( &ctx, hash, input + (64*8) );
   skein512_8way_full( &ctx, output, hash, 64 );
 }
-*/

 int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
@@ -45,14 +36,11 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
    const bool bench = opt_benchmark;
    skein512_8way_context ctx;

-    pthread_rwlock_rdlock( &g_work_lock );
-       memcpy( vdata, skein2_8way_vdata, sizeof vdata );
-       memcpy( &ctx, &skein512_8way_ctx, sizeof ctx );
-    pthread_rwlock_unlock( &g_work_lock );
-
+    mm512_bswap32_intrlv80_8x64( vdata, pdata );
    *noncev = mm512_intrlv_blend_32(
                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
+    skein512_8way_prehash64( &ctx, vdata );
    do
    {
       skein512_8way_final16( &ctx, hash, vdata + (16*8) );
@@ -79,18 +67,10 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
 }

 #elif defined(SKEIN_4WAY)
-                                           
-static skein512_4way_context skein512_4way_ctx __attribute__ ((aligned (64)));
-static uint32_t skein2_4way_vdata[20*4] __attribute__ ((aligned (64)));
-                                           
-int skein2_4way_prehash( struct work *work )
-{
-    mm256_bswap32_intrlv80_4x64( skein2_4way_vdata, work->data );
-    skein512_4way_prehash64( &skein512_4way_ctx, skein2_4way_vdata );
-    return 1;
-}   

-/* not used
+static __thread skein512_4way_context skein512_4way_ctx
+                                           __attribute__ ((aligned (64)));
+
 void skein2hash_4way( void *output, const void *input )
 {
   skein512_4way_context ctx;
@@ -100,7 +80,6 @@ void skein2hash_4way( void *output, const void *input )
   skein512_4way_final16( &ctx, hash, input + (64*4) );
   skein512_4way_full( &ctx, output, hash, 64 );
 }
-*/

 int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
@@ -120,11 +99,8 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
    const bool bench = opt_benchmark;
    skein512_4way_context ctx;

-    pthread_rwlock_rdlock( &g_work_lock );
-       memcpy( vdata, skein2_4way_vdata, sizeof vdata );
-       memcpy( &ctx, &skein512_4way_ctx, sizeof ctx );
-    pthread_rwlock_unlock( &g_work_lock );
-
+    mm256_bswap32_intrlv80_4x64( vdata, pdata );
+    skein512_4way_prehash64( &ctx, vdata );
    *noncev = mm256_intrlv_blend_32(
                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
    do 
--- a/algo/x16/hex.c
+++ b/algo/x16/hex.c
@@ -25,7 +25,7 @@ static void hex_getAlgoString(const uint32_t* prevblock, char *output)

 static __thread x16r_context_overlay hex_ctx;

-int hex_hash( void* output, const void* input, const int thrid )
+int hex_hash( void* output, const void* input, int thrid )
 {
   uint32_t _ALIGN(128) hash[16];
   x16r_context_overlay ctx;
--- a/algo/x16/minotaur.c
+++ b/algo/x16/minotaur.c
@@ -72,7 +72,7 @@ struct TortureGarden

 // Get a 64-byte hash for given 64-byte input, using given TortureGarden contexts and given algo index
 static int get_hash( void *output, const void *input, TortureGarden *garden,
-	                  unsigned int algo, const int thr_id )
+	                  unsigned int algo, int thr_id )
 {    
 	unsigned char hash[64] __attribute__ ((aligned (64)));
   int rc = 1;
@@ -233,7 +233,7 @@ bool initialize_torture_garden()
 }

 // Produce a 32-byte hash from 80-byte input data
-int minotaur_hash( void *output, const void *input, const int thr_id )
+int minotaur_hash( void *output, const void *input, int thr_id )
 {    
    unsigned char hash[64] __attribute__ ((aligned (64)));
    int rc = 1;
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -19,7 +19,7 @@
 // Perform midstate prehash of hash functions with block size <= 72 bytes,
 // 76 bytes for hash functions that operate on 32 bit data.

-void x16r_8way_do_prehash( void *vdata, const void *pdata )
+void x16r_8way_prehash( void *vdata, void *pdata )
 {
   uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
   uint32_t edata[20] __attribute__ ((aligned (64)));
@@ -106,18 +106,11 @@ void x16r_8way_do_prehash( void *vdata, const void *pdata )
   }
 }

-int x16r_8way_prehash( struct work *work )
-{
-   x16r_gate_get_hash_order( work, x16r_hash_order );
-   x16r_8way_do_prehash( x16r_8way_vdata, work->data );
-   return 1;
-}
-
 // Perform the full x16r hash and returns 512 bit intermediate hash.
 // Called by wrapper hash function to optionally continue hashing and
 // convert to final hash.

-int x16r_8way_hash_generic( void* output, const void* input, const int thrid )
+int x16r_8way_hash_generic( void* output, const void* input, int thrid )
 {
   uint32_t vhash[20*8] __attribute__ ((aligned (128)));
   uint32_t hash0[20] __attribute__ ((aligned (16)));
@@ -478,7 +471,7 @@ int x16r_8way_hash_generic( void* output, const void* input, const int thrid )

 // x16-r,-s,-rt wrapper called directly by scanhash to repackage 512 bit
 // hash to 256 bit final hash.
-int x16r_8way_hash( void* output, const void* input, const int thrid )
+int x16r_8way_hash( void* output, const void* input, int thrid )
 {
   uint8_t hash[64*8] __attribute__ ((aligned (128)));
   if ( !x16r_8way_hash_generic( hash, input, thrid ) )
@@ -502,6 +495,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*8] __attribute__ ((aligned (128)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -514,16 +508,27 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,

   if ( bench )   ptarget[7] = 0x0cff;

-   pthread_rwlock_rdlock( &g_work_lock );
-      memcpy( vdata, x16r_8way_vdata, sizeof vdata );
-   pthread_rwlock_unlock( &g_work_lock );
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );

+   static __thread uint32_t s_ntime = UINT32_MAX;
+   const uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+
+      if ( opt_debug && !thr_id )
+          applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
+   }
+
+   x16r_8way_prehash( vdata, pdata );
   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
   do
   {
-      if( algo_gate.hash( hash, vdata, thr_id ) );
+      if( x16r_8way_hash( hash, vdata, thr_id ) );
      for ( int i = 0; i < 8; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
@@ -541,7 +546,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,

 #elif defined (X16R_4WAY)

-void x16r_4way_do_prehash( void *vdata, const void *pdata )
+void x16r_4way_prehash( void *vdata, void *pdata )
 {
   uint32_t vdata2[20*4] __attribute__ ((aligned (64)));
   uint32_t edata[20] __attribute__ ((aligned (64)));
@@ -622,14 +627,7 @@ void x16r_4way_do_prehash( void *vdata, const void *pdata )
   }
 }

-int x16r_4way_prehash( struct work *work )
-{
-   x16r_gate_get_hash_order( work, x16r_hash_order );
-   x16r_4way_do_prehash( x16r_4way_vdata, work->data );
-   return 1;
-}
-
-int x16r_4way_hash_generic( void* output, const void* input, const int thrid )
+int x16r_4way_hash_generic( void* output, const void* input, int thrid )
 {
   uint32_t vhash[20*4] __attribute__ ((aligned (128)));
   uint32_t hash0[20] __attribute__ ((aligned (32)));
@@ -637,14 +635,13 @@ int x16r_4way_hash_generic( void* output, const void* input, const int thrid )
   uint32_t hash2[20] __attribute__ ((aligned (32)));
   uint32_t hash3[20] __attribute__ ((aligned (32)));
   x16r_4way_context_overlay ctx;
+   memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
   void *in0 = (void*) hash0;
   void *in1 = (void*) hash1;
   void *in2 = (void*) hash2;
   void *in3 = (void*) hash3;
   int size = 80;

-   memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-
   dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );

   for ( int i = 0; i < 16; i++ )
@@ -908,7 +905,7 @@ int x16r_4way_hash_generic( void* output, const void* input, const int thrid )
   return 1;
 }

-int x16r_4way_hash( void* output, const void* input, const int thrid )
+int x16r_4way_hash( void* output, const void* input, int thrid )
 {
   uint8_t hash[64*4] __attribute__ ((aligned (64)));
   if ( !x16r_4way_hash_generic( hash, input, thrid ) )
@@ -927,6 +924,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -939,15 +937,25 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,

   if ( bench )  ptarget[7] = 0x0cff;

-   pthread_rwlock_rdlock( &g_work_lock );
-      memcpy( vdata, x16r_4way_vdata, sizeof vdata );
-   pthread_rwlock_unlock( &g_work_lock );
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );

+   static __thread uint32_t s_ntime = UINT32_MAX;
+   const uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+         applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
+   }
+
+   x16r_4way_prehash( vdata, pdata );
   *noncev = mm256_intrlv_blend_32(
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
   {
-      if ( algo_gate.hash( hash, vdata, thr_id ) );
+      if ( x16r_4way_hash( hash, vdata, thr_id ) );
      for ( int i = 0; i < 4; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -1,44 +1,26 @@
 #include "x16r-gate.h"
 #include "algo/sha/sha256d.h"

-char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = {0};
+__thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = { 0 };

-void (*x16r_gate_get_hash_order) ( const struct work *, char * ) = NULL;
+void (*x16_r_s_getAlgoString) ( const uint8_t*, char* ) = NULL;

 #if defined (X16R_8WAY)

-x16r_8way_context_overlay x16r_ctx;
-uint32_t x16r_8way_vdata[24*8] __attribute__ ((aligned (64)));
+__thread x16r_8way_context_overlay x16r_ctx;

 #elif defined (X16R_4WAY)

-x16r_4way_context_overlay x16r_ctx;
-uint32_t x16r_4way_vdata[24*4] __attribute__ ((aligned (64)));
-
+__thread x16r_4way_context_overlay x16r_ctx;

 #endif

-#if defined (X16RV2_8WAY)
+__thread x16r_context_overlay x16_ctx;

-x16rv2_8way_context_overlay x16rv2_ctx;

-#elif defined (X16RV2_4WAY)
-
-x16rv2_4way_context_overlay x16rv2_ctx;
-
-#endif
-
-x16r_context_overlay x16_ctx;
-uint32_t x16r_edata[24] __attribute__ ((aligned (32)));
-
-void x16r_get_hash_order( const struct work *work, char *hash_order )
+void x16r_getAlgoString( const uint8_t* prevblock, char *output )
 {
-   char *sptr = hash_order;
-   const uint32_t *pdata = work->data;
-   uint8_t prevblock[16];
-   ((uint32_t*)prevblock)[0] = bswap_32( pdata[1] );
-   ((uint32_t*)prevblock)[1] = bswap_32( pdata[2] );
-
+   char *sptr = output;
   for ( int j = 0; j < X16R_HASH_FUNC_COUNT; j++ )
   {
      uint8_t b = (15 - j) >> 1; // 16 first ascii hex chars (lsb in uint256)
@@ -50,51 +32,38 @@ void x16r_get_hash_order( const struct work *work, char *hash_order )
      sptr++;
   }
   *sptr = '\0';
-
-   if ( !opt_quiet )
-      applog( LOG_INFO, "Hash order %s", x16r_hash_order );
 }
-   
-void x16s_get_hash_order( const struct work *work, char *hash_order )
+
+void x16s_getAlgoString( const uint8_t* prevblock, char *output )
 {
-   const uint32_t *pdata = work->data;
-   uint8_t prevblock[16];
-   ((uint32_t*)prevblock)[0] = bswap_32( pdata[1] );
-   ((uint32_t*)prevblock)[1] = bswap_32( pdata[2] );
-   strcpy( hash_order, "0123456789ABCDEF" );
+   strcpy( output, "0123456789ABCDEF" );
   for ( int i = 0; i < 16; i++ )
   {
      uint8_t b = (15 - i) >> 1; // 16 ascii hex chars, reversed
      uint8_t algoDigit = (i & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4;
      int offset = algoDigit;
      // insert the nth character at the front
-      char oldVal = hash_order[ offset ];
+      char oldVal = output[offset];
      for( int j = offset; j-- > 0; )
-         hash_order[ j+1 ] = hash_order[ j ];
-      hash_order[ 0 ] = oldVal;
+         output[j+1] = output[j];
+      output[0] = oldVal;
   }
-
-   if ( !opt_quiet )
-      applog( LOG_INFO, "Hash order %s", x16r_hash_order );
 }

 bool register_x16r_algo( algo_gate_t* gate )
 {
 #if defined (X16R_8WAY)
  gate->scanhash  = (void*)&scanhash_x16r_8way;
-  gate->prehash   = (void*)&x16r_8way_prehash;
  gate->hash      = (void*)&x16r_8way_hash;
 #elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
-  gate->prehash   = (void*)&x16r_4way_prehash;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
-  gate->prehash   = (void*)&x16r_prehash;
  gate->hash      = (void*)&x16r_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
-  x16r_gate_get_hash_order = (void*)&x16r_get_hash_order;
+  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
 };
@@ -102,20 +71,17 @@ bool register_x16r_algo( algo_gate_t* gate )
 bool register_x16rv2_algo( algo_gate_t* gate )
 {
 #if defined (X16RV2_8WAY)
-  gate->scanhash  = (void*)&scanhash_x16r_8way;
-  gate->prehash   = (void*)&x16rv2_8way_prehash;
+  gate->scanhash  = (void*)&scanhash_x16rv2_8way;
  gate->hash      = (void*)&x16rv2_8way_hash;
 #elif defined (X16RV2_4WAY)
-  gate->scanhash  = (void*)&scanhash_x16r_4way;
-  gate->prehash   = (void*)&x16rv2_4way_prehash;
+  gate->scanhash  = (void*)&scanhash_x16rv2_4way;
  gate->hash      = (void*)&x16rv2_4way_hash;
 #else
-  gate->scanhash  = (void*)&scanhash_x16r;
-  gate->prehash   = (void*)&x16rv2_prehash;
+  gate->scanhash  = (void*)&scanhash_x16rv2;
  gate->hash      = (void*)&x16rv2_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
-  x16r_gate_get_hash_order = (void*)&x16r_get_hash_order;
+  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
 };
@@ -124,19 +90,16 @@ bool register_x16s_algo( algo_gate_t* gate )
 {
 #if defined (X16R_8WAY)
  gate->scanhash  = (void*)&scanhash_x16r_8way;
-  gate->prehash   = (void*)&x16r_8way_prehash;
  gate->hash      = (void*)&x16r_8way_hash;
 #elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
-  gate->prehash   = (void*)&x16r_4way_prehash;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
-  gate->prehash   = (void*)&x16r_prehash;
  gate->hash      = (void*)&x16r_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
-  x16r_gate_get_hash_order = (void*)&x16s_get_hash_order;
+  x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
  opt_target_factor = 256.0;
  return true;
 };
@@ -145,33 +108,30 @@ bool register_x16s_algo( algo_gate_t* gate )
 //
 //   X16RT

-void x16rt_get_hash_order( const struct work * work, char * hash_order )
-{   
-   uint32_t _ALIGN(64) timehash[8*8];
-   const uint32_t ntime = bswap_32( work->data[17] );
-   const int32_t masked_ntime = ntime & 0xffffff80;
-   uint8_t* data = (uint8_t*)timehash;
-   char *sptr = hash_order;

-   sha256d( (unsigned char*)timehash, (const unsigned char*)( &masked_ntime ),
-             sizeof( masked_ntime ) );
+void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash )
+{
+    int32_t maskedTime = timeStamp & 0xffffff80;
+    sha256d( (unsigned char*)timeHash, (const unsigned char*)( &maskedTime ),
+             sizeof( maskedTime ) );
+}

-   for ( uint8_t j = 0; j < X16R_HASH_FUNC_COUNT; j++ )
-   {
+void x16rt_getAlgoString( const uint32_t *timeHash, char *output)
+{
+   char *sptr = output;
+   uint8_t* data = (uint8_t*)timeHash;
+
+   for (uint8_t j = 0; j < X16R_HASH_FUNC_COUNT; j++) {
      uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed
      uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4;

-      if ( algoDigit >= 10 )
-         sprintf( sptr, "%c", 'A' + (algoDigit - 10) );
+      if (algoDigit >= 10)
+         sprintf(sptr, "%c", 'A' + (algoDigit - 10));
      else
-         sprintf( sptr, "%u", (uint32_t) algoDigit );
+         sprintf(sptr, "%u", (uint32_t) algoDigit);
      sptr++;
   }
   *sptr = '\0';
-
-   if ( !opt_quiet )
-      applog( LOG_INFO, "Hash order %s, ntime %08x, time hash %08x",
-                         hash_order, ntime, timehash );
 }

 void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
@@ -262,19 +222,15 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 bool register_x16rt_algo( algo_gate_t* gate )
 {
 #if defined (X16R_8WAY)
-  gate->scanhash  = (void*)&scanhash_x16r_8way;
-  gate->prehash   = (void*)&x16r_8way_prehash;
+  gate->scanhash  = (void*)&scanhash_x16rt_8way;
  gate->hash      = (void*)&x16r_8way_hash;
 #elif defined (X16R_4WAY)
-  gate->scanhash  = (void*)&scanhash_x16r_4way;
-  gate->prehash   = (void*)&x16r_4way_prehash;
+  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
-  gate->scanhash  = (void*)&scanhash_x16r;
-  gate->prehash   = (void*)&x16r_prehash;
+  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  x16r_gate_get_hash_order = (void*)&x16rt_get_hash_order;
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
  opt_target_factor = 256.0;
  return true;
@@ -283,20 +239,16 @@ bool register_x16rt_algo( algo_gate_t* gate )
 bool register_x16rt_veil_algo( algo_gate_t* gate )
 {
 #if defined (X16R_8WAY)
-  gate->scanhash  = (void*)&scanhash_x16r_8way;
-  gate->prehash   = (void*)&x16r_8way_prehash;
+  gate->scanhash  = (void*)&scanhash_x16rt_8way;
  gate->hash      = (void*)&x16r_8way_hash;
 #elif defined (X16R_4WAY)
-  gate->scanhash  = (void*)&scanhash_x16r_4way;
-  gate->prehash   = (void*)&x16r_4way_prehash;
+  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
-  gate->scanhash  = (void*)&scanhash_x16r;
-  gate->prehash   = (void*)&x16r_prehash;
+  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16r_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
-  x16r_gate_get_hash_order = (void*)&x16rt_get_hash_order;
  gate->build_extraheader = (void*)&veil_build_extraheader;
  opt_target_factor = 256.0;
  return true;
@@ -323,23 +275,20 @@ bool register_hex_algo( algo_gate_t* gate )
 bool register_x21s_algo( algo_gate_t* gate )
 {
 #if defined (X16R_8WAY)
-  gate->scanhash          = (void*)&scanhash_x16r_8way;
-  gate->prehash           = (void*)&x16r_8way_prehash;
+  gate->scanhash          = (void*)&scanhash_x21s_8way;
  gate->hash              = (void*)&x21s_8way_hash;
  gate->miner_thread_init = (void*)&x21s_8way_thread_init;
 #elif defined (X16R_4WAY)
-  gate->scanhash          = (void*)&scanhash_x16r_4way;
-  gate->prehash           = (void*)&x16r_4way_prehash;
+  gate->scanhash          = (void*)&scanhash_x21s_4way;
  gate->hash              = (void*)&x21s_4way_hash;
  gate->miner_thread_init = (void*)&x21s_4way_thread_init;
 #else
-  gate->scanhash          = (void*)&scanhash_x16r;
-  gate->prehash           = (void*)&x16r_prehash;
+  gate->scanhash          = (void*)&scanhash_x21s;
  gate->hash              = (void*)&x21s_hash;
  gate->miner_thread_init = (void*)&x21s_thread_init;
 #endif
  gate->optimizations  = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
-  x16r_gate_get_hash_order = (void*)&x16s_get_hash_order;
+  x16_r_s_getAlgoString   = (void*)&x16s_getAlgoString;
  opt_target_factor = 256.0;
  return true;
 };
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -21,7 +21,6 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sph_sha2.h"
-#include "algo/tiger/sph_tiger.h"

 #if defined(__AES__)
 #include "algo/echo/aes_ni/hash_api.h"
@@ -58,11 +57,13 @@

  #define X16R_8WAY   1
  #define X16RV2_8WAY 1
+  #define X16RT_8WAY  1
  #define X21S_8WAY   1

 #elif defined(__AVX2__) && defined(__AES__)

  #define X16RV2_4WAY 1
+  #define X16RT_4WAY  1
  #define X21S_4WAY   1
  #define X16R_4WAY   1

@@ -88,29 +89,23 @@ enum x16r_Algo {
        X16R_HASH_FUNC_COUNT
 };

+extern __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ];

-//extern __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ];
-extern char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ];
-
-
-extern void (*x16r_gate_get_hash_order) ( const struct work *, char * );
-
-// x16r, x16rv2
-void x16r_get_hash_order( const struct work *, char * );
-// x16s, x21s
-void x16s_get_hash_order( const struct work *, char * );
-// x16rt
-void x16rt_get_hash_order( const struct work *, char * );
+extern void (*x16_r_s_getAlgoString) ( const uint8_t*, char* );
+void x16r_getAlgoString( const uint8_t *prevblock, char *output );
+void x16s_getAlgoString( const uint8_t *prevblock, char *output );
+void x16rt_getAlgoString( const uint32_t *timeHash, char *output );

+void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash );

 bool register_x16r_algo( algo_gate_t* gate );
 bool register_x16rv2_algo( algo_gate_t* gate );
 bool register_x16s_algo( algo_gate_t* gate );
 bool register_x16rt_algo( algo_gate_t* gate );
-bool register_hex_algo( algo_gate_t* gate );
-bool register_x21s_algo( algo_gate_t* gate );
+bool register_hex__algo( algo_gate_t* gate );
+bool register_x21s__algo( algo_gate_t* gate );

-// x16r, x16s, x16rt
+// x16r, x16s
 #if defined(X16R_8WAY)

 union _x16r_8way_context_overlay
@@ -141,15 +136,15 @@ union _x16r_8way_context_overlay

 typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;

-extern x16r_8way_context_overlay x16r_ctx;
-extern uint32_t x16r_8way_vdata[24*8] __attribute__ ((aligned (64)));
+extern __thread x16r_8way_context_overlay x16r_ctx;

-void x16r_8way_do_prehash( void *, const void * );
-int x16r_8way_prehash( struct work * );
-int x16r_8way_hash_generic( void *, const void *, const int );
-int x16r_8way_hash( void *, const void *, const int );
+void x16r_8way_prehash( void *, void * );
+int x16r_8way_hash_generic( void *, const void *, int );
+int x16r_8way_hash( void *, const void *, int );
 int scanhash_x16r_8way( struct work *, uint32_t ,
                        uint64_t *, struct thr_info * );
+extern __thread x16r_8way_context_overlay x16r_ctx;
+

 #elif defined(X16R_4WAY)

@@ -182,15 +177,14 @@ union _x16r_4way_context_overlay

 typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;

-extern x16r_4way_context_overlay x16r_ctx;
-extern uint32_t x16r_4way_vdata[24*4] __attribute__ ((aligned (64)));
+extern __thread x16r_4way_context_overlay x16r_ctx;

-void x16r_4way_do_prehash( void *, const void * );
-int x16r_4way_prehash( struct work * );
-int x16r_4way_hash_generic( void *, const void *, const int );
-int x16r_4way_hash( void *, const void *, const int );
+void x16r_4way_prehash( void *, void * );
+int x16r_4way_hash_generic( void *, const void *, int );
+int x16r_4way_hash( void *, const void *, int );
 int scanhash_x16r_4way( struct work *, uint32_t,
                        uint64_t *, struct thr_info * );
+extern __thread x16r_4way_context_overlay x16r_ctx;

 #endif

@@ -223,113 +217,80 @@ union _x16r_context_overlay

 typedef union _x16r_context_overlay x16r_context_overlay;

-extern x16r_context_overlay x16_ctx;
-extern uint32_t x16r_edata[24] __attribute__ ((aligned (32)));
+extern __thread x16r_context_overlay x16_ctx;

-void x16r_do_prehash( const void * );
-int x16r_prehash( const struct work * );
-int x16r_hash_generic( void *, const void *, const int );
-int x16r_hash( void *, const void *, const int );
+void x16r_prehash( void *, void * );
+int x16r_hash_generic( void *, const void *, int );
+int x16r_hash( void *, const void *, int );
 int scanhash_x16r( struct work *, uint32_t, uint64_t *, struct thr_info * );

 // x16Rv2
 #if defined(X16RV2_8WAY)

-union _x16rv2_8way_context_overlay
-{
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
-    luffa_4way_context      luffa;
-    cubehashParam           cube;
-    simd_4way_context       simd;
-    hamsi512_8way_context   hamsi;
-    hashState_fugue         fugue;
-    shabal512_8way_context  shabal;
-    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
-    sph_tiger_context       tiger;
-#if defined(__VAES__)
-    groestl512_4way_context groestl;
-    shavite512_4way_context shavite;
-    echo_4way_context       echo;
-#else
-    hashState_groestl       groestl;
-    shavite512_context      shavite;
-    hashState_echo          echo;
-#endif
-} __attribute__ ((aligned (64)));
-
-typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
-extern x16rv2_8way_context_overlay x16rv2_ctx;
-
-int x16rv2_8way_prehash( struct work * );
-int x16rv2_8way_hash( void *state, const void *input, const int thrid );
-//int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
-//                          uint64_t *hashes_done, struct thr_info *mythr );
+int x16rv2_8way_hash( void *state, const void *input, int thrid );
+int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );

 #elif defined(X16RV2_4WAY)

-union _x16rv2_4way_context_overlay
-{
-    blake512_4way_context   blake;
-    bmw512_4way_context     bmw;
-#if defined(__VAES__)
-    groestl512_2way_context groestl;
-    shavite512_2way_context shavite;
-    echo_2way_context       echo;
+int x16rv2_4way_hash( void *state, const void *input, int thrid );
+int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
 #else
-    hashState_groestl       groestl;
-    shavite512_context      shavite;
-    hashState_echo          echo;
+
+int x16rv2_hash( void *state, const void *input, int thr_id );
+int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr );
+
 #endif
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
-    luffa_2way_context      luffa;
-    cubehashParam           cube;
-    simd_2way_context       simd;
-    hamsi512_4way_context   hamsi;
-    hashState_fugue         fugue;
-    shabal512_4way_context  shabal;
-    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
-    sph_tiger_context       tiger;
-};

-typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay;
-extern x16rv2_4way_context_overlay x16rv2_ctx;
+// x16rt, veil
+#if defined(X16R_8WAY)

-int x16rv2_4way_hash( void *state, const void *input, const int thrid );
-int x16rv2_4way_prehash( struct work * );
+//void x16rt_8way_hash( void *state, const void *input );
+int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(X16R_4WAY)
+
+//void x16rt_4way_hash( void *state, const void *input );
+int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );

 #else

-int x16rv2_hash( void *state, const void *input, const int thr_id );
-int x16rv2_prehash( const struct work * );
+//void x16rt_hash( void *state, const void *input );
+int scanhash_x16rt( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr );

 #endif

 // x21s
 #if defined(X16R_8WAY)

-int x21s_8way_hash( void *state, const void *input, const int thrid );
+int x21s_8way_hash( void *state, const void *input, int thrid );
+int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
 bool x21s_8way_thread_init();

 #elif defined(X16R_4WAY)

-int x21s_4way_hash( void *state, const void *input, const int thrid );
+int x21s_4way_hash( void *state, const void *input, int thrid );
+int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
 bool x21s_4way_thread_init();

 #else

-int x21s_hash( void *state, const void *input, const int thr_id );
+int x21s_hash( void *state, const void *input, int thr_id );
+int scanhash_x21s( struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr );
 bool x21s_thread_init();

 #endif

+//void hex_hash( void *state, const void *input );
 int scanhash_hex( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );

--- a/algo/x16/x16r.c
+++ b/algo/x16/x16r.c
@@ -10,7 +10,7 @@
 #include <stdlib.h>
 #include <string.h>

-void x16r_do_prehash( const void *edata )
+void x16r_prehash( void *edata, void *pdata )
 {
   const char elem = x16r_hash_order[0];
   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
@@ -48,7 +48,7 @@ void x16r_do_prehash( const void *edata )
   }
 }

-int x16r_hash_generic( void* output, const void* input, const int thrid )
+int x16r_hash_generic( void* output, const void* input, int thrid )
 {
   uint32_t _ALIGN(128) hash[16];
   x16r_context_overlay ctx;
@@ -192,15 +192,7 @@ int x16r_hash_generic( void* output, const void* input, const int thrid )
   return true;
 }

-int x16r_prehash( const struct work *work )
-{
-   mm128_bswap32_80( x16r_edata, work->data );
-   x16r_gate_get_hash_order( work, x16r_hash_order );
-   x16r_do_prehash( x16r_edata );  
-   return 1;
-}
-
-int x16r_hash( void* output, const void* input, const int thrid )
+int x16r_hash( void* output, const void* input, int thrid )
 {  
   uint8_t hash[64] __attribute__ ((aligned (64)));
   if ( !x16r_hash_generic( hash, input, thrid ) )
@@ -213,8 +205,8 @@ int x16r_hash( void* output, const void* input, const int thrid )
 int scanhash_x16r( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t _ALIGN(32) hash32[8];
-   uint32_t _ALIGN(32) edata[20];
+   uint32_t _ALIGN(128) hash32[8];
+   uint32_t _ALIGN(128) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -224,14 +216,24 @@ int scanhash_x16r( struct work *work, uint32_t max_nonce,
   const bool bench = opt_benchmark;
   if ( bench )  ptarget[7] = 0x0cff;

-   pthread_rwlock_rdlock( &g_work_lock );
-      memcpy( edata, x16r_edata, sizeof edata );
-   pthread_rwlock_unlock( &g_work_lock );
+   mm128_bswap32_80( edata, pdata );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   if ( s_ntime != pdata[17] )
+   {
+      uint32_t ntime = swab32(pdata[17]);
+      x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+           applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
+   }
+
+   x16r_prehash( edata, pdata );

   do
   {
      edata[19] = nonce;
-      if ( algo_gate.hash( hash32, edata, thr_id ) )
+      if ( x16r_hash( hash32, edata, thr_id ) )
      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
      {
         pdata[19] = bswap_32( nonce );
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -0,0 +1,113 @@
+#include "x16r-gate.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if defined (X16R_8WAY)
+
+int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[16*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) timeHash[8*8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   const int thr_id = mythr->id;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+
+   if ( bench )   ptarget[7] = 0x0cff;
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
+   if ( s_ntime != masked_ntime )
+   {
+      x16rt_getTimeHash( masked_ntime, &timeHash );
+      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
+      s_ntime = masked_ntime;
+      if ( !thr_id )
+          applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
+                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
+   }
+
+   x16r_8way_prehash( vdata, pdata );
+   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
+                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
+   do
+   {
+      if ( x16r_8way_hash( hash, vdata, thr_id ) )
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (X16R_4WAY)
+
+int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[4*16] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) timeHash[4*8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;  
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+
+   if ( bench )  ptarget[7] = 0x0cff;
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
+   if ( s_ntime != masked_ntime )
+   {
+      x16rt_getTimeHash( masked_ntime, &timeHash );
+      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
+      s_ntime = masked_ntime;
+      if ( !thr_id )
+          applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
+                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
+   }
+
+   x16r_4way_prehash( vdata, pdata );
+   *noncev = mm256_intrlv_blend_32(
+                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+   do
+   {
+      if ( x16r_4way_hash( hash, vdata, thr_id ) )
+      for ( int i = 0; i < 4; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
+      n += 4;
+   } while ( (  n < last_nonce ) && !(*restart) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
--- a/algo/x16/x16rt.c
+++ b/algo/x16/x16rt.c
@@ -0,0 +1,53 @@
+#include "x16r-gate.h"
+
+#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
+
+int scanhash_x16rt( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(128) hash32[8];
+   uint32_t _ALIGN(128) edata[20];
+   uint32_t _ALIGN(64) timeHash[8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const int thr_id = mythr->id; 
+   uint32_t nonce = first_nonce;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+   if ( bench )  ptarget[7] = 0x0cff;
+
+   mm128_bswap32_80( edata, pdata );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   uint32_t masked_ntime = swab32( pdata[17] ) & 0xffffff80;
+   if ( s_ntime != masked_ntime )
+   {
+      x16rt_getTimeHash( masked_ntime, &timeHash );
+      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
+      s_ntime = masked_ntime;
+      if ( opt_debug && !thr_id )
+          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
+                        x16r_hash_order, swab32( pdata[17] ), timeHash );
+   }
+   
+   x16r_prehash( edata, pdata );
+   
+   do
+   {
+      edata[19] = nonce;
+      if ( x16r_hash( hash32, edata, thr_id ) )
+      if ( valid_hash( hash32, ptarget ) && !bench )
+      {
+         pdata[19] = bswap_32( nonce );
+         submit_solution( work, hash32, mythr );
+      }
+      nonce++;
+   } while ( nonce < max_nonce && !(*restart) );
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce;
+   return 0;
+}
+
+#endif  // !defined(X16R_8WAY) && !defined(X16R_4WAY)
+
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -12,73 +12,37 @@

 #if defined (X16RV2_8WAY)

-void x16rv2_8way_do_prehash( void *vdata, void *pdata )
+union _x16rv2_8way_context_overlay
 {
-   uint32_t vdata32[20*8] __attribute__ ((aligned (64)));
-   uint32_t edata[20] __attribute__ ((aligned (64)));
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cubehashParam           cube;
+    simd_4way_context       simd;
+    hamsi512_8way_context   hamsi;
+    hashState_fugue         fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+    sph_tiger_context       tiger;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    shavite512_context      shavite;
+    hashState_echo          echo;
+#endif
+} __attribute__ ((aligned (64)));

-   const char elem = x16r_hash_order[0];
-   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
+static __thread x16rv2_8way_context_overlay x16rv2_ctx;

-   switch ( algo )
-   {
-      case JH:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         jh512_8way_init( &x16rv2_ctx.jh );
-         jh512_8way_update( &x16rv2_ctx.jh, vdata, 64 );
-      break;
-      case KECCAK:
-      case LUFFA:
-      case SHA_512:
-         mm128_bswap32_80( edata, pdata );
-         sph_tiger_init( &x16rv2_ctx.tiger );
-         sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
-         intrlv_8x64( vdata, edata, edata, edata, edata,
-                             edata, edata, edata, edata, 640 );
-      break;
-      case SKEIN:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         skein512_8way_init( &x16rv2_ctx.skein );
-         skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 );
-      break;
-      case CUBEHASH:
-         mm128_bswap32_80( edata, pdata );
-         cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 );
-         intrlv_8x64( vdata, edata, edata, edata, edata,
-                             edata, edata, edata, edata, 640 );
-      break;
-      case HAMSI:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         hamsi512_8way_init( &x16rv2_ctx.hamsi );
-         hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 );
-      break;
-      case SHABAL:
-         mm256_bswap32_intrlv80_8x32( vdata32, pdata );
-         shabal512_8way_init( &x16rv2_ctx.shabal );
-         shabal512_8way_update( &x16rv2_ctx.shabal, vdata32, 64 );
-         rintrlv_8x32_8x64( vdata, vdata32, 640 );
-      break;
-      case WHIRLPOOL:
-         mm128_bswap32_80( edata, pdata );
-         sph_whirlpool_init( &x16rv2_ctx.whirlpool );
-         sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 );
-         intrlv_8x64( vdata, edata, edata, edata, edata,
-                             edata, edata, edata, edata, 640 );
-      break;
-      default:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-   }
-}
-
-int x16rv2_8way_prehash( struct work *work )
-{
-   x16r_gate_get_hash_order( work, x16r_hash_order );
-   x16rv2_8way_do_prehash( x16r_8way_vdata, work->data );
-   return 1;
-}
-
-int x16rv2_8way_hash( void* output, const void* input, const int thrid )
+int x16rv2_8way_hash( void* output, const void* input, int thrid )
 {
   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
   uint32_t hash0[24] __attribute__ ((aligned (32)));
@@ -593,28 +557,50 @@ int x16rv2_8way_hash( void* output, const void* input, const int thrid )
   return 1;
 }

-#elif defined (X16RV2_4WAY)
-
-// Pad the 24 bytes tiger hash to 64 bytes
-inline void padtiger512( uint32_t* hash )
+int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
 {
-  for ( int i = 6; i < 16; i++ ) hash[i] = 0;
-}
-
-void x16rv2_4way_do_prehash( void *vdata, void *pdata )
-{
-   uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash[16*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
   uint32_t edata[20] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   const int thr_id = mythr->id;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;

+   if ( bench ) ptarget[7] = 0x0cff;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   const uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+   }
+
+   // Do midstate prehash on hash functions with block size <= 64 bytes.
   const char elem = x16r_hash_order[0];
   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-
   switch ( algo )
   {
      case JH:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         jh512_4way_init( &x16rv2_ctx.jh );
-         jh512_4way_update( &x16rv2_ctx.jh, vdata, 64 );
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         jh512_8way_init( &x16rv2_ctx.jh );
+         jh512_8way_update( &x16rv2_ctx.jh, vdata, 64 );
      break;
      case KECCAK:
      case LUFFA:
@@ -622,45 +608,100 @@ void x16rv2_4way_do_prehash( void *vdata, void *pdata )
         mm128_bswap32_80( edata, pdata );
         sph_tiger_init( &x16rv2_ctx.tiger );
         sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
-         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
      break;
      case SKEIN:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         skein512_4way_prehash64( &x16r_ctx.skein, vdata );
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         skein512_8way_init( &x16rv2_ctx.skein );
+         skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 );
      break;
      case CUBEHASH:
         mm128_bswap32_80( edata, pdata );
         cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
         cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 );
-         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
      break;
      case HAMSI:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         hamsi512_4way_init( &x16rv2_ctx.hamsi );
-         hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         hamsi512_8way_init( &x16rv2_ctx.hamsi );
+         hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 );
      break;
      case SHABAL:
-         mm128_bswap32_intrlv80_4x32( vdata32, pdata );
-         shabal512_4way_init( &x16rv2_ctx.shabal );
-         shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 );
-         rintrlv_4x32_4x64( vdata, vdata32, 640 );
+         mm256_bswap32_intrlv80_8x32( vdata2, pdata );
+         shabal512_8way_init( &x16rv2_ctx.shabal );
+         shabal512_8way_update( &x16rv2_ctx.shabal, vdata2, 64 );
+         rintrlv_8x32_8x64( vdata, vdata2, 640 );
      break;
      case WHIRLPOOL:
         mm128_bswap32_80( edata, pdata );
         sph_whirlpool_init( &x16rv2_ctx.whirlpool );
         sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 );
-         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
      break;
      default:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
   }
-}   
+   
+   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
+                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
+   do
+   {
+      if ( x16rv2_8way_hash( hash, vdata, thr_id ) )
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}

-int x16rv2_4way_prehash( struct work *work )
+#elif defined (X16RV2_4WAY)
+
+union _x16rv2_4way_context_overlay
 {
-   x16r_gate_get_hash_order( work, x16r_hash_order );
-   x16rv2_4way_do_prehash( x16r_4way_vdata, work->data );
-   return 1;
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+#if defined(__VAES__)
+    groestl512_2way_context groestl;
+    shavite512_2way_context shavite;
+    echo_2way_context       echo;
+#else
+    hashState_groestl       groestl;
+    shavite512_context      shavite;
+    hashState_echo          echo;
+#endif
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    luffa_2way_context      luffa;
+    cubehashParam           cube;
+    simd_2way_context       simd;
+    hamsi512_4way_context   hamsi;
+    hashState_fugue         fugue;
+    shabal512_4way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_4way_context     sha512;
+    sph_tiger_context       tiger;
+};
+typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay;
+
+static __thread x16rv2_4way_context_overlay x16rv2_ctx;
+
+// Pad the 24 bytes tiger hash to 64 bytes
+inline void padtiger512( uint32_t* hash )
+{
+  for ( int i = 6; i < 16; i++ ) hash[i] = 0;
 }

 int x16rv2_4way_hash( void* output, const void* input, int thrid )
@@ -1007,4 +1048,107 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
   return 1;
 }

+int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[4*16] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
+   uint32_t edata[20];
+   uint32_t bedata1[2];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id; 
+    __m256i  *noncev = (__m256i*)vdata + 9; 
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+
+   if ( bench )  ptarget[7] = 0x0fff;
+   
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   const uint32_t ntime = bswap_32(pdata[17]);
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+   }
+
+   // Do midstate prehash on hash functions with block size <= 64 bytes.
+   const char elem = x16r_hash_order[0];
+   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+   switch ( algo )
+   {
+      case JH:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         jh512_4way_init( &x16rv2_ctx.jh );
+         jh512_4way_update( &x16rv2_ctx.jh, vdata, 64 );
+      break;
+      case KECCAK:
+      case LUFFA:
+      case SHA_512:
+         mm128_bswap32_80( edata, pdata );
+         sph_tiger_init( &x16rv2_ctx.tiger );
+         sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+      break;
+      case SKEIN:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         skein512_4way_prehash64( &x16r_ctx.skein, vdata );
+      break;
+      case CUBEHASH:
+         mm128_bswap32_80( edata, pdata );
+         cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+      break;
+      case HAMSI:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         hamsi512_4way_init( &x16rv2_ctx.hamsi );
+         hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
+      break;
+      case SHABAL:
+         mm128_bswap32_intrlv80_4x32( vdata32, pdata );
+         shabal512_4way_init( &x16rv2_ctx.shabal );
+         shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 );
+         rintrlv_4x32_4x64( vdata, vdata32, 640 );
+      break;
+      case WHIRLPOOL:
+         mm128_bswap32_80( edata, pdata );
+         sph_whirlpool_init( &x16rv2_ctx.whirlpool );
+         sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+      break;
+      default:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   }
+
+   *noncev = mm256_intrlv_blend_32(
+                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+
+   do
+   {
+      if ( x16rv2_4way_hash( hash, vdata, thr_id ) )
+      for ( int i = 0; i < 4; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
+      n += 4;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
 #endif
--- a/algo/x16/x16rv2.c
+++ b/algo/x16/x16rv2.c
@@ -43,16 +43,9 @@ inline void padtiger512(uint32_t* hash) {
   for (int i = (24/4); i < (64/4); i++) hash[i] = 0;
 }

-// no prehash
-int x16rv2_prehash( const struct work *work )
+int x16rv2_hash( void* output, const void* input, int thrid )
 {
-   x16r_gate_get_hash_order( work, x16r_hash_order );
-   return 1;
-}
-
-int x16rv2_hash( void* output, const void* input, const int thrid )
-{
-   uint32_t _ALIGN(32) hash[16];
+   uint32_t _ALIGN(128) hash[16];
   x16rv2_context_overlay ctx;
   void *in = (void*) input;
   int size = 80;
@@ -177,4 +170,52 @@ int x16rv2_hash( void* output, const void* input, const int thrid )
   return 1;
 }

+int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(128) hash32[8];
+   uint32_t _ALIGN(128) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const int thr_id = mythr->id;  
+   uint32_t nonce = first_nonce;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+
+   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   if ( s_ntime != pdata[17] )
+   {
+      uint32_t ntime = swab32(pdata[17]);
+      x16_r_s_getAlgoString( (const uint8_t*) (&edata[1]), x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)",
+                                 x16r_hash_order, ntime );
+   }
+
+   if ( bench )   ptarget[7] = 0x0cff;
+
+   do
+   {
+      edata[19] = nonce;
+      if ( x16rv2_hash( hash32, edata, thr_id ) )
+      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( nonce );
+         submit_solution( work, hash32, mythr );
+      }
+      nonce++;
+   } while ( nonce < max_nonce && !(*restart) );
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce;
+   return 0;
+}
+
 #endif
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -30,7 +30,7 @@ union _x21s_8way_context_overlay

 typedef union _x21s_8way_context_overlay x21s_8way_context_overlay;

-int x21s_8way_hash( void* output, const void* input, const int thrid )
+int x21s_8way_hash( void* output, const void* input, int thrid )
 {
   uint32_t vhash[16*8] __attribute__ ((aligned (128)));
   uint8_t shash[64*8] __attribute__ ((aligned (64)));
@@ -129,6 +129,66 @@ int x21s_8way_hash( void* output, const void* input, const int thrid )
   return 1;
 }

+int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[16*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &hash[7<<3];
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 16;
+   const int thr_id = mythr->id;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+
+   if ( bench )   ptarget[7] = 0x0cff;
+
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+   }
+
+   x16r_8way_prehash( vdata, pdata );
+   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
+                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
+   do
+   {
+      if ( x21s_8way_hash( hash, vdata, thr_id ) )
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( unlikely( hash7[lane] <= Htarg ) )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+             pdata[19] = bswap_32( n + lane );
+             submit_solution( work, lane_hash, mythr );
+         }
+      }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
 bool x21s_8way_thread_init()
 {
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
@@ -155,7 +215,7 @@ union _x21s_4way_context_overlay

 typedef union _x21s_4way_context_overlay x21s_4way_context_overlay;

-int x21s_4way_hash( void* output, const void* input, const int thrid )
+int x21s_4way_hash( void* output, const void* input, int thrid )
 {
   uint32_t vhash[16*4] __attribute__ ((aligned (64)));
   uint8_t  shash[64*4] __attribute__ ((aligned (64)));
@@ -231,6 +291,58 @@ int x21s_4way_hash( void* output, const void* input, const int thrid )
   return 1;
 }

+int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[16*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id; 
+   const bool bench = opt_benchmark;
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( bench )  ptarget[7] = 0x0cff;
+ 
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
+   }
+
+   x16r_4way_prehash( vdata, pdata );
+   *noncev = mm256_intrlv_blend_32(
+                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+   do
+   {
+      if ( x21s_4way_hash( hash, vdata, thr_id ) )
+      for ( int i = 0; i < 4; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
+      n += 4;
+   } while ( likely( (  n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
 bool x21s_4way_thread_init()
 {
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
--- a/algo/x16/x21s.c
+++ b/algo/x16/x21s.c
@@ -27,7 +27,7 @@ union _x21s_context_overlay
 };
 typedef union _x21s_context_overlay x21s_context_overlay;

-int x21s_hash( void* output, const void* input, const int thrid )
+int x21s_hash( void* output, const void* input, int thrid )
 {
   uint32_t _ALIGN(128) hash[16];
   x21s_context_overlay ctx;
@@ -57,6 +57,50 @@ int x21s_hash( void* output, const void* input, const int thrid )
   return 1;
 }

+int scanhash_x21s( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(128) hash32[8];
+   uint32_t _ALIGN(128) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const int thr_id = mythr->id;
+   uint32_t nonce = first_nonce;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+   if ( bench )  ptarget[7] = 0x0cff;
+
+   mm128_bswap32_80( edata, pdata );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
+   if ( s_ntime != pdata[17] )
+   {
+      uint32_t ntime = swab32(pdata[17]);
+      x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+          applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+   }
+
+   x16r_prehash( edata, pdata );
+
+   do
+   {
+      edata[19] = nonce;
+      if ( x21s_hash( hash32, edata, thr_id ) )
+      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( nonce );
+         submit_solution( work, hash32, mythr );
+      }
+      nonce++;
+   } while ( nonce < max_nonce && !(*restart) );
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce;
+   return 0;
+}
+
 bool x21s_thread_init()
 {
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -257,6 +257,7 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
   const __m512i eight = m512_const1_64( 8 );
   const bool bench = opt_benchmark;

+   // convert LE32 to LE64
   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
@@ -470,6 +471,7 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
   const __m256i four = m256_const1_64( 4 );
   const bool bench = opt_benchmark;

+   // convert LE32 to LE64
   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
--- a/algo/yespower/yespower-gate.c
+++ b/algo/yespower/yespower-gate.c
@@ -31,26 +31,8 @@

 yespower_params_t yespower_params;

-// master g_work 
-sha256_context yespower_sha256_prehash_ctx;
-uint32_t _ALIGN(64) yespower_endiandata[20];
-
-// local work
 __thread sha256_context sha256_prehash_ctx;

-
-int yespower_sha256_prehash( struct work *work )
-{
-   uint32_t *pdata = work->data;
-
-   for ( int k = 0; k < 19; k++ )
-      be32enc( &yespower_endiandata[k], pdata[k] );
-
-   sha256_ctx_init( &yespower_sha256_prehash_ctx );
-   sha256_update( &yespower_sha256_prehash_ctx, yespower_endiandata, 64 );
-
-   return 1;
-}
 // YESPOWER

 int yespower_hash( const char *input, char *output, uint32_t len, int thrid )
@@ -71,15 +53,14 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;

-//   pthread_rwlock_rdlock( &g_work_lock );
-
-   memcpy( endiandata, yespower_endiandata, sizeof endiandata );
-   memcpy( &sha256_prehash_ctx, &yespower_sha256_prehash_ctx, sizeof sha256_prehash_ctx );
-
-//   pthread_rwlock_unlock( &g_work_lock );
-
+   for ( int k = 0; k < 19; k++ )
+      be32enc( &endiandata[k], pdata[k] );
   endiandata[19] = n;

+   // do sha256 prehash
+   sha256_ctx_init( &sha256_prehash_ctx );
+   sha256_update( &sha256_prehash_ctx, endiandata, 64 );
+
   do {
      if ( yespower_hash( (char*)endiandata, (char*)vhash, 80, thr_id ) )
      if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark )
@@ -159,7 +140,6 @@ bool register_yespower_algo( algo_gate_t* gate )

  gate->optimizations = SSE2_OPT | SHA_OPT;
  gate->scanhash      = (void*)&scanhash_yespower;
-  gate->prehash       = (void*)&yespower_sha256_prehash;
  gate->hash          = (void*)&yespower_hash;
  opt_target_factor = 65536.0;
  return true;
@@ -174,7 +154,6 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
  yespower_params.perslen = 0;
  gate->optimizations = SSE2_OPT | SHA_OPT;
  gate->scanhash      = (void*)&scanhash_yespower;
-  gate->prehash       = (void*)&yespower_sha256_prehash;
  gate->hash          = (void*)&yespower_hash;
  opt_target_factor = 65536.0;
  return true;
@@ -186,7 +165,6 @@ bool register_yescrypt_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
-   gate->prehash       = (void*)&yespower_sha256_prehash;
   yespower_params.version = YESPOWER_0_5;
   opt_target_factor = 65536.0;

@@ -220,7 +198,6 @@ bool register_yescryptr8_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
-   gate->prehash       = (void*)&yespower_sha256_prehash;
   yespower_params.version = YESPOWER_0_5;
   yespower_params.N       = 2048;
   yespower_params.r       = 8;
@@ -234,7 +211,6 @@ bool register_yescryptr16_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
-   gate->prehash       = (void*)&yespower_sha256_prehash;
   yespower_params.version = YESPOWER_0_5;
   yespower_params.N       = 4096;
   yespower_params.r       = 16;
@@ -248,7 +224,6 @@ bool register_yescryptr32_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
-   gate->prehash       = (void*)&yespower_sha256_prehash;
   yespower_params.version = YESPOWER_0_5;
   yespower_params.N       = 4096;
   yespower_params.r       = 32;
--- a/algo/yespower/yespower.h
+++ b/algo/yespower/yespower.h
@@ -80,8 +80,6 @@ extern yespower_params_t yespower_params;

 extern __thread sha256_context sha256_prehash_ctx;

-int yespower_sha256_prehash( struct work *work );
-
 /**
 * yespower_init_local(local):
 * Initialize the thread-local (RAM) data structure.  Actual memory allocation
--- a/4355
+++ b/4355
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.21.3])
+AC_INIT([cpuminer-opt], [3.22.2])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -3,7 +3,7 @@
 * Copyright 2012-2014 pooler
 * Copyright 2014 Lucas Jones
 * Copyright 2014-2016 Tanguy Pruvot
- * Copyright 2016-2021 Jay D Dee
+ * Copyright 2016-2023 Jay D Dee
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
@@ -37,7 +37,7 @@
 #include <curl/curl.h>
 #include <jansson.h>
 #include <openssl/sha.h>
-#include <mm_malloc.h>
+//#include <mm_malloc.h>
 #include "sysinfos.c"
 #include "algo/sha/sha256d.h"

@@ -121,7 +121,6 @@ static uint64_t opt_affinity = 0xFFFFFFFFFFFFFFFFULL;  // default, use all cores
 int opt_priority = 0;  // deprecated
 int num_cpus = 1;
 int num_cpugroups = 1;  // For Windows
-#define max_cpus 256   // max for affinity
 char *rpc_url = NULL;
 char *rpc_userpass = NULL;
 char *rpc_user, *rpc_pass;
@@ -224,8 +223,7 @@ char*  lp_id;

 static void   workio_cmd_free(struct workio_cmd *wc);

-// array mapping thread to cpu
-static uint8_t thread_affinity_map[ max_cpus ];
+static int *thread_affinity_map;

 // display affinity mask graphically
 static void format_affinity_mask( char *mask_str, uint64_t mask )
@@ -432,20 +430,18 @@ static bool work_decode( const json_t *val, struct work *work )
    if ( unlikely( !algo_gate.work_decode( work ) ) )
        return false;

-    if ( !allow_mininginfo )
-        net_diff = algo_gate.calc_network_diff( work );
-    else
-        net_diff = hash_to_diff( work->target );
-
-    work->targetdiff = net_diff;
-    stratum_diff = last_targetdiff = work->targetdiff;
+    // many of these aren't used solo.
+    net_diff =
+    work->targetdiff = 
+    stratum_diff =
+    last_targetdiff = hash_to_diff( work->target );
    work->sharediff = 0;
    algo_gate.decode_extra_data( work, &net_blocks );

    return true;
 }

-// good alternative for wallet mining, difficulty and net hashrate
+// Only used for net_hashrate with GBT/getwork, data is from previous block.
 static const char *info_req =
 "{\"method\": \"getmininginfo\", \"params\": [], \"id\":8}\r\n";

@@ -471,17 +467,14 @@ static bool get_mininginfo( CURL *curl, struct work *work )
   // "networkhashps": 56475980
   if ( res )
   {
-      // net_diff is a global that is set from the work hash target by
-      // both getwork and GBT. Don't overwrite it, define a local to override
-      // the global.
-      double net_diff = 0.;
+      double difficulty = 0.;
  		json_t *key = json_object_get( res, "difficulty" );
   	if ( key )
      {
 	   	if ( json_is_object( key ) )
 		   	key = json_object_get( key, "proof-of-work" );
 		   if ( json_is_real( key ) )
-			   net_diff = json_real_value( key );
+			   difficulty = json_real_value( key );
 	   }

      key = json_object_get( res, "networkhashps" );
@@ -498,12 +491,13 @@ static bool get_mininginfo( CURL *curl, struct work *work )
 		  	net_blocks = json_integer_value( key );

      if ( opt_debug )
-         applog(LOG_INFO,"Mining info: diff %.5g, net_hashrate %f, height %d",
-                              net_diff, net_hashrate, net_blocks );
-      
+         applog( LOG_INFO,"getmininginfo: difficulty %.5g, networkhashps %.5g, blocks %d", difficulty, net_hashrate, net_blocks );
+
      if ( !work->height )
      {
 	      // complete missing data from getwork
+         if ( opt_debug )
+            applog( LOG_DEBUG, "work height set by getmininginfo" );
 	      work->height = (uint32_t) net_blocks + 1;
 	      if ( work->height > g_work.height )
            restart_threads();
@@ -535,9 +529,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
   json_t *tmp, *txa;
   bool rc = false;
   int i, n;
-
-// Segwit BEGIN
   bool segwit = false;
+
   tmp = json_object_get( val, "rules" );
   if ( tmp && json_is_array( tmp ) )
   {
@@ -555,8 +548,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
         }
      }
   }
-// Segwit END
-   
+
   tmp = json_object_get( val, "mutable" );
   if ( tmp && json_is_array( tmp ) )
   {
@@ -638,7 +630,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
         goto out;
      }
   }
-   
+
   /* find count and size of transactions */
   txa = json_object_get(val, "transactions" );
   if ( !txa || !json_is_array( txa ) )
@@ -713,12 +705,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      cbtx[41] = cbtx_size - 42; /* scriptsig length */
      le32enc( (uint32_t *)( cbtx+cbtx_size ), 0xffffffff ); /* sequence */
      cbtx_size += 4;
-
-// Segwit BEGIN
-      //cbtx[cbtx_size++] = 1; /* out-counter */
-        cbtx[cbtx_size++] = segwit ? 2 : 1; /* out-counter */
-// Segwit END
-
+      cbtx[cbtx_size++] = segwit ? 2 : 1; /* out-counter */
      le32enc( (uint32_t *)( cbtx+cbtx_size) , (uint32_t)cbvalue ); /* value */
      le32enc( (uint32_t *)( cbtx+cbtx_size+4 ), cbvalue >> 32 );
      cbtx_size += 8;
@@ -726,7 +713,6 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      memcpy( cbtx+cbtx_size, pk_script, pk_script_size );
      cbtx_size += (int) pk_script_size;

-// Segwit BEGIN
       if ( segwit )
       {
          unsigned char (*wtree)[32] = calloc(tx_count + 2, 32);
@@ -761,12 +747,11 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
            for ( i = 0; i < n; i++ )
               sha256d( wtree[i], wtree[2*i], 64 );
         }
-         memset( wtree[1], 0, 32 );  /* witness reserved value = 0 */
+         memset( wtree[1], 0, 32 );  // witness reserved value = 0
         sha256d( cbtx+cbtx_size, wtree[0], 64 );
         cbtx_size += 32;
         free( wtree );
      }
-// Segwit END

      le32enc( (uint32_t *)( cbtx+cbtx_size ), 0 ); /* lock time */
      cbtx_size += 4;
@@ -785,10 +770,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
            xsig_len += n;
         }
         else
-         {
            applog( LOG_WARNING,
                        "Signature does not fit in coinbase, skipping" );
-         }
      }
      tmp = json_object_get( val, "coinbaseaux" );
      if ( tmp && json_is_object( tmp ) )
@@ -815,8 +798,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      if ( xsig_len )
      {
         unsigned char *ssig_end = cbtx + 42 + cbtx[41];
-         int push_len = cbtx[41] + xsig_len < 76 ? 1 :
-		               cbtx[41] + 2 + xsig_len > 100 ? 0 : 2;
+         int push_len = cbtx[41] + xsig_len < 76
+                        ? 1 : cbtx[41] + 2 + xsig_len > 100 ? 0 : 2;
         n = xsig_len + push_len;
         memmove( ssig_end + n, ssig_end, cbtx_size - 42 - cbtx[41] );
         cbtx[41] += n;
@@ -843,7 +826,6 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      const char *tx_hex = json_string_value( json_object_get( tmp, "data" ) );
      const int tx_size = tx_hex ? (int) ( strlen( tx_hex ) / 2 ) : 0;

-// Segwit BEGIN      
      if ( segwit )
      {
         const char *txid = json_string_value( json_object_get( tmp, "txid" ) );
@@ -856,8 +838,6 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      }
      else
      {
-// Segwit END
-
         unsigned char *tx = (uchar*) malloc( tx_size );
         if ( !tx_hex || !hex2bin( tx, tx_hex, tx_size ) )
         {
@@ -867,10 +847,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
         }
         sha256d( merkle_tree[1 + i], tx, tx_size );
         free( tx );
-
-// Segwit BEGIN      
      }
-// Segwit END

      if ( !submit_coinbase )
         strcat( work->txs, tx_hex );
@@ -888,6 +865,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
         sha256d( merkle_tree[i], merkle_tree[2*i], 64 );
   }

+   work->tx_count = tx_count;
+
   /* assemble block header */
   algo_gate.build_block_header( work, swab32( version ),
                                 (uint32_t*) prevhash, (uint32_t*) merkle_tree,
@@ -900,21 +879,11 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      goto out;
   }

-// See git issue https://github.com/JayDDee/cpuminer-opt/issues/379    
-#if defined(__AVX2__)
-   if ( opt_debug )
-   {
-      if ( (uint64_t)target % 32 )
-         applog( LOG_ERR, "Misaligned target %p", target );
-      if ( (uint64_t)(work->target) % 32 )
-         applog( LOG_ERR, "Misaligned work->target %p", work->target );
-   }   
-#endif
-
-   for ( i = 0; i < 8; i++ )
-      work->target[7 - i] = be32dec( target + i );
+   // reverse the bytes in target
+   casti_m128i( work->target, 0 ) = mm128_bswap_128( casti_m128i( target, 1 ) );
+   casti_m128i( work->target, 1 ) = mm128_bswap_128( casti_m128i( target, 0 ) );
   net_diff = work->targetdiff = hash_to_diff( work->target );
-   
+
   tmp = json_object_get( val, "workid" );
   if ( tmp )
   {
@@ -1090,12 +1059,11 @@ void report_summary_log( bool force )
   timeval_subtract( &et, &now, &start_time );
   timeval_subtract( &uptime, &total_hashes_time, &session_start );
   
-   double share_time = (double)et.tv_sec + (double)et.tv_usec / 1e6;
+   double share_time = (double)et.tv_sec + (double)et.tv_usec * 1e-6;
   double ghrate = safe_div( total_hashes, (double)uptime.tv_sec, 0. );
   double target_diff = exp32 * last_targetdiff;
   double shrate = safe_div( target_diff * (double)(accepts),
                             share_time, 0. );
-//   global_hashrate = ghrate;
   double sess_hrate = safe_div( exp32 * norm_diff_sum,
                                 (double)uptime.tv_sec, 0. );
   double submit_rate = safe_div( (double)submits * 60., share_time, 0. );
@@ -1116,7 +1084,7 @@ void report_summary_log( bool force )
   applog2( LOG_NOTICE, "Periodic Report     %s        %s", et_str, upt_str );
   applog2( LOG_INFO, "Share rate        %.2f/min     %.2f/min",
            submit_rate, safe_div( (double)submitted_share_count*60.,
-              ( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ), 0. ) );
+              ( (double)uptime.tv_sec + (double)uptime.tv_usec * 1e-6 ), 0. ) );
   applog2( LOG_INFO, "Hash rate       %7.2f%sh/s   %7.2f%sh/s   (%.2f%sh/s)",
            shrate, shr_units, sess_hrate, sess_hr_units, ghrate, ghr_units );

@@ -1563,7 +1531,6 @@ const char *getwork_req =

 #define GBT_CAPABILITIES "[\"coinbasetxn\", \"coinbasevalue\", \"longpoll\", \"workid\"]"

-// Segwit BEGIN
 #define GBT_RULES "[\"segwit\"]"
 static const char *gbt_req =
   "{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
@@ -1572,16 +1539,6 @@ const char *gbt_lp_req =
   "{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
   GBT_CAPABILITIES ", \"rules\": " GBT_RULES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n";

-/*
-static const char *gbt_req =
-	"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
-	GBT_CAPABILITIES "}], \"id\":0}\r\n";
-const char *gbt_lp_req =
-	"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
-	GBT_CAPABILITIES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n";
-*/
-// Segwit END
-
 static bool get_upstream_work( CURL *curl, struct work *work )
 {
   json_t *val;
@@ -1656,49 +1613,49 @@ start:
         last_block_height = work->height;
         last_targetdiff = net_diff;

-         applog( LOG_BLUE, "New Block %d, Net Diff %.5g, Ntime %08x",
-                                work->height, net_diff,
+         applog( LOG_BLUE, "New Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
+                                work->height, work->tx_count, net_diff,
                                work->data[ algo_gate.ntime_index ] );
-
-         if ( !opt_quiet )
-         {
-            double miner_hr = 0.;
-            double net_hr = net_hashrate;
-            double nd = net_diff * exp32;
-            char net_hr_units[4] = {0};
-            char miner_hr_units[4] = {0};
-            char net_ttf[32];
-            char miner_ttf[32];
-
-            pthread_mutex_lock( &stats_lock );
-
-            for ( int i = 0; i < opt_n_threads; i++ )
-               miner_hr += thr_hashrates[i];
-            global_hashrate = miner_hr;
-
-            pthread_mutex_unlock( &stats_lock );
-
-            if ( net_hr > 0. )
-               sprintf_et( net_ttf, nd / net_hr );
-            else
-               sprintf( net_ttf, "NA" );
-            if ( miner_hr > 0. )
-               sprintf_et( miner_ttf, nd / miner_hr );
-            else
-               sprintf( miner_ttf, "NA" );
-
-            scale_hash_for_display ( &miner_hr, miner_hr_units );
-            scale_hash_for_display ( &net_hr, net_hr_units );
-            applog2( LOG_INFO,
-                     "Miner TTF @ %.2f %sh/s %s, Net TTF @ %.2f %sh/s %s",
-                     miner_hr, miner_hr_units, miner_ttf, net_hr,
-                     net_hr_units, net_ttf );
-         }
-      }  // work->height > last_block_height
+      }
      else if ( memcmp( &work->data[1], &g_work.data[1], 32 ) )
-         applog( LOG_BLUE, "New Work: Block %d, Net Diff %.5g, Ntime %08x",
-                                      work->height, net_diff,
-                                      work->data[ algo_gate.ntime_index ] );
+         applog( LOG_BLUE, "New Work: Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
+                                work->height, work->tx_count, net_diff,
+                                work->data[ algo_gate.ntime_index ] );
+       
+      if ( !opt_quiet )
+      {
+         double miner_hr = 0.;
+         double net_hr = net_hashrate;
+         double nd = net_diff * exp32;
+         char net_hr_units[4] = {0};
+         char miner_hr_units[4] = {0};
+         char net_ttf[32];
+         char miner_ttf[32];
+
+         pthread_mutex_lock( &stats_lock );
+
+         for ( int i = 0; i < opt_n_threads; i++ )
+             miner_hr += thr_hashrates[i];
+         global_hashrate = miner_hr;
+
+         pthread_mutex_unlock( &stats_lock );
+
+         if ( net_hr > 0. )
+            sprintf_et( net_ttf, nd / net_hr );
+         else
+            sprintf( net_ttf, "NA" );
+         if ( miner_hr > 0. )
+            sprintf_et( miner_ttf, nd / miner_hr );
+         else
+            sprintf( miner_ttf, "NA" );
+
+         scale_hash_for_display ( &miner_hr, miner_hr_units );
+         scale_hash_for_display ( &net_hr, net_hr_units );
+         applog2( LOG_INFO,
+                  "Miner TTF @ %.2f %sh/s %s, Net TTF @ %.2f %sh/s %s",
+                  miner_hr, miner_hr_units, miner_ttf, net_hr,
+                  net_hr_units, net_ttf );
+      }
   }  // rc

   return rc;
@@ -1724,20 +1681,19 @@ static void workio_cmd_free(struct workio_cmd *wc)

 static bool workio_get_work( struct workio_cmd *wc, CURL *curl )
 {
-   struct work *ret_work;
+   struct work *work_heap;
   int failures = 0;

-   ret_work = (struct work*) _mm_malloc( sizeof(*ret_work), 32 );
-   if ( !ret_work )  return false;
-   memset( ret_work, 0, sizeof(*ret_work) );
+   work_heap = calloc( 1, sizeof(struct work) );
+   if ( !work_heap )  return false;

   /* obtain new work from bitcoin via JSON-RPC */
-   while ( !get_upstream_work( curl, ret_work ) )
+   while ( !get_upstream_work( curl, work_heap ) )
   {
      if ( unlikely( ( opt_retries >= 0 ) && ( ++failures > opt_retries ) ) )
      {
         applog( LOG_ERR, "json_rpc_call failed, terminating workio thread" );
-         free( ret_work );
+         free( work_heap );
         return false;
      }

@@ -1748,8 +1704,8 @@ static bool workio_get_work( struct workio_cmd *wc, CURL *curl )
   }

   /* send work to requesting thread */
-   if ( !tq_push(wc->thr->q, ret_work ) )
-      free( ret_work );
+   if ( !tq_push(wc->thr->q, work_heap ) )
+      free( work_heap );

   return true;
 }
@@ -1825,7 +1781,7 @@ static void *workio_thread(void *userdata)
 static bool get_work(struct thr_info *thr, struct work *work)
 {
 	struct workio_cmd *wc;
-	struct work *work_heap;
+   struct work *work_heap;

 	if unlikely( opt_benchmark )
   {
@@ -1850,17 +1806,16 @@ static bool get_work(struct thr_info *thr, struct work *work)
 	wc->thr = thr;
 	/* send work request to workio thread */
 	if (!tq_push(thr_info[work_thr_id].q, wc))
-        {
+   {
 		workio_cmd_free(wc);
 		return false;
 	}
 	/* wait for response, a unit of work */
 	work_heap = (struct work*) tq_pop(thr->q, NULL);
-	if (!work_heap)
-		return false;
-	/* copy returned work into storage provided by caller */
-	memcpy(work, work_heap, sizeof(*work));
-	free(work_heap);
+	if ( !work_heap ) return false;
+   /* copy returned work into storage provided by caller */
+	memcpy( work, work_heap, sizeof(*work) );
+	free( work_heap );
 	return true;
 }

@@ -1910,9 +1865,9 @@ static void update_submit_stats( struct work *work, const void *hash )
 bool submit_solution( struct work *work, const void *hash,
                      struct thr_info *thr )
 {
-   // Job went stale during hashing of a valid share.
-   if ( !opt_quiet && work_restart[ thr->id ].restart )
-      applog( LOG_INFO, CL_LBL "Share may be stale, submitting anyway..." CL_N );
+// Job went stale during hashing of a valid share.
+//   if ( !opt_quiet && work_restart[ thr->id ].restart )
+//      applog( LOG_INFO, CL_LBL "Share may be stale, submitting anyway..." CL_N );
   
   work->sharediff = hash_to_diff( hash );
   if ( likely( submit_work( thr, work ) ) )
@@ -1930,32 +1885,34 @@ bool submit_solution( struct work *work, const void *hash,
     if ( !opt_quiet )
     {
        if ( have_stratum )
+        {
           applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Job %s",
                   submitted_share_count, work->sharediff, work->height,
                   work->job_id );
+           if ( opt_debug && opt_extranonce )
+           {
+              unsigned char *xnonce2str = abin2hex( work->xnonce2,
+                                                    work->xnonce2_len );
+              applog( LOG_INFO, "Xnonce2 %s", xnonce2str );
+              free( xnonce2str );
+           }
+        }
        else
           applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Ntime %08x",
                   submitted_share_count, work->sharediff, work->height,
                   work->data[ algo_gate.ntime_index ] );
-     }

-     if ( opt_debug )
-     {
-        uint32_t* h = (uint32_t*)hash;
-        uint32_t* t = (uint32_t*)work->target;
-        uint32_t* d = (uint32_t*)work->data;
+        if ( opt_debug )
+        {
+           uint32_t* h = (uint32_t*)hash;
+           uint32_t* t = (uint32_t*)work->target;
+           uint32_t* d = (uint32_t*)work->data;

-        unsigned char *xnonce2str = abin2hex( work->xnonce2,
-                                              work->xnonce2_len );
-        applog(LOG_INFO,"Thread %d, Nonce %08x, Xnonce2 %s", thr->id,
-                       work->data[ algo_gate.nonce_index ], xnonce2str );
-        free( xnonce2str );
-        applog(LOG_INFO,"Data[0:19]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9] );
-        applog(LOG_INFO,"          : %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[10],d[11],d[12],d[13],d[14],d[15],d[16],d[17],d[18],d[19]);
-        applog(LOG_INFO,"Hash[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
-                                    h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0]);
-        applog(LOG_INFO,"Targ[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
-                                    t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0]);
+           applog( LOG_INFO, "Data[ 0: 9]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9] );
+           applog( LOG_INFO, "Data[10:19]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[10],d[11],d[12],d[13],d[14],d[15],d[16],d[17],d[18],d[19] );
+           applog( LOG_INFO, "Hash[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x", h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0] );
+           applog( LOG_INFO, "Targ[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x", t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0] );
+        }
     }
     return true;
   }
@@ -2031,33 +1988,6 @@ void set_work_data_big_endian( struct work *work )
        be32enc( work->data + i, work->data[i] );
 }

-// calculate net diff from nbits.
-double std_calc_network_diff( struct work* work )
-{
-   uint32_t nbits = work->data[ algo_gate.nbits_index ];
-   uint32_t shift = nbits & 0xff;
-   uint32_t bits = bswap_32( nbits ) & 0x00ffffff;
-/*
-   // sample for diff 43.281 : 1c05ea29
-   // todo: endian reversed on longpoll could be zr5 specific...
-   int nbits_index = algo_gate.nbits_index;
-   uint32_t nbits = have_longpoll ? work->data[ nbits_index]
-                                  : swab32( work->data[ nbits_index ] );
-   uint32_t bits  = ( nbits & 0xffffff );
-   int16_t  shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
-*/
-
-   int m;
-   long double d = (long double)0x0000ffff / (long double)bits;
-   for ( m = shift; m < 29; m++ )
-       d *= 256.0;
-   for ( m = 29; m < shift; m++ )
-       d /= 256.0;
-   if ( opt_debug_diff )
-      applog(LOG_DEBUG, "net diff: %8f -> shift %u, bits %08x", (double)d, shift, bits);
-   return (double)d;
-}
-
 void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
                     uint32_t *end_nonce_ptr )
 {
@@ -2081,17 +2011,6 @@ void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
       ++(*nonceptr);
 }

-bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                           int thr_id )
-{
-   if ( have_stratum && !work->data[0] && !opt_benchmark )
-   {
-      sleep(1);
-      return false;
-   }
-   return true;
-}
-
 static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
 {
   bool new_job;
@@ -2108,7 +2027,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   g_work->xnonce2 = (uchar*) realloc( g_work->xnonce2, sctx->xnonce2_size );
   memcpy( g_work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size );
   algo_gate.build_extraheader( g_work, sctx );
-   net_diff = algo_gate.calc_network_diff( g_work );
+   net_diff = nbits_to_diff( g_work->data[ algo_gate.nbits_index ] );
   algo_gate.set_work_data_endian( g_work );
   g_work->height = sctx->block_height;
   g_work->targetdiff = sctx->job.diff
@@ -2122,10 +2041,6 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
         t++ );

   g_work_time = time(NULL);
-
-   // Do midstate prehash
-   algo_gate.prehash( g_work );
-
   restart_threads();

   pthread_mutex_unlock( &sctx->work_lock );
@@ -2141,15 +2056,18 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   pthread_mutex_unlock( &stats_lock );

   if ( stratum_diff != sctx->job.diff )
-      applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Job %s",
-                        sctx->job.diff, sctx->block_height, g_work->job_id );
+      applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Tx %d, Job %s",
+                        sctx->job.diff, sctx->block_height,
+                        sctx->job.merkle_count, g_work->job_id );
   else if ( last_block_height != sctx->block_height )
-      applog( LOG_BLUE, "New Block %d, Net diff %.5g, Job %s",
-                        sctx->block_height, net_diff, g_work->job_id );
+      applog( LOG_BLUE, "New Block %d, Tx %d, Netdiff %.5g, Job %s",
+                        sctx->block_height, sctx->job.merkle_count,
+                        net_diff, g_work->job_id );
   else if ( g_work->job_id && new_job )
-      applog( LOG_BLUE, "New Work: Block %d, Net diff %.5g, Job %s",
-                         sctx->block_height, net_diff, g_work->job_id );
-   else if ( opt_debug )
+      applog( LOG_BLUE, "New Work: Block %d, Tx %d, Netdiff %.5g, Job %s",
+                         sctx->block_height, sctx->job.merkle_count,
+                         net_diff, g_work->job_id );
+   else if ( !opt_quiet )
   {
      unsigned char *xnonce2str = bebin2hex( g_work->xnonce2,
                                             g_work->xnonce2_len );
@@ -2162,8 +2080,6 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   if ( ( stratum_diff != sctx->job.diff )
   || ( last_block_height != sctx->block_height ) )
   {
-      static bool multipool = false;
-      if ( stratum.block_height < last_block_height ) multipool = true;
      if ( unlikely( !session_first_block ) )
         session_first_block = stratum.block_height;
      last_block_height = stratum.block_height;
@@ -2171,58 +2087,47 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
      last_targetdiff   = g_work->targetdiff;
      if ( lowest_share < last_targetdiff )
         lowest_share = 9e99;
+    }

-      if ( !opt_quiet )
-      {
-         applog2( LOG_INFO, "Diff: Net %.5g, Stratum %.5g, Target %.5g",
-                            net_diff, stratum_diff, g_work->targetdiff );
+    if ( !opt_quiet )
+    {
+       applog2( LOG_INFO, "Diff: Net %.5g, Stratum %.5g, Target %.5g",
+                          net_diff, stratum_diff, g_work->targetdiff );

-         if ( likely( hr > 0. ) )
-         {
-            double nd = net_diff * exp32;
-            char hr_units[4] = {0};
-            char block_ttf[32];
-            char share_ttf[32];
+       if ( likely( hr > 0. ) )
+       {
+          double nd = net_diff * exp32;
+          char hr_units[4] = {0};
+          char block_ttf[32];
+          char share_ttf[32];
+          static bool multipool = false;
+      
+          if ( stratum.block_height < last_block_height ) multipool = true;
+            
+          sprintf_et( block_ttf, nd / hr );
+          sprintf_et( share_ttf, ( g_work->targetdiff * exp32 ) / hr );
+          scale_hash_for_display ( &hr, hr_units );
+          applog2( LOG_INFO, "TTF @ %.2f %sh/s: Block %s, Share %s",
+                             hr, hr_units, block_ttf, share_ttf );

-            sprintf_et( block_ttf, nd / hr );
-            sprintf_et( share_ttf, ( g_work->targetdiff * exp32 ) / hr );
-            scale_hash_for_display ( &hr, hr_units );
-            applog2( LOG_INFO, "TTF @ %.2f %sh/s: Block %s, Share %s",
-                               hr, hr_units, block_ttf, share_ttf );
-
-            if ( !multipool && last_block_height > session_first_block )
-            {
-               struct timeval now, et;
-               gettimeofday( &now, NULL );
-               timeval_subtract( &et, &now, &session_start );
-               uint64_t net_ttf =
-                    ( last_block_height - session_first_block ) == 0 ? 0
-                    : et.tv_sec / ( last_block_height - session_first_block );
-               if ( net_diff > 0. && net_ttf )
-               {
-                  double net_hr = nd / net_ttf;
-                  char net_hr_units[4] = {0};
-                  scale_hash_for_display ( &net_hr, net_hr_units );
-                  applog2( LOG_INFO, "Net hash rate (est) %.2f %sh/s",
-                                     net_hr, net_hr_units );
-               }
-            }
-         }  // hr > 0
-      } // !quiet
-   }  // new diff/block
-
-/*   
-   if ( new_job && !( opt_quiet || stratum_errors ) )
-   {
-      int mismatch = submitted_share_count - ( accepted_share_count
-                                             + stale_share_count
-                                             + rejected_share_count );
-      if ( mismatch )
-         applog( LOG_INFO,
-                 CL_LBL "%d Submitted share pending, maybe stale" CL_N,
-                 submitted_share_count );
-   }
-*/
+          if ( !multipool && last_block_height > session_first_block )
+          {
+             struct timeval now, et;
+             gettimeofday( &now, NULL );
+             timeval_subtract( &et, &now, &session_start );
+             uint64_t net_ttf = safe_div( et.tv_sec,
+                                 last_block_height - session_first_block, 0 );
+             if ( net_diff > 0. && net_ttf )
+             {
+                double net_hr = safe_div( nd, net_ttf, 0. );
+                char net_hr_units[4] = {0};
+                scale_hash_for_display ( &net_hr, net_hr_units );
+                applog2( LOG_INFO, "Net hash rate (est) %.2f %sh/s",
+                                   net_hr, net_hr_units );
+             }
+          }
+       }  // hr > 0
+    } // !quiet
 }

 static void *miner_thread( void *userdata )
@@ -2344,9 +2249,6 @@ static void *miner_thread( void *userdata )
 		             goto out;
 	             }
                g_work_time = time(NULL);
-
-                // do midstate prehash
-                algo_gate.prehash( &g_work );
                restart_threads();
             }

@@ -2363,9 +2265,6 @@ static void *miner_thread( void *userdata )
       } // do_this_thread
       algo_gate.resync_threads( thr_id, &work );

-       if ( unlikely( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) )
-          continue;
-
       // conditional mining
       if ( unlikely( !wanna_mine( thr_id ) ) )
       {
@@ -3745,7 +3644,6 @@ int main(int argc, char *argv[])
   if ( opt_time_limit )
      time_limit_stop = (unsigned int)time(NULL) + opt_time_limit;

-
   // need to register to get algo optimizations for cpu capabilities
   // but that causes registration logs before cpu capabilities is output.
   // Would need to split register function into 2 parts. First part sets algo
@@ -3874,24 +3772,29 @@ int main(int argc, char *argv[])
 #endif

 #if defined(WIN32) && defined(WINDOWS_CPU_GROUPS_ENABLED)
-      if ( !opt_quiet )
-         applog( LOG_INFO, "Found %d CPUs in %d groups", num_cpus, num_cpugroups );
+      if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )
+         applog( LOG_INFO, "Found %d CPUs in %d groups",
+                           num_cpus, num_cpugroups );
 #endif
   
-   if ( opt_affinity && num_cpus > max_cpus )
+   const int map_size = opt_n_threads < num_cpus ? num_cpus : opt_n_threads;   
+   thread_affinity_map = malloc( map_size * (sizeof (int)) );
+   if ( !thread_affinity_map )
   {
-      applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled",
-                            max_cpus );
+      applog( LOG_ERR, "CPU Affinity disabled, memory allocation failed" );
      opt_affinity = 0ULL;
-   }
-   
+   }   
   if ( opt_affinity )
   {
-      for ( int thr = 0, cpu = 0; thr < opt_n_threads; thr++, cpu++ )
+      int active_cpus = 0; // total CPUs available using rolling affinity mask
+      for ( int thr = 0, cpu = 0; thr < map_size; thr++, cpu++ )
      {
         while ( !( ( opt_affinity >> ( cpu & 63 ) ) & 1ULL ) ) cpu++;   
         thread_affinity_map[ thr ] = cpu % num_cpus;
+         if ( cpu < num_cpus ) active_cpus++;
      }
+      if ( opt_n_threads > active_cpus )
+         applog( LOG_WARNING, "Affinity: more threads (%d) than active CPUs (%d)", opt_n_threads, active_cpus );
      if ( !opt_quiet )
      {
         char affinity_mask[64];
--- a/miner.h
+++ b/miner.h
@@ -24,6 +24,11 @@

 #endif /* _MSC_VER */

+// prevent questions from ARM users that don't read the requirements.
+#if !defined(__x86_64__)
+#error "CPU architecture not supported. Consult the requirements for supported CPUs."
+#endif
+
 #include <stdbool.h>
 #include <inttypes.h>
 #include <sys/time.h>
@@ -91,6 +96,19 @@ enum {
   LOG_PINK  = 0x14 };
 #endif

+#define WORK_ALIGNMENT 64
+
+// When working with dynamically allocated memory to guarantee data alignment
+// for large vectors. Physical block size must be extended by alignment number
+// of bytes when allocated. free() should use the physical pointer returned by
+// malloc(), not the aligned pointer. All others shoujld use the logical,
+// aligned, pointer returned by this function. 
+static inline void *align_ptr( const void *ptr, const uint64_t alignment )
+{
+  const uint64_t mask = alignment - 1;
+  return (void*)( ( ((const uint64_t)ptr) + mask ) & (~mask) );
+}
+
 extern bool is_power_of_2( int n );

 static inline bool is_windows(void)
@@ -317,7 +335,7 @@ extern void cbin2hex(char *out, const char *in, size_t len);
 void   bin2hex( char *s, const unsigned char *p, size_t len );
 char  *abin2hex( const unsigned char *p, size_t len );
 char  *bebin2hex( const unsigned char *p, size_t len );
-bool   hex2bin( unsigned char *p, const char *hexstr, size_t len );
+bool   hex2bin( unsigned char *p, const char *hexstr, const size_t len );
 bool   jobj_binary( const json_t *obj, const char *key, void *buf,
                    size_t buflen );
 int    varint_encode( unsigned char *p, uint64_t n );
@@ -333,10 +351,7 @@ extern void memrev(unsigned char *p, size_t len);
 // number of hashes.
 //
 //     https://en.bitcoin.it/wiki/Difficulty
-//
 //     hash = diff * 2**32
-//
-// diff_to_hash = 2**32 = 0x100000000 = 4294967296 = exp32;

 #define EXP16 65536.
 #define EXP32 4294967296.
@@ -350,8 +365,9 @@ extern const long double exp160; // 2**160
 bool   fulltest( const uint32_t *hash, const uint32_t *target );
 bool   valid_hash( const void*, const void* );

-double hash_to_diff( const void* );
+extern double hash_to_diff( const void* );
 extern void diff_to_hash( uint32_t*, const double );
+extern double nbits_to_diff( uint32_t );

 double hash_target_ratio( uint32_t* hash, uint32_t* target );
 void   work_set_target_ratio( struct work* work, const void *hash );
@@ -392,20 +408,21 @@ float cpu_temp( int core );

 struct work
 {
-   uint32_t data[48] __attribute__ ((aligned (64)));
-   uint32_t target[8] __attribute__ ((aligned (32)));
+   uint32_t target[8] __attribute__ ((aligned (64)));
+	uint32_t data[48] __attribute__ ((aligned (64)));
 	double targetdiff;
 	double sharediff;
   double stratum_diff;
 	int height;
 	char *txs;
-	char *workid;
+   int tx_count;
+   char *workid;
 	char *job_id;
 	size_t xnonce2_len;
 	unsigned char *xnonce2;
   bool sapling;
   bool stale;
-} __attribute__ ((aligned (64)));
+} __attribute__ ((aligned (WORK_ALIGNMENT)));

 struct stratum_job
 {
@@ -416,7 +433,8 @@ struct stratum_job
 	unsigned char *coinbase;
 	unsigned char *xnonce2;
 	int merkle_count;
-	unsigned char **merkle;
+   int merkle_buf_size;
+   unsigned char **merkle;
 	unsigned char version[4];
 	unsigned char nbits[4];
 	unsigned char ntime[4];
@@ -540,7 +558,6 @@ enum algos {
        ALGO_BMW,        
        ALGO_BMW512,
        ALGO_C11,         
-        ALGO_DECRED,
        ALGO_DEEP,
        ALGO_DMD_GR,
        ALGO_GROESTL,     
@@ -572,9 +589,11 @@ enum algos {
        ALGO_QUBIT,       
        ALGO_SCRYPT,
        ALGO_SHA256D,
+        ALGO_SHA256DT,
        ALGO_SHA256Q,
        ALGO_SHA256T,
        ALGO_SHA3D,
+        ALGO_SHA512256D,
        ALGO_SHAVITE3,    
        ALGO_SKEIN,       
        ALGO_SKEIN2,      
@@ -634,7 +653,6 @@ static const char* const algo_names[] = {
        "bmw",
        "bmw512",
        "c11",
-        "decred",
        "deep",
        "dmd-gr",
        "groestl",
@@ -666,9 +684,11 @@ static const char* const algo_names[] = {
        "qubit",
        "scrypt",
        "sha256d",
+        "sha256dt",
        "sha256q",
        "sha256t",
        "sha3d",
+        "sha512256d",
        "shavite3",
        "skein",
        "skein2",
@@ -795,7 +815,6 @@ Options:\n\
                          bmw           BMW 256\n\
                          bmw512        BMW 512\n\
                          c11           Chaincoin\n\
-                          decred        Blake256r14dcr\n\
                          deep          Deepcoin (DCN)\n\
                          dmd-gr        Diamond\n\
                          groestl       Groestl coin\n\
@@ -829,9 +848,11 @@ Options:\n\
                          scrypt:N      scrypt(N, 1, 1)\n\
                          scryptn2      scrypt(1048576, 1,1)\n\
                          sha256d       Double SHA-256\n\
+                          sha256dt      Modified sha256d (Novo)\n\
                          sha256q       Quad SHA-256, Pyrite (PYE)\n\
                          sha256t       Triple SHA-256, Onecoin (OC)\n\
                          sha3d         Double Keccak256 (BSHA3)\n\
+                          sha512256d    Double SHA-512 (Radiant)\n\
                          shavite3      Shavite3\n\
                          skein         Skein+Sha (Skeincoin)\n\
                          skein2        Double Skein (Woodcoin)\n\
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -93,10 +93,15 @@ static inline uint32_t u32_mov128_32( const __m128i a )
  return n;
 }

-// Equivalent of set1, broadcast integer to all elements.
-#define m128_const_i128( i ) mm128_mov64_128( i )
-#define m128_const1_64( i ) _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 )
-#define m128_const1_32( i ) _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 )
+// Emulate broadcast & insert instructions not available in SSE2
+#define mm128_bcast_i64( i )   _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 )
+#define mm128_bcast_i32( i )   _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 )
+
+#define m128_const_i128( i )    mm128_mov64_128( i )
+
+// deprecated
+#define m128_const1_64          mm128_bcast_i64
+#define m128_const1_32          mm128_bcast_i32

 #if defined(__SSE4_1__)

@@ -104,7 +109,7 @@ static inline uint32_t u32_mov128_32( const __m128i a )
 #define m128_const_64( hi, lo ) \
   _mm_insert_epi64( mm128_mov64_128( lo ), hi, 1 )

-#else  // No insert in SSE2
+#else 

 #define m128_const_64  _mm_set_epi64x

@@ -114,12 +119,10 @@ static inline uint32_t u32_mov128_32( const __m128i a )

 #define m128_zero      _mm_setzero_si128()
 #define m128_one_128   mm128_mov64_128( 1 )
-#define m128_one_64    _mm_shuffle_epi32( mm128_mov64_128( 1 ), 0x44 )
-#define m128_one_32    _mm_shuffle_epi32( mm128_mov32_128( 1 ), 0x00 )
-#define m128_one_16    _mm_shuffle_epi32( \
-                                 mm128_mov32_128( 0x00010001 ), 0x00 )
-#define m128_one_8     _mm_shuffle_epi32( \
-                                 mm128_mov32_128( 0x01010101 ), 0x00 )
+#define m128_one_64    mm128_bcast_i64( 1 )
+#define m128_one_32    mm128_bcast_i32( 1 )
+#define m128_one_16    mm128_bcast_i32( 0x00010001 )
+#define m128_one_8     mm128_bcast_i32( 0x01010101 )

 // ASM avoids the need to initialize return variable to avoid compiler warning.
 // Macro abstracts function parentheses to look like an identifier.
@@ -149,7 +152,7 @@ static inline __m128i mm128_neg1_fn()
 // sizing. It's unique.
 //
 // It can:
-//   - zero 32 bit elements of a 128 bit vector.
+//   - zero any number of 32 bit elements of a 128 bit vector.
 //   - extract any 32 bit element from one 128 bit vector and insert the
 //     data to any 32 bit element of another 128 bit vector, or the same vector.
 //   - do both simultaneoulsly.
@@ -162,14 +165,21 @@ static inline __m128i mm128_neg1_fn()
 //    c[5:4] destination element selector
 //    c[7:6] source element selector

-// Convert type and abbreviate name: e"x"tract "i"nsert "m"ask
+// Convert type and abbreviate name: eXtract Insert Mask = XIM
 #define mm128_xim_32( v1, v2, c ) \
   _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
                                    _mm_castsi128_ps( v2 ), c ) )

-// Some examples of simple operations:
+/* Another way to do it with individual arguments.
+#define mm128_xim_32( v1, i1, v2, i2, mask ) \
+   _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
+                                    _mm_castsi128_ps( v2 ), \
+                                    (mask) | ((i1)<<4) | ((i2)<<6) ) )
+*/

-// Insert 32 bit integer into v at element c and return modified v.
+// Examples of simple operations using xim:
+
+// Insert 32 bit integer into v at element c and return updated v.
 static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
                                       const int c )
 {   return mm128_xim_32( v, mm128_mov32_128( i ), c<<4 ); }
@@ -178,13 +188,12 @@ static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
 static inline uint32_t mm128_extract_32( const __m128i v, const int c )
 {   return u32_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }

-// Clear (zero) 32 bit elements based on bits set in 4 bit mask.
+// Zero 32 bit elements when bit in mask is set.
 static inline __m128i mm128_mask_32( const __m128i v, const int m ) 
 {   return mm128_xim_32( v, v, m ); }

-// Move element i2 of v2 to element i1 of v1. For reference and convenience,
-// it's faster to precalculate the index.
-#define mm128_shuflmov_32( v1, i1, v2, i2 ) \
+// Move element i2 of v2 to element i1 of v1 and return updated v1.
+#define mm128_mov32_32( v1, i1, v2, i2 ) \
  mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )

 #endif  // SSE4_1
@@ -280,7 +289,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )

 // Mask making
 // Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
-// Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements.
+// Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements.
 // Effectively a sign test.

 #define mm_movmask_64( v ) \
@@ -385,7 +394,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_var_32( v, c ) \
   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )

-/* Not used
+// Cross lane shuffles
+//
 // Limited 2 input shuffle, combines shuffle with blend. The destination low
 // half is always taken from v1, and the high half from v2.
 #define mm128_shuffle2_64( v1, v2, c ) \
@@ -395,19 +405,16 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_shuffle2_32( v1, v2, c ) \
   _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
                                     _mm_castsi128_ps( v2 ), c ) ); 
-*/

-//
 // Rotate vector elements accross all lanes

-#define mm128_swap_64( v )    _mm_shuffle_epi32( v, 0x4e )
-#define mm128_shuflr_64       mm128_swap_64
-#define mm128_shufll_64       mm128_swap_64
+#define mm128_swap_64( v )     _mm_shuffle_epi32( v, 0x4e )
+#define mm128_shuflr_64        mm128_swap_64
+#define mm128_shufll_64        mm128_swap_64

 #define mm128_shuflr_32( v )   _mm_shuffle_epi32( v, 0x39 )
 #define mm128_shufll_32( v )   _mm_shuffle_epi32( v, 0x93 )

-/* Not used
 #if defined(__SSSE3__)

 // Rotate right by c bytes, no SSE2 equivalent.
@@ -415,15 +422,12 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 { return _mm_alignr_epi8( v, v, c ); }

 #endif
-*/

-// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit rotations
-// for multiples of 8 bits. Uses ror/rol macros when AVX512 is available
-// (unlikely but faster), or when SSSE3 is not available (slower).
+//  Rotate 64 bit lanes

 #define mm128_swap64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
-#define mm128_shuflr64_32 mm128_swap64_32
-#define mm128_shufll64_32 mm128_swap64_32
+#define mm128_shuflr64_32     mm128_swap64_32
+#define mm128_shufll64_32     mm128_swap64_32

 #if defined(__SSSE3__) && !defined(__AVX512VL__)
  #define mm128_shuflr64_24( v ) \
@@ -441,6 +445,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
  #define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 )
 #endif

+// Rotate 32 bit lanes
+
 #if defined(__SSSE3__) && !defined(__AVX512VL__)
  #define mm128_swap32_16( v ) \
    _mm_shuffle_epi8( v, _mm_set_epi64x( \
@@ -448,8 +454,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 #else
  #define mm128_swap32_16( v ) mm128_ror_32( v, 16 )
 #endif
-#define mm128_shuflr32_16 mm128_swap32_16
-#define mm128_shufll32_16 mm128_swap32_16
+#define mm128_shuflr32_16      mm128_swap32_16
+#define mm128_shufll32_16      mm128_swap32_16

 #if defined(__SSSE3__) && !defined(__AVX512VL__)
  #define mm128_shuflr32_8( v ) \
@@ -464,6 +470,10 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )

 #if defined(__SSSE3__)

+#define mm128_bswap_128( v ) \
+   _mm_shuffle_epi8( v, m128_const_64( 0x0001020304050607, \
+                                       0x08090a0b0c0d0e0f ) )
+
 #define mm128_bswap_64( v ) \
   _mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \
                                       0x0001020304050607 ) )
@@ -525,6 +535,9 @@ static inline __m128i mm128_bswap_16( __m128i v )
  return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
 }

+#define mm128_bswap_128( v ) \
+   mm128_swap_64( mm128_bswap_64( v ) )
+
 static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
 {
   d[0] = mm128_bswap_64( s[0] );
@@ -558,12 +571,11 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
   v2 = _mm_xor_si128( v1, v2 ); \
   v1 = _mm_xor_si128( v1, v2 );

-// Concatenate { hi, lo }, rotate right by c elements and return low 128 bits.
-#if defined(__SSSE3__)

-// _mm_alignr_epi32 & _mm_alignr_epi64 are only available with AVX512VL but
-// are emulated here using _mm_alignr_epi8. There are no fast equivalents for
-// 256 bit vectors, though there is no for this functionality.
+// alignr instruction for 32 & 64 bit elements is only available with AVX512
+// but emulated here. Behaviour is consistent with Intel alignr intrinsics.
+
+#if defined(__SSSE3__)

 #define mm128_alignr_64( hi, lo, c )    _mm_alignr_epi8( hi, lo, (c)*8 )
 #define mm128_alignr_32( hi, lo, c )    _mm_alignr_epi8( hi, lo, (c)*4 )
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -15,6 +15,8 @@
 //
 // "_mm256_shuffle_epi8" and "_mm256_alignr_epi8" are restricted to 128 bit
 // lanes and data can't cross the 128 bit lane boundary.  
+// Full width byte shuffle is available with AVX512VL using the mask version
+// with a full mask (-1). 
 // Instructions that can move data across 128 bit lane boundary incur a
 // performance penalty over those that can't.
 // Some usage of index vectors may be encoded as if full vector shuffles are
@@ -66,31 +68,33 @@ typedef union
 #define u32_mov256_32( v ) u32_mov128_32( _mm256_castsi256_si128( v ) )

 // concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
+
 #define mm256_concat_128( hi, lo ) \
   _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )

+#define mm256_bcast_m128( v ) \
+                 _mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
+#define mm256_bcast_i128( i ) mm256_bcast_m128( mm128_mov64_128( i ) )
+#define mm256_bcast_i64( i )  _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
+#define mm256_bcast_i32( i )  _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
+#define mm256_bcast_i16( i )  _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
+#define mm256_bcast_i8( i )   _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )

 // Equivalent of set, move 64 bit integer constants to respective 64 bit
 // elements.
 static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
                                     const uint64_t i1, const uint64_t i0 )
 {
-  union { __m256i m256i;
-          uint64_t u64[4]; } v;
+  union { __m256i m256i;  uint64_t u64[4]; } v;
  v.u64[0] = i0; v.u64[1] = i1; v.u64[2] = i2; v.u64[3] = i3;
  return v.m256i;
 }

-// Equivalent of set1.
-// 128 bit vector argument
-#define m256_const1_128( v ) \
-   _mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
-// 64 bit integer argument zero extended to 128 bits.
-#define m256_const1_i128( i ) m256_const1_128( mm128_mov64_128( i ) )
-#define m256_const1_64( i )  _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
-#define m256_const1_32( i )  _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
-#define m256_const1_16( i )  _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
-#define m256_const1_8 ( i )  _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )
+// Deprecated
+#define m256_const1_128      mm256_bcast_m128
+#define m256_const1_i128     mm256_bcast_i128
+#define m256_const1_64       mm256_bcast_i64
+#define m256_const1_32       mm256_bcast_i32

 #define m256_const2_64( i1, i0 ) \
  m256_const1_128( m128_const_64( i1, i0 ) )
@@ -99,13 +103,13 @@ static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
 // All SIMD constant macros are actually functions containing executable
 // code and therefore can't be used as compile time initializers.

-#define m256_zero      _mm256_setzero_si256()
-#define m256_one_256   mm256_mov64_256( 1 )
-#define m256_one_128   m256_const1_i128( 1 )
-#define m256_one_64    _mm256_broadcastq_epi64( mm128_mov64_128( 1 ) )
-#define m256_one_32    _mm256_broadcastd_epi32( mm128_mov64_128( 1 ) )
-#define m256_one_16    _mm256_broadcastw_epi16( mm128_mov64_128( 1 ) )
-#define m256_one_8     _mm256_broadcastb_epi8 ( mm128_mov64_128( 1 ) )
+#define m256_zero         _mm256_setzero_si256()
+#define m256_one_256      mm256_mov64_256( 1 )
+#define m256_one_128      mm256_bcast_i128( 1 )
+#define m256_one_64       mm256_bcast_i64( 1 )
+#define m256_one_32       mm256_bcast_i32( 1 )
+#define m256_one_16       mm256_bcast_i16( 1 )
+#define m256_one_8        mm256_bcast_i8 ( 1 )

 static inline __m256i mm256_neg1_fn()
 {
@@ -116,8 +120,8 @@ static inline __m256i mm256_neg1_fn()
 #define m256_neg1  mm256_neg1_fn()

 // Consistent naming for similar operations.
-#define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v )
-#define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 )
+#define mm128_extr_lo128_256( v )    _mm256_castsi256_si128( v )
+#define mm128_extr_hi128_256( v )    _mm256_extracti128_si256( v, 1 )

 //
 // Memory functions
@@ -239,8 +243,8 @@ static inline __m256i mm256_not( const __m256i v )

 // Mask making
 // Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
-// Create a 64 or 32 bit integer mask from MSB of 64 or 32 bit elements.
-// Effectively a sign test: if (mask[n]) then -1 else  0.
+// Returns 4 or 8 bit integer mask from MSBit of 64 or 32 bit elements.
+// Effectively a sign test.

 #define mm256_movmask_64( v ) \
   _mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) )
@@ -348,23 +352,27 @@ static inline __m256i mm256_not( const __m256i v )
   _mm256_or_si256( _mm256_slli_epi16( v, c ), \
                    _mm256_srli_epi16( v, 16-(c) ) )

-// Deprecated. Obsolete sm3, the only user, is grandfathered.
+// Deprecated.
 #define mm256_rol_var_32( v, c ) \
   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
                    _mm256_srli_epi32( v, 32-(c) ) )

+//
+// Cross lane shuffles
 //
 // Rotate elements accross all lanes.

 // Swap 128 bit elements in 256 bit vector.
 #define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )
-#define mm256_shuflr_128 mm256_swap_128
-#define mm256_shufll_128 mm256_swap_128
+#define mm256_shuflr_128        mm256_swap_128
+#define mm256_shufll_128        mm256_swap_128

 // Rotate 256 bit vector by one 64 bit element
 #define mm256_shuflr_64( v )    _mm256_permute4x64_epi64( v, 0x39 )
 #define mm256_shufll_64( v )    _mm256_permute4x64_epi64( v, 0x93 )

+
+/* Not used
 // Rotate 256 bit vector by one 32 bit element.
 #if defined(__AVX512VL__)

@@ -387,11 +395,11 @@ static inline __m256i mm256_shufll_32( const __m256i v )
                                    0x0000000200000001,  0x0000000000000007 ) )

 #endif
+*/

 //
 // Rotate elements within each 128 bit lane of 256 bit vector.

-/* Not used
 // Limited 2 input shuffle
 #define mm256_shuffle2_64( v1, v2, c ) \
   _mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( v1 ), \
@@ -400,7 +408,6 @@ static inline __m256i mm256_shufll_32( const __m256i v )
 #define mm256_shuffle2_32( v1, v2, c ) \
   _mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( v1 ), \
                                           _mm256_castsi256_ps( v2 ), c ) ); 
-*/

 #define mm256_swap128_64( v )  _mm256_shuffle_epi32( v, 0x4e )
 #define mm256_shuflr128_64 mm256_swap128_64
@@ -412,20 +419,17 @@ static inline __m256i mm256_shufll_32( const __m256i v )
 static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 { return _mm256_alignr_epi8( v, v, c ); }

-// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit
-// rotations for multiples of 8 bits. Uses faster ror/rol instructions when
-// AVX512 is available.
+// 64 bit lanes

-#define mm256_swap64_32( v )   _mm256_shuffle_epi32( v, 0xb1 )
-#define mm256_shuflr64_32 mm256_swap64_32
-#define mm256_shufll64_32 mm256_swap64_32
+#define mm256_swap64_32( v )      _mm256_shuffle_epi32( v, 0xb1 )
+#define mm256_shuflr64_32         mm256_swap64_32
+#define mm256_shufll64_32         mm256_swap64_32

 #if defined(__AVX512VL__)
  #define mm256_shuflr64_24( v )  _mm256_ror_epi64( v, 24 )
 #else
  #define mm256_shuflr64_24( v ) \
-    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
-                                    0x0a09080f0e0d0c0b, 0x0201000706050403, \
+    _mm256_shuffle_epi8( v, m256_const2_64( \
                                    0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
 #endif

@@ -433,21 +437,21 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
  #define mm256_shuflr64_16( v )  _mm256_ror_epi64( v, 16 )
 #else
  #define mm256_shuflr64_16( v ) \
-    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
-                                    0x09080f0e0d0c0b0a, 0x0100070605040302, \
+    _mm256_shuffle_epi8( v, m256_const2_64( \
                                    0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
 #endif

+// 32 bit lanes
+
 #if defined(__AVX512VL__)
  #define mm256_swap32_16( v )  _mm256_ror_epi32( v, 16 )
 #else
  #define mm256_swap32_16( v ) \
-    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
-                                    0x0d0c0f0e09080b0a, 0x0504070601000302, \
+    _mm256_shuffle_epi8( v, m256_const2_64( \
                                    0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
 #endif
-#define mm256_shuflr32_16 mm256_swap32_16
-#define mm256_shufll32_16 mm256_swap32_16
+#define mm256_shuflr32_16       mm256_swap32_16
+#define mm256_shufll32_16       mm256_swap32_16

 #if defined(__AVX512VL__)
  #define mm256_shuflr32_8( v )  _mm256_ror_epi32( v, 8 )
@@ -458,35 +462,24 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
                                    0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
 #endif

-// NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
-// lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
-// AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
-// needed for a shuffle that crosses 128 bit lanes. BSWAP doesn't therefore the
-// AVX2 version will work here. The bswap control vector is coded to work
-// with both versions, bit 4 is ignored in AVX2. 
-
 // Reverse byte order in elements, endian bswap.
 #define mm256_bswap_64( v ) \
   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                        0x08090a0b0c0d0e0f, 0x0001020304050607 ) )
+         m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) )

 #define mm256_bswap_32( v ) \
   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
-                        0x0c0d0e0f08090a0b, 0x0405060700010203 ) )
+         m256_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) )

 #define mm256_bswap_16( v ) \
   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x1e1f1c1d1a1b1819, 0x1617141512131011, \
-                        0x0e0f0c0d0a0b0809, 0x0607040502030001, ) )
+         m256_const2_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001, ) )

 // Source and destination are pointers, may point to same memory.
 // 8 byte qword * 8 qwords * 4 lanes = 256 bytes
 #define mm256_block_bswap_64( d, s ) do \
 { \
-  __m256i ctl = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                               0x08090a0b0c0d0e0f, 0x0001020304050607 ) ; \
+  __m256i ctl = m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ; \
  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
@@ -500,8 +493,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 // 4 byte dword * 8 dwords * 8 lanes = 256 bytes
 #define mm256_block_bswap_32( d, s ) do \
 { \
-  __m256i ctl = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
-                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+  __m256i ctl = m256_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
@@ -513,8 +505,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 } while(0)

 // swap 256 bit vectors in place.
-// Deprecated, Shabal is the only user and it should be modified to reorder
-// instructions.
+// This should be avoided, it's more efficient to switch references.
 #define mm256_swap512_256( v1, v2 ) \
   v1 = _mm256_xor_si256( v1, v2 ); \
   v2 = _mm256_xor_si256( v1, v2 ); \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -113,7 +113,17 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
 #define mm512_concat_256( hi, lo ) \
   _mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 )

-#define m512_const_128( v3, v2, v1, v0 ) \
+// Work in progress.
+// modified naming scheme to align more with opcode mnenonic:
+// m512_const1 becomes mm512_bcast_m[n] or mm512_bcast_i[n], short for
+// broadcast, i indicates integer arg, m is vector. Set1 intrinsics should
+// genarally be used for integer data.
+// mm512_const should only be used with immediate integer arguments, use
+// _mm512_set intrinsic instead.
+// mm512_set, mm512_set[n] macros may be defined when no intrinsic exists
+// for either the arg size or arg count.
+
+#define mm512_set_128( v3, v2, v1, v0 ) \
   mm512_concat_256( mm256_concat_128( v3, v2 ), \
                     mm256_concat_128( v1, v0 ) )

@@ -133,29 +143,35 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
  return v.m512i;
 }

+// Broadcast with vector argument is generally more efficient except for
+// integer immediate constants or when data was most recently referenced as
+// integer and is still available in an integer register.
+
+/* not used
 // Equivalent of set1, broadcast lo element to all elements.
 static inline __m512i m512_const1_256( const __m256i v )
 { return _mm512_inserti64x4( _mm512_castsi256_si512( v ), v, 1 ); }  
+*/

-#define m512_const1_128( v ) \
-    mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
-// Integer input argument up to 64 bits
-#define m512_const1_i128( i ) \
-    mm512_perm_128( _mm512_castsi128_si512( mm128_mov64_128( i ) ), 0 )
+#define mm512_bcast_m128( v )  mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
+// Low 64 bits only, high 64 bits are zeroed.
+#define mm512_bcast_i128( i )  mm512_bcast_m128( mm128_mov64_128( i ) )
+#define mm512_bcast_i64( i )   _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
+#define mm512_bcast_i32( i )   _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
+#define mm512_bcast_i16( i )   _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
+#define mm512_bcast_i8( i )    _mm512_broadcastb_epi8( mm128_mov32_128( i ) )

-//#define m512_const1_256( v )   _mm512_broadcast_i64x4( v )
-//#define m512_const1_128( v )   _mm512_broadcast_i64x2( v )
-#define m512_const1_64( i )    _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
-#define m512_const1_32( i )    _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
-#define m512_const1_16( i )    _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
-#define m512_const1_8( i )     _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) )
+// const1 is deprecated, use bcast instead
+#define m512_const1_128   mm512_bcast_m128
+#define m512_const1_i128  mm512_bcast_i128
+#define m512_const1_64    mm512_bcast_i64
+#define m512_const1_32    mm512_bcast_i32

 #define m512_const2_128( v1, v0 ) \
-   m512_const1_256( _mm512_inserti64x2( _mm512_castsi128_si512( v0 ), v1, 1 ) )
+   _mm512_inserti64x2( _mm512_castsi128_si512( v0 ), v1, 1 )

 #define m512_const2_64( i1, i0 ) \
-   m512_const1_128( m128_const_64( i1, i0 ) )
-
+   mm512_bcast_m128( m128_const_64( i1, i0 ) )

 static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
                                      const uint64_t i1, const uint64_t i0 )
@@ -179,11 +195,11 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
 #define m512_zero       _mm512_setzero_si512()
 #define m512_one_512    mm512_mov64_512( 1 )
 #define m512_one_256    _mm512_inserti64x4( m512_one_512, m256_one_256, 1 )  
-#define m512_one_128    m512_const1_i128( 1 )
-#define m512_one_64     m512_const1_64( 1 )
-#define m512_one_32     m512_const1_32( 1 )
-#define m512_one_16     m512_const1_16( 1 )
-#define m512_one_8      m512_const1_8( 1 )
+#define m512_one_128    mm512_bcast_i128( (__uint128_t)1 )
+#define m512_one_64     mm512_bcast_i64( (uint64_t)1 )
+#define m512_one_32     mm512_bcast_i32( (uint32_t)1 )
+#define m512_one_16     mm512_bcast_i16( (uint16_t)1 )
+#define m512_one_8      mm512_bcast_i8(  (uint8_t)1 )

 // use asm to avoid compiler warning for unitialized local
 static inline __m512i mm512_neg1_fn()
@@ -193,8 +209,6 @@ static inline __m512i mm512_neg1_fn()
   return a;
 }
 #define m512_neg1 mm512_neg1_fn()                          // 1 clock
-//#define m512_neg1 m512_const1_64( 0xffffffffffffffff )   // 5 clocks
-//#define m512_neg1 _mm512_movm_epi64( 0xff )              // 2 clocks

 //
 // Basic operations without SIMD equivalent
@@ -343,10 +357,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // 8 lanes of 64 bytes each
 #define mm512_block_bswap_64( d, s ) do \
 { \
-  __m512i ctl = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
-                               0x28292a2b2c2d2e2f, 0x2021222324252627, \
-                               0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                               0x08090a0b0c0d0e0f, 0x0001020304050607  ); \
+  const __m512i ctl = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
+                                     0x28292a2b2c2d2e2f, 0x2021222324252627, \
+                                     0x18191a1b1c1d1e1f, 0x1011121314151617, \
+                                     0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
@@ -360,10 +374,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // 16 lanes of 32 bytes each
 #define mm512_block_bswap_32( d, s ) do \
 { \
-  __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
-                               0x2c2d2e2f28292a2b, 0x2425262720212223, \
-                               0x1c1d1e1f18191a1b, 0x1415161710111213, \
-                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+  const __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
+                                     0x2c2d2e2f28292a2b, 0x2425262720212223, \
+                                     0x1c1d1e1f18191a1b, 0x1415161710111213, \
+                                     0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
@@ -409,7 +423,6 @@ static inline __m512i mm512_shuflr_x64( const __m512i v, const int n )
 static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 { return _mm512_alignr_epi32( v, v, n ); }

-/* Not used
 #define mm512_shuflr_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                       0x0000001F001E001D, 0x001C001B001A0019, \
@@ -437,7 +450,6 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
                       0x2E2D2C2B2A292827, 0x262524232221201F, \
                       0x1E1D1C1B1A191817, 0x161514131211100F, \
                       0x0E0D0C0B0A090807, 0x060504030201003F ) )
-*/

 // 256 bit lanes used only by lyra2, move these there
 // Rotate elements within 256 bit lanes of 512 bit vector.
@@ -451,7 +463,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuflr256_64( v )     _mm512_permutex_epi64( v, 0x39 )
 #define mm512_shufll256_64( v )     _mm512_permutex_epi64( v, 0x93 )

-/* Not used
+/*  Not used
 // Rotate 256 bit lanes by one 32 bit element
 #define mm512_shuflr256_32( v ) \
   _mm512_permutexvar_epi32( m512_const_64( \
@@ -498,7 +510,18 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 //
 // Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
 
-/* Not used
+#define mm512_swap128_64( v )   _mm512_shuffle_epi32( v, 0x4e )
+#define mm512_shuflr128_64      mm512_swap128_64
+#define mm512_shufll128_64      mm512_swap128_64
+
+// Rotate 128 bit lanes by one 32 bit element
+#define mm512_shuflr128_32( v )    _mm512_shuffle_epi32( v, 0x39 )
+#define mm512_shufll128_32( v )    _mm512_shuffle_epi32( v, 0x93 )
+
+// Rotate 128 bit lanes right by c bytes, versatile and just as fast
+static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
+{  return _mm512_alignr_epi8( v, v, c ); }
+
 // Limited 2 input, 1 output shuffle, combines shuffle with blend.
 // Like most shuffles it's limited to 128 bit lanes and like some shuffles
 // destination elements must come from a specific source arg. 
@@ -509,32 +532,12 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuffle2_32( v1, v2, c ) \
   _mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( v1 ), \
                                           _mm512_castsi512_ps( v2 ), c ) ); 
-*/

-// These hard coded shuffles exist for consistency with AVX2 & SSE2 where
-// efficient generic versions don't exist.
-// Swap 64 bits in each 128 bit lane
-#define mm512_swap128_64( v )   _mm512_shuffle_epi32( v, 0x4e )
-#define mm512_shuflr128_64  mm512_swap128_64
-#define mm512_shufll128_64  mm512_swap128_64
-
-// Rotate 128 bit lanes by one 32 bit element
-#define mm512_shuflr128_32( v )    _mm512_shuffle_epi32( v, 0x39 )
-#define mm512_shufll128_32( v )    _mm512_shuffle_epi32( v, 0x93 )
-
-/* Not used
-// Rotate right 128 bit lanes by c bytes, efficient generic version of above.
-static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
-{  return _mm512_alignr_epi8( v, v, c ); }
-*/
-
-// Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all
-// can be done with ror & rol. Defined only for convenience and consistency
-// with AVX2 & SSE2 macros.
+// 64 bit lanes

 #define mm512_swap64_32( v )    _mm512_shuffle_epi32( v, 0xb1 )
-#define mm512_shuflr64_32 mm512_swap64_32
-#define mm512_shufll64_32 mm512_swap64_32
+#define mm512_shuflr64_32       mm512_swap64_32
+#define mm512_shufll64_32       mm512_swap64_32

 #define mm512_shuflr64_24( v )  _mm512_ror_epi64( v, 24 )
 #define mm512_shufll64_24( v )  _mm512_rol_epi64( v, 24 )
@@ -545,12 +548,14 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 #define mm512_shuflr64_8(  v )  _mm512_ror_epi64( v,  8 )
 #define mm512_shufll64_8(  v )  _mm512_rol_epi64( v,  8 )

-#define mm512_swap32_16(   v )  _mm512_ror_epi32( v, 16 )
-#define mm512_shuflr32_16 mm512_swap32_16
-#define mm512_shufll32_16 mm512_swap32_16
+// 32 bit lanes

-#define mm512_shuflr32_8(  v )  _mm512_ror_epi32( v,  8 )
-#define mm512_shufll32_8(  v )  _mm512_rol_epi32( v,  8 )
+#define mm512_swap32_16( v )    _mm512_ror_epi32( v, 16 )
+#define mm512_shuflr32_16       mm512_swap32_16
+#define mm512_shufll32_16       mm512_swap32_16
+
+#define mm512_shuflr32_8( v )   _mm512_ror_epi32( v,  8 )
+#define mm512_shufll32_8( v )   _mm512_rol_epi32( v,  8 )

 #endif // AVX512
 #endif // SIMD_512_H__
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -55,6 +55,13 @@
 typedef          __int128  int128_t;
 typedef unsigned __int128 uint128_t;

+typedef union
+{
+   uint128_t u128;
+   uint64_t  u64[2];
+   uint32_t  u32[4];
+} __attribute__ ((aligned (16))) u128_ovly;
+
 // Extracting the low bits is a trivial cast.
 // These specialized functions are optimized while providing a
 // consistent interface.
--- a/util.c
+++ b/util.c
@@ -44,28 +44,22 @@
 #include <libgen.h>
 #endif

-//#include "miner.h"
 #include "elist.h"
 #include "algo-gate-api.h"
 #include "algo/sha/sha256d.h"

-//extern pthread_mutex_t stats_lock;
-
-struct data_buffer {
-	void		*buf;
-	size_t		len;
-};
-
-struct upload_buffer {
-	const void	*buf;
-	size_t		len;
-	size_t		pos;
-};
-
 struct header_info {
 	char		*lp_path;
 	char		*reason;
 	char		*stratum_url;
+   size_t	content_length;
+};
+
+struct data_buffer {
+	void			*buf;
+	size_t			len;
+	size_t			allocated;
+	struct header_info	*headers;
 };

 struct tq_ent {
@@ -127,7 +121,6 @@ void applog2( int prio, const char *fmt, ... )
      int len;
 //    struct tm tm;
 //    time_t now = time(NULL);
-
 //    localtime_r(&now, &tm);

      switch ( prio )
@@ -395,67 +388,53 @@ static void databuf_free(struct data_buffer *db)
 static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb,
 			  void *user_data)
 {
-	struct data_buffer *db = (struct data_buffer *) user_data;
+	struct data_buffer *db = user_data;
 	size_t len = size * nmemb;
-	size_t oldlen, newlen;
+	size_t newalloc, reqalloc;
 	void *newmem;
 	static const unsigned char zero = 0;
+	static const size_t max_realloc_increase = 8 * 1024 * 1024;
+	static const size_t initial_alloc = 16 * 1024;

-	oldlen = db->len;
-	newlen = oldlen + len;
+	/* minimum required allocation size */
+	reqalloc = db->len + len + 1;

-	newmem = realloc(db->buf, newlen + 1);
-	if (!newmem)
-		return 0;
+	if (reqalloc > db->allocated) {
+		if (db->len > 0) {
+			newalloc = db->allocated * 2;
+		} else {
+			if (db->headers->content_length > 0)
+				newalloc = db->headers->content_length + 1;
+			else
+				newalloc = initial_alloc;
+		}

-	db->buf = newmem;
-	db->len = newlen;
-	memcpy((uchar*) db->buf + oldlen, ptr, len);
-	memcpy((uchar*) db->buf + newlen, &zero, 1);	/* null terminate */
+		if (db->headers->content_length == 0) {
+			/* limit the maximum buffer increase */
+			if (newalloc - db->allocated > max_realloc_increase)
+				newalloc = db->allocated + max_realloc_increase;
+		}
+
+		/* ensure we have a big enough allocation */
+		if (reqalloc > newalloc)
+			newalloc = reqalloc;
+
+		newmem = realloc(db->buf, newalloc);
+		if (!newmem)
+			return 0;
+
+		db->buf = newmem;
+		db->allocated = newalloc;
+	}
+
+	memcpy(db->buf + db->len, ptr, len); /* append new data */
+	memcpy(db->buf + db->len + len, &zero, 1); /* null terminate */
+
+	db->len += len;

 	return len;
 }

-static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb,
-			     void *user_data)
-{
-	struct upload_buffer *ub = (struct upload_buffer *) user_data;
-	size_t len = size * nmemb;
-
-	if (len > ub->len - ub->pos)
-		len = ub->len - ub->pos;
-
-	if (len) {
-		memcpy(ptr, ((uchar*)ub->buf) + ub->pos, len);
-		ub->pos += len;
-	}
-
-	return len;
-}
-
-#if LIBCURL_VERSION_NUM >= 0x071200
-static int seek_data_cb(void *user_data, curl_off_t offset, int origin)
-{
-	struct upload_buffer *ub = (struct upload_buffer *) user_data;
-	
-	switch (origin) {
-	case SEEK_SET:
-		ub->pos = (size_t) offset;
-		break;
-	case SEEK_CUR:
-		ub->pos += (size_t) offset;
-		break;
-	case SEEK_END:
-		ub->pos = ub->len + (size_t) offset;
-		break;
-	default:
-		return 1; /* CURL_SEEKFUNC_FAIL */
-	}
-
-	return 0; /* CURL_SEEKFUNC_OK */
-}
-#endif
-
 static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
 {
 	struct header_info *hi = (struct header_info *) user_data;
@@ -505,6 +484,9 @@ static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
 		val = NULL;
 	}

+	if (!strcasecmp("Content-Length", key))
+		hi->content_length = strtoul(val, NULL, 10);
+
 out:
 	free(key);
 	free(val);
@@ -564,48 +546,38 @@ json_t *json_rpc_call(CURL *curl, const char *url,
 	int rc;
 	long http_rc;
 	struct data_buffer all_data = {0};
-	struct upload_buffer upload_data;
 	char *json_buf;
 	json_error_t err;
 	struct curl_slist *headers = NULL;
-	char len_hdr[64];
 	char curl_err_str[CURL_ERROR_SIZE] = { 0 };
 	long timeout = (flags & JSON_RPC_LONGPOLL) ? opt_timeout : 30;
 	struct header_info hi = {0};

+   all_data.headers = &hi;
 	/* it is assumed that 'curl' is freshly [re]initialized at this pt */

-	if (opt_protocol)
-		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
+	if (opt_protocol)  curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
 	curl_easy_setopt(curl, CURLOPT_URL, url);
-	if (opt_cert)
-		curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
-//
-        curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
-
+	if (opt_cert)      curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
+   curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
 	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
 	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
 	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
 	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
 	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb);
 	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
-	curl_easy_setopt(curl, CURLOPT_READFUNCTION, upload_data_cb);
-	curl_easy_setopt(curl, CURLOPT_READDATA, &upload_data);
-#if LIBCURL_VERSION_NUM >= 0x071200
-	curl_easy_setopt(curl, CURLOPT_SEEKFUNCTION, &seek_data_cb);
-	curl_easy_setopt(curl, CURLOPT_SEEKDATA, &upload_data);
-#endif
-	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
-	if (opt_redirect)
-		curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
+   curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
+	if (opt_redirect)  curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
 	curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
 	curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, resp_hdr_cb);
 	curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hi);
-	if (opt_proxy) {
+	if (opt_proxy)
+   {
 		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
 		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
 	}
-	if (userpass) {
+	if (userpass)
+   {
 		curl_easy_setopt(curl, CURLOPT_USERPWD, userpass);
 		curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC);
 	}
@@ -613,23 +585,16 @@ json_t *json_rpc_call(CURL *curl, const char *url,
 	if (flags & JSON_RPC_LONGPOLL)
 		curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
 #endif
-	curl_easy_setopt(curl, CURLOPT_POST, 1);
+   curl_easy_setopt(curl, CURLOPT_POSTFIELDS, rpc_req);

 	if (opt_protocol)
 		applog(LOG_DEBUG, "JSON protocol request:\n%s\n", rpc_req);

-	upload_data.buf = rpc_req;
-	upload_data.len = strlen(rpc_req);
-	upload_data.pos = 0;
-	sprintf(len_hdr, "Content-Length: %lu",
-		(unsigned long) upload_data.len);
-
 	headers = curl_slist_append(headers, "Content-Type: application/json");
-	headers = curl_slist_append(headers, len_hdr);
 	headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
 	headers = curl_slist_append(headers, "X-Mining-Extensions: longpoll reject-reason");
-	//headers = curl_slist_append(headers, "Accept:"); /* disable Accept hdr*/
-	//headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/
+	//headers = curl_slist_append(headers, "Accept:"); // disable Accept hdr
+	//headers = curl_slist_append(headers, "Expect:"); // disable Expect hdr

 	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);

@@ -786,18 +751,26 @@ err_out:
 	return cfg;
 }

-// Segwit BEGIN
 void memrev(unsigned char *p, size_t len)
 {
-   unsigned char c, *q;
-   for (q = p + len - 1; p < q; p++, q--) {
-      c = *p;
-      *p = *q;
-      *q = c;
+   if ( len == 32 )
+   {
+      __m128i *pv = (__m128i*)p;
+      __m128i t = mm128_bswap_128( pv[0] );
+      pv[0] =     mm128_bswap_128( pv[1] );   
+      pv[1] = t;
+   }
+   else
+   {
+      unsigned char c, *q;
+      for (q = p + len - 1; p < q; p++, q--) 
+      {
+         c = *p;
+         *p = *q;
+         *q = c;
+      }
   }
 }
-// Segwit END
-

 void cbin2hex(char *out, const char *in, size_t len)
 {
@@ -832,32 +805,42 @@ char *bebin2hex(const unsigned char *p, size_t len)
   return s;
 }

-bool hex2bin(unsigned char *p, const char *hexstr, size_t len)
+bool hex2bin( unsigned char *p, const char *hexstr, const size_t len )
 {
-	char hex_byte[3];
-	char *ep;
+	if( hexstr == NULL )	return false;

-	hex_byte[2] = '\0';
-
-	while (*hexstr && len) {
-		if (!hexstr[1]) {
-			applog(LOG_ERR, "hex2bin str truncated");
-			return false;
-		}
-		hex_byte[0] = hexstr[0];
-		hex_byte[1] = hexstr[1];
-		*p = (unsigned char) strtol(hex_byte, &ep, 16);
-		if (*ep) {
-			applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte);
-			return false;
-		}
-		p++;
-		hexstr += 2;
-		len--;
+	size_t hexstr_len = strlen( hexstr );
+	if( ( hexstr_len % 2 ) != 0 )
+   {
+		applog( LOG_ERR, "hex2bin string truncated" );
+		return false;
+	}
+	size_t bin_len = hexstr_len / 2;
+	if ( bin_len > len )
+   {
+		applog( LOG_ERR, "hex2bin buffer too small" );
+		return false;
 	}

-	return(!len) ? true : false;
-/*	return (len == 0 && *hexstr == 0) ? true : false; */
+	memset( p, 0, len );
+	size_t i = 0;
+	while ( i < hexstr_len )
+   {
+		char c = hexstr[i];
+		unsigned char nibble;
+		if      ( c >= '0' && c <= '9' )	 nibble = (c - '0');
+		else if ( c >= 'A' && c <= 'F' )	 nibble = ( 10 + (c - 'A') );
+		else if ( c >= 'a' && c <= 'f' )	 nibble = ( 10 + (c - 'a') );
+		else
+      {
+			applog( LOG_ERR, "hex2bin invalid hex" );
+			return false;
+		}
+		p[(i / 2)] |= (nibble << ( (1 - (i % 2) ) * 4) );
+		i++;
+	}
+
+	return true;
 }

 int varint_encode(unsigned char *p, uint64_t n)
@@ -1339,6 +1322,43 @@ inline bool valid_hash( const void *hash, const void *target )

 #endif 

+inline double nbits_to_diff( uint32_t nbits )
+{
+   long double diff;
+   uint32_t shift = nbits & 0xff;
+   uint32_t bits = bswap_32( nbits ) & 0x00ffffff;
+   int shift_off = (int)shift - 29;
+
+   // diff = ( (2**16 -1) / ( 256**shift_off * bits )
+   // With uint128 byte shift is good for 16 <= shift <= 41. As unlikely
+   // as this may seem necessary, check just in case.
+
+   if ( shift_off >= -13 && shift_off <= 12 ) 
+   {  // fast
+      if ( shift_off == 0 )
+         diff = (long double)0xffff / (long double)bits;
+      else if ( shift_off < 0 )   // shift < 29
+         diff = (long double)( (uint128_t)0xffff << ( (-shift_off) *8 ) ) 
+              / (long double)bits;
+      else // ( shift_off > 0 )   // shift > 29
+         diff =   (long double)0xffff
+                / (long double)( (uint128_t)bits << ( shift_off*8 ) );  
+   }
+   else
+   {  // slow
+      int m;
+      diff = 0.;
+      for ( m = shift; m < 29; m++ )    diff *= 256.0;
+      for ( m = 29; m < shift; m++ )    diff /= 256.0;
+   }
+
+   if ( opt_debug )
+      applog( LOG_INFO, "nbits %08x: shift %u(%d), bits %06x, diff %8g",
+                         nbits, shift, shift_off, bits, (double)diff );
+
+   return (double)diff;
+}
+
 #ifdef WIN32
 #define socket_blocks() (WSAGetLastError() == WSAEWOULDBLOCK)
 #else
@@ -1507,7 +1527,8 @@ out:
 	return sret;
 }

-#if LIBCURL_VERSION_NUM >= 0x071101
+#if LIBCURL_VERSION_NUM >= 0x071101 && LIBCURL_VERSION_NUM < 0x072d00
+//#if LIBCURL_VERSION_NUM >= 0x071101
 static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose,
 	struct curl_sockaddr *addr)
 {
@@ -1575,7 +1596,8 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
 #if LIBCURL_VERSION_NUM >= 0x070f06
 	curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
 #endif
-#if LIBCURL_VERSION_NUM >= 0x071101
+#if LIBCURL_VERSION_NUM >= 0x071101 && LIBCURL_VERSION_NUM < 0x072d00
+//#if LIBCURL_VERSION_NUM >= 0x071101
 	curl_easy_setopt(curl, CURLOPT_OPENSOCKETFUNCTION, opensocket_grab_cb);
 	curl_easy_setopt(curl, CURLOPT_OPENSOCKETDATA, &sctx->sock);
 #endif
@@ -1589,7 +1611,10 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
 		return false;
 	}

-#if LIBCURL_VERSION_NUM < 0x071101
+#if LIBCURL_VERSION_NUM >= 0x072d00
+	curl_easy_getinfo(curl, CURLINFO_ACTIVESOCKET, &sctx->sock);
+#elif LIBCURL_VERSION_NUM < 0x071101   
+//#if LIBCURL_VERSION_NUM < 0x071101
 	/* CURLINFO_LASTSOCKET is broken on Win64; only use it as a last resort */
 	curl_easy_getinfo(curl, CURLINFO_LASTSOCKET, (long *)&sctx->sock);
 #endif
@@ -1885,7 +1910,8 @@ static uint32_t getblocheight(struct stratum_ctx *sctx)

 	// find 0xffff tag
 	p = (uint8_t*) sctx->job.coinbase + 32;
-	m = p + 128;
+   m = p + sctx->job.coinbase_size - 32 - 2;
+//   m = p + 128;
 	while (*p != 0xff && p < m) p++;
 	while (*p == 0xff && p < m) p++;
 	if (*(p-1) == 0xff && *(p-2) == 0xff) {
@@ -1992,23 +2018,41 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
      }
   }

-   if ( merkle_count )
-      merkle = (uchar**) malloc( merkle_count * sizeof(char *) );
-	for ( i = 0; i < merkle_count; i++ )
-   {
-		const char *s = json_string_value( json_array_get( merkle_arr, i ) );
-		if ( !s || strlen(s) != 64 )
-      {
-			while ( i-- ) free( merkle[i] );
-			free( merkle );
-			applog( LOG_ERR, "Stratum notify: invalid Merkle branch" );
-			goto out;
-		}
-		merkle[i] = (uchar*) malloc( 32 );
-		hex2bin( merkle[i], s, 32 );
-	}
+   pthread_mutex_lock( &sctx->work_lock );

-	pthread_mutex_lock( &sctx->work_lock );
+   if ( merkle_count )
+   {
+      if ( merkle_count > sctx->job.merkle_buf_size )
+      {
+         for ( i = 0; i < sctx->job.merkle_count; i++ )
+            free( sctx->job.merkle[i] );
+         free( sctx->job.merkle );
+
+         merkle = (uchar**) malloc( merkle_count * sizeof(char *) );
+         for ( i = 0; i < merkle_count; i++ )
+            merkle[i] = (uchar*) malloc( 32 );
+         sctx->job.merkle_buf_size = merkle_count;
+         sctx->job.merkle = merkle;
+      }
+
+      for ( i = 0; i < merkle_count; i++ )
+      {
+         const char *s = json_string_value( json_array_get( merkle_arr, i ) );
+         if ( !s || strlen(s) != 64 )
+         {
+            for ( int j = sctx->job.merkle_buf_size; j > 0; j-- )
+               free( sctx->job.merkle[i] );
+            free( sctx->job.merkle );
+            sctx->job.merkle_count =
+            sctx->job.merkle_buf_size = 0;
+            pthread_mutex_unlock( &sctx->work_lock );
+            applog( LOG_ERR, "Stratum notify: invalid Merkle branch" );
+            goto out;
+         }
+         hex2bin( sctx->job.merkle[i], s, 32 );
+      }   
+   }
+   sctx->job.merkle_count = merkle_count;         

 	coinb1_size = strlen( coinb1 ) / 2;
 	coinb2_size = strlen( coinb2 ) / 2;
@@ -2041,18 +2085,9 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
   }

 	sctx->block_height = getblocheight( sctx );
-
-	for ( i = 0; i < sctx->job.merkle_count; i++ )
-		free( sctx->job.merkle[i] );
-
-	free( sctx->job.merkle );
-	sctx->job.merkle = merkle;
-	sctx->job.merkle_count = merkle_count;
-
 	hex2bin( sctx->job.nbits, nbits, 4 );
 	hex2bin( sctx->job.ntime, stime, 4 );
 	sctx->job.clean = clean;
-
 	sctx->job.diff = sctx->next_diff;

 	pthread_mutex_unlock( &sctx->work_lock );
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -17,8 +17,9 @@ export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
 # used by GCC
 export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
 # Support for Windows 7 CPU groups, AES sometimes not included in -march
-# Disabled due to CPU group incompatibilities between Intel and AMD CPU. 
-export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
+# CPU groups disabled due to incompatibilities between Intel and AMD CPUs.
+#export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
+export DEFAULT_CFLAGS="-maes -O3 -Wall"
 export DEFAULT_CFLAGS_OLD="-O3 -Wall"

 # make link to local gmp header file.
@@ -46,7 +47,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
 ./clean-all.sh || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-march=icelake-client $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=icelake-client $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
@@ -54,7 +55,7 @@ mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
 # AVX512 AES: Intel Core HEDT Slylake-X, Cascadelake 
 make clean || echo clean
 rm -f config.status
-CFLAGS="-march=skylake-avx512 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=skylake-avx512 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512.exe
@@ -62,7 +63,7 @@ mv cpuminer.exe release/cpuminer-avx512.exe
 # AVX2 SHA VAES: Intel Alderlake, AMD Zen3
 make clean || echo done
 rm -f config.status
-CFLAGS="-mavx2 -msha -mvaes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
+CFLAGS="-mavx2 -msha -mvaes $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
@@ -70,7 +71,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
 # AVX2 AES SHA: AMD Zen1
 make clean || echo clean
 rm -f config.status
-CFLAGS="-march=znver1 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=znver1 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2-sha.exe
@@ -78,7 +79,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha.exe
 # AVX2 AES: Intel Core Haswell, Skylake, Kabylake, Coffeelake, Cometlake
 make clean || echo clean
 rm -f config.status
-CFLAGS="-march=core-avx2 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=core-avx2 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2.exe
@@ -128,7 +129,7 @@ make clean || echo clean
 # Native with CPU groups ennabled
 make clean || echo clean
 rm -f config.status
-CFLAGS="-march=native $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=native $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
Author	SHA1	Message	Date
Jay D Dee	de564ccbde	v3.22.2	2023-04-06 13:38:37 -04:00
Jay D Dee	fcd7727b0d	v3.22.1	2023-03-24 18:29:42 -04:00
Jay D Dee	3dd6787531	v3.22.0	2023-03-21 17:12:51 -04:00
Jay D Dee	cae1ce2ab7	v3.21.5	2023-03-15 12:27:04 -04:00
Jay D Dee	7a91c41d74	v3.21.4	2023-03-13 14:54:38 -04:00
Jay D Dee	c6bc9d67fb	v3.21.3 Unreleased	2023-03-13 03:20:13 -04:00