v3.22.0

v3.21.5
v3.21.4
2025-09-17 23:44:27 +00:00 · 2023-03-21 17:12:51 -04:00 · 2023-03-15 12:27:04 -04:00 · 2023-03-13 14:54:38 -04:00 · 2023-03-13 03:20:13 -04:00 · 2023-03-11 14:54:49 -05:00
51 changed files with 3909 additions and 4439 deletions
--- a/2
+++ b/2
@@ -37,7 +37,7 @@ SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
 openssl 1.1.0e or higher.

 znver1 and znver2 should be recognized on most recent version of GCC and
-znver3 is expected with GCC 11. GCC 11 also includes rocketlake support.
+znver3 is available with GCC 11. GCC 11 also includes rocketlake support.
 In the meantime here are some suggestions to compile with new CPUs:

 "-march=native" is usually the best choice, used by build.sh.
--- a/Makefile.am
+++ b/Makefile.am
@@ -55,9 +55,6 @@ cpuminer_SOURCES = \
  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
  algo/blake/blakecoin-4way.c \
-  algo/blake/decred-gate.c \
-  algo/blake/decred.c \
-  algo/blake/decred-4way.c \
  algo/blake/pentablake-gate.c \
  algo/blake/pentablake-4way.c \
  algo/blake/pentablake.c \
--- a/60
+++ b/60
@@ -65,10 +65,68 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v3.22.0
+
+Stratum: faster netdiff calculation.
+Merged a few updates from Pooler/cpuminer:
+   Use CURLOPT_POSTFIELDS in json_rpc_call,
+   Use CURLINFO_ACTIVESOCKET when supported,
+   JSONRPC speedup,
+   Speed up hex2bin function.  
+Small log improvements, notably more frequent hash rate reports.
+Removed decred algo.
+
+v3.21.5
+
+All issues with v3.21.3 & v3.21.4 should be resolved.
+Changes since v3.21.2:
+#392 #379 #389 Fixed misaligned address segfault solo mining.
+#392 Fixed stats for myr-gr algo, and a few others, for CPUs without AVX2.
+#392 Fixed conditional mining.
+#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
+     Windows binaries no longer support CPU groups,
+     Windows binaries support CPUs with up to 64 threads.
+Small optimizations to serialized vectoring.
+
+v3.21.4 CANCELLED
+
+Reapply selected changes from v3.21.3.
+#392 #379 #389 Fixed misaligned address segfault solo mining.
+#392 Fixed conditional mining.
+#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
+     Windows binaries no longer support CPU groups,
+     Windows binaries support CPUs with up to 64 threads.
+
+v3.21.3.1 UNRELEASED
+
+Revert to 3.21.2
+
+v3.21.3 CANCELLED
+
+#392 #379 #389 Fixed misaligned address segfault solo mining.
+#392 Fixed stats for myr-gr algo, and a few others, for CPUs without AVX2.
+#392 Fixed conditional mining.
+#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
+     Windows binaries no longer support CPU groups,
+     Windows binaries support CPUs with up to 64 threads.
+Midstate prehash is now centralized, done only once instead of by every thread
+for selected algos.
+Small optimizations to serialized vectoring.
+
+v3.21.2 
+
+Faster SALSA SIMD shuffle for yespower, yescrypt & scryptn2.
+Fixed a couple of compiler warnings with gcc-12.
+
+v3.21.1
+
+Fixed a segfault in some obsolete algos.
+Small optimizations to Hamsi & Shabal AVX2 & AVX512.
+
 v3.21.0

 Added minotaurx algo for stratum only.
-Blake256 & sha256 prehash optimised to ignore zero-padded data for AVX2 & AVX512.
+Blake256 & sha256 prehash optimized to ignore zero-padded data for AVX2 & AVX512.
 Other small improvements.

 v3.20.3
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -1,6 +1,6 @@
-# generated automatically by aclocal 1.16.1 -*- Autoconf -*-
+# generated automatically by aclocal 1.16.5 -*- Autoconf -*-

-# Copyright (C) 1996-2018 Free Software Foundation, Inc.
+# Copyright (C) 1996-2021 Free Software Foundation, Inc.

 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -14,13 +14,13 @@
 m4_ifndef([AC_CONFIG_MACRO_DIRS], [m4_defun([_AM_CONFIG_MACRO_DIRS], [])m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])])
 m4_ifndef([AC_AUTOCONF_VERSION],
  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
-m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.69],,
-[m4_warning([this file was generated for autoconf 2.69.
+m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.71],,
+[m4_warning([this file was generated for autoconf 2.71.
 You have another version of autoconf.  It may work, but is not guaranteed to.
 If you have problems, you may need to regenerate the build system entirely.
 To do so, use the procedure documented by the package, typically 'autoreconf'.])])

-# Copyright (C) 2002-2018 Free Software Foundation, Inc.
+# Copyright (C) 2002-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -35,7 +35,7 @@ AC_DEFUN([AM_AUTOMAKE_VERSION],
 [am__api_version='1.16'
 dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
 dnl require some minimum version.  Point them to the right macro.
-m4_if([$1], [1.16.1], [],
+m4_if([$1], [1.16.5], [],
      [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
 ])

@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
 # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
 # This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.16.1])dnl
+[AM_AUTOMAKE_VERSION([1.16.5])dnl
 m4_ifndef([AC_AUTOCONF_VERSION],
  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])

 # Figure out how to run the assembler.                      -*- Autoconf -*-

-# Copyright (C) 2001-2018 Free Software Foundation, Inc.
+# Copyright (C) 2001-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -78,7 +78,7 @@ _AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl

 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-

-# Copyright (C) 2001-2018 Free Software Foundation, Inc.
+# Copyright (C) 2001-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -130,7 +130,7 @@ am_aux_dir=`cd "$ac_aux_dir" && pwd`

 # AM_CONDITIONAL                                            -*- Autoconf -*-

-# Copyright (C) 1997-2018 Free Software Foundation, Inc.
+# Copyright (C) 1997-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -161,7 +161,7 @@ AC_CONFIG_COMMANDS_PRE(
 Usually this means the macro was only invoked conditionally.]])
 fi])])

-# Copyright (C) 1999-2018 Free Software Foundation, Inc.
+# Copyright (C) 1999-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -352,7 +352,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl

 # Generate code to set up dependency tracking.              -*- Autoconf -*-

-# Copyright (C) 1999-2018 Free Software Foundation, Inc.
+# Copyright (C) 1999-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -391,7 +391,9 @@ AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
  done
  if test $am_rc -ne 0; then
    AC_MSG_FAILURE([Something went wrong bootstrapping makefile fragments
-    for automatic dependency tracking.  Try re-running configure with the
+    for automatic dependency tracking.  If GNU make was not used, consider
+    re-running the configure script with MAKE="gmake" (or whatever is
+    necessary).  You can also try re-running configure with the
    '--disable-dependency-tracking' option to at least be able to build
    the package (albeit without support for automatic dependency tracking).])
  fi
@@ -418,7 +420,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],

 # Do all the work for Automake.                             -*- Autoconf -*-

-# Copyright (C) 1996-2018 Free Software Foundation, Inc.
+# Copyright (C) 1996-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -446,6 +448,10 @@ m4_defn([AC_PROG_CC])
 # release and drop the old call support.
 AC_DEFUN([AM_INIT_AUTOMAKE],
 [AC_PREREQ([2.65])dnl
+m4_ifdef([_$0_ALREADY_INIT],
+  [m4_fatal([$0 expanded multiple times
+]m4_defn([_$0_ALREADY_INIT]))],
+  [m4_define([_$0_ALREADY_INIT], m4_expansion_stack)])dnl
 dnl Autoconf wants to disallow AM_ names.  We explicitly allow
 dnl the ones we care about.
 m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
@@ -482,7 +488,7 @@ m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
 [_AM_SET_OPTIONS([$1])dnl
 dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT.
 m4_if(
-  m4_ifdef([AC_PACKAGE_NAME], [ok]):m4_ifdef([AC_PACKAGE_VERSION], [ok]),
+  m4_ifset([AC_PACKAGE_NAME], [ok]):m4_ifset([AC_PACKAGE_VERSION], [ok]),
  [ok:ok],,
  [m4_fatal([AC_INIT should be called with package and version arguments])])dnl
 AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
@@ -534,6 +540,20 @@ AC_PROVIDE_IFELSE([AC_PROG_OBJCXX],
 		  [m4_define([AC_PROG_OBJCXX],
 			     m4_defn([AC_PROG_OBJCXX])[_AM_DEPENDENCIES([OBJCXX])])])dnl
 ])
+# Variables for tags utilities; see am/tags.am
+if test -z "$CTAGS"; then
+  CTAGS=ctags
+fi
+AC_SUBST([CTAGS])
+if test -z "$ETAGS"; then
+  ETAGS=etags
+fi
+AC_SUBST([ETAGS])
+if test -z "$CSCOPE"; then
+  CSCOPE=cscope
+fi
+AC_SUBST([CSCOPE])
+
 AC_REQUIRE([AM_SILENT_RULES])dnl
 dnl The testsuite driver may need to know about EXEEXT, so add the
 dnl 'am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen.  This
@@ -615,7 +635,7 @@ for _am_header in $config_headers :; do
 done
 echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])

-# Copyright (C) 2001-2018 Free Software Foundation, Inc.
+# Copyright (C) 2001-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -636,7 +656,7 @@ if test x"${install_sh+set}" != xset; then
 fi
 AC_SUBST([install_sh])])

-# Copyright (C) 2003-2018 Free Software Foundation, Inc.
+# Copyright (C) 2003-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -658,7 +678,7 @@ AC_SUBST([am__leading_dot])])
 # Add --enable-maintainer-mode option to configure.         -*- Autoconf -*-
 # From Jim Meyering

-# Copyright (C) 1996-2018 Free Software Foundation, Inc.
+# Copyright (C) 1996-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -693,7 +713,7 @@ AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])

 # Check to see how 'make' treats includes.	            -*- Autoconf -*-

-# Copyright (C) 2001-2018 Free Software Foundation, Inc.
+# Copyright (C) 2001-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -736,7 +756,7 @@ AC_SUBST([am__quote])])

 # Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-

-# Copyright (C) 1997-2018 Free Software Foundation, Inc.
+# Copyright (C) 1997-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -757,12 +777,7 @@ AC_DEFUN([AM_MISSING_HAS_RUN],
 [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
 AC_REQUIRE_AUX_FILE([missing])dnl
 if test x"${MISSING+set}" != xset; then
-  case $am_aux_dir in
-  *\ * | *\	*)
-    MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;;
-  *)
-    MISSING="\${SHELL} $am_aux_dir/missing" ;;
-  esac
+  MISSING="\${SHELL} '$am_aux_dir/missing'"
 fi
 # Use eval to expand $SHELL
 if eval "$MISSING --is-lightweight"; then
@@ -775,7 +790,7 @@ fi

 # Helper functions for option handling.                     -*- Autoconf -*-

-# Copyright (C) 2001-2018 Free Software Foundation, Inc.
+# Copyright (C) 2001-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -804,7 +819,7 @@ AC_DEFUN([_AM_SET_OPTIONS],
 AC_DEFUN([_AM_IF_OPTION],
 [m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])

-# Copyright (C) 1999-2018 Free Software Foundation, Inc.
+# Copyright (C) 1999-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -851,7 +866,7 @@ AC_LANG_POP([C])])
 # For backward compatibility.
 AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])

-# Copyright (C) 2001-2018 Free Software Foundation, Inc.
+# Copyright (C) 2001-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -870,7 +885,7 @@ AC_DEFUN([AM_RUN_LOG],

 # Check to make sure that the build environment is sane.    -*- Autoconf -*-

-# Copyright (C) 1996-2018 Free Software Foundation, Inc.
+# Copyright (C) 1996-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -951,7 +966,7 @@ AC_CONFIG_COMMANDS_PRE(
 rm -f conftest.file
 ])

-# Copyright (C) 2009-2018 Free Software Foundation, Inc.
+# Copyright (C) 2009-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1011,7 +1026,7 @@ AC_SUBST([AM_BACKSLASH])dnl
 _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
 ])

-# Copyright (C) 2001-2018 Free Software Foundation, Inc.
+# Copyright (C) 2001-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1039,7 +1054,7 @@ fi
 INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
 AC_SUBST([INSTALL_STRIP_PROGRAM])])

-# Copyright (C) 2006-2018 Free Software Foundation, Inc.
+# Copyright (C) 2006-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1058,7 +1073,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])

 # Check how to create a tarball.                            -*- Autoconf -*-

-# Copyright (C) 2004-2018 Free Software Foundation, Inc.
+# Copyright (C) 2004-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -263,8 +263,6 @@ void init_algo_gate( algo_gate_t* gate )
   gate->build_block_header      = (void*)&std_build_block_header;
   gate->build_extraheader       = (void*)&std_build_extraheader;
   gate->set_work_data_endian    = (void*)&do_nothing;
-   gate->calc_network_diff       = (void*)&std_calc_network_diff;
-   gate->ready_to_mine           = (void*)&std_ready_to_mine;
   gate->resync_threads          = (void*)&do_nothing;
   gate->do_this_thread          = (void*)&return_true;
   gate->longpoll_rpc_call       = (void*)&std_longpoll_rpc_call;
@@ -308,7 +306,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_BLAKECOIN:    rc = register_blakecoin_algo     ( gate ); break;
    case ALGO_BMW512:       rc = register_bmw512_algo        ( gate ); break;
    case ALGO_C11:          rc = register_c11_algo           ( gate ); break;
-    case ALGO_DECRED:       rc = register_decred_algo        ( gate ); break;
    case ALGO_DEEP:         rc = register_deep_algo          ( gate ); break;
    case ALGO_DMD_GR:       rc = register_dmd_gr_algo        ( gate ); break;
    case ALGO_GROESTL:      rc = register_groestl_algo       ( gate ); break;
@@ -427,7 +424,6 @@ const char* const algo_alias_map[][2] =
  { "blake256r8",        "blakecoin"      },
  { "blake256r8vnl",     "vanilla"        },
  { "blake256r14",       "blake"          },
-  { "blake256r14dcr",    "decred"         },
  { "diamond",           "dmd-gr"         },
  { "espers",            "hmq1725"        },
  { "flax",              "c11"            },
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -144,7 +144,7 @@ void ( *gen_merkle_root )       ( char*, struct stratum_ctx* );
 void ( *build_extraheader )     ( struct work*, struct stratum_ctx* );

 void ( *build_block_header )    ( struct work*, uint32_t, uint32_t*,
-	                                uint32_t*, uint32_t, uint32_t,
+	                                uint32_t*,   uint32_t, uint32_t,
                                   unsigned char* );

 // Build mining.submit message
@@ -155,19 +155,13 @@ char* ( *malloc_txs_request )   ( struct work* );
 // Big endian or little endian
 void ( *set_work_data_endian )  ( struct work* );

-double ( *calc_network_diff )   ( struct work* );
-
-// Wait for first work
-bool ( *ready_to_mine )         ( struct work*, struct stratum_ctx*, int );
-
 // Diverge mining threads
 bool ( *do_this_thread )        ( int );

 // After do_this_thread
 void ( *resync_threads )        ( int, struct work* );

-// No longer needed
-json_t* (*longpoll_rpc_call)      ( CURL*, int*, char* );
+json_t* ( *longpoll_rpc_call )  ( CURL*, int*, char* );

 set_t optimizations;
 int  ( *get_work_data_size )     ();
@@ -286,8 +280,6 @@ char* std_malloc_txs_request( struct work *work );
 // Default is do_nothing, little endian is assumed
 void set_work_data_big_endian( struct work *work );

-double std_calc_network_diff( struct work *work );
-
 void std_build_block_header( struct work* g_work, uint32_t version,
 	                          uint32_t *prevhash,  uint32_t *merkle_root,
   	                       uint32_t ntime,      uint32_t nbits,
@@ -297,9 +289,6 @@ void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );

 json_t* std_longpoll_rpc_call( CURL *curl, int *err, char *lp_url );

-bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                        int thr_id );
-
 int std_get_work_data_size();

 // Gate admin functions
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -1,74 +0,0 @@
-#include "decred-gate.h"
-#include "blake-hash-4way.h"
-#include <string.h>
-#include <stdint.h>
-#include <memory.h>
-#include <unistd.h>
-
-#if defined (DECRED_4WAY)
-
-static __thread blake256_4way_context blake_mid;
-
-void decred_hash_4way( void *state, const void *input )
-{
-     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-//     uint32_t hash0[8] __attribute__ ((aligned (32)));
-//     uint32_t hash1[8] __attribute__ ((aligned (32)));
-//     uint32_t hash2[8] __attribute__ ((aligned (32)));
-//     uint32_t hash3[8] __attribute__ ((aligned (32)));
-     const void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
-     int tail_len = 180 - DECRED_MIDSTATE_LEN; 
-     blake256_4way_context ctx __attribute__ ((aligned (64)));
-
-     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
-     blake256_4way_update( &ctx, tail, tail_len );
-     blake256_4way_close( &ctx, vhash );
-     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
-}
-
-int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t vdata[48*4] __attribute__ ((aligned (64)));
-   uint32_t hash[8*4] __attribute__ ((aligned (32)));
-   uint32_t _ALIGN(64) edata[48];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
-   uint32_t n = first_nonce;
-   const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-   // copy to buffer guaranteed to be aligned.
-   memcpy( edata, pdata, 180 );
-
-   // use the old way until  new way updated for size.
-   mm128_intrlv_4x32x( vdata, edata, edata, edata, edata, 180*8 );
-
-   blake256_4way_init( &blake_mid );
-   blake256_4way_update( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
-
-   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
-   do {
-      * noncep    = n;
-      *(noncep+1) = n+1;
-      *(noncep+2) = n+2;
-      *(noncep+3) = n+3;
-
-      decred_hash_4way( hash, vdata );
-
-      for ( int i = 0; i < 4; i++ )
-      if (  (hash+(i<<3))[7] <= HTarget )
-      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-      {
-          pdata[DECRED_NONCE_INDEX] = n+i;
-          submit_solution( work, hash+(i<<3), mythr );
-      }
-      n += 4;
-  } while ( (n < max_nonce) && !work_restart[thr_id].restart );
-
-  *hashes_done = n - first_nonce + 1;
-  return 0;
-}
-
-#endif
--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -1,171 +0,0 @@
-#include "decred-gate.h"
-#include <unistd.h>
-#include <memory.h>
-#include <string.h>
-
-uint32_t *decred_get_nonceptr( uint32_t *work_data )
-{
-   return &work_data[ DECRED_NONCE_INDEX ];
-}
-
-long double decred_calc_network_diff( struct work* work )
-{
-   // sample for diff 43.281 : 1c05ea29
-   // todo: endian reversed on longpoll could be zr5 specific...
-   uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
-   uint32_t bits = ( nbits & 0xffffff );
-   int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
-   int m;
-   long double d = (long double)0x0000ffff / (long double)bits;
-
-   for ( m = shift; m < 29; m++ )
-       d *= 256.0;
-   for ( m = 29; m < shift; m++ )
-       d /= 256.0;
-   if ( shift == 28 )
-       d *= 256.0; // testnet
-   if ( opt_debug_diff )
-       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", (double)d,
-                           shift, bits );
-   return net_diff;
-}
-
-void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
-{
-   // some random extradata to make the work unique
-   work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
-   work->height = work->data[32];
-   if (!have_longpoll && work->height > *net_blocks + 1)
-   {
-      char netinfo[64] = { 0 };
-      if ( net_diff > 0. )
-      {
-         if (net_diff != work->targetdiff)
-            sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
-                   work->targetdiff);
-         else
-             sprintf(netinfo, ", diff %.3f", net_diff);
-       }
-       applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
-                       netinfo);
-       *net_blocks = work->height - 1;
-   }
-}
-
-void decred_be_build_stratum_request( char *req, struct work *work,
-                                      struct stratum_ctx *sctx )
-{
-   unsigned char *xnonce2str;
-   uint32_t ntime, nonce;
-   char ntimestr[9], noncestr[9];
-
-   be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
-   be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
-   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
-   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
-   xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
-                                     sctx->xnonce1_size );
-   snprintf( req, JSON_BUF_LEN,
-        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
-   free(xnonce2str);
-}
-
-#if !defined(min)
-#define min(a,b) (a>b ? (b) :(a))
-#endif
-
-void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
-{
-   uchar merkle_root[64] = { 0 };
-   uint32_t extraheader[32] = { 0 };
-   int headersize = 0;
-   uint32_t* extradata = (uint32_t*) sctx->xnonce1;
-   int i;
-
-   // getwork over stratum, getwork merkle + header passed in coinb1
-   memcpy(merkle_root, sctx->job.coinbase, 32);
-   headersize = min((int)sctx->job.coinbase_size - 32,
-                  sizeof(extraheader) );
-   memcpy( extraheader, &sctx->job.coinbase[32], headersize );
-
-   // Assemble block header 
-   memset( g_work->data, 0, sizeof(g_work->data) );
-   g_work->data[0] = le32dec( sctx->job.version );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[1 + i] = swab32(
-                              le32dec( (uint32_t *) sctx->job.prevhash + i ) );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
-
-//   for ( i = 0; i < 8; i++ ) // prevhash
-//      g_work->data[1 + i] = swab32( g_work->data[1 + i] );
-//   for ( i = 0; i < 8; i++ ) // merkle
-//      g_work->data[9 + i] = swab32( g_work->data[9 + i] );
-
-   for ( i = 0; i < headersize/4; i++ ) // header
-      g_work->data[17 + i] = extraheader[i];
-   // extradata
-
-   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
-      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
-   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
-      g_work->data[i] = 0;
-   g_work->data[37] = (rand()*4) << 8;
-   // block header suffix from coinb2 (stake version)
-   memcpy( &g_work->data[44],
-           &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
-   sctx->block_height = g_work->data[32];
-   //applog_hex(work->data, 180);
-   //applog_hex(&work->data[36], 36);
-}
-
-#undef min
-
-bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                           int thr_id )
-{
-   if ( have_stratum && strcmp(stratum->job.job_id, work->job_id)  )
-      // need to regen g_work..
-      return false;
-   if ( have_stratum && !work->data[0] && !opt_benchmark )
-   {
-      sleep(1);
-      return false;
-   }
-   // extradata: prevent duplicates
-   work->data[ DECRED_XNONCE_INDEX     ] += 1;
-   work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
-   return true;
-}
-
-int decred_get_work_data_size() { return DECRED_DATA_SIZE; }
-
-bool register_decred_algo( algo_gate_t* gate )
-{
-#if defined(DECRED_4WAY)
-  four_way_not_tested();
-  gate->scanhash  = (void*)&scanhash_decred_4way;
-  gate->hash      = (void*)&decred_hash_4way;
-#else
-  gate->scanhash  = (void*)&scanhash_decred;
-  gate->hash      = (void*)&decred_hash;
-#endif
-  gate->optimizations = AVX2_OPT;
-//  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
-  gate->decode_extra_data     = (void*)&decred_decode_extradata;
-  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
-  gate->work_decode           = (void*)&std_be_work_decode;
-  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
-  gate->build_extraheader     = (void*)&decred_build_extraheader;
-  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
-  gate->nbits_index           = DECRED_NBITS_INDEX;
-  gate->ntime_index           = DECRED_NTIME_INDEX;
-  gate->nonce_index           = DECRED_NONCE_INDEX;
-  gate->get_work_data_size    = (void*)&decred_get_work_data_size;
-  gate->work_cmp_size         = DECRED_WORK_COMPARE_SIZE;
-  allow_mininginfo            = false;
-  have_gbt                    = false;
-  return true;
-}
-
--- a/algo/blake/decred-gate.h
+++ b/algo/blake/decred-gate.h
@@ -1,36 +0,0 @@
-#ifndef __DECRED_GATE_H__
-#define __DECRED_GATE_H__
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-#define DECRED_NBITS_INDEX 29
-#define DECRED_NTIME_INDEX 34
-#define DECRED_NONCE_INDEX 35
-#define DECRED_XNONCE_INDEX 36
-#define DECRED_DATA_SIZE 192
-#define DECRED_WORK_COMPARE_SIZE 140
-#define DECRED_MIDSTATE_LEN 128
-
-#if defined (__AVX2__) 
-//void blakehash_84way(void *state, const void *input);
-//int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
-//                         uint64_t *hashes_done );
-#endif
-
-#if defined(__SSE4_2__)
-  #define DECRED_4WAY
-#endif
-
-#if defined (DECRED_4WAY)
-void decred_hash_4way(void *state, const void *input);
-int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr );
-#endif
-
-void decred_hash( void *state, const void *input );
-int scanhash_decred( struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done, struct thr_info *mythr );
-
-#endif
-
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -1,282 +0,0 @@
-#include "decred-gate.h"
-
-#if !defined(DECRED_8WAY) && !defined(DECRED_4WAY)
-
-#include "sph_blake.h"
-
-#include <string.h>
-#include <stdint.h>
-#include <memory.h>
-#include <unistd.h>
-
-/*
-#ifndef min
-#define min(a,b) (a>b ? b : a)
-#endif
-#ifndef max 
-#define max(a,b) (a<b ? b : a)
-#endif
-*/
-/*
-#define DECRED_NBITS_INDEX 29
-#define DECRED_NTIME_INDEX 34
-#define DECRED_NONCE_INDEX 35
-#define DECRED_XNONCE_INDEX 36
-#define DECRED_DATA_SIZE 192
-#define DECRED_WORK_COMPARE_SIZE 140
-*/
-static __thread sph_blake256_context blake_mid;
-static __thread bool ctx_midstate_done = false;
-
-void decred_hash(void *state, const void *input)
-{
-//        #define MIDSTATE_LEN 128
-        sph_blake256_context ctx __attribute__ ((aligned (64)));
-
-        uint8_t *ending = (uint8_t*) input;
-        ending += DECRED_MIDSTATE_LEN;
-
-        if (!ctx_midstate_done) {
-                sph_blake256_init(&blake_mid);
-                sph_blake256(&blake_mid, input, DECRED_MIDSTATE_LEN);
-                ctx_midstate_done = true;
-        }
-        memcpy(&ctx, &blake_mid, sizeof(blake_mid));
-
-        sph_blake256(&ctx, ending, (180 - DECRED_MIDSTATE_LEN));
-        sph_blake256_close(&ctx, state);
-}
-
-void decred_hash_simple(void *state, const void *input)
-{
-        sph_blake256_context ctx;
-        sph_blake256_init(&ctx);
-        sph_blake256(&ctx, input, 180);
-        sph_blake256_close(&ctx, state);
-}
-
-int scanhash_decred( struct work *work, uint32_t max_nonce,
-               uint64_t *hashes_done, struct thr_info *mythr )
-{
-        uint32_t _ALIGN(64) endiandata[48];
-        uint32_t _ALIGN(64) hash32[8];
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-//        #define DCR_NONCE_OFT32 35
-
-        const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
-        const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
-
-        uint32_t n = first_nonce;
-
-        ctx_midstate_done = false;
-
-#if 1
-        memcpy(endiandata, pdata, 180);
-#else
-        for (int k=0; k < (180/4); k++)
-                be32enc(&endiandata[k], pdata[k]);
-#endif
-
-        do {
-                //be32enc(&endiandata[DCR_NONCE_OFT32], n);
-                endiandata[DECRED_NONCE_INDEX] = n;
-                decred_hash(hash32, endiandata);
-
-                if (hash32[7] <= HTarget && fulltest(hash32, ptarget))
-                {
-                   pdata[DECRED_NONCE_INDEX] = n;
-                   submit_solution( work, hash32, mythr );
-                }
-
-                n++;
-
-        } while (n < max_nonce && !work_restart[thr_id].restart);
-
-        *hashes_done = n - first_nonce + 1;
-        pdata[DECRED_NONCE_INDEX] = n;
-        return 0;
-}
-
-/*
-uint32_t *decred_get_nonceptr( uint32_t *work_data )
-{
-   return &work_data[ DECRED_NONCE_INDEX ];
-}
-
-double decred_calc_network_diff( struct work* work )
-{
-   // sample for diff 43.281 : 1c05ea29
-   // todo: endian reversed on longpoll could be zr5 specific...
-   uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
-   uint32_t bits = ( nbits & 0xffffff );
-   int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
-   int m;
-   double d = (double)0x0000ffff / (double)bits;
-
-   for ( m = shift; m < 29; m++ )
-       d *= 256.0;
-   for ( m = 29; m < shift; m++ )
-       d /= 256.0;
-   if ( shift == 28 )
-       d *= 256.0; // testnet
-   if ( opt_debug_diff )
-       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
-                           shift, bits );
-   return net_diff;
-}
-
-void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
-{
-   // some random extradata to make the work unique
-   work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
-   work->height = work->data[32];
-   if (!have_longpoll && work->height > *net_blocks + 1)
-   {
-      char netinfo[64] = { 0 };
-      if (net_diff > 0.)
-      {
-         if (net_diff != work->targetdiff)
-	    sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
-                   work->targetdiff);
-	 else
-	     sprintf(netinfo, ", diff %.3f", net_diff);
-       }
-       applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
-                       netinfo);
-       *net_blocks = work->height - 1;
-   }
-}
-
-void decred_be_build_stratum_request( char *req, struct work *work,
-                                      struct stratum_ctx *sctx )
-{
-   unsigned char *xnonce2str;
-   uint32_t ntime, nonce;
-   char ntimestr[9], noncestr[9];
-
-   be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
-   be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
-   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
-   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
-   xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
-                                     sctx->xnonce1_size );
-   snprintf( req, JSON_BUF_LEN,
-        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
-   free(xnonce2str);
-}
-*/
-/*
-// data shared between gen_merkle_root and build_extraheader.
-__thread uint32_t decred_extraheader[32] = { 0 };
-__thread int decred_headersize = 0;
-
-void decred_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
-{
-   // getwork over stratum, getwork merkle + header passed in coinb1
-   memcpy(merkle_root, sctx->job.coinbase, 32);
-   decred_headersize = min((int)sctx->job.coinbase_size - 32, 
-                  sizeof(decred_extraheader) );
-   memcpy( decred_extraheader, &sctx->job.coinbase[32], decred_headersize);
-}
-*/
-
-/*
-#define min(a,b) (a>b ? (b) :(a))
-
-void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
-{
-   uchar merkle_root[64] = { 0 };
-   uint32_t extraheader[32] = { 0 };
-   int headersize = 0;
-   uint32_t* extradata = (uint32_t*) sctx->xnonce1;
-   size_t t;
-   int i;
-
-   // getwork over stratum, getwork merkle + header passed in coinb1
-   memcpy(merkle_root, sctx->job.coinbase, 32);
-   headersize = min((int)sctx->job.coinbase_size - 32,
-                  sizeof(extraheader) );
-   memcpy( extraheader, &sctx->job.coinbase[32], headersize );
-
-   // Increment extranonce2 
-   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
-
-   // Assemble block header 
-   memset( g_work->data, 0, sizeof(g_work->data) );
-   g_work->data[0] = le32dec( sctx->job.version );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[1 + i] = swab32(
-                              le32dec( (uint32_t *) sctx->job.prevhash + i ) );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
-
-//   for ( i = 0; i < 8; i++ ) // prevhash
-//      g_work->data[1 + i] = swab32( g_work->data[1 + i] );
-//   for ( i = 0; i < 8; i++ ) // merkle
-//      g_work->data[9 + i] = swab32( g_work->data[9 + i] );
-
-   for ( i = 0; i < headersize/4; i++ ) // header
-      g_work->data[17 + i] = extraheader[i];
-   // extradata
-
-   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
-      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
-   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
-      g_work->data[i] = 0;
-   g_work->data[37] = (rand()*4) << 8;
-   // block header suffix from coinb2 (stake version)
-   memcpy( &g_work->data[44],
-           &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
-   sctx->bloc_height = g_work->data[32];
-   //applog_hex(work->data, 180);
-   //applog_hex(&work->data[36], 36);
-}
-
-#undef min
-
-bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                           int thr_id )
-{
-   if ( have_stratum && strcmp(stratum->job.job_id, work->job_id)  )
-      // need to regen g_work..
-      return false;
-   if ( have_stratum && !work->data[0] && !opt_benchmark )
-   {
-      sleep(1);
-      return false;
-   }      
-   // extradata: prevent duplicates
-   work->data[ DECRED_XNONCE_INDEX     ] += 1;
-   work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
-   return true;
-}
-
-
-bool register_decred_algo( algo_gate_t* gate )
-{
-  gate->optimizations         = SSE2_OPT;
-  gate->scanhash              = (void*)&scanhash_decred;
-  gate->hash                  = (void*)&decred_hash;
-  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
-  gate->decode_extra_data     = (void*)&decred_decode_extradata;
-  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
-  gate->work_decode           = (void*)&std_be_work_decode;
-  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
-  gate->build_extraheader     = (void*)&decred_build_extraheader;
-  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
-  gate->nbits_index           = DECRED_NBITS_INDEX;
-  gate->ntime_index           = DECRED_NTIME_INDEX;
-  gate->nonce_index           = DECRED_NONCE_INDEX;
-  gate->work_data_size        = DECRED_DATA_SIZE;
-  gate->work_cmp_size         = DECRED_WORK_COMPARE_SIZE; 
-  allow_mininginfo            = false;
-  have_gbt                    = false;
-  return true;
-}
-*/
-
-#endif
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -1,6 +1,6 @@
 #include "pentablake-gate.h"

-#if defined (__AVX2__)
+#if defined(PENTABLAKE_4WAY)

 #include <stdlib.h>
 #include <stdint.h>
--- a/algo/blake/pentablake-gate.h
+++ b/algo/blake/pentablake-gate.h
@@ -4,9 +4,10 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__)
-  #define PENTABLAKE_4WAY
-#endif
+// 4way is broken
+//#if defined(__AVX2__)
+//  #define PENTABLAKE_4WAY
+//#endif

 #if defined(PENTABLAKE_4WAY)
 void pentablakehash_4way( void *state, const void *input );
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -103,16 +103,16 @@
   const uint8_t *sigmaR = sigma[R]; \
   BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
   BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
-   V2 = mm128_alignr_64( V[3], V[2] ); \
-   V3 = mm128_alignr_64( V[2], V[3] ); \
-   V6 = mm128_alignr_64( V[6], V[7] ); \
-   V7 = mm128_alignr_64( V[7], V[6] ); \
+   V2 = mm128_alignr_64( V[3], V[2], 1 ); \
+   V3 = mm128_alignr_64( V[2], V[3], 1 ); \
+   V6 = mm128_alignr_64( V[6], V[7], 1 ); \
+   V7 = mm128_alignr_64( V[7], V[6], 1 ); \
   BLAKE2B_G( V[0], V2, V[5], V6,  8,  9, 10, 11 ); \
   BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
-   V[2] = mm128_alignr_64( V2, V3 ); \
-   V[3] = mm128_alignr_64( V3, V2 ); \
-   V[6] = mm128_alignr_64( V7, V6 ); \
-   V[7] = mm128_alignr_64( V6, V7 ); \
+   V[2] = mm128_alignr_64( V2, V3, 1 ); \
+   V[3] = mm128_alignr_64( V3, V2, 1 ); \
+   V[6] = mm128_alignr_64( V7, V6, 1 ); \
+   V[7] = mm128_alignr_64( V6, V7, 1 ); \
 }

 #else
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -24,9 +24,6 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )

  ctx->hashlen = hashlen;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
-
  for ( i = 0; i < SIZE512; i++ )
  {
     ctx->chaining[i] = _mm_setzero_si128();
@@ -46,9 +43,6 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
 {
  int i;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
-
  for ( i = 0; i < SIZE512; i++ )
  {
     ctx->chaining[i] = _mm_setzero_si128();
--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -22,9 +22,6 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )

  ctx->hashlen = hashlen;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
-
  for ( i = 0; i < SIZE256; i++ )
  {
     ctx->chaining[i] = _mm_setzero_si128();
@@ -43,9 +40,6 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
 {
  int i;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
-
  for ( i = 0; i < SIZE256; i++ )
  {
     ctx->chaining[i] = _mm_setzero_si128();
@@ -54,8 +48,6 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)

  ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 );

-//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-//  INIT256(ctx->chaining);
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -26,9 +26,6 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )

  ctx->hashlen = hashlen;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
-
  for ( i = 0; i < SIZE256; i++ )
  {
     ctx->chaining[i] = m512_zero;
@@ -54,8 +51,8 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
   __m512i* in = (__m512i*)input;
   int i;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
+//  if (ctx->chaining == NULL || ctx->buffer == NULL)
+//    return 1;

  for ( i = 0; i < SIZE256; i++ )
  {
@@ -179,8 +176,8 @@ int groestl256_2way_init( groestl256_2way_context* ctx, uint64_t hashlen )

  ctx->hashlen = hashlen;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
+//  if (ctx->chaining == NULL || ctx->buffer == NULL)
+//    return 1;

  for ( i = 0; i < SIZE256; i++ )
  {
@@ -207,9 +204,6 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
   __m256i* in = (__m256i*)input;
   int i;

-   if (ctx->chaining == NULL || ctx->buffer == NULL)
-     return 1;
-
   for ( i = 0; i < SIZE256; i++ )
   {
     ctx->chaining[i] = m256_zero;
--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -21,9 +21,6 @@

 int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
 {
-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
-
  memset_zero_512( ctx->chaining, SIZE512 );
  memset_zero_512( ctx->buffer, SIZE512 );

@@ -142,9 +139,6 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,

 int groestl512_2way_init( groestl512_2way_context* ctx, uint64_t hashlen )
 {
-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
-
  memset_zero_256( ctx->chaining, SIZE512 );
  memset_zero_256( ctx->buffer, SIZE512 );

--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -73,11 +73,11 @@ int scanhash_myriad( struct work *work, uint32_t max_nonce,
      be32enc(&endiandata[19], nonce);
      myriad_hash(hash, endiandata);

-      if (hash[7] <= Htarg && fulltest(hash, ptarget))
+      if (hash[7] <= Htarg )
+      if ( fulltest(hash, ptarget) && !opt_benchmark )
      {
         pdata[19] = nonce;
-         *hashes_done = pdata[19] - first_nonce;
-         return 1;
+         submit_solution( work, hash, mythr );
      }
      nonce++;

--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -585,9 +585,8 @@ do { \
  t = _mm512_xor_si512( t, c ); \
  d = mm512_xoror( a, b, t ); \
  t = mm512_xorand( t, a, b ); \
-  b = mm512_xor3( b, d, t ); \
  a = c; \
-  c = b; \
+  c = mm512_xor3( b, d, t ); \
  b = d; \
  d = mm512_not( t ); \
 } while (0)
@@ -635,7 +634,7 @@ do { \

 #define ROUND_BIG8( alpha ) \
 do { \
-   __m512i t0, t1, t2, t3; \
+   __m512i t0, t1, t2, t3, t4, t5; \
   s0 = _mm512_xor_si512( s0, alpha[ 0] ); /* m0 */ \
   s1 = _mm512_xor_si512( s1, alpha[ 1] ); /* c0 */ \
   s2 = _mm512_xor_si512( s2, alpha[ 2] ); /* m1 */ \
@@ -662,43 +661,35 @@ do { \
  s5 = mm512_swap64_32( s5 ); \
  sD = mm512_swap64_32( sD ); \
  sE = mm512_swap64_32( sE ); \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
-  L8( s0, t1, s9, t3 ); \
-  s4 = _mm512_mask_blend_epi32( 0x5555, s4, t1 ); \
-  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, t1 ); \
-  sD = _mm512_mask_blend_epi32( 0x5555, sD, t3 ); \
-  sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t3 ); \
+  t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
+  L8( s0, t0, s9, t1 ); \
 \
  s6 = mm512_swap64_32( s6 ); \
  sF = mm512_swap64_32( sF ); \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
-  L8( s1, t1, sA, t3 ); \
-  s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
-  s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, t1 ); \
-  sE = _mm512_mask_blend_epi32( 0x5555, sE, t3 ); \
-  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
+  L8( s1, t2, sA, t3 ); \
+  s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \
+  sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \
 \
  s7 = mm512_swap64_32( s7 ); \
  sC = mm512_swap64_32( sC ); \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
-  L8( s2, t1, sB, t3 ); \
-  s6 = _mm512_mask_blend_epi32( 0x5555, s6, t1 ); \
-  s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, t1 ); \
-  sF = _mm512_mask_blend_epi32( 0x5555, sF, t3 ); \
-  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t3 ); \
+  t4 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
+  t5 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
+  L8( s2, t4, sB, t5 ); \
+  s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \
+  sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \
  s6 = mm512_swap64_32( s6 ); \
  sF = mm512_swap64_32( sF ); \
 \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
-  L8( s3, t1, s8, t3 ); \
-  s7 = _mm512_mask_blend_epi32( 0x5555, s7, t1 ); \
-  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, t1 ); \
-  sC = _mm512_mask_blend_epi32( 0x5555, sC, t3 ); \
-  sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t3 ); \
+  L8( s3, t2, s8, t3 ); \
+  s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \
+  s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \
+  sC = _mm512_mask_blend_epi32( 0x5555, t5, t3 ); \
+  sD = _mm512_mask_blend_epi32( 0xaaaa, t1, t3 ); \
  s7 = mm512_swap64_32( s7 ); \
  sC = mm512_swap64_32( sC ); \
 \
@@ -924,10 +915,9 @@ do { \
  d = _mm256_xor_si256( d, a ); \
  a = _mm256_and_si256( a, b ); \
  t = _mm256_xor_si256( t, a ); \
-  b = _mm256_xor_si256( b, d ); \
-  b = _mm256_xor_si256( b, t ); \
  a = c; \
-  c = b; \
+  c = _mm256_xor_si256( b, d ); \
+  c = _mm256_xor_si256( c, t ); \
  b = d; \
  d = mm256_not( t ); \
 } while (0)
@@ -977,7 +967,7 @@ do { \

 #define ROUND_BIG( alpha ) \
 do { \
-   __m256i t0, t1, t2, t3; \
+   __m256i t0, t1, t2, t3, t4, t5; \
   s0 = _mm256_xor_si256( s0, alpha[ 0] ); \
   s1 = _mm256_xor_si256( s1, alpha[ 1] ); \
   s2 = _mm256_xor_si256( s2, alpha[ 2] ); \
@@ -1004,43 +994,35 @@ do { \
  s5 = mm256_swap64_32( s5 ); \
  sD = mm256_swap64_32( sD ); \
  sE = mm256_swap64_32( sE ); \
-  t1 = _mm256_blend_epi32( s4, s5, 0xaa ); \
-  t3 = _mm256_blend_epi32( sD, sE, 0xaa ); \
-  L( s0, t1, s9, t3 ); \
-  s4 = _mm256_blend_epi32( s4, t1, 0x55 ); \
-  s5 = _mm256_blend_epi32( s5, t1, 0xaa ); \
-  sD = _mm256_blend_epi32( sD, t3, 0x55 ); \
-  sE = _mm256_blend_epi32( sE, t3, 0xaa ); \
+  t0 = _mm256_blend_epi32( s4, s5, 0xaa ); \
+  t1 = _mm256_blend_epi32( sD, sE, 0xaa ); \
+  L( s0, t0, s9, t1 ); \
 \
  s6 = mm256_swap64_32( s6 ); \
  sF = mm256_swap64_32( sF ); \
-  t1 = _mm256_blend_epi32( s5, s6, 0xaa ); \
+  t2 = _mm256_blend_epi32( s5, s6, 0xaa ); \
  t3 = _mm256_blend_epi32( sE, sF, 0xaa ); \
-  L( s1, t1, sA, t3 ); \
-  s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
-  s6 = _mm256_blend_epi32( s6, t1, 0xaa ); \
-  sE = _mm256_blend_epi32( sE, t3, 0x55 ); \
-  sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
+  L( s1, t2, sA, t3 ); \
+  s5 = _mm256_blend_epi32( t0, t2, 0x55 ); \
+  sE = _mm256_blend_epi32( t1, t3, 0x55 ); \
 \
  s7 = mm256_swap64_32( s7 ); \
  sC = mm256_swap64_32( sC ); \
-  t1 = _mm256_blend_epi32( s6, s7, 0xaa ); \
-  t3 = _mm256_blend_epi32( sF, sC, 0xaa ); \
-  L( s2, t1, sB, t3 ); \
-  s6 = _mm256_blend_epi32( s6, t1, 0x55 ); \
-  s7 = _mm256_blend_epi32( s7, t1, 0xaa ); \
-  sF = _mm256_blend_epi32( sF, t3, 0x55 ); \
-  sC = _mm256_blend_epi32( sC, t3, 0xaa ); \
+  t4 = _mm256_blend_epi32( s6, s7, 0xaa ); \
+  t5 = _mm256_blend_epi32( sF, sC, 0xaa ); \
+  L( s2, t4, sB, t5 ); \
+  s6 = _mm256_blend_epi32( t2, t4, 0x55 ); \
+  sF = _mm256_blend_epi32( t3, t5, 0x55 ); \
  s6 = mm256_swap64_32( s6 ); \
  sF = mm256_swap64_32( sF ); \
 \
-  t1 = _mm256_blend_epi32( s7, s4, 0xaa ); \
+  t2 = _mm256_blend_epi32( s7, s4, 0xaa ); \
  t3 = _mm256_blend_epi32( sC, sD, 0xaa ); \
-  L( s3, t1, s8, t3 ); \
-  s7 = _mm256_blend_epi32( s7, t1, 0x55 ); \
-  s4 = _mm256_blend_epi32( s4, t1, 0xaa ); \
-  sC = _mm256_blend_epi32( sC, t3, 0x55 ); \
-  sD = _mm256_blend_epi32( sD, t3, 0xaa ); \
+  L( s3, t2, s8, t3 ); \
+  s7 = _mm256_blend_epi32( t4, t2, 0x55 ); \
+  s4 = _mm256_blend_epi32( t0, t2, 0xaa ); \
+  sC = _mm256_blend_epi32( t5, t3, 0x55 ); \
+  sD = _mm256_blend_epi32( t1, t3, 0xaa ); \
  s7 = mm256_swap64_32( s7 ); \
  sC = mm256_swap64_32( sC ); \
 \
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -141,6 +141,13 @@ do { \
                       _mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \
 } while (0)

+#define STEP1(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
+do { \
+   __m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
+                                      mm128_ror_32( x7, 11 ) ), w ); \
+} while (0)
+
 /*
 * PASSy(n, in) computes pass number "y", for a total of "n", using the
 * one-argument macro "in" to access input words. Current state is assumed
@@ -152,22 +159,22 @@ do { \
 #define PASS1(n, in)   do { \
 		unsigned pass_count; \
 		for (pass_count = 0; pass_count < 32; pass_count += 8) { \
-			STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
-				in(pass_count + 0), SPH_C32(0x00000000)); \
-			STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
-				in(pass_count + 1), SPH_C32(0x00000000)); \
-			STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
-				in(pass_count + 2), SPH_C32(0x00000000)); \
-			STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
-				in(pass_count + 3), SPH_C32(0x00000000)); \
-			STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
-				in(pass_count + 4), SPH_C32(0x00000000)); \
-			STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
-				in(pass_count + 5), SPH_C32(0x00000000)); \
-			STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
-				in(pass_count + 6), SPH_C32(0x00000000)); \
-			STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
-				in(pass_count + 7), SPH_C32(0x00000000)); \
+			STEP1(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
+				in(pass_count + 0) ); \
+			STEP1(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
+				in(pass_count + 1) ); \
+			STEP1(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
+				in(pass_count + 2) ); \
+			STEP1(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
+				in(pass_count + 3) ); \
+			STEP1(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
+				in(pass_count + 4) ); \
+			STEP1(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
+				in(pass_count + 5) ); \
+			STEP1(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
+				in(pass_count + 6) ); \
+			STEP1(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
+				in(pass_count + 7) ); \
   		} \
 	} while (0)

@@ -605,25 +612,32 @@ do { \
                       _mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
 } while (0)

+#define STEP1_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
+do { \
+   __m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
+                                      mm256_ror_32( x7, 11 ) ), w ); \
+} while (0)
+   
 #define PASS1_8W(n, in)   do { \
      unsigned pass_count; \
      for (pass_count = 0; pass_count < 32; pass_count += 8) { \
-         STEP_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
-            in(pass_count + 0), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
-            in(pass_count + 1), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
-            in(pass_count + 2), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
-            in(pass_count + 3), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
-            in(pass_count + 4), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
-            in(pass_count + 5), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
-            in(pass_count + 6), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
-            in(pass_count + 7), SPH_C32(0x00000000)); \
+         STEP1_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
+            in(pass_count + 0) ); \
+         STEP1_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
+            in(pass_count + 1) ); \
+         STEP1_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
+            in(pass_count + 2) ); \
+         STEP1_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
+            in(pass_count + 3) ); \
+         STEP1_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
+            in(pass_count + 4) ); \
+         STEP1_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
+            in(pass_count + 5) ); \
+         STEP1_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
+            in(pass_count + 6) ); \
+         STEP1_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
+            in(pass_count + 7) ); \
         } \
   } while (0)

--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -72,11 +72,11 @@ static const uint64_t RC[] = {
 // Targetted macros, keccak-macros.h is included for each target.

 #define DECL64(x)          __m512i x
-#define XOR(d, a, b)     (d = _mm512_xor_si512(a,b))
-#define XOR64 XOR
+#define XOR(d, a, b)       (d = _mm512_xor_si512(a,b))
+#define XOR64              XOR
 #define AND64(d, a, b)     (d = _mm512_and_si512(a,b))
 #define OR64(d, a, b)      (d = _mm512_or_si512(a,b))
-#define NOT64(d, s)        (d = _mm512_xor_si512(s,m512_neg1))
+#define NOT64(d, s)        (d = mm512_not( s ) )
 #define ROL64(d, v, n)     (d = mm512_rol_64(v, n))
 #define XOROR(d, a, b, c)  (d = mm512_xoror(a, b, c))
 #define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
@@ -257,14 +257,14 @@ keccak512_8way_close(void *cc, void *dst)
        kc->w[j ] = _mm256_xor_si256( kc->w[j], buf[j] ); \
 } while (0)

-#define DECL64(x)        __m256i x
-#define XOR(d, a, b)    (d = _mm256_xor_si256(a,b))
-#define XOR64 XOR
-#define AND64(d, a, b)   (d = _mm256_and_si256(a,b))
-#define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
-#define NOT64(d, s)      (d = _mm256_xor_si256(s,m256_neg1))
-#define ROL64(d, v, n)   (d = mm256_rol_64(v, n))
-#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
+#define DECL64(x)          __m256i x
+#define XOR(d, a, b)       (d = _mm256_xor_si256(a,b))
+#define XOR64              XOR
+#define AND64(d, a, b)     (d = _mm256_and_si256(a,b))
+#define OR64(d, a, b)      (d = _mm256_or_si256(a,b))
+#define NOT64(d, s)        (d = mm256_not( s ) )
+#define ROL64(d, v, n)     (d = mm256_rol_64(v, n))
+#define XOROR(d, a, b, c)  (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
 #define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
 #define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c ))

--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -19,26 +19,34 @@
 */

 #include <string.h>
-#include <emmintrin.h>
 #include "simd-utils.h"
 #include "luffa_for_sse2.h"

-#if defined(__SSE4_1__)
+#if defined(__AVX512VL__)
+
+#define MULT2( a0, a1 ) \
+{ \
+  __m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
+  a0 = _mm_alignr_epi32( a1, b, 1 ); \
+  a1 = _mm_alignr_epi32( b, a1, 1 ); \
+}
+
+#elif defined(__SSE4_1__)

 #define MULT2( a0, a1 ) do \
 { \
-  __m128i b =  _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
-  a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
-  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) );  \
+  __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
+  a0 = _mm_alignr_epi8( a1, b, 4 ); \
+  a1 = _mm_alignr_epi8( b, a1, 4 ); \
 } while(0)

 #else

 #define MULT2( a0, a1 ) do \
 { \
-  __m128i b =  _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 16 ) ); \
+  __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
  a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
-  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) );  \
+  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
 } while(0)

 #endif
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -3,7 +3,7 @@
 #include "lyra2.h"
 #include "simd-utils.h"

-__thread uint64_t* lyra2z330_wholeMatrix;
+static __thread uint64_t* lyra2z330_wholeMatrix;

 void lyra2z330_hash(void *state, const void *input, uint32_t height)
 {
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -146,14 +146,25 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );

 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+{ \
+   __m128i t; \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_vrol256_64( s6, s7 ); \
-   mm128_vror256_64( s2, s3 ); \
+   t = mm128_alignr_64( s7, s6, 1 ); \
+   s6 = mm128_alignr_64( s6, s7, 1 ); \
+   s7 = t; \
+   t = mm128_alignr_64( s2, s3, 1 ); \
+   s2 =  mm128_alignr_64( s3, s2, 1 ); \
+   s3 = t; \
   G_2X64( s0, s2, s5, s6 ); \
   G_2X64( s1, s3, s4, s7 ); \
-   mm128_vror256_64( s6, s7 ); \
-   mm128_vrol256_64( s2, s3 );
+   t = mm128_alignr_64( s6, s7, 1 ); \
+   s6 = mm128_alignr_64( s7, s6, 1 ); \
+   s7 = t; \
+   t = mm128_alignr_64( s3, s2, 1 ); \
+   s2 =  mm128_alignr_64( s2, s3, 1 ); \
+   s3 = t; \
+} 

 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -4,24 +4,6 @@
 #include <string.h>
 #include <stdio.h>

-long double lbry_calc_network_diff( struct work *work )
-{
-        // sample for diff 43.281 : 1c05ea29
-        // todo: endian reversed on longpoll could be zr5 specific...
-
-   uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
-   uint32_t bits = (nbits & 0xffffff);
-   int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
-   long double d = (long double)0x0000ffff / (long double)bits;
-
-   for (int m=shift; m < 29; m++) d *= 256.0;
-   for (int m=29; m < shift; m++) d /= 256.0;
-   if (opt_debug_diff)
-      applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
-
-   return d;
-}
-
 // std_le should work but it doesn't
 void lbry_le_build_stratum_request( char *req, struct work *work,
                                      struct stratum_ctx *sctx )
@@ -41,31 +23,6 @@ void lbry_le_build_stratum_request( char *req, struct work *work,
   free(xnonce2str);
 }

-/*
-void lbry_build_block_header( struct work* g_work, uint32_t version,
-                             uint32_t *prevhash, uint32_t *merkle_root,
-                             uint32_t ntime, uint32_t nbits )
-{
-   int i;
-   memset( g_work->data, 0, sizeof(g_work->data) );
-   g_work->data[0] =  version;
-
-   if ( have_stratum )
-      for ( i = 0; i < 8; i++ )
-         g_work->data[1 + i] = le32dec( prevhash + i );
-   else
-      for (i = 0; i < 8; i++)
-         g_work->data[ 8-i ] = le32dec( prevhash + i );
-
-   for ( i = 0; i < 8; i++ )
-      g_work->data[9 + i] = be32dec( merkle_root + i );
-
-   g_work->data[ LBRY_NTIME_INDEX ] = ntime;
-   g_work->data[ LBRY_NBITS_INDEX ] = nbits;
-   g_work->data[28] = 0x80000000;
-}
-*/
-
 void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 {
   unsigned char merkle_root[64] = { 0 };
@@ -112,9 +69,7 @@ bool register_lbry_algo( algo_gate_t* gate )
  gate->hash                  = (void*)&lbry_hash;
  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
 #endif
-  gate->calc_network_diff     = (void*)&lbry_calc_network_diff;
  gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
-//  gate->build_block_header    = (void*)&build_block_header;
  gate->build_extraheader     = (void*)&lbry_build_extraheader;
  gate->ntime_index           = LBRY_NTIME_INDEX;
  gate->nbits_index           = LBRY_NBITS_INDEX;
--- a/algo/scrypt/scrypt-core-4way.c
+++ b/algo/scrypt/scrypt-core-4way.c
@@ -830,7 +830,7 @@ void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N )
   }
 }

-// Working, not up to date, needs stream optimization.
+// Working, not up to date, needs stream, shuffle optimizations.
 // 4x32 interleaving
 static void salsa8_simd128_4way( __m128i *b, const __m128i *c )
 {
@@ -937,46 +937,28 @@ void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
 // 4x memory usage
 // Working
 // 4x128 interleaving
-static void salsa_shuffle_4way_simd128( __m512i *X )
+static inline void salsa_shuffle_4way_simd128( __m512i *X )
 {
-   __m512i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3;
-
-   Y0 = _mm512_mask_blend_epi32( 0x1111, X[1], X[0] );
-   Z0 = _mm512_mask_blend_epi32( 0x4444, X[3], X[2] );
-
-   Y1 = _mm512_mask_blend_epi32( 0x1111, X[2], X[1] );
-   Z1 = _mm512_mask_blend_epi32( 0x4444, X[0], X[3] );
-
-   Y2 = _mm512_mask_blend_epi32( 0x1111, X[3], X[2] );
-   Z2 = _mm512_mask_blend_epi32( 0x4444, X[1], X[0] );
-
-   Y3 = _mm512_mask_blend_epi32( 0x1111, X[0], X[3] );
-   Z3 = _mm512_mask_blend_epi32( 0x4444, X[2], X[1] );
-
-   X[0] = _mm512_mask_blend_epi32( 0x3333, Z0, Y0 );
-   X[1] = _mm512_mask_blend_epi32( 0x3333, Z1, Y1 );
-   X[2] = _mm512_mask_blend_epi32( 0x3333, Z2, Y2 );
-   X[3] = _mm512_mask_blend_epi32( 0x3333, Z3, Y3 );
+  __m512i t0 = _mm512_mask_blend_epi32( 0xaaaa, X[0], X[1] );
+  __m512i t1 = _mm512_mask_blend_epi32( 0x5555, X[0], X[1] );
+  __m512i t2 = _mm512_mask_blend_epi32( 0xaaaa, X[2], X[3] );
+  __m512i t3 = _mm512_mask_blend_epi32( 0x5555, X[2], X[3] );
+  X[0] = _mm512_mask_blend_epi32( 0xcccc, t0, t2 );
+  X[1] = _mm512_mask_blend_epi32( 0x6666, t1, t3 );
+  X[2] = _mm512_mask_blend_epi32( 0x3333, t0, t2 );
+  X[3] = _mm512_mask_blend_epi32( 0x9999, t1, t3 );
 }

-static void salsa_unshuffle_4way_simd128( __m512i *X )
+static inline void salsa_unshuffle_4way_simd128( __m512i *X )
 {
-   __m512i Y0, Y1, Y2, Y3;
-
-   Y0 = _mm512_mask_blend_epi32( 0x8888, X[0], X[1] );
-   Y1 = _mm512_mask_blend_epi32( 0x1111, X[0], X[1] );
-   Y2 = _mm512_mask_blend_epi32( 0x2222, X[0], X[1] );
-   Y3 = _mm512_mask_blend_epi32( 0x4444, X[0], X[1] );
-
-   Y0 = _mm512_mask_blend_epi32( 0x4444, Y0, X[2] );
-   Y1 = _mm512_mask_blend_epi32( 0x8888, Y1, X[2] );
-   Y2 = _mm512_mask_blend_epi32( 0x1111, Y2, X[2] );
-   Y3 = _mm512_mask_blend_epi32( 0x2222, Y3, X[2] );
-
-   X[0] = _mm512_mask_blend_epi32( 0x2222, Y0, X[3] );
-   X[1] = _mm512_mask_blend_epi32( 0x4444, Y1, X[3] );
-   X[2] = _mm512_mask_blend_epi32( 0x8888, Y2, X[3] );
-   X[3] = _mm512_mask_blend_epi32( 0x1111, Y3, X[3] );
+  __m512i t0 = _mm512_mask_blend_epi32( 0xcccc, X[0], X[2] );
+  __m512i t1 = _mm512_mask_blend_epi32( 0x3333, X[0], X[2] );
+  __m512i t2 = _mm512_mask_blend_epi32( 0x6666, X[1], X[3] );
+  __m512i t3 = _mm512_mask_blend_epi32( 0x9999, X[1], X[3] );
+  X[0] = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 );
+  X[1] = _mm512_mask_blend_epi32( 0x5555, t0, t2 );
+  X[2] = _mm512_mask_blend_epi32( 0xaaaa, t1, t3 );
+  X[3] = _mm512_mask_blend_epi32( 0x5555, t1, t3 );
 }

 static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C)
@@ -1147,46 +1129,28 @@ void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N )
 // { l1xb, l1xa, l1c9, l1x8,   l0xb, l0xa, l0x9, l0x8 }   b[1]  B[23:16]
 // { l1xf, l1xe, l1xd, l1xc,   l0xf, l0xe, l0xd, l0xc }   b[0]  B[31:24]

-static void salsa_shuffle_2way_simd128( __m256i *X )
+static inline void salsa_shuffle_2way_simd128( __m256i *X )
 {
-   __m256i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3;
-
-   Y0 = _mm256_blend_epi32( X[1], X[0], 0x11 );
-   Z0 = _mm256_blend_epi32( X[3], X[2], 0x44 );
-
-   Y1 = _mm256_blend_epi32( X[2], X[1], 0x11 );
-   Z1 = _mm256_blend_epi32( X[0], X[3], 0x44 );
-
-   Y2 = _mm256_blend_epi32( X[3], X[2], 0x11 );
-   Z2 = _mm256_blend_epi32( X[1], X[0], 0x44 );
-
-   Y3 = _mm256_blend_epi32( X[0], X[3], 0x11 );
-   Z3 = _mm256_blend_epi32( X[2], X[1], 0x44 );
-
-   X[0] = _mm256_blend_epi32( Z0, Y0, 0x33 );
-   X[1] = _mm256_blend_epi32( Z1, Y1, 0x33 );
-   X[2] = _mm256_blend_epi32( Z2, Y2, 0x33 );
-   X[3] = _mm256_blend_epi32( Z3, Y3, 0x33 );
+  __m256i t0 = _mm256_blend_epi32( X[0], X[1], 0xaa );
+  __m256i t1 = _mm256_blend_epi32( X[0], X[1], 0x55 );
+  __m256i t2 = _mm256_blend_epi32( X[2], X[3], 0xaa );
+  __m256i t3 = _mm256_blend_epi32( X[2], X[3], 0x55 );
+  X[0] = _mm256_blend_epi32( t0, t2, 0xcc );
+  X[1] = _mm256_blend_epi32( t1, t3, 0x66 );
+  X[2] = _mm256_blend_epi32( t0, t2, 0x33 );
+  X[3] = _mm256_blend_epi32( t1, t3, 0x99 );
 }

-static void salsa_unshuffle_2way_simd128( __m256i *X )
+static inline void salsa_unshuffle_2way_simd128( __m256i *X )
 {
-   __m256i Y0, Y1, Y2, Y3;
-
-   Y0 = _mm256_blend_epi32( X[0], X[1], 0x88 );
-   Y1 = _mm256_blend_epi32( X[0], X[1], 0x11 );
-   Y2 = _mm256_blend_epi32( X[0], X[1], 0x22 );
-   Y3 = _mm256_blend_epi32( X[0], X[1], 0x44 );
-
-   Y0 = _mm256_blend_epi32( Y0, X[2], 0x44 );
-   Y1 = _mm256_blend_epi32( Y1, X[2], 0x88 );
-   Y2 = _mm256_blend_epi32( Y2, X[2], 0x11 );
-   Y3 = _mm256_blend_epi32( Y3, X[2], 0x22 );
-
-   X[0] = _mm256_blend_epi32( Y0, X[3], 0x22 );
-   X[1] = _mm256_blend_epi32( Y1, X[3], 0x44 );
-   X[2] = _mm256_blend_epi32( Y2, X[3], 0x88 );
-   X[3] = _mm256_blend_epi32( Y3, X[3], 0x11 );
+  __m256i t0 = _mm256_blend_epi32( X[0], X[2], 0xcc );
+  __m256i t1 = _mm256_blend_epi32( X[0], X[2], 0x33 );
+  __m256i t2 = _mm256_blend_epi32( X[1], X[3], 0x66 );
+  __m256i t3 = _mm256_blend_epi32( X[1], X[3], 0x99 );
+  X[0] = _mm256_blend_epi32( t0, t2, 0xaa );
+  X[1] = _mm256_blend_epi32( t0, t2, 0x55 );
+  X[2] = _mm256_blend_epi32( t1, t3, 0xaa );
+  X[3] = _mm256_blend_epi32( t1, t3, 0x55 );
 }

 static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C)
@@ -2163,7 +2127,7 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
   X2 = _mm_blend_epi32( B[1], B[0], 0x4 );
   Y3 = _mm_blend_epi32( B[0], B[3], 0x1 );
   X3 = _mm_blend_epi32( B[2], B[1], 0x4 );
-   X0 = _mm_blend_epi32( X0, Y0, 0x3);
+   X0 = _mm_blend_epi32( X0, Y0, 0x3 );
   X1 = _mm_blend_epi32( X1, Y1, 0x3 );
   X2 = _mm_blend_epi32( X2, Y2, 0x3 );
   X3 = _mm_blend_epi32( X3, Y3, 0x3 );
@@ -2311,91 +2275,34 @@ void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )
 // Double buffered, 2x memory usage
 // No interleaving

-static void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
+static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
 {
   __m128i *XA = (__m128i*)xa;
   __m128i *XB = (__m128i*)xb;
-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;

 #if defined(__SSE4_1__)

-//   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
-   __m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 );
-   YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 );
-   ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 );
-   ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 );
-
-   YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 );
-   YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 );
-   ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 );
-   ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 );
-
-   YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 );
-   YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 );
-   ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 );
-   ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 );
-
-   YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 );
-   YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 );
-   ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 );
-   ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 );
-
-   XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 );
-   XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 );
-
-   XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 );
-   XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 );
-
-   XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 );
-   XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 );
-
-   XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 );
-   XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 );
-
-#else
-
-//  SSE4.1
-
-   YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 );
-   YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 );
-   ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 );
-   ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 );
-
-   YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 );
-   YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 );
-   ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 );
-   ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 );
-
-   YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 );
-   YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 );
-   ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 );
-   ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 );
-
-   YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 );
-   YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 );
-   ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 );
-   ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 );
-
-   XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f );
-   XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f );
-
-   XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f );
-   XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f );
-
-   XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f );
-   XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f );
-
-   XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f );
-   XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f );
-
-#endif  // AVX2 else SSE4_1
+  __m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
+  __m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
+  __m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
+  __m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
+  XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XA[3] = _mm_blend_epi16( t1, t3, 0xc3 );
+  t0 = _mm_blend_epi16( XB[0], XB[1], 0xcc );
+  t1 = _mm_blend_epi16( XB[0], XB[1], 0x33 );
+  t2 = _mm_blend_epi16( XB[2], XB[3], 0xcc );
+  t3 = _mm_blend_epi16( XB[2], XB[3], 0x33 );
+  XB[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XB[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );

 #else   // SSE2
-  
+
+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
+   
   YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
   YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
   YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
@@ -2417,7 +2324,7 @@ static void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
 #endif
 }

-static void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
+static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
 {

   __m128i *XA = (__m128i*)xa;
@@ -2425,67 +2332,22 @@ static void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )

 #if defined(__SSE4_1__)

-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 );
-   YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 );
-   YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 );
-   YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 );
-   YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 );
-   YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 );
-   YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 );
-   YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 );
-
-   YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 );
-   YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 );
-   YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 );
-   YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 );
-   YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 );
-   YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 );
-   YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 );
-   YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 );
-
-   XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 );
-   XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 );
-   XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 );
-   XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 );
-   XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 );
-   XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 );
-   XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 );
-   XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 );
-
-#else   // SSE4_1
-
-   YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 );
-   YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 );
-   YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 );
-   YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 );
-   YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c );
-   YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c );
-   YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 );
-   YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 );
-
-   YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 );
-   YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 );
-   YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 );
-   YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 );
-   YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 );
-   YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 );
-   YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c );
-   YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c );
-
-   XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c );
-   XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c );
-   XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 );
-   XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 );
-   XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 );
-   XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 );
-   XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 );
-   XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 );
-
-#endif  // AVX2 else SSE4_1
+  __m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
+  __m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
+  __m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
+  __m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
+  XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XA[3] = _mm_blend_epi16( t1, t3, 0x33 );
+  t0 = _mm_blend_epi16( XB[0], XB[2], 0xf0 );
+  t1 = _mm_blend_epi16( XB[0], XB[2], 0x0f );
+  t2 = _mm_blend_epi16( XB[1], XB[3], 0x3c );
+  t3 = _mm_blend_epi16( XB[1], XB[3], 0xc3 );
+  XB[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XB[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XB[3] = _mm_blend_epi16( t1, t3, 0x33 );

 #else  // SSE2

@@ -2690,116 +2552,44 @@ void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
 }


-static void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
+static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
                                        uint32_t *xc )
 {
   __m128i *XA = (__m128i*)xa;
   __m128i *XB = (__m128i*)xb;
   __m128i *XC = (__m128i*)xc;
-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;

 #if defined(__SSE4_1__)

-   __m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3, ZC0, ZC1, ZC2, ZC3;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 );
-   YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 );
-   YC0 = _mm_blend_epi32( XC[1], XC[0], 0x1 );
-   ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 );
-   ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 );
-   ZC0 = _mm_blend_epi32( XC[3], XC[2], 0x4 );
-
-   YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 );
-   YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 );
-   YC1 = _mm_blend_epi32( XC[2], XC[1], 0x1 );
-   ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 );
-   ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 );
-   ZC1 = _mm_blend_epi32( XC[0], XC[3], 0x4 );
-
-   YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 );
-   YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 );
-   YC2 = _mm_blend_epi32( XC[3], XC[2], 0x1 );
-   ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 );
-   ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 );
-   ZC2 = _mm_blend_epi32( XC[1], XC[0], 0x4 );
-
-   YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 );
-   YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 );
-   YC3 = _mm_blend_epi32( XC[0], XC[3], 0x1 );
-   ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 );
-   ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 );
-   ZC3 = _mm_blend_epi32( XC[2], XC[1], 0x4 );
-
-   XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 );
-   XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 );
-   XC[0] = _mm_blend_epi32( ZC0, YC0, 0x3 );
-
-   XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 );
-   XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 );
-   XC[1] = _mm_blend_epi32( ZC1, YC1, 0x3 );
-
-   XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 );
-   XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 );
-   XC[2] = _mm_blend_epi32( ZC2, YC2, 0x3 );
-
-   XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 );
-   XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 );
-   XC[3] = _mm_blend_epi32( ZC3, YC3, 0x3 );
-
-#else   
-
-//  SSE4.1
-
-   YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 );
-   YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 );
-   YC0 = _mm_blend_epi16( XC[1], XC[0], 0x03 );
-   ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 );
-   ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 );
-   ZC0 = _mm_blend_epi16( XC[3], XC[2], 0x30 );
-
-   YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 );
-   YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 );
-   YC1 = _mm_blend_epi16( XC[2], XC[1], 0x03 );
-   ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 );
-   ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 );
-   ZC1 = _mm_blend_epi16( XC[0], XC[3], 0x30 );
-
-   YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 );
-   YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 );
-   YC2 = _mm_blend_epi16( XC[3], XC[2], 0x03 );
-   ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 );
-   ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 );
-   ZC2 = _mm_blend_epi16( XC[1], XC[0], 0x30 );
-
-   YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 );
-   YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 );
-   YC3 = _mm_blend_epi16( XC[0], XC[3], 0x03 );
-   ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 );
-   ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 );
-   ZC3 = _mm_blend_epi16( XC[2], XC[1], 0x30 );
-
-   XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f );
-   XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f );
-   XC[0] = _mm_blend_epi16( ZC0, YC0, 0x0f );
-
-   XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f );
-   XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f );
-   XC[1] = _mm_blend_epi16( ZC1, YC1, 0x0f );
-
-   XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f );
-   XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f );
-   XC[2] = _mm_blend_epi16( ZC2, YC2, 0x0f );
-
-   XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f );
-   XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f );
-   XC[3] = _mm_blend_epi16( ZC3, YC3, 0x0f );
-
-#endif  // AVX2 else SSE4_1
+  __m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
+  __m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
+  __m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
+  __m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
+  XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XA[3] = _mm_blend_epi16( t1, t3, 0xc3 );
+  t0 = _mm_blend_epi16( XB[0], XB[1], 0xcc );
+  t1 = _mm_blend_epi16( XB[0], XB[1], 0x33 );
+  t2 = _mm_blend_epi16( XB[2], XB[3], 0xcc );
+  t3 = _mm_blend_epi16( XB[2], XB[3], 0x33 );
+  XB[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XB[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );
+  t0 = _mm_blend_epi16( XC[0], XC[1], 0xcc );
+  t1 = _mm_blend_epi16( XC[0], XC[1], 0x33 );
+  t2 = _mm_blend_epi16( XC[2], XC[3], 0xcc );
+  t3 = _mm_blend_epi16( XC[2], XC[3], 0x33 );
+  XC[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XC[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XC[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XC[3] = _mm_blend_epi16( t1, t3, 0xc3 );

 #else   // SSE2

+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
+
   YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
   YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
   YC0 = _mm_set_epi32( xc[15], xc[10], xc[ 5], xc[ 0] );
@@ -2829,7 +2619,7 @@ static void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
 #endif
 }

-static void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
+static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
                                          uint32_t* xc )
 {
   __m128i *XA = (__m128i*)xa;
@@ -2838,91 +2628,30 @@ static void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,

 #if defined(__SSE4_1__)

-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 );
-   YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 );
-   YC0 = _mm_blend_epi32( XC[0], XC[1], 0x8 );
-   YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 );
-   YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 );
-   YC1 = _mm_blend_epi32( XC[0], XC[1], 0x1 );
-   YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 );
-   YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 );
-   YC2 = _mm_blend_epi32( XC[0], XC[1], 0x2 );
-   YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 );
-   YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 );
-   YC3 = _mm_blend_epi32( XC[0], XC[1], 0x4 );
-
-   YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 );
-   YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 );
-   YC0 = _mm_blend_epi32( YC0, XC[2], 0x4 );
-   YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 );
-   YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 );
-   YC1 = _mm_blend_epi32( YC1, XC[2], 0x8 );
-   YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 );
-   YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 );
-   YC2 = _mm_blend_epi32( YC2, XC[2], 0x1 );
-   YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 );
-   YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 );
-   YC3 = _mm_blend_epi32( YC3, XC[2], 0x2 );
-
-   XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 );
-   XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 );
-   XC[0] = _mm_blend_epi32( YC0, XC[3], 0x2 );
-   XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 );
-   XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 );
-   XC[1] = _mm_blend_epi32( YC1, XC[3], 0x4 );
-   XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 );
-   XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 );
-   XC[2] = _mm_blend_epi32( YC2, XC[3], 0x8 );
-   XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 );
-   XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 );
-   XC[3] = _mm_blend_epi32( YC3, XC[3], 0x1 );
-
-#else   // SSE4_1
-
-   YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 );
-   YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 );
-   YC0 = _mm_blend_epi16( XC[0], XC[1], 0xc0 );
-   YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 );
-   YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 );
-   YC1 = _mm_blend_epi16( XC[0], XC[1], 0x03 );
-   YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c );
-   YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c );
-   YC2 = _mm_blend_epi16( XC[0], XC[1], 0x0c );
-   YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 );
-   YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 );
-   YC3 = _mm_blend_epi16( XC[0], XC[1], 0x30 );
-
-   YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 );
-   YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 );
-   YC0 = _mm_blend_epi16( YC0, XC[2], 0x30 );
-   YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 );
-   YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 );
-   YC1 = _mm_blend_epi16( YC1, XC[2], 0xc0 );
-   YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 );
-   YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 );
-   YC2 = _mm_blend_epi16( YC2, XC[2], 0x03 );
-   YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c );
-   YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c );
-   YC3 = _mm_blend_epi16( YC3, XC[2], 0x0c );
-
-   XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c );
-   XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c );
-   XC[0] = _mm_blend_epi16( YC0, XC[3], 0x0c );
-   XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 );
-   XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 );
-   XC[1] = _mm_blend_epi16( YC1, XC[3], 0x30 );
-   XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 );
-   XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 );
-   XC[2] = _mm_blend_epi16( YC2, XC[3], 0xc0 );
-   XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 );
-   XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 );
-   XC[3] = _mm_blend_epi16( YC3, XC[3], 0x03 );
-
-#endif  // AVX2 else SSE4_1
+  __m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
+  __m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
+  __m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
+  __m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
+  XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XA[3] = _mm_blend_epi16( t1, t3, 0x33 );
+  t0 = _mm_blend_epi16( XB[0], XB[2], 0xf0 );
+  t1 = _mm_blend_epi16( XB[0], XB[2], 0x0f );
+  t2 = _mm_blend_epi16( XB[1], XB[3], 0x3c );
+  t3 = _mm_blend_epi16( XB[1], XB[3], 0xc3 );
+  XB[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XB[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XB[3] = _mm_blend_epi16( t1, t3, 0x33 );
+  t0 = _mm_blend_epi16( XC[0], XC[2], 0xf0 );
+  t1 = _mm_blend_epi16( XC[0], XC[2], 0x0f );
+  t2 = _mm_blend_epi16( XC[1], XC[3], 0x3c );
+  t3 = _mm_blend_epi16( XC[1], XC[3], 0xc3 );
+  XC[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XC[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XC[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XC[3] = _mm_blend_epi16( t1, t3, 0x33 );

 #else  // SSE2

--- a/algo/sha/md-helper-4way.c
+++ b/algo/sha/md-helper-4way.c
@@ -1,270 +0,0 @@
-/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */
-/*
- * This file contains some functions which implement the external data
- * handling and padding for Merkle-Damgard hash functions which follow
- * the conventions set out by MD4 (little-endian) or SHA-1 (big-endian).
- *
- * API: this file is meant to be included, not compiled as a stand-alone
- * file. Some macros must be defined:
- *   RFUN   name for the round function
- *   HASH   "short name" for the hash function
- *   BE32   defined for big-endian, 32-bit based (e.g. SHA-1)
- *   LE32   defined for little-endian, 32-bit based (e.g. MD5)
- *   BE64   defined for big-endian, 64-bit based (e.g. SHA-512)
- *   LE64   defined for little-endian, 64-bit based (no example yet)
- *   PW01   if defined, append 0x01 instead of 0x80 (for Tiger)
- *   BLEN   if defined, length of a message block (in bytes)
- *   PLW1   if defined, length is defined on one 64-bit word only (for Tiger)
- *   PLW4   if defined, length is defined on four 64-bit words (for WHIRLPOOL)
- *   SVAL   if defined, reference to the context state information
- *
- * BLEN is used when a message block is not 16 (32-bit or 64-bit) words:
- * this is used for instance for Tiger, which works on 64-bit words but
- * uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are
- * ignored if 32-bit words are used; if 64-bit words are used and PLW1 is
- * set, then only one word (64 bits) will be used to encode the input
- * message length (in bits), otherwise two words will be used (as in
- * SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but
- * not PLW1), four 64-bit words will be used to encode the message length
- * (in bits). Note that regardless of those settings, only 64-bit message
- * lengths are supported (in bits): messages longer than 2 Exabytes will be
- * improperly hashed (this is unlikely to happen soon: 2 Exabytes is about
- * 2 millions Terabytes, which is huge).
- *
- * If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close()
- * function. This is used for Tiger2, which is identical to Tiger except
- * when it comes to the padding (Tiger2 uses the standard 0x80 byte instead
- * of the 0x01 from original Tiger).
- *
- * The RFUN function is invoked with two arguments, the first pointing to
- * aligned data (as a "const void *"), the second being state information
- * from the context structure. By default, this state information is the
- * "val" field from the context, and this field is assumed to be an array
- * of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64).
- * from the context structure. The "val" field can have any type, except
- * for the output encoding which assumes that it is an array of "sph_u32"
- * values. By defining NO_OUTPUT, this last step is deactivated; the
- * includer code is then responsible for writing out the hash result. When
- * NO_OUTPUT is defined, the third parameter to the "close()" function is
- * ignored.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifdef _MSC_VER
-#pragma warning (disable: 4146)
-#endif
-
-#undef SPH_XCAT
-#define SPH_XCAT(a, b)     SPH_XCAT_(a, b)
-#undef SPH_XCAT_
-#define SPH_XCAT_(a, b)    a ## b
-
-#undef SPH_BLEN
-#undef SPH_WLEN
-#if defined BE64 || defined LE64
-#define SPH_BLEN    128U
-#define SPH_WLEN      8U
-#else
-#define SPH_BLEN     64U
-#define SPH_WLEN      4U
-#endif
-
-#ifdef BLEN
-#undef SPH_BLEN
-#define SPH_BLEN    BLEN
-#endif
-
-#undef SPH_MAXPAD
-#if defined PLW1
-#define SPH_MAXPAD   (SPH_BLEN - SPH_WLEN)
-#elif defined PLW4
-#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 2))
-#else
-#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 1))
-#endif
-
-#undef SPH_VAL
-#undef SPH_NO_OUTPUT
-#ifdef SVAL
-#define SPH_VAL         SVAL
-#define SPH_NO_OUTPUT   1
-#else
-#define SPH_VAL   sc->val
-#endif
-
-#ifndef CLOSE_ONLY
-
-#ifdef SPH_UPTR
-static void
-SPH_XCAT(HASH, _short)( void *cc, const void *data, size_t len )
-#else
-void
-HASH ( void *cc, const void *data, size_t len )
-#endif
-{
-   SPH_XCAT( HASH, _context ) *sc;
-   __m256i *vdata = (__m256i*)data;
-   size_t ptr;
-
-   sc = cc;
-   ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
-   while ( len > 0 )
-   {
-      size_t clen;
-      clen = SPH_BLEN - ptr;
-      if ( clen > len )
-         clen = len;
-      memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
-      vdata = vdata + (clen>>3);
-      ptr += clen;
-      len -= clen;
-      if ( ptr == SPH_BLEN )
-      {
-         RFUN( sc->buf, SPH_VAL );
-         ptr = 0;
-      }
-         sc->count += clen;
-   }
-}
-
-#ifdef SPH_UPTR
-void
-HASH (void *cc, const void *data, size_t len)
-{
-   SPH_XCAT(HASH, _context) *sc;
-   __m256i *vdata = (__m256i*)data;
-   unsigned ptr;
-
-   if ( len < (2 * SPH_BLEN) )
-   {
-      SPH_XCAT(HASH, _short)(cc, data, len);
-      return;
-   }
-   sc = cc;
-   ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
-   if ( ptr > 0 )
-   {
-      unsigned t;
-      t = SPH_BLEN - ptr;
-      SPH_XCAT( HASH, _short )( cc, data, t );
-      vdata = vdata + (t>>3);
-      len -= t;
-   }
-   SPH_XCAT( HASH, _short )( cc, data, len );
-}
-#endif
-
-#endif
-
-/*
- * Perform padding and produce result. The context is NOT reinitialized
- * by this function.
- */
-static void
-SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
-          void *dst, unsigned rnum )
-{
-    SPH_XCAT(HASH, _context) *sc;
-    unsigned ptr, u;
-    sc = cc;
-    ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
-
-#ifdef PW01
-    sc->buf[ptr>>3] = m256_const1_64( 0x100 >> 8 );
-#else
-    sc->buf[ptr>>3] = m256_const1_64( 0x80 );
-#endif
-    ptr += 8;
-
-    if ( ptr > SPH_MAXPAD )
-    {
-         memset_zero_256( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
-         RFUN( sc->buf, SPH_VAL );
-         memset_zero_256( sc->buf, SPH_MAXPAD >> 3 );
-    }
-    else
-    {
-         memset_zero_256( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
-    }
-#if defined BE64
-#if defined PLW1
-    sc->buf[ SPH_MAXPAD>>3 ] =
-                 mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
-#elif defined PLW4
-    memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
-    sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-                mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
-    sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-                mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
-#else
-    sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-               mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
-    sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-               mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
-#endif  // PLW
-#else  // LE64
-#if defined PLW1
-    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
-#elif defined PLW4
-    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
-    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
-                       _mm256_set1_epi64x( c->count >> 61 );
-    memset_zero_256( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
-                       2 * SPH_WLEN );
-#else
-    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
-    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
-                          _mm256_set1_epi64x( sc->count >> 61 );
-#endif // PLW
-
-#endif // LE64
-
-    RFUN( sc->buf, SPH_VAL );
-
-#ifdef SPH_NO_OUTPUT
-    (void)dst;
-    (void)rnum;
-    (void)u;
-#else
-    for ( u = 0; u < rnum; u ++ )
-    {
-#if defined BE64
-       ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
-#else  // LE64
-       ((__m256i*)dst)[u] = sc->val[u];
-#endif
-    }
-#endif
-}
-
-static void
-SPH_XCAT( HASH, _mdclose )( void *cc, void *dst, unsigned rnum )
-{
-   SPH_XCAT( HASH, _addbits_and_close )( cc, 0, 0, dst, rnum );
-}
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -33,6 +33,7 @@
 #include <stddef.h>
 #include <string.h>

+// 4way is only used with AVX2, 8way only with AVX512, 16way is not needed.
 #ifdef __SSE4_1__

 #include "shabal-hash-4way.h"
@@ -44,21 +45,6 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

-/*
- * Part of this code was automatically generated (the part between
- * the "BEGIN" and "END" markers).
- */
-
-#define sM    16
-
-#define C32   SPH_C32
-#define T32   SPH_T32
-
-#define O1   13
-#define O2    9
-#define O3    6
-
-
 #if defined(__AVX2__)

 #define DECL_STATE8   \
@@ -310,72 +296,71 @@ do { \
    mm256_swap512_256( BF, CF ); \
 } while (0)

-#define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
+#define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
-   xa0 = mm256_xor3( xm, xb1, _mm256_xor_si256(  \
-            _mm256_andnot_si256( xb3, xb2 ), \
-            _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
-               _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \
-                                   FIVE ) ), THREE ) ) ); \
+   xa0 = mm256_xor3( xm, xb1, mm256_xorandnot(  \
+           _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
+              _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
+           xb3, xb2 ) ); \
   xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
 } while (0)

 #define PERM_STEP_0_8   do { \
-      PERM_ELT8(A0, AB, B0, BD, B9, B6, C8, M0); \
-      PERM_ELT8(A1, A0, B1, BE, BA, B7, C7, M1); \
-      PERM_ELT8(A2, A1, B2, BF, BB, B8, C6, M2); \
-      PERM_ELT8(A3, A2, B3, B0, BC, B9, C5, M3); \
-      PERM_ELT8(A4, A3, B4, B1, BD, BA, C4, M4); \
-      PERM_ELT8(A5, A4, B5, B2, BE, BB, C3, M5); \
-      PERM_ELT8(A6, A5, B6, B3, BF, BC, C2, M6); \
-      PERM_ELT8(A7, A6, B7, B4, B0, BD, C1, M7); \
-      PERM_ELT8(A8, A7, B8, B5, B1, BE, C0, M8); \
-      PERM_ELT8(A9, A8, B9, B6, B2, BF, CF, M9); \
-      PERM_ELT8(AA, A9, BA, B7, B3, B0, CE, MA); \
-      PERM_ELT8(AB, AA, BB, B8, B4, B1, CD, MB); \
-      PERM_ELT8(A0, AB, BC, B9, B5, B2, CC, MC); \
-      PERM_ELT8(A1, A0, BD, BA, B6, B3, CB, MD); \
-      PERM_ELT8(A2, A1, BE, BB, B7, B4, CA, ME); \
-      PERM_ELT8(A3, A2, BF, BC, B8, B5, C9, MF); \
-   } while (0)
+      PERM_ELT8( A0, AB, B0, BD, B9, B6, C8, M0 ); \
+      PERM_ELT8( A1, A0, B1, BE, BA, B7, C7, M1 ); \
+      PERM_ELT8( A2, A1, B2, BF, BB, B8, C6, M2 ); \
+      PERM_ELT8( A3, A2, B3, B0, BC, B9, C5, M3 ); \
+      PERM_ELT8( A4, A3, B4, B1, BD, BA, C4, M4 ); \
+      PERM_ELT8( A5, A4, B5, B2, BE, BB, C3, M5 ); \
+      PERM_ELT8( A6, A5, B6, B3, BF, BC, C2, M6 ); \
+      PERM_ELT8( A7, A6, B7, B4, B0, BD, C1, M7 ); \
+      PERM_ELT8( A8, A7, B8, B5, B1, BE, C0, M8 ); \
+      PERM_ELT8( A9, A8, B9, B6, B2, BF, CF, M9 ); \
+      PERM_ELT8( AA, A9, BA, B7, B3, B0, CE, MA ); \
+      PERM_ELT8( AB, AA, BB, B8, B4, B1, CD, MB ); \
+      PERM_ELT8( A0, AB, BC, B9, B5, B2, CC, MC ); \
+      PERM_ELT8( A1, A0, BD, BA, B6, B3, CB, MD ); \
+      PERM_ELT8( A2, A1, BE, BB, B7, B4, CA, ME ); \
+      PERM_ELT8( A3, A2, BF, BC, B8, B5, C9, MF ); \
+} while (0)

 #define PERM_STEP_1_8   do { \
-      PERM_ELT8(A4, A3, B0, BD, B9, B6, C8, M0); \
-      PERM_ELT8(A5, A4, B1, BE, BA, B7, C7, M1); \
-      PERM_ELT8(A6, A5, B2, BF, BB, B8, C6, M2); \
-      PERM_ELT8(A7, A6, B3, B0, BC, B9, C5, M3); \
-      PERM_ELT8(A8, A7, B4, B1, BD, BA, C4, M4); \
-      PERM_ELT8(A9, A8, B5, B2, BE, BB, C3, M5); \
-      PERM_ELT8(AA, A9, B6, B3, BF, BC, C2, M6); \
-      PERM_ELT8(AB, AA, B7, B4, B0, BD, C1, M7); \
-      PERM_ELT8(A0, AB, B8, B5, B1, BE, C0, M8); \
-      PERM_ELT8(A1, A0, B9, B6, B2, BF, CF, M9); \
-      PERM_ELT8(A2, A1, BA, B7, B3, B0, CE, MA); \
-      PERM_ELT8(A3, A2, BB, B8, B4, B1, CD, MB); \
-      PERM_ELT8(A4, A3, BC, B9, B5, B2, CC, MC); \
-      PERM_ELT8(A5, A4, BD, BA, B6, B3, CB, MD); \
-      PERM_ELT8(A6, A5, BE, BB, B7, B4, CA, ME); \
-      PERM_ELT8(A7, A6, BF, BC, B8, B5, C9, MF); \
-   } while (0)
+      PERM_ELT8( A4, A3, B0, BD, B9, B6, C8, M0 ); \
+      PERM_ELT8( A5, A4, B1, BE, BA, B7, C7, M1 ); \
+      PERM_ELT8( A6, A5, B2, BF, BB, B8, C6, M2 ); \
+      PERM_ELT8( A7, A6, B3, B0, BC, B9, C5, M3 ); \
+      PERM_ELT8( A8, A7, B4, B1, BD, BA, C4, M4 ); \
+      PERM_ELT8( A9, A8, B5, B2, BE, BB, C3, M5 ); \
+      PERM_ELT8( AA, A9, B6, B3, BF, BC, C2, M6 ); \
+      PERM_ELT8( AB, AA, B7, B4, B0, BD, C1, M7 ); \
+      PERM_ELT8( A0, AB, B8, B5, B1, BE, C0, M8 ); \
+      PERM_ELT8( A1, A0, B9, B6, B2, BF, CF, M9 ); \
+      PERM_ELT8( A2, A1, BA, B7, B3, B0, CE, MA ); \
+      PERM_ELT8( A3, A2, BB, B8, B4, B1, CD, MB ); \
+      PERM_ELT8( A4, A3, BC, B9, B5, B2, CC, MC ); \
+      PERM_ELT8( A5, A4, BD, BA, B6, B3, CB, MD ); \
+      PERM_ELT8( A6, A5, BE, BB, B7, B4, CA, ME ); \
+      PERM_ELT8( A7, A6, BF, BC, B8, B5, C9, MF ); \
+} while (0)

 #define PERM_STEP_2_8   do { \
-      PERM_ELT8(A8, A7, B0, BD, B9, B6, C8, M0); \
-      PERM_ELT8(A9, A8, B1, BE, BA, B7, C7, M1); \
-      PERM_ELT8(AA, A9, B2, BF, BB, B8, C6, M2); \
-      PERM_ELT8(AB, AA, B3, B0, BC, B9, C5, M3); \
-      PERM_ELT8(A0, AB, B4, B1, BD, BA, C4, M4); \
-      PERM_ELT8(A1, A0, B5, B2, BE, BB, C3, M5); \
-      PERM_ELT8(A2, A1, B6, B3, BF, BC, C2, M6); \
-      PERM_ELT8(A3, A2, B7, B4, B0, BD, C1, M7); \
-      PERM_ELT8(A4, A3, B8, B5, B1, BE, C0, M8); \
-      PERM_ELT8(A5, A4, B9, B6, B2, BF, CF, M9); \
-      PERM_ELT8(A6, A5, BA, B7, B3, B0, CE, MA); \
-      PERM_ELT8(A7, A6, BB, B8, B4, B1, CD, MB); \
-      PERM_ELT8(A8, A7, BC, B9, B5, B2, CC, MC); \
-      PERM_ELT8(A9, A8, BD, BA, B6, B3, CB, MD); \
-      PERM_ELT8(AA, A9, BE, BB, B7, B4, CA, ME); \
-      PERM_ELT8(AB, AA, BF, BC, B8, B5, C9, MF); \
-   } while (0)
+      PERM_ELT8( A8, A7, B0, BD, B9, B6, C8, M0 ); \
+      PERM_ELT8( A9, A8, B1, BE, BA, B7, C7, M1 ); \
+      PERM_ELT8( AA, A9, B2, BF, BB, B8, C6, M2 ); \
+      PERM_ELT8( AB, AA, B3, B0, BC, B9, C5, M3 ); \
+      PERM_ELT8( A0, AB, B4, B1, BD, BA, C4, M4 ); \
+      PERM_ELT8( A1, A0, B5, B2, BE, BB, C3, M5 ); \
+      PERM_ELT8( A2, A1, B6, B3, BF, BC, C2, M6 ); \
+      PERM_ELT8( A3, A2, B7, B4, B0, BD, C1, M7 ); \
+      PERM_ELT8( A4, A3, B8, B5, B1, BE, C0, M8 ); \
+      PERM_ELT8( A5, A4, B9, B6, B2, BF, CF, M9 ); \
+      PERM_ELT8( A6, A5, BA, B7, B3, B0, CE, MA ); \
+      PERM_ELT8( A7, A6, BB, B8, B4, B1, CD, MB ); \
+      PERM_ELT8( A8, A7, BC, B9, B5, B2, CC, MC ); \
+      PERM_ELT8( A9, A8, BD, BA, B6, B3, CB, MD ); \
+      PERM_ELT8( AA, A9, BE, BB, B7, B4, CA, ME ); \
+      PERM_ELT8( AB, AA, BF, BC, B8, B5, C9, MF ); \
+} while (0)

 #define APPLY_P8 \
 do { \
@@ -437,8 +422,8 @@ do { \
 } while (0)

 #define INCR_W8   do { \
-      if ((Wlow = T32(Wlow + 1)) == 0) \
-         Whigh = T32(Whigh + 1); \
+      if ( ( Wlow = Wlow + 1 ) == 0 ) \
+         Whigh = Whigh + 1; \
   } while (0)

 static void
@@ -650,15 +635,8 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
   shabal_8way_close(cc, ub, n, dst, 16);
 }

-
 #endif  // AVX2

-/*
- * We copy the state into local variables, so that the compiler knows
- * that it can optimize them at will.
- */
-
-
 #define DECL_STATE   \
 	__m128i A0, A1, A2, A3, A4, A5, A6, A7, \
 	        A8, A9, AA, AB; \
@@ -888,15 +866,6 @@ do { \
   A1 = _mm_xor_si128( A1, _mm_set1_epi32( Whigh ) ); \
 } while (0)

-
-/*
-#define SWAP(v1, v2)   do { \
-		sph_u32 tmp = (v1); \
-		(v1) = (v2); \
-		(v2) = tmp; \
-	} while (0)
-*/
-
 #define SWAP_BC \
 do { \
    mm128_swap256_128( B0, C0 ); \
@@ -917,18 +886,6 @@ do { \
    mm128_swap256_128( BF, CF ); \
 } while (0)

-/*
-#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
-do { \
-  __m128i t1 = _mm_mullo_epi32(  mm_rol_32( xa1, 15 ),\
-                                   _mm_set1_epi32(5UL) ) \
-  __m128i t2 = _mm_xor_si128( xa0, xc ); \
-  xb0 = mm_not( _mm_xor_si256( xa0, mm_rol_32( xb0, 1 ) ) ); \
-  xa0 = mm_xor4( xm, xb1, _mm_andnot_si128( xb3, xb2 ), \
-              _mm_xor_si128( t2, \
-                      _mm_mullo_epi32( t1, _mm_set1_epi32(5UL) ) ) ) \
-*/
-
 #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
 do { \
   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
@@ -1056,8 +1013,8 @@ do { \
 } while (0)

 #define INCR_W   do { \
-		if ((Wlow = T32(Wlow + 1)) == 0) \
-			Whigh = T32(Whigh + 1); \
+		if ( ( Wlow = Wlow + 1 ) == 0 ) \
+			Whigh = Whigh + 1; \
 	} while (0)

 /*
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -75,7 +75,6 @@ void shabal512_8way_close( void *cc, void *dst );
 void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                       void *dst );

-
 #endif

 typedef struct {
@@ -97,7 +96,6 @@ void shabal256_4way_addbits_and_close(	void *cc, unsigned ub, unsigned n,

 void shabal512_4way_init( void *cc );
 void shabal512_4way_update( void *cc, const void *data, size_t len );
-//#define shabal512_4way shabal512_4way_update
 void shabal512_4way_close( void *cc, void *dst );
 void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                       void *dst );
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -1106,8 +1106,7 @@ skein256_4way_close(void *cc, void *dst)
 }


-
-// Do not use with 128 bit data
+// Broken for 80 & 128 bytes, use prehash or full
 void
 skein512_4way_update(void *cc, const void *data, size_t len)
 {
--- a/algo/skein/skein.c
+++ b/algo/skein/skein.c
@@ -31,18 +31,19 @@ int scanhash_skein( struct work *work, uint32_t max_nonce,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t n = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;

   swab32_array( endiandata, pdata, 20 );

 	do {
 		be32enc(&endiandata[19], n); 
 		skeinhash(hash64, endiandata);
-		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
-			*hashes_done = n - first_nonce + 1;
-			pdata[19] = n;
-			return true;
-		}
+      if (hash64[7] <= Htarg )
+      if ( fulltest(hash64, ptarget) && !opt_benchmark )
+      {
+         pdata[19] = n;
+         submit_solution( work, hash64, mythr );
+      }
 		n++;

 	} while (n < max_nonce && !work_restart[thr_id].restart);
--- a/algo/skein/skein2.c
+++ b/algo/skein/skein2.c
@@ -34,31 +34,31 @@ void skein2hash(void *output, const void *input)
 	sph_skein512_close(&ctx_skein, hash);

 	memcpy(output, hash, 32);
-
 }

 int scanhash_skein2( struct work *work,	uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	uint32_t hash64[8] __attribute__ ((aligned (64)));
 	uint32_t endiandata[20] __attribute__ ((aligned (64)));
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t n = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id; 

-        swab32_array( endiandata, pdata, 20 );
+   swab32_array( endiandata, pdata, 20 );

 	do {
 		be32enc(&endiandata[19], n);
 		skein2hash(hash64, endiandata);
-		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
-			*hashes_done = n - first_nonce + 1;
-			pdata[19] = n;
-			return true;
-		}
+      if (hash64[7] <= Htarg )
+      if ( fulltest(hash64, ptarget) && !opt_benchmark )
+      {
+         pdata[19] = n;
+         submit_solution( work, hash64, mythr );
+      }
 		n++;

 	} while (n < max_nonce && !work_restart[thr_id].restart);
--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -112,8 +112,9 @@ void timetravel_4way_hash(void *output, const void *input)
              intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
        case 3:
-           skein512_4way_update( &ctx.skein, vhashA, dataLen );
-           skein512_4way_close( &ctx.skein, vhashB );
+           skein512_4way_full( &ctx.skein, vhashB, vhashA, dataLen );
+//           skein512_4way_update( &ctx.skein, vhashA, dataLen );
+//           skein512_4way_close( &ctx.skein, vhashB );
           if ( i == 7 )
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
        break;
--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -118,8 +118,9 @@ void timetravel10_4way_hash(void *output, const void *input)
              intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
        case 3:
-           skein512_4way_update( &ctx.skein, vhashA, dataLen );
-           skein512_4way_close( &ctx.skein, vhashB );
+           skein512_4way_full( &ctx.skein, vhashB, vhashA, dataLen );
+//           skein512_4way_update( &ctx.skein, vhashA, dataLen );
+//           skein512_4way_close( &ctx.skein, vhashB );
           if ( i == 9 )
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
        break;
--- a/algo/x14/polytimos-4way.c
+++ b/algo/x14/polytimos-4way.c
@@ -33,9 +33,10 @@ void polytimos_4way_hash( void *output, const void *input )
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     poly_4way_context_overlay ctx;

-     skein512_4way_init( &ctx.skein );
-     skein512_4way_update( &ctx.skein, input, 80 );
-     skein512_4way_close( &ctx.skein, vhash );
+     skein512_4way_full( &ctx.skein, vhash, input, 80 );
+//     skein512_4way_init( &ctx.skein );
+//     skein512_4way_update( &ctx.skein, input, 80 );
+//     skein512_4way_close( &ctx.skein, vhash );

     // Need to convert from 64 bit interleaved to 32 bit interleaved.
     uint32_t vhash32[16*4];
--- a/algo/x14/veltor-4way.c
+++ b/algo/x14/veltor-4way.c
@@ -38,8 +38,10 @@ void veltor_4way_hash( void *output, const void *input )
     veltor_4way_ctx_holder ctx __attribute__ ((aligned (64)));
     memcpy( &ctx, &veltor_4way_ctx, sizeof(veltor_4way_ctx) );

-     skein512_4way_update( &ctx.skein, input, 80 );
-     skein512_4way_close( &ctx.skein, vhash );
+//     skein512_4way_update( &ctx.skein, input, 80 );
+//     skein512_4way_close( &ctx.skein, vhash );
+
+     skein512_4way_full( &ctx.skein, vhash, input, 80 );
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     sph_shavite512( &ctx.shavite, hash0, 64 );
@@ -105,7 +107,7 @@ int scanhash_veltor_4way( struct work *work, uint32_t max_nonce,
         pdata[19] = n;

         for ( int i = 0; i < 4; i++ )
-         if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+         if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) && ! opt_benchmark )
         {
            pdata[19] = n+i;
            submit_solution( work, hash+(i<<3), mythr );
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -198,7 +198,7 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
   {
       char* data;
       data = (char*)malloc( 2 + strlen( denom10_str ) * 4 + 16 * 4
-                             + strlen( merkleroot_str ) * 3 );
+                             + strlen( merkleroot_str ) * 3 + 1 );
       // Build the block header veildatahash in hex
       sprintf( data, "%s%s%s%s%s%s%s%s%s%s%s%s",
                       merkleroot_str, witmerkleroot_str, "04",
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -257,6 +257,7 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
   const __m512i eight = m512_const1_64( 8 );
   const bool bench = opt_benchmark;

+   // convert LE32 to LE64
   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
@@ -470,6 +471,7 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
   const __m256i four = m256_const1_64( 4 );
   const bool bench = opt_benchmark;

+   // convert LE32 to LE64
   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
--- a/algo/yespower/yespower-opt.c
+++ b/algo/yespower/yespower-opt.c
@@ -71,6 +71,11 @@
 */
 #undef USE_SSE4_FOR_32BIT

+// AVX512 is slow. There isn't enough AVX512 code to make up
+// for the reduced clock. AVX512VL, used for rotate & ternary logic on smaller
+// vectors, is exempt.
+//#define YESPOWER_USE_AVX512 1
+
 #ifdef __SSE2__
 /*
 * GCC before 4.9 would by default unnecessarily use store/load (without
@@ -124,18 +129,96 @@
 #endif

 typedef union {
-	uint32_t w[16];
-	uint64_t d[8];
+	uint32_t d[16];
+	uint64_t q[8];
 #ifdef __SSE2__
-	__m128i q[4];
+	__m128i m128[4];
+#endif
+#if defined(__AVX2__)
+   __m256i m256[2];
+#endif
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+   __m512i m512;
 #endif
 } salsa20_blk_t;

+#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+// Slow
+
+static const __m512i simd_shuffle_index = 
+   { 0x0000000500000000, 0x0000000f0000000a,
+     0x0000000900000004, 0x000000030000000e,
+     0x0000000d00000008, 0x0000000700000002,
+     0x000000010000000c, 0x0000000b00000006 };
+static const __m512i simd_unshuffle_index =
+   { 0x0000000d00000000, 0x000000070000000a,
+     0x0000000100000004, 0x0000000b0000000e,
+     0x0000000500000008, 0x0000000f00000002,
+     0x000000090000000c, 0x0000000300000006 };
+
+#elif defined(__AVX2__)
+
+#if defined(__AVX512VL__)
+// alternative when not using 512 bit vectors
+
+static const __m256i simd_shuffle_index =
+   { 0x0000000500000000, 0x0000000f0000000a,
+     0x0000000900000004, 0x000000030000000e };
+static const __m256i simd_unshuffle_index =
+   { 0x0000000d00000000, 0x000000070000000a,
+     0x0000000100000004, 0x0000000b0000000e };
+
+#else
+
+static const __m256i simd_shuffle_index =
+   { 0x0000000500000000, 0x0000000700000002,
+     0x0000000100000004, 0x0000000300000006 };
+// same index for unshuffle
+
+#endif
+
+#endif
+
 static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin,
    salsa20_blk_t *Bout)
 {
+#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  
+  Bout->m512 = _mm512_permutexvar_epi32( simd_shuffle_index, Bin->m512 );
+
+#elif defined(__AVX2__)
+
+#if defined(__AVX512VL__)
+
+  Bout->m256[0] = _mm256_permutex2var_epi32( Bin->m256[0], simd_shuffle_index,
+                                             Bin->m256[1] );
+  Bout->m256[1] = _mm256_permutex2var_epi32( Bin->m256[1], simd_shuffle_index,
+                                             Bin->m256[0] );
+  
+#else
+
+  __m256i t0 = _mm256_permutevar8x32_epi32( Bin->m256[0], simd_shuffle_index );
+  __m256i t1 = _mm256_permutevar8x32_epi32( Bin->m256[1], simd_shuffle_index );
+  Bout->m256[0] = _mm256_blend_epi32( t1, t0, 0x93 );
+  Bout->m256[1] = _mm256_blend_epi32( t1, t0, 0x6c );
+  
+#endif
+  
+#elif defined(__SSE4_1__)
+
+  __m128i t0 = _mm_blend_epi16( Bin->m128[0], Bin->m128[1], 0xcc );
+  __m128i t1 = _mm_blend_epi16( Bin->m128[0], Bin->m128[1], 0x33 );
+  __m128i t2 = _mm_blend_epi16( Bin->m128[2], Bin->m128[3], 0xcc );
+  __m128i t3 = _mm_blend_epi16( Bin->m128[2], Bin->m128[3], 0x33 );
+  Bout->m128[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  Bout->m128[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  Bout->m128[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  Bout->m128[3] = _mm_blend_epi16( t1, t3, 0xc3 );
+
+#else
+
 #define COMBINE(out, in1, in2) \
-	Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32);
+	Bout->q[out] = Bin->d[in1 * 2] | ((uint64_t)Bin->d[in2 * 2 + 1] << 32);
 	COMBINE(0, 0, 2)
 	COMBINE(1, 5, 7)
 	COMBINE(2, 2, 4)
@@ -145,14 +228,51 @@ static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin,
 	COMBINE(6, 6, 0)
 	COMBINE(7, 3, 5)
 #undef COMBINE
+
+#endif   
 }

 static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
    salsa20_blk_t *Bout)
 {
+#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+  Bout->m512 = _mm512_permutexvar_epi32( simd_unshuffle_index, Bin->m512 );    
+
+#elif defined(__AVX2__)
+  
+#if defined(__AVX512VL__)
+  
+  Bout->m256[0] = _mm256_permutex2var_epi32( Bin->m256[0], simd_unshuffle_index,
+                                             Bin->m256[1] );
+  Bout->m256[1] = _mm256_permutex2var_epi32( Bin->m256[1], simd_unshuffle_index,
+                                             Bin->m256[0] );
+
+#else  
+
+  __m256i t0 = _mm256_permutevar8x32_epi32( Bin->m256[0], simd_shuffle_index );
+  __m256i t1 = _mm256_permutevar8x32_epi32( Bin->m256[1], simd_shuffle_index );
+  Bout->m256[0] = _mm256_blend_epi32( t1, t0, 0x39 );
+  Bout->m256[1] = _mm256_blend_epi32( t1, t0, 0xc6 );
+
+#endif
+
+#elif defined(__SSE4_1__)
+
+  __m128i t0 = _mm_blend_epi16( Bin->m128[0], Bin->m128[2], 0xf0 );
+  __m128i t1 = _mm_blend_epi16( Bin->m128[0], Bin->m128[2], 0x0f );
+  __m128i t2 = _mm_blend_epi16( Bin->m128[1], Bin->m128[3], 0x3c );
+  __m128i t3 = _mm_blend_epi16( Bin->m128[1], Bin->m128[3], 0xc3 );
+  Bout->m128[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  Bout->m128[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  Bout->m128[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  Bout->m128[3] = _mm_blend_epi16( t1, t3, 0x33 );
+
+#else
+
 #define UNCOMBINE(out, in1, in2) \
-	Bout->w[out * 2] = Bin->d[in1]; \
-	Bout->w[out * 2 + 1] = Bin->d[in2] >> 32;
+	Bout->d[out * 2] = Bin->q[in1]; \
+	Bout->d[out * 2 + 1] = Bin->q[in2] >> 32;
 	UNCOMBINE(0, 0, 6)
 	UNCOMBINE(1, 5, 3)
 	UNCOMBINE(2, 2, 0)
@@ -162,19 +282,14 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
 	UNCOMBINE(6, 6, 4)
 	UNCOMBINE(7, 3, 1)
 #undef UNCOMBINE
+
+#endif
 }

-#ifdef __SSE2__
-
-#define DECL_X \
-	__m128i X0, X1, X2, X3;
-#define DECL_Y \
-	__m128i Y0, Y1, Y2, Y3;
-#define READ_X(in) \
-	X0 = (in).q[0]; X1 = (in).q[1]; X2 = (in).q[2]; X3 = (in).q[3];
 #define WRITE_X(out) \
-	(out).q[0] = X0; (out).q[1] = X1; (out).q[2] = X2; (out).q[3] = X3;
+ (out).m128[0] = X0; (out).m128[1] = X1; (out).m128[2] = X2; (out).m128[3] = X3;

+// Bit rotation optimization
 #if defined(__AVX512VL__)

 #define ARX(out, in1, in2, s) \
@@ -221,203 +336,229 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
 #define SALSA20_wrapper(out, rounds) { \
 	__m128i Z0 = X0, Z1 = X1, Z2 = X2, Z3 = X3; \
 	rounds \
-	(out).q[0] = X0 = _mm_add_epi32(X0, Z0); \
-	(out).q[1] = X1 = _mm_add_epi32(X1, Z1); \
-	(out).q[2] = X2 = _mm_add_epi32(X2, Z2); \
-	(out).q[3] = X3 = _mm_add_epi32(X3, Z3); \
+	(out).m128[0] = X0 = _mm_add_epi32( X0, Z0 ); \
+	(out).m128[1] = X1 = _mm_add_epi32( X1, Z1 ); \
+	(out).m128[2] = X2 = _mm_add_epi32( X2, Z2 ); \
+	(out).m128[3] = X3 = _mm_add_epi32( X3, Z3 ); \
 }

 /**
 * Apply the Salsa20/2 core to the block provided in X.
 */
+// Not called explicitly, aliased to SALSA20
 #define SALSA20_2(out) \
 	SALSA20_wrapper(out, SALSA20_2ROUNDS)

-#define SALSA20_8ROUNDS \
-	SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS
-
 /**
 * Apply the Salsa20/8 core to the block provided in X.
 */
+#define SALSA20_8ROUNDS \
+   SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS
+
 #define SALSA20_8(out) \
 	SALSA20_wrapper(out, SALSA20_8ROUNDS)

 #define XOR_X(in) \
-	X0 = _mm_xor_si128(X0, (in).q[0]); \
-	X1 = _mm_xor_si128(X1, (in).q[1]); \
-	X2 = _mm_xor_si128(X2, (in).q[2]); \
-	X3 = _mm_xor_si128(X3, (in).q[3]);
-
-#define XOR_X_2(in1, in2) \
-	X0 = _mm_xor_si128((in1).q[0], (in2).q[0]); \
-	X1 = _mm_xor_si128((in1).q[1], (in2).q[1]); \
-	X2 = _mm_xor_si128((in1).q[2], (in2).q[2]); \
-	X3 = _mm_xor_si128((in1).q[3], (in2).q[3]);
+	X0 = _mm_xor_si128( X0, (in).m128[0] ); \
+	X1 = _mm_xor_si128( X1, (in).m128[1] ); \
+	X2 = _mm_xor_si128( X2, (in).m128[2] ); \
+	X3 = _mm_xor_si128( X3, (in).m128[3] );

 #define XOR_X_WRITE_XOR_Y_2(out, in) \
-	(out).q[0] = Y0 = _mm_xor_si128((out).q[0], (in).q[0]); \
-	(out).q[1] = Y1 = _mm_xor_si128((out).q[1], (in).q[1]); \
-	(out).q[2] = Y2 = _mm_xor_si128((out).q[2], (in).q[2]); \
-	(out).q[3] = Y3 = _mm_xor_si128((out).q[3], (in).q[3]); \
-	X0 = _mm_xor_si128(X0, Y0); \
-	X1 = _mm_xor_si128(X1, Y1); \
-	X2 = _mm_xor_si128(X2, Y2); \
-	X3 = _mm_xor_si128(X3, Y3);
+	(out).m128[0] = Y0 = _mm_xor_si128( (out).m128[0], (in).m128[0] ); \
+	(out).m128[1] = Y1 = _mm_xor_si128( (out).m128[1], (in).m128[1] ); \
+	(out).m128[2] = Y2 = _mm_xor_si128( (out).m128[2], (in).m128[2] ); \
+	(out).m128[3] = Y3 = _mm_xor_si128( (out).m128[3], (in).m128[3] ); \
+	X0 = _mm_xor_si128( X0, Y0 ); \
+	X1 = _mm_xor_si128( X1, Y1 ); \
+	X2 = _mm_xor_si128( X2, Y2 ); \
+	X3 = _mm_xor_si128( X3, Y3 );

-#define INTEGERIFY _mm_cvtsi128_si32(X0)
-
-#else /* !defined(__SSE2__) */
-
-#define DECL_X \
-	salsa20_blk_t X;
-#define DECL_Y \
-	salsa20_blk_t Y;
-
-#define COPY(out, in) \
-	(out).d[0] = (in).d[0]; \
-	(out).d[1] = (in).d[1]; \
-	(out).d[2] = (in).d[2]; \
-	(out).d[3] = (in).d[3]; \
-	(out).d[4] = (in).d[4]; \
-	(out).d[5] = (in).d[5]; \
-	(out).d[6] = (in).d[6]; \
-	(out).d[7] = (in).d[7];
-
-#define READ_X(in) COPY(X, in)
-#define WRITE_X(out) COPY(out, X)
-
-/**
- * salsa20(B):
- * Apply the Salsa20 core to the provided block.
- */
-static inline void salsa20(salsa20_blk_t *restrict B,
-    salsa20_blk_t *restrict Bout, uint32_t doublerounds)
-{
-	salsa20_blk_t X;
-#define x X.w
-
-	salsa20_simd_unshuffle(B, &X);
-
-	do {
-#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
-		/* Operate on columns */
-		x[ 4] ^= R(x[ 0]+x[12], 7);  x[ 8] ^= R(x[ 4]+x[ 0], 9);
-		x[12] ^= R(x[ 8]+x[ 4],13);  x[ 0] ^= R(x[12]+x[ 8],18);
-
-		x[ 9] ^= R(x[ 5]+x[ 1], 7);  x[13] ^= R(x[ 9]+x[ 5], 9);
-		x[ 1] ^= R(x[13]+x[ 9],13);  x[ 5] ^= R(x[ 1]+x[13],18);
-
-		x[14] ^= R(x[10]+x[ 6], 7);  x[ 2] ^= R(x[14]+x[10], 9);
-		x[ 6] ^= R(x[ 2]+x[14],13);  x[10] ^= R(x[ 6]+x[ 2],18);
-
-		x[ 3] ^= R(x[15]+x[11], 7);  x[ 7] ^= R(x[ 3]+x[15], 9);
-		x[11] ^= R(x[ 7]+x[ 3],13);  x[15] ^= R(x[11]+x[ 7],18);
-
-		/* Operate on rows */
-		x[ 1] ^= R(x[ 0]+x[ 3], 7);  x[ 2] ^= R(x[ 1]+x[ 0], 9);
-		x[ 3] ^= R(x[ 2]+x[ 1],13);  x[ 0] ^= R(x[ 3]+x[ 2],18);
-
-		x[ 6] ^= R(x[ 5]+x[ 4], 7);  x[ 7] ^= R(x[ 6]+x[ 5], 9);
-		x[ 4] ^= R(x[ 7]+x[ 6],13);  x[ 5] ^= R(x[ 4]+x[ 7],18);
-
-		x[11] ^= R(x[10]+x[ 9], 7);  x[ 8] ^= R(x[11]+x[10], 9);
-		x[ 9] ^= R(x[ 8]+x[11],13);  x[10] ^= R(x[ 9]+x[ 8],18);
-
-		x[12] ^= R(x[15]+x[14], 7);  x[13] ^= R(x[12]+x[15], 9);
-		x[14] ^= R(x[13]+x[12],13);  x[15] ^= R(x[14]+x[13],18);
-#undef R
-	} while (--doublerounds);
-#undef x
-
-	{
-		uint32_t i;
-		salsa20_simd_shuffle(&X, Bout);
-		for (i = 0; i < 16; i += 4) {
-			B->w[i] = Bout->w[i] += B->w[i];
-			B->w[i + 1] = Bout->w[i + 1] += B->w[i + 1];
-			B->w[i + 2] = Bout->w[i + 2] += B->w[i + 2];
-			B->w[i + 3] = Bout->w[i + 3] += B->w[i + 3];
-		}
-	}
-}
-
-/**
- * Apply the Salsa20/2 core to the block provided in X.
- */
-#define SALSA20_2(out) \
-	salsa20(&X, &out, 1);
-
-/**
- * Apply the Salsa20/8 core to the block provided in X.
- */
-#define SALSA20_8(out) \
-	salsa20(&X, &out, 4);
-
-#define XOR(out, in1, in2) \
-	(out).d[0] = (in1).d[0] ^ (in2).d[0]; \
-	(out).d[1] = (in1).d[1] ^ (in2).d[1]; \
-	(out).d[2] = (in1).d[2] ^ (in2).d[2]; \
-	(out).d[3] = (in1).d[3] ^ (in2).d[3]; \
-	(out).d[4] = (in1).d[4] ^ (in2).d[4]; \
-	(out).d[5] = (in1).d[5] ^ (in2).d[5]; \
-	(out).d[6] = (in1).d[6] ^ (in2).d[6]; \
-	(out).d[7] = (in1).d[7] ^ (in2).d[7];
-
-#define XOR_X(in) XOR(X, X, in)
-#define XOR_X_2(in1, in2) XOR(X, in1, in2)
-#define XOR_X_WRITE_XOR_Y_2(out, in) \
-	XOR(Y, out, in) \
-	COPY(out, Y) \
-	XOR(X, X, Y)
-
-#define INTEGERIFY (uint32_t)X.d[0]
-#endif
+#define INTEGERIFY( X ) _mm_cvtsi128_si32( X )

 // AVX512 ternary logic optimization
 #if defined(__AVX512VL__)

 #define XOR_X_XOR_X( in1, in2 ) \
- X0 =  _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
- X1 =  _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
- X2 =  _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
- X3 =  _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); 
-
-#define XOR_X_2_XOR_X( in1, in2, in3 ) \
- X0 =  _mm_ternarylogic_epi32( (in1).q[0], (in2).q[0], (in3).q[0], 0x96 ); \
- X1 =  _mm_ternarylogic_epi32( (in1).q[1], (in2).q[1], (in3).q[1], 0x96 ); \
- X2 =  _mm_ternarylogic_epi32( (in1).q[2], (in2).q[2], (in3).q[2], 0x96 ); \
- X3 =  _mm_ternarylogic_epi32( (in1).q[3], (in2).q[3], (in3).q[3], 0x96 );
-
-#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
- X0 =  _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
- X1 =  _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
- X2 =  _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
- X3 =  _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); \
- SALSA20(out)
+ X0 =  _mm_ternarylogic_epi32( X0, (in1).m128[0], (in2).m128[0], 0x96 ); \
+ X1 =  _mm_ternarylogic_epi32( X1, (in1).m128[1], (in2).m128[1], 0x96 ); \
+ X2 =  _mm_ternarylogic_epi32( X2, (in1).m128[2], (in2).m128[2], 0x96 ); \
+ X3 =  _mm_ternarylogic_epi32( X3, (in1).m128[3], (in2).m128[3], 0x96 ); 

 #else

 #define XOR_X_XOR_X( in1, in2 ) \
  XOR_X( in1 ) \
-  XOR_X( in2 ) 
-
-#define XOR_X_2_XOR_X( in1, in2, in3 ) \
-   XOR_X_2( in1, in2 ) \
-   XOR_X( in3 )
-
-#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
-   XOR_X(in1) \
-   XOR_X(in2) \
-   SALSA20( out )
+  XOR_X( in2 )

 #endif

-/**
- * Apply the Salsa20 core to the block provided in X ^ in.
- */
+// General vectored optimizations
+#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define READ_X( in ) \
+  X.m512 = (in).m512;
+
+#define XOR_X_2_XOR_X( in1, in2, in3 ) \
+ X.m512 = _mm512_ternarylogic_epi32( (in1).m512, (in2).m512, (in3).m512, 0x96 );
+
+#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
+{ \
+ __m128i X0, X1, X2, X3; \
+ X.m512 = _mm512_ternarylogic_epi32( X.m512, (in1).m512, (in2).m512, 0x96 ); \
+ X0 = X.m128[0]; \
+ X1 = X.m128[1]; \
+ X2 = X.m128[2]; \
+ X3 = X.m128[3]; \
+ SALSA20( out ); \
+ X.m128[0] = X0; \
+ X.m128[1] = X1; \
+ X.m128[2] = X2; \
+ X.m128[3] = X3; \
+}
+
 #define SALSA20_XOR_MEM(in, out) \
-	XOR_X(in) \
-	SALSA20(out)
+{ \
+ __m128i X0, X1, X2, X3; \
+ X.m512 = _mm512_xor_si512( X.m512, (in).m512 ); \
+ X0 = X.m128[0]; \
+ X1 = X.m128[1]; \
+ X2 = X.m128[2]; \
+ X3 = X.m128[3]; \
+ SALSA20( out ); \
+ X.m128[0] = X0; \
+ X.m128[1] = X1; \
+ X.m128[2] = X2; \
+ X.m128[3] = X3; \
+}
+
+#elif defined(__AVX2__)
+
+#define READ_X( in ) \
+  X.m256[0] = (in).m256[0]; \
+  X.m256[1] = (in).m256[1];
+
+#if defined(__AVX512VL__)
+
+#define XOR_X_2_XOR_X( in1, in2, in3 ) \
+   X.m256[0] = _mm256_ternarylogic_epi32( (in1).m256[0], (in2).m256[0], \
+                                          (in3).m256[0], 0x96 ); \
+   X.m256[1] = _mm256_ternarylogic_epi32( (in1).m256[1], (in2).m256[1], \
+                                          (in3).m256[1], 0x96 );
+
+#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
+{ \
+   __m128i X0, X1, X2, X3; \
+   X.m256[0] = _mm256_ternarylogic_epi32( X.m256[0], (in1).m256[0], \
+                                      (in2).m256[0], 0x96 ); \
+   X.m256[1] = _mm256_ternarylogic_epi32( X.m256[1], (in1).m256[1], \
+                                      (in2).m256[1], 0x96 ); \
+   X0 = X.m128[0]; \
+   X1 = X.m128[1]; \
+   X2 = X.m128[2]; \
+   X3 = X.m128[3]; \
+   SALSA20( out ); \
+   X.m128[0] = X0; \
+   X.m128[1] = X1; \
+   X.m128[2] = X2; \
+   X.m128[3] = X3; \
+}
+
+#else  // AVX2
+
+#define XOR_X_2_XOR_X( in1, in2, in3 ) \
+   X.m256[0] = _mm256_xor_si256( (in1).m256[0], \
+                       _mm256_xor_si256( (in2).m256[0], (in3).m256[0] ) ); \
+   X.m256[1] = _mm256_xor_si256( (in1).m256[1], \
+                       _mm256_xor_si256( (in2).m256[1], (in3).m256[1] ) );
+
+#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
+{ \
+   __m128i X0, X1, X2, X3; \
+   X.m256[0] = _mm256_xor_si256( X.m256[0], \
+                       _mm256_xor_si256( (in1).m256[0], (in2).m256[0] ) ); \
+   X.m256[1] = _mm256_xor_si256( X.m256[1], \
+                       _mm256_xor_si256( (in1).m256[1], (in2).m256[1] ) ); \
+   X0 = X.m128[0]; \
+   X1 = X.m128[1]; \
+   X2 = X.m128[2]; \
+   X3 = X.m128[3]; \
+   SALSA20( out ); \
+   X.m128[0] = X0; \
+   X.m128[1] = X1; \
+   X.m128[2] = X2; \
+   X.m128[3] = X3; \
+}  
+
+#endif // AVX512VL else
+
+#define SALSA20_XOR_MEM( in, out ) \
+{ \
+   __m128i X0, X1, X2, X3; \
+   X.m256[0] = _mm256_xor_si256( X.m256[0], (in).m256[0] ); \
+   X.m256[1] = _mm256_xor_si256( X.m256[1], (in).m256[1] ); \
+   X0 = X.m128[0]; \
+   X1 = X.m128[1]; \
+   X2 = X.m128[2]; \
+   X3 = X.m128[3]; \
+   SALSA20( out ) \
+   X.m128[0] = X0; \
+   X.m128[1] = X1; \
+   X.m128[2] = X2; \
+   X.m128[3] = X3; \
+}
+
+#else   // SSE2
+
+#define READ_X(in) \
+   X.m128[0] = (in).m128[0]; \
+   X.m128[1] = (in).m128[1]; \
+   X.m128[2] = (in).m128[2]; \
+   X.m128[3] = (in).m128[3];
+
+#define XOR_X_2_XOR_X( in1, in2, in3 ) \
+   X.m128[0] = _mm_xor_si128( (in1).m128[0], \
+                     _mm_xor_si128( (in2).m128[0], (in3).m128[0] ) ); \
+   X.m128[1] = _mm_xor_si128( (in1).m128[1], \
+                     _mm_xor_si128( (in2).m128[1], (in3).m128[1] ) ); \
+   X.m128[2] = _mm_xor_si128( (in1).m128[2], \
+                     _mm_xor_si128( (in2).m128[2], (in3).m128[2] ) ); \
+   X.m128[3] = _mm_xor_si128( (in1).m128[3], \
+                     _mm_xor_si128( (in2).m128[3], (in3).m128[3] ) );
+
+
+#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
+{ \
+   __m128i X0 = _mm_xor_si128( X.m128[0], \
+                         _mm_xor_si128( (in1).m128[0], (in2).m128[0] ) ); \
+   __m128i X1 = _mm_xor_si128( X.m128[1], \
+                         _mm_xor_si128( (in1).m128[1], (in2).m128[1] ) ); \
+   __m128i X2 = _mm_xor_si128( X.m128[2], \
+                         _mm_xor_si128( (in1).m128[2], (in2).m128[2] ) ); \
+   __m128i X3 = _mm_xor_si128( X.m128[3], \
+                         _mm_xor_si128( (in1).m128[3], (in2).m128[3] ) ); \
+   SALSA20( out ); \
+   X.m128[0] = X0; \
+   X.m128[1] = X1; \
+   X.m128[2] = X2; \
+   X.m128[3] = X3; \
+}   
+     
+// Apply the Salsa20 core to the block provided in X ^ in.
+#define SALSA20_XOR_MEM(in, out) \
+{ \
+   __m128i X0 = _mm_xor_si128( X.m128[0], (in).m128[0] ); \
+   __m128i X1 = _mm_xor_si128( X.m128[1], (in).m128[1] ); \
+   __m128i X2 = _mm_xor_si128( X.m128[2], (in).m128[2] ); \
+   __m128i X3 = _mm_xor_si128( X.m128[3], (in).m128[3] ); \
+   SALSA20( out ) \
+   X.m128[0] = X0; \
+   X.m128[1] = X1; \
+   X.m128[2] = X2; \
+   X.m128[3] = X3; \
+} 
+
+#endif   // AVX512 elif AVX2 else

 #define SALSA20 SALSA20_8
 #else /* pass 2 */
@@ -425,7 +566,7 @@ static inline void salsa20(salsa20_blk_t *restrict B,
 #define SALSA20 SALSA20_2
 #endif

-/**
+/*
 * blockmix_salsa(Bin, Bout):
 * Compute Bout = BlockMix_{salsa20, 1}(Bin).  The input Bin must be 128
 * bytes in length; the output Bout must also be the same size.
@@ -433,29 +574,23 @@ static inline void salsa20(salsa20_blk_t *restrict B,
 static inline void blockmix_salsa(const salsa20_blk_t *restrict Bin,
    salsa20_blk_t *restrict Bout)
 {
-	DECL_X
+   salsa20_blk_t X;

-	READ_X(Bin[1])
-	SALSA20_XOR_MEM(Bin[0], Bout[0])
-	SALSA20_XOR_MEM(Bin[1], Bout[1])
+   READ_X( Bin[1] );
+   SALSA20_XOR_MEM(Bin[0], Bout[0]);
+	SALSA20_XOR_MEM(Bin[1], Bout[1]);
 }

 static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1,
    const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout)
 {
-	DECL_X
+   salsa20_blk_t X;

-   XOR_X_2_XOR_X( Bin1[1], Bin2[1], Bin1[0] )   
-//	XOR_X_2(Bin1[1], Bin2[1])
-//	XOR_X(Bin1[0])
-	SALSA20_XOR_MEM(Bin2[0], Bout[0])
+   XOR_X_2_XOR_X( Bin1[1], Bin2[1], Bin1[0] );   
+	SALSA20_XOR_MEM( Bin2[0], Bout[0] );
+   XOR_X_SALSA20_XOR_MEM( Bin1[1], Bin2[1], Bout[1] );

-// Factor out the XOR from salsa20 to do a xor3
-   XOR_X_SALSA20_XOR_MEM( Bin1[1], Bin2[1], Bout[1] )
-//   XOR_X(Bin1[1])
-//	SALSA20_XOR_MEM(Bin2[1], Bout[1])
-
-	return INTEGERIFY;
+   return X.d[0];
 }

 #if _YESPOWER_OPT_C_PASS_ == 1
@@ -490,7 +625,6 @@ typedef struct {
 #define DECL_SMASK2REG /* empty */
 #define MAYBE_MEMORY_BARRIER /* empty */

-#ifdef __SSE2__
 /*
 * (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs
 * starting with Sandy Bridge.  Additionally, PSHUFD uses separate source and
@@ -513,28 +647,40 @@ typedef struct {

 #if defined(__x86_64__) && \
    __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__ICC)
+
 #ifdef __AVX__
+
 #define MOVQ "vmovq"
+
 #else
 /* "movq" would be more correct, but "movd" is supported by older binutils
 * due to an error in AMD's spec for x86-64. */
+
 #define MOVQ "movd"
+
 #endif
+
 #define EXTRACT64(X) ({ \
 	uint64_t result; \
 	__asm__(MOVQ " %1, %0" : "=r" (result) : "x" (X)); \
 	result; \
 })
+
 #elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__)
 /* MSVC and Open64 had bugs */
+
 #define EXTRACT64(X) _mm_cvtsi128_si64(X)
+
 #elif defined(__x86_64__) && defined(__SSE4_1__)
 /* No known bugs for this intrinsic */
+
 #include <smmintrin.h>
 #define EXTRACT64(X) _mm_extract_epi64((X), 0)
+
 #elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__)
 /* 32-bit */
 #include <smmintrin.h>
+
 #if 0
 /* This is currently unused by the code below, which instead uses these two
 * intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */
@@ -542,18 +688,24 @@ typedef struct {
 	((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
 	((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32))
 #endif
+
 #else
 /* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64() */
+
 #define EXTRACT64(X) \
 	((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
 	((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32))
+
 #endif

 #if defined(__x86_64__) && (defined(__AVX__) || !defined(__GNUC__))
 /* 64-bit with AVX */
 /* Force use of 64-bit AND instead of two 32-bit ANDs */
+
 #undef DECL_SMASK2REG
+
 #if defined(__GNUC__) && !defined(__ICC)
+
 #define DECL_SMASK2REG uint64_t Smask2reg = Smask2;
 /* Force use of lower-numbered registers to reduce number of prefixes, relying
 * on out-of-order execution and register renaming. */
@@ -561,12 +713,16 @@ typedef struct {
 	__asm__("" : "=a" (x), "+d" (Smask2reg), "+S" (S0), "+D" (S1));
 #define FORCE_REGALLOC_2 \
 	__asm__("" : : "c" (lo));
-#else
+
+#else   // not GNUC
+
 static volatile uint64_t Smask2var = Smask2;
 #define DECL_SMASK2REG uint64_t Smask2reg = Smask2var;
 #define FORCE_REGALLOC_1 /* empty */
 #define FORCE_REGALLOC_2 /* empty */
+
 #endif
+
 #define PWXFORM_SIMD(X) { \
 	uint64_t x; \
 	FORCE_REGALLOC_1 \
@@ -577,14 +733,18 @@ static volatile uint64_t Smask2var = Smask2;
 	X = _mm_add_epi64(X, *(__m128i *)(S0 + lo)); \
 	X = _mm_xor_si128(X, *(__m128i *)(S1 + hi)); \
 }
+
 #elif defined(__x86_64__)
 /* 64-bit without AVX.  This relies on out-of-order execution and register
 * renaming.  It may actually be fastest on CPUs with AVX(2) as well - e.g.,
 * it runs great on Haswell. */
 //#warning "Note: using x86-64 inline assembly for pwxform.  That's great."
+
 #undef MAYBE_MEMORY_BARRIER
+
 #define MAYBE_MEMORY_BARRIER \
 	__asm__("" : : : "memory");
+
 #define PWXFORM_SIMD(X) { \
 	__m128i H; \
 	__asm__( \
@@ -600,8 +760,10 @@ static volatile uint64_t Smask2var = Smask2;
 	    : "d" (Smask2), "S" (S0), "D" (S1) \
 	    : "cc", "ax", "cx"); \
 }
+
 #elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__)
 /* 32-bit with SSE4.1 */
+
 #define PWXFORM_SIMD(X) { \
 	__m128i x = _mm_and_si128(X, _mm_set1_epi64x(Smask2)); \
 	__m128i s0 = *(__m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \
@@ -610,8 +772,10 @@ static volatile uint64_t Smask2var = Smask2;
 	X = _mm_add_epi64(X, s0); \
 	X = _mm_xor_si128(X, s1); \
 }
+
 #else
 /* 32-bit without SSE4.1 */
+
 #define PWXFORM_SIMD(X) { \
 	uint64_t x = EXTRACT64(X) & Smask2; \
 	__m128i s0 = *(__m128i *)(S0 + (uint32_t)x); \
@@ -620,6 +784,7 @@ static volatile uint64_t Smask2var = Smask2;
 	X = _mm_add_epi64(X, s0); \
 	X = _mm_xor_si128(X, s1); \
 }
+
 #endif

 #define PWXFORM_SIMD_WRITE(X, Sw) \
@@ -649,50 +814,13 @@ static volatile uint64_t Smask2var = Smask2;
 	PWXFORM_SIMD(X2) \
 	PWXFORM_SIMD(X3)

-#else /* !defined(__SSE2__) */
-
-#define PWXFORM_SIMD(x0, x1) { \
-	uint64_t x = x0 & Smask2; \
-	uint64_t *p0 = (uint64_t *)(S0 + (uint32_t)x); \
-	uint64_t *p1 = (uint64_t *)(S1 + (x >> 32)); \
-	x0 = ((x0 >> 32) * (uint32_t)x0 + p0[0]) ^ p1[0]; \
-	x1 = ((x1 >> 32) * (uint32_t)x1 + p0[1]) ^ p1[1]; \
-}
-
-#define PWXFORM_SIMD_WRITE(x0, x1, Sw) \
-	PWXFORM_SIMD(x0, x1) \
-	((uint64_t *)(Sw + w))[0] = x0; \
-	((uint64_t *)(Sw + w))[1] = x1;
-
-#define PWXFORM_ROUND \
-	PWXFORM_SIMD(X.d[0], X.d[1]) \
-	PWXFORM_SIMD(X.d[2], X.d[3]) \
-	PWXFORM_SIMD(X.d[4], X.d[5]) \
-	PWXFORM_SIMD(X.d[6], X.d[7])
-
-#define PWXFORM_ROUND_WRITE4 \
-	PWXFORM_SIMD_WRITE(X.d[0], X.d[1], S0) \
-	PWXFORM_SIMD_WRITE(X.d[2], X.d[3], S1) \
-	w += 16; \
-	PWXFORM_SIMD_WRITE(X.d[4], X.d[5], S0) \
-	PWXFORM_SIMD_WRITE(X.d[6], X.d[7], S1) \
-	w += 16;
-
-#define PWXFORM_ROUND_WRITE2 \
-	PWXFORM_SIMD_WRITE(X.d[0], X.d[1], S0) \
-	PWXFORM_SIMD_WRITE(X.d[2], X.d[3], S1) \
-	w += 16; \
-	PWXFORM_SIMD(X.d[4], X.d[5]) \
-	PWXFORM_SIMD(X.d[6], X.d[7])
-#endif
-
 #define PWXFORM \
 	PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND \
 	PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND

 #define Smask2 Smask2_0_5

-#else /* pass 2 */
+#else // pass 2

 #undef PWXFORM
 #define PWXFORM \
@@ -718,23 +846,27 @@ static volatile uint64_t Smask2var = Smask2;
 static void blockmix(const salsa20_blk_t *restrict Bin,
    salsa20_blk_t *restrict Bout, size_t r, pwxform_ctx_t *restrict ctx)
 {
-	if (unlikely(!ctx)) {
+	if ( unlikely(!ctx) )
+   {
 		blockmix_salsa(Bin, Bout);
 		return;
 	}

+   __m128i X0, X1, X2, X3;
 	uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
 #if _YESPOWER_OPT_C_PASS_ > 1
 	uint8_t *S2 = ctx->S2;
 	size_t w = ctx->w;
 #endif
 	size_t i;
-	DECL_X

 	/* Convert count of 128-byte blocks to max index of 64-byte block */
 	r = r * 2 - 1;

-	READ_X(Bin[r])
+   X0 = Bin[r].m128[0];
+   X1 = Bin[r].m128[1];
+   X2 = Bin[r].m128[2];
+   X3 = Bin[r].m128[3];

 	DECL_SMASK2REG

@@ -763,13 +895,13 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
 	if (unlikely(!ctx))
 		return blockmix_salsa_xor(Bin1, Bin2, Bout);

+   __m128i X0, X1, X2, X3;
 	uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
 #if _YESPOWER_OPT_C_PASS_ > 1
 	uint8_t *S2 = ctx->S2;
 	size_t w = ctx->w;
 #endif
 	size_t i;
-	DECL_X

 	/* Convert count of 128-byte blocks to max index of 64-byte block */
 	r = r * 2 - 1;
@@ -781,7 +913,10 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
 	}
 #endif

-	XOR_X_2(Bin1[r], Bin2[r])
+   X0 = _mm_xor_si128( Bin1[r].m128[0], Bin2[r].m128[0] );
+   X1 = _mm_xor_si128( Bin1[r].m128[1], Bin2[r].m128[1] );
+   X2 = _mm_xor_si128( Bin1[r].m128[2], Bin2[r].m128[2] );
+   X3 = _mm_xor_si128( Bin1[r].m128[3], Bin2[r].m128[3] );

 	DECL_SMASK2REG

@@ -789,21 +924,13 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
 	r--;
 	do {
      XOR_X_XOR_X( Bin1[i], Bin2[i] )
-//      XOR_X(Bin1[i])
-//      XOR_X(Bin2[i])
 		PWXFORM
 		WRITE_X(Bout[i])
-
      XOR_X_XOR_X( Bin1[ i+1 ], Bin2[ i+1 ] )     
-//		XOR_X(Bin1[i + 1])
-//		XOR_X(Bin2[i + 1])
 		PWXFORM
-
 		if (unlikely(i >= r))
 			break;
-
 		WRITE_X(Bout[i + 1])
-
 		i += 2;
 	} while (1);
 	i++;
@@ -815,21 +942,20 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,

 	SALSA20(Bout[i])

-	return INTEGERIFY;
+	return INTEGERIFY( X0 );
 }

-static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out,
-    salsa20_blk_t *restrict Bin2,
-    size_t r, pwxform_ctx_t *restrict ctx)
+static uint32_t blockmix_xor_save( salsa20_blk_t *restrict Bin1out,
+        salsa20_blk_t *restrict Bin2,  size_t r, pwxform_ctx_t *restrict ctx )
 {
+   __m128i X0, X1, X2, X3;
+   __m128i Y0, Y1, Y2, Y3;
 	uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
 #if _YESPOWER_OPT_C_PASS_ > 1
 	uint8_t *S2 = ctx->S2;
 	size_t w = ctx->w;
 #endif
 	size_t i;
-	DECL_X
-	DECL_Y

 	/* Convert count of 128-byte blocks to max index of 64-byte block */
 	r = r * 2 - 1;
@@ -841,7 +967,10 @@ static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out,
 	}
 #endif

-	XOR_X_2(Bin1out[r], Bin2[r])
+   X0 = _mm_xor_si128( Bin1out[r].m128[0], Bin2[r].m128[0] );
+   X1 = _mm_xor_si128( Bin1out[r].m128[1], Bin2[r].m128[1] );
+   X2 = _mm_xor_si128( Bin1out[r].m128[2], Bin2[r].m128[2] );
+   X3 = _mm_xor_si128( Bin1out[r].m128[3], Bin2[r].m128[3] );

 	DECL_SMASK2REG

@@ -851,15 +980,11 @@ static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out,
 		XOR_X_WRITE_XOR_Y_2(Bin2[i], Bin1out[i])
 		PWXFORM
 		WRITE_X(Bin1out[i])
-
 		XOR_X_WRITE_XOR_Y_2(Bin2[i + 1], Bin1out[i + 1])
 		PWXFORM
-
-		if (unlikely(i >= r))
-			break;
-
+		if ( unlikely(i >= r) )
+         break;
 		WRITE_X(Bin1out[i + 1])
-
 		i += 2;
 	} while (1);
 	i++;
@@ -871,7 +996,7 @@ static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out,

 	SALSA20(Bin1out[i])

-	return INTEGERIFY;
+	return INTEGERIFY( X0 );
 }

 #if _YESPOWER_OPT_C_PASS_ == 1
@@ -886,7 +1011,7 @@ static inline uint32_t integerify(const salsa20_blk_t *B, size_t r)
 * w[0] here (would be wrong on big-endian).  Also, our 32-bit words are
 * SIMD-shuffled, but we only care about the least significant 32 bits anyway.
 */
-	return (uint32_t)B[2 * r - 1].d[0];
+	return (uint32_t)B[2 * r - 1].q[0];
 }
 #endif

@@ -915,7 +1040,7 @@ static void smix1(uint8_t *B, size_t r, uint32_t N,
 		salsa20_blk_t *dst = &X[i];
 		size_t k;
 		for (k = 0; k < 16; k++)
-         tmp->w[k] = src->w[k];
+         tmp->d[k] = src->d[k];
 		salsa20_simd_shuffle(tmp, dst);
 	}

@@ -962,7 +1087,7 @@ static void smix1(uint8_t *B, size_t r, uint32_t N,
 		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64];
 		size_t k;
 		for (k = 0; k < 16; k++)
-         tmp->w[k] = src->w[k];
+         tmp->d[k] = src->d[k];
 		salsa20_simd_unshuffle(tmp, dst);
 	}
 }
@@ -988,7 +1113,7 @@ static void smix2(uint8_t *B, size_t r, uint32_t N, uint32_t Nloop,
 		salsa20_blk_t *dst = &X[i];
 		size_t k;
 		for (k = 0; k < 16; k++)
-			tmp->w[k] = src->w[k];
+			tmp->d[k] = src->d[k];
 		salsa20_simd_shuffle(tmp, dst);
 	}

@@ -1020,7 +1145,7 @@ static void smix2(uint8_t *B, size_t r, uint32_t N, uint32_t Nloop,
 		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64];
 		size_t k;
 		for (k = 0; k < 16; k++)
-			tmp->w[k]  = src->w[k];
+			tmp->d[k]  = src->d[k];
 		salsa20_simd_unshuffle(tmp, dst);
 	}
 }
--- a/api.c
+++ b/api.c
@@ -336,7 +336,7 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
 	char inpkey[128] = { 0 };
 	char seckey[64];
 	uchar sha1[20];
-	SHA_CTX ctx;
+//	SHA_CTX ctx;

 	if (opt_protocol)
 		applog(LOG_DEBUG, "clientkey: %s", clientkey);
@@ -346,9 +346,11 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
 	// SHA-1 test from rfc, returns in base64 "s3pPLMBiTxaQ9kYGzzhZRbK+xOo="
 	//sprintf(inpkey, "dGhlIHNhbXBsZSBub25jZQ==258EAFA5-E914-47DA-95CA-C5AB0DC85B11");

-	SHA1_Init(&ctx);
-	SHA1_Update(&ctx, inpkey, strlen(inpkey));
-	SHA1_Final(sha1, &ctx);
+   SHA1( inpkey, strlen(inpkey), sha1 );
+// Deprecated in openssl-3
+// SHA1_Init(&ctx);
+//	SHA1_Update(&ctx, inpkey, strlen(inpkey));
+//	SHA1_Final(sha1, &ctx);

 	base64_encode(sha1, 20, seckey, sizeof(seckey));

--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,7 +4,7 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.

-rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake > /dev/null

 # AVX512 SHA VAES: Intel Core Icelake, Rocketlake
 make distclean || echo clean
@@ -17,13 +17,22 @@ make -j 8
 strip -s cpuminer
 mv cpuminer cpuminer-avx512-sha-vaes

+# AVX256 SHA VAES: Intel Core Alderlake, needs gcc-12
+#make clean || echo clean
+#rm -f config.status
+#./autogen.sh || echo done
+#CFLAGS="-O3 -march=alderlake -Wall -fno-common" ./configure --with-curl
+#make -j 8
+#strip -s cpuminer
+#mv cpuminer cpuminer-alderlake
+
 # Zen4 AVX512 SHA VAES
 make clean || echo clean
 rm -f config.status
 # znver3 needs gcc-11, znver4 ?
 #CFLAGS="-O3 -march=znver4 -Wall -fno-common " ./configure --with-curl
-#CFLAGS="-O3 -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
-CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
+CFLAGS="-O3 -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
+#CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
 make -j 8
 strip -s cpuminer
 mv cpuminer cpuminer-zen4
@@ -31,8 +40,8 @@ mv cpuminer cpuminer-zen4
 # Zen3 AVX2 SHA VAES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=znver2 -mvaes -fno-common " ./configure --with-curl
-#CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl
+#CFLAGS="-O3 -march=znver2 -mvaes -fno-common " ./configure --with-curl
+CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl
 make -j 8
 strip -s cpuminer
 mv cpuminer cpuminer-zen3
@@ -80,7 +89,7 @@ make -j 8
 strip -s cpuminer
 mv cpuminer cpuminer-avx

-# SSE4.2 AES: Intel Westmere
+# SSE4.2 AES: Intel Westmere, most Pentium & Celeron
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl
--- a/4343
+++ b/4343
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.21.0])
+AC_INIT([cpuminer-opt], [3.22.0])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -37,6 +37,7 @@
 #include <curl/curl.h>
 #include <jansson.h>
 #include <openssl/sha.h>
+//#include <mm_malloc.h>
 #include "sysinfos.c"
 #include "algo/sha/sha256d.h"

@@ -317,8 +318,9 @@ static void affine_to_cpu( struct thr_info *thr )
   if ( !ok )
   {
      last_error = GetLastError();
-      applog( LOG_WARNING, "affine_to_cpu_mask for %u returned 0x%x",
-                           thread, last_error );
+      if ( !thread )
+      applog( LOG_WARNING, "Set affinity returned error 0x%x for thread %d",
+                           last_error, thread );
   }
 }   

@@ -430,20 +432,18 @@ static bool work_decode( const json_t *val, struct work *work )
    if ( unlikely( !algo_gate.work_decode( work ) ) )
        return false;

-    if ( !allow_mininginfo )
-        net_diff = algo_gate.calc_network_diff( work );
-    else
-        net_diff = hash_to_diff( work->target );
-
-    work->targetdiff = net_diff;
-    stratum_diff = last_targetdiff = work->targetdiff;
+    // many of these aren't used solo.
+    net_diff =
+    work->targetdiff = 
+    stratum_diff =
+    last_targetdiff = hash_to_diff( work->target );
    work->sharediff = 0;
    algo_gate.decode_extra_data( work, &net_blocks );

    return true;
 }

-// good alternative for wallet mining, difficulty and net hashrate
+// Only used for net_hashrate with GBT/getwork, data is from previous block.
 static const char *info_req =
 "{\"method\": \"getmininginfo\", \"params\": [], \"id\":8}\r\n";

@@ -469,17 +469,14 @@ static bool get_mininginfo( CURL *curl, struct work *work )
   // "networkhashps": 56475980
   if ( res )
   {
-      // net_diff is a global that is set from the work hash target by
-      // both getwork and GBT. Don't overwrite it, define a local to override
-      // the global.
-      double net_diff = 0.;
+      double difficulty = 0.;
  		json_t *key = json_object_get( res, "difficulty" );
   	if ( key )
      {
 	   	if ( json_is_object( key ) )
 		   	key = json_object_get( key, "proof-of-work" );
 		   if ( json_is_real( key ) )
-			   net_diff = json_real_value( key );
+			   difficulty = json_real_value( key );
 	   }

      key = json_object_get( res, "networkhashps" );
@@ -496,12 +493,13 @@ static bool get_mininginfo( CURL *curl, struct work *work )
 		  	net_blocks = json_integer_value( key );

      if ( opt_debug )
-         applog(LOG_INFO,"Mining info: diff %.5g, net_hashrate %f, height %d",
-                              net_diff, net_hashrate, net_blocks );
-      
+         applog( LOG_INFO,"getmininginfo: difficulty %.5g, networkhashps %.5g, blocks %d", difficulty, net_hashrate, net_blocks );
+
      if ( !work->height )
      {
 	      // complete missing data from getwork
+         if ( opt_debug )
+            applog( LOG_DEBUG, "work height set by getmininginfo" );
 	      work->height = (uint32_t) net_blocks + 1;
 	      if ( work->height > g_work.height )
            restart_threads();
@@ -533,9 +531,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
   json_t *tmp, *txa;
   bool rc = false;
   int i, n;
-
-// Segwit BEGIN
   bool segwit = false;
+
   tmp = json_object_get( val, "rules" );
   if ( tmp && json_is_array( tmp ) )
   {
@@ -553,8 +550,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
         }
      }
   }
-// Segwit END
-   
+
   tmp = json_object_get( val, "mutable" );
   if ( tmp && json_is_array( tmp ) )
   {
@@ -636,7 +632,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
         goto out;
      }
   }
-   
+
   /* find count and size of transactions */
   txa = json_object_get(val, "transactions" );
   if ( !txa || !json_is_array( txa ) )
@@ -711,12 +707,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      cbtx[41] = cbtx_size - 42; /* scriptsig length */
      le32enc( (uint32_t *)( cbtx+cbtx_size ), 0xffffffff ); /* sequence */
      cbtx_size += 4;
-
-// Segwit BEGIN
-      //cbtx[cbtx_size++] = 1; /* out-counter */
-        cbtx[cbtx_size++] = segwit ? 2 : 1; /* out-counter */
-// Segwit END
-
+      cbtx[cbtx_size++] = segwit ? 2 : 1; /* out-counter */
      le32enc( (uint32_t *)( cbtx+cbtx_size) , (uint32_t)cbvalue ); /* value */
      le32enc( (uint32_t *)( cbtx+cbtx_size+4 ), cbvalue >> 32 );
      cbtx_size += 8;
@@ -724,7 +715,6 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      memcpy( cbtx+cbtx_size, pk_script, pk_script_size );
      cbtx_size += (int) pk_script_size;

-// Segwit BEGIN
       if ( segwit )
       {
          unsigned char (*wtree)[32] = calloc(tx_count + 2, 32);
@@ -759,12 +749,11 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
            for ( i = 0; i < n; i++ )
               sha256d( wtree[i], wtree[2*i], 64 );
         }
-         memset( wtree[1], 0, 32 );  /* witness reserved value = 0 */
+         memset( wtree[1], 0, 32 );  // witness reserved value = 0
         sha256d( cbtx+cbtx_size, wtree[0], 64 );
         cbtx_size += 32;
         free( wtree );
      }
-// Segwit END

      le32enc( (uint32_t *)( cbtx+cbtx_size ), 0 ); /* lock time */
      cbtx_size += 4;
@@ -783,10 +772,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
            xsig_len += n;
         }
         else
-         {
            applog( LOG_WARNING,
                        "Signature does not fit in coinbase, skipping" );
-         }
      }
      tmp = json_object_get( val, "coinbaseaux" );
      if ( tmp && json_is_object( tmp ) )
@@ -813,8 +800,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      if ( xsig_len )
      {
         unsigned char *ssig_end = cbtx + 42 + cbtx[41];
-         int push_len = cbtx[41] + xsig_len < 76 ? 1 :
-		               cbtx[41] + 2 + xsig_len > 100 ? 0 : 2;
+         int push_len = cbtx[41] + xsig_len < 76
+                        ? 1 : cbtx[41] + 2 + xsig_len > 100 ? 0 : 2;
         n = xsig_len + push_len;
         memmove( ssig_end + n, ssig_end, cbtx_size - 42 - cbtx[41] );
         cbtx[41] += n;
@@ -841,7 +828,6 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      const char *tx_hex = json_string_value( json_object_get( tmp, "data" ) );
      const int tx_size = tx_hex ? (int) ( strlen( tx_hex ) / 2 ) : 0;

-// Segwit BEGIN      
      if ( segwit )
      {
         const char *txid = json_string_value( json_object_get( tmp, "txid" ) );
@@ -854,8 +840,6 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      }
      else
      {
-// Segwit END
-
         unsigned char *tx = (uchar*) malloc( tx_size );
         if ( !tx_hex || !hex2bin( tx, tx_hex, tx_size ) )
         {
@@ -865,10 +849,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
         }
         sha256d( merkle_tree[1 + i], tx, tx_size );
         free( tx );
-
-// Segwit BEGIN      
      }
-// Segwit END

      if ( !submit_coinbase )
         strcat( work->txs, tx_hex );
@@ -898,10 +879,11 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      goto out;
   }

-   for ( i = 0; i < 8; i++ )
-      work->target[7 - i] = be32dec( target + i );
+   // reverse the bytes in target
+   casti_m128i( work->target, 0 ) = mm128_bswap_128( casti_m128i( target, 1 ) );
+   casti_m128i( work->target, 1 ) = mm128_bswap_128( casti_m128i( target, 0 ) );
   net_diff = work->targetdiff = hash_to_diff( work->target );
-   
+
   tmp = json_object_get( val, "workid" );
   if ( tmp )
   {
@@ -1077,12 +1059,11 @@ void report_summary_log( bool force )
   timeval_subtract( &et, &now, &start_time );
   timeval_subtract( &uptime, &total_hashes_time, &session_start );
   
-   double share_time = (double)et.tv_sec + (double)et.tv_usec / 1e6;
+   double share_time = (double)et.tv_sec + (double)et.tv_usec * 1e-6;
   double ghrate = safe_div( total_hashes, (double)uptime.tv_sec, 0. );
   double target_diff = exp32 * last_targetdiff;
   double shrate = safe_div( target_diff * (double)(accepts),
                             share_time, 0. );
-//   global_hashrate = ghrate;
   double sess_hrate = safe_div( exp32 * norm_diff_sum,
                                 (double)uptime.tv_sec, 0. );
   double submit_rate = safe_div( (double)submits * 60., share_time, 0. );
@@ -1103,7 +1084,7 @@ void report_summary_log( bool force )
   applog2( LOG_NOTICE, "Periodic Report     %s        %s", et_str, upt_str );
   applog2( LOG_INFO, "Share rate        %.2f/min     %.2f/min",
            submit_rate, safe_div( (double)submitted_share_count*60.,
-              ( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ), 0. ) );
+              ( (double)uptime.tv_sec + (double)uptime.tv_usec * 1e-6 ), 0. ) );
   applog2( LOG_INFO, "Hash rate       %7.2f%sh/s   %7.2f%sh/s   (%.2f%sh/s)",
            shrate, shr_units, sess_hrate, sess_hr_units, ghrate, ghr_units );

@@ -1550,7 +1531,6 @@ const char *getwork_req =

 #define GBT_CAPABILITIES "[\"coinbasetxn\", \"coinbasevalue\", \"longpoll\", \"workid\"]"

-// Segwit BEGIN
 #define GBT_RULES "[\"segwit\"]"
 static const char *gbt_req =
   "{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
@@ -1559,16 +1539,6 @@ const char *gbt_lp_req =
   "{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
   GBT_CAPABILITIES ", \"rules\": " GBT_RULES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n";

-/*
-static const char *gbt_req =
-	"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
-	GBT_CAPABILITIES "}], \"id\":0}\r\n";
-const char *gbt_lp_req =
-	"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
-	GBT_CAPABILITIES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n";
-*/
-// Segwit END
-
 static bool get_upstream_work( CURL *curl, struct work *work )
 {
   json_t *val;
@@ -1646,46 +1616,46 @@ start:
         applog( LOG_BLUE, "New Block %d, Net Diff %.5g, Ntime %08x",
                                work->height, net_diff,
                                work->data[ algo_gate.ntime_index ] );
-
-         if ( !opt_quiet )
-         {
-            double miner_hr = 0.;
-            double net_hr = net_hashrate;
-            double nd = net_diff * exp32;
-            char net_hr_units[4] = {0};
-            char miner_hr_units[4] = {0};
-            char net_ttf[32];
-            char miner_ttf[32];
-
-            pthread_mutex_lock( &stats_lock );
-
-            for ( int i = 0; i < opt_n_threads; i++ )
-               miner_hr += thr_hashrates[i];
-            global_hashrate = miner_hr;
-
-            pthread_mutex_unlock( &stats_lock );
-
-            if ( net_hr > 0. )
-               sprintf_et( net_ttf, nd / net_hr );
-            else
-               sprintf( net_ttf, "NA" );
-            if ( miner_hr > 0. )
-               sprintf_et( miner_ttf, nd / miner_hr );
-            else
-               sprintf( miner_ttf, "NA" );
-
-            scale_hash_for_display ( &miner_hr, miner_hr_units );
-            scale_hash_for_display ( &net_hr, net_hr_units );
-            applog2( LOG_INFO,
-                     "Miner TTF @ %.2f %sh/s %s, Net TTF @ %.2f %sh/s %s",
-                     miner_hr, miner_hr_units, miner_ttf, net_hr,
-                     net_hr_units, net_ttf );
-         }
-      }  // work->height > last_block_height
+      }
      else if ( memcmp( &work->data[1], &g_work.data[1], 32 ) )
         applog( LOG_BLUE, "New Work: Block %d, Net Diff %.5g, Ntime %08x",
-                                      work->height, net_diff,
+                                     work->height, net_diff,
                                      work->data[ algo_gate.ntime_index ] );
+       
+      if ( !opt_quiet )
+      {
+         double miner_hr = 0.;
+         double net_hr = net_hashrate;
+         double nd = net_diff * exp32;
+         char net_hr_units[4] = {0};
+         char miner_hr_units[4] = {0};
+         char net_ttf[32];
+         char miner_ttf[32];
+
+         pthread_mutex_lock( &stats_lock );
+
+         for ( int i = 0; i < opt_n_threads; i++ )
+             miner_hr += thr_hashrates[i];
+         global_hashrate = miner_hr;
+
+         pthread_mutex_unlock( &stats_lock );
+
+         if ( net_hr > 0. )
+            sprintf_et( net_ttf, nd / net_hr );
+         else
+            sprintf( net_ttf, "NA" );
+         if ( miner_hr > 0. )
+            sprintf_et( miner_ttf, nd / miner_hr );
+         else
+            sprintf( miner_ttf, "NA" );
+
+         scale_hash_for_display ( &miner_hr, miner_hr_units );
+         scale_hash_for_display ( &net_hr, net_hr_units );
+         applog2( LOG_INFO,
+                  "Miner TTF @ %.2f %sh/s %s, Net TTF @ %.2f %sh/s %s",
+                  miner_hr, miner_hr_units, miner_ttf, net_hr,
+                  net_hr_units, net_ttf );
+      }
   }  // rc

   return rc;
@@ -1711,36 +1681,36 @@ static void workio_cmd_free(struct workio_cmd *wc)

 static bool workio_get_work( struct workio_cmd *wc, CURL *curl )
 {
-   struct work *ret_work;
+   struct work *work_heap;
   int failures = 0;

-   ret_work = (struct work*) calloc( 1, sizeof(*ret_work) );
-   if ( !ret_work )
-	return false;
+   work_heap = calloc( 1, sizeof(struct work) );
+   if ( !work_heap )  return false;

   /* obtain new work from bitcoin via JSON-RPC */
-   while ( !get_upstream_work( curl, ret_work ) )
+   while ( !get_upstream_work( curl, work_heap ) )
   {
      if ( unlikely( ( opt_retries >= 0 ) && ( ++failures > opt_retries ) ) )
      {
         applog( LOG_ERR, "json_rpc_call failed, terminating workio thread" );
-         free( ret_work );
-	      return false;
+         free( work_heap );
+         return false;
      }

      /* pause, then restart work-request loop */
-	   applog( LOG_ERR, "json_rpc_call failed, retry after %d seconds",
-		        opt_fail_pause );
+      applog( LOG_ERR, "json_rpc_call failed, retry after %d seconds",
+              opt_fail_pause );
      sleep( opt_fail_pause );
   }

   /* send work to requesting thread */
-   if ( !tq_push(wc->thr->q, ret_work ) )
-   	free( ret_work );
+   if ( !tq_push(wc->thr->q, work_heap ) )
+      free( work_heap );

   return true;
 }

+
 static bool workio_submit_work(struct workio_cmd *wc, CURL *curl)
 {
   int failures = 0;
@@ -1811,7 +1781,7 @@ static void *workio_thread(void *userdata)
 static bool get_work(struct thr_info *thr, struct work *work)
 {
 	struct workio_cmd *wc;
-	struct work *work_heap;
+   struct work *work_heap;

 	if unlikely( opt_benchmark )
   {
@@ -1836,17 +1806,16 @@ static bool get_work(struct thr_info *thr, struct work *work)
 	wc->thr = thr;
 	/* send work request to workio thread */
 	if (!tq_push(thr_info[work_thr_id].q, wc))
-        {
+   {
 		workio_cmd_free(wc);
 		return false;
 	}
 	/* wait for response, a unit of work */
 	work_heap = (struct work*) tq_pop(thr->q, NULL);
-	if (!work_heap)
-		return false;
-	/* copy returned work into storage provided by caller */
-	memcpy(work, work_heap, sizeof(*work));
-	free(work_heap);
+	if ( !work_heap ) return false;
+   /* copy returned work into storage provided by caller */
+	memcpy( work, work_heap, sizeof(*work) );
+	free( work_heap );
 	return true;
 }

@@ -1896,9 +1865,9 @@ static void update_submit_stats( struct work *work, const void *hash )
 bool submit_solution( struct work *work, const void *hash,
                      struct thr_info *thr )
 {
-   // Job went stale during hashing of a valid share.
-   if ( !opt_quiet && work_restart[ thr->id ].restart )
-      applog( LOG_INFO, CL_LBL "Share may be stale, submitting anyway..." CL_N );
+// Job went stale during hashing of a valid share.
+//   if ( !opt_quiet && work_restart[ thr->id ].restart )
+//      applog( LOG_INFO, CL_LBL "Share may be stale, submitting anyway..." CL_N );
   
   work->sharediff = hash_to_diff( hash );
   if ( likely( submit_work( thr, work ) ) )
@@ -1916,32 +1885,34 @@ bool submit_solution( struct work *work, const void *hash,
     if ( !opt_quiet )
     {
        if ( have_stratum )
+        {
           applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Job %s",
                   submitted_share_count, work->sharediff, work->height,
                   work->job_id );
+           if ( opt_debug && opt_extranonce )
+           {
+              unsigned char *xnonce2str = abin2hex( work->xnonce2,
+                                                    work->xnonce2_len );
+              applog( LOG_INFO, "Xnonce2 %s", xnonce2str );
+              free( xnonce2str );
+           }
+        }
        else
           applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Ntime %08x",
                   submitted_share_count, work->sharediff, work->height,
                   work->data[ algo_gate.ntime_index ] );
-     }

-     if ( opt_debug )
-     {
-        uint32_t* h = (uint32_t*)hash;
-        uint32_t* t = (uint32_t*)work->target;
-        uint32_t* d = (uint32_t*)work->data;
+        if ( opt_debug )
+        {
+           uint32_t* h = (uint32_t*)hash;
+           uint32_t* t = (uint32_t*)work->target;
+           uint32_t* d = (uint32_t*)work->data;

-        unsigned char *xnonce2str = abin2hex( work->xnonce2,
-                                              work->xnonce2_len );
-        applog(LOG_INFO,"Thread %d, Nonce %08x, Xnonce2 %s", thr->id,
-                       work->data[ algo_gate.nonce_index ], xnonce2str );
-        free( xnonce2str );
-        applog(LOG_INFO,"Data[0:19]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9] );
-        applog(LOG_INFO,"          : %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[10],d[11],d[12],d[13],d[14],d[15],d[16],d[17],d[18],d[19]);
-        applog(LOG_INFO,"Hash[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
-                                    h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0]);
-        applog(LOG_INFO,"Targ[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
-                                    t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0]);
+           applog( LOG_INFO, "Data[ 0: 9]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9] );
+           applog( LOG_INFO, "Data[10:19]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[10],d[11],d[12],d[13],d[14],d[15],d[16],d[17],d[18],d[19] );
+           applog( LOG_INFO, "Hash[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x", h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0] );
+           applog( LOG_INFO, "Targ[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x", t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0] );
+        }
     }
     return true;
   }
@@ -1959,15 +1930,15 @@ static bool wanna_mine(int thr_id)
 		float temp = cpu_temp(0);
 		if (temp > opt_max_temp)
      {
-			if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
-				applog(LOG_INFO, "temperature too high (%.0fC), waiting...", temp);
-			state = false;
+         if ( !thr_id && !conditional_state[thr_id] && !opt_quiet )
+           applog(LOG_NOTICE, "CPU temp too high: %.0fC max %.0f, waiting...", temp, opt_max_temp );
+         state = false;
 		}
 	}
 	if (opt_max_diff > 0.0 && net_diff > opt_max_diff)
   {
 		if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
-			applog(LOG_INFO, "network diff too high, waiting...");
+			applog(LOG_NOTICE, "network diff too high, waiting...");
 		state = false;
 	}
 	if (opt_max_rate > 0.0 && net_hashrate > opt_max_rate)
@@ -1976,12 +1947,14 @@ static bool wanna_mine(int thr_id)
      {
 			char rate[32];
 			format_hashrate(opt_max_rate, rate);
-			applog(LOG_INFO, "network hashrate too high, waiting %s...", rate);
+			applog(LOG_NOTICE, "network hashrate too high (%s), waiting...", rate);
 		}
 		state = false;
 	}
-	if (thr_id < MAX_CPUS)
-		conditional_state[thr_id] = (uint8_t) !state;
+  
+   if ( conditional_state[thr_id] && state && !thr_id && !opt_quiet )
+      applog(LOG_NOTICE, "...resuming" );
+	conditional_state[thr_id] = (uint8_t) !state;
 	return state;
 }

@@ -2015,33 +1988,6 @@ void set_work_data_big_endian( struct work *work )
        be32enc( work->data + i, work->data[i] );
 }

-// calculate net diff from nbits.
-double std_calc_network_diff( struct work* work )
-{
-   uint32_t nbits = work->data[ algo_gate.nbits_index ];
-   uint32_t shift = nbits & 0xff;
-   uint32_t bits = bswap_32( nbits ) & 0x00ffffff;
-/*
-   // sample for diff 43.281 : 1c05ea29
-   // todo: endian reversed on longpoll could be zr5 specific...
-   int nbits_index = algo_gate.nbits_index;
-   uint32_t nbits = have_longpoll ? work->data[ nbits_index]
-                                  : swab32( work->data[ nbits_index ] );
-   uint32_t bits  = ( nbits & 0xffffff );
-   int16_t  shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
-*/
-
-   int m;
-   long double d = (long double)0x0000ffff / (long double)bits;
-   for ( m = shift; m < 29; m++ )
-       d *= 256.0;
-   for ( m = 29; m < shift; m++ )
-       d /= 256.0;
-   if ( opt_debug_diff )
-      applog(LOG_DEBUG, "net diff: %8f -> shift %u, bits %08x", (double)d, shift, bits);
-   return (double)d;
-}
-
 void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
                     uint32_t *end_nonce_ptr )
 {
@@ -2065,17 +2011,6 @@ void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
       ++(*nonceptr);
 }

-bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                           int thr_id )
-{
-   if ( have_stratum && !work->data[0] && !opt_benchmark )
-   {
-      sleep(1);
-      return false;
-   }
-   return true;
-}
-
 static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
 {
   bool new_job;
@@ -2092,7 +2027,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   g_work->xnonce2 = (uchar*) realloc( g_work->xnonce2, sctx->xnonce2_size );
   memcpy( g_work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size );
   algo_gate.build_extraheader( g_work, sctx );
-   net_diff = algo_gate.calc_network_diff( g_work );
+   net_diff = nbits_to_diff( g_work->data[ algo_gate.nbits_index ] );
   algo_gate.set_work_data_endian( g_work );
   g_work->height = sctx->block_height;
   g_work->targetdiff = sctx->job.diff
@@ -2142,8 +2077,6 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   if ( ( stratum_diff != sctx->job.diff )
   || ( last_block_height != sctx->block_height ) )
   {
-      static bool multipool = false;
-      if ( stratum.block_height < last_block_height ) multipool = true;
      if ( unlikely( !session_first_block ) )
         session_first_block = stratum.block_height;
      last_block_height = stratum.block_height;
@@ -2151,58 +2084,47 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
      last_targetdiff   = g_work->targetdiff;
      if ( lowest_share < last_targetdiff )
         lowest_share = 9e99;
+    }

-      if ( !opt_quiet )
-      {
-         applog2( LOG_INFO, "Diff: Net %.5g, Stratum %.5g, Target %.5g",
-                            net_diff, stratum_diff, g_work->targetdiff );
+    if ( !opt_quiet )
+    {
+       applog2( LOG_INFO, "Diff: Net %.5g, Stratum %.5g, Target %.5g",
+                          net_diff, stratum_diff, g_work->targetdiff );

-         if ( likely( hr > 0. ) )
-         {
-            double nd = net_diff * exp32;
-            char hr_units[4] = {0};
-            char block_ttf[32];
-            char share_ttf[32];
+       if ( likely( hr > 0. ) )
+       {
+          double nd = net_diff * exp32;
+          char hr_units[4] = {0};
+          char block_ttf[32];
+          char share_ttf[32];
+          static bool multipool = false;
+      
+          if ( stratum.block_height < last_block_height ) multipool = true;
+            
+          sprintf_et( block_ttf, nd / hr );
+          sprintf_et( share_ttf, ( g_work->targetdiff * exp32 ) / hr );
+          scale_hash_for_display ( &hr, hr_units );
+          applog2( LOG_INFO, "TTF @ %.2f %sh/s: Block %s, Share %s",
+                             hr, hr_units, block_ttf, share_ttf );

-            sprintf_et( block_ttf, nd / hr );
-            sprintf_et( share_ttf, ( g_work->targetdiff * exp32 ) / hr );
-            scale_hash_for_display ( &hr, hr_units );
-            applog2( LOG_INFO, "TTF @ %.2f %sh/s: Block %s, Share %s",
-                               hr, hr_units, block_ttf, share_ttf );
-
-            if ( !multipool && last_block_height > session_first_block )
-            {
-               struct timeval now, et;
-               gettimeofday( &now, NULL );
-               timeval_subtract( &et, &now, &session_start );
-               uint64_t net_ttf =
-                    ( last_block_height - session_first_block ) == 0 ? 0
-                    : et.tv_sec / ( last_block_height - session_first_block );
-               if ( net_diff > 0. && net_ttf )
-               {
-                  double net_hr = nd / net_ttf;
-                  char net_hr_units[4] = {0};
-                  scale_hash_for_display ( &net_hr, net_hr_units );
-                  applog2( LOG_INFO, "Net hash rate (est) %.2f %sh/s",
-                                     net_hr, net_hr_units );
-               }
-            }
-         }  // hr > 0
-      } // !quiet
-   }  // new diff/block
-
-/*   
-   if ( new_job && !( opt_quiet || stratum_errors ) )
-   {
-      int mismatch = submitted_share_count - ( accepted_share_count
-                                             + stale_share_count
-                                             + rejected_share_count );
-      if ( mismatch )
-         applog( LOG_INFO,
-                 CL_LBL "%d Submitted share pending, maybe stale" CL_N,
-                 submitted_share_count );
-   }
-*/
+          if ( !multipool && last_block_height > session_first_block )
+          {
+             struct timeval now, et;
+             gettimeofday( &now, NULL );
+             timeval_subtract( &et, &now, &session_start );
+             uint64_t net_ttf = safe_div( et.tv_sec,
+                                 last_block_height - session_first_block, 0 );
+             if ( net_diff > 0. && net_ttf )
+             {
+                double net_hr = safe_div( nd, net_ttf, 0. );
+                char net_hr_units[4] = {0};
+                scale_hash_for_display ( &net_hr, net_hr_units );
+                applog2( LOG_INFO, "Net hash rate (est) %.2f %sh/s",
+                                   net_hr, net_hr_units );
+             }
+          }
+       }  // hr > 0
+    } // !quiet
 }

 static void *miner_thread( void *userdata )
@@ -2340,9 +2262,14 @@ static void *miner_thread( void *userdata )
       } // do_this_thread
       algo_gate.resync_threads( thr_id, &work );

-       if ( unlikely( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) )
+       // conditional mining
+       if ( unlikely( !wanna_mine( thr_id ) ) )
+       {
+          restart_threads();
+          sleep(5);
          continue;
-
+       }
+       
       // opt_scantime expressed in hashes
       max64 = opt_scantime * thr_hashrates[thr_id];

@@ -2489,14 +2416,6 @@ static void *miner_thread( void *userdata )
             }
          }
       }  // benchmark
-
-       // conditional mining
-       if ( unlikely( !wanna_mine( thr_id ) ) )
-       {
-          sleep(5);
-          continue;
-       }
-
   }  // miner_thread loop

 out:
@@ -3671,7 +3590,7 @@ int main(int argc, char *argv[])

 #if defined(WIN32)

-// Are Windows CPU Groups supported?
+// Get the number of cpus, display after parsing command line
 #if defined(WINDOWS_CPU_GROUPS_ENABLED)
 	num_cpus = 0;
 	num_cpugroups = GetActiveProcessorGroupCount();
@@ -3680,8 +3599,8 @@ int main(int argc, char *argv[])
 	   int cpus = GetActiveProcessorCount( i );
 	   num_cpus += cpus;

-	   if (opt_debug)
-         applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
+//	   if (opt_debug)
+//         applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
 	}

 #else
@@ -3698,7 +3617,7 @@ int main(int argc, char *argv[])
 	sysctl(req, 2, &num_cpus, &len, NULL, 0);
 #else
 	num_cpus = 1;
-#endif
+#endif 

   if ( num_cpus < 1 )
      num_cpus = 1;
@@ -3722,7 +3641,6 @@ int main(int argc, char *argv[])
   if ( opt_time_limit )
      time_limit_stop = (unsigned int)time(NULL) + opt_time_limit;

-
   // need to register to get algo optimizations for cpu capabilities
   // but that causes registration logs before cpu capabilities is output.
   // Would need to split register function into 2 parts. First part sets algo
@@ -3850,6 +3768,11 @@ int main(int argc, char *argv[])
 	}
 #endif

+#if defined(WIN32) && defined(WINDOWS_CPU_GROUPS_ENABLED)
+      if ( !opt_quiet )
+         applog( LOG_INFO, "Found %d CPUs in %d groups", num_cpus, num_cpugroups );
+#endif
+   
   if ( opt_affinity && num_cpus > max_cpus )
   {
      applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled",
@@ -3861,7 +3784,7 @@ int main(int argc, char *argv[])
   {
      for ( int thr = 0, cpu = 0; thr < opt_n_threads; thr++, cpu++ )
      {
-         while ( !( ( opt_affinity >> ( cpu&63 ) ) & 1ULL ) ) cpu++;   
+         while ( !( ( opt_affinity >> ( cpu & 63 ) ) & 1ULL ) ) cpu++;   
         thread_affinity_map[ thr ] = cpu % num_cpus;
      }
      if ( !opt_quiet )
--- a/miner.h
+++ b/miner.h
@@ -91,6 +91,19 @@ enum {
   LOG_PINK  = 0x14 };
 #endif

+#define WORK_ALIGNMENT 64
+
+// When working with dynamically allocated memory to guarantee data alignment
+// for large vectors. Physical block size must be extended by alignment number
+// of bytes when allocated. free() should use the physical pointer returned by
+// malloc(), not the aligned pointer. All others shoujld use the logical,
+// aligned, pointer returned by this function. 
+static inline void *align_ptr( const void *ptr, const uint64_t alignment )
+{
+  const uint64_t mask = alignment - 1;
+  return (void*)( ( ((const uint64_t)ptr) + mask ) & (~mask) );
+}
+
 extern bool is_power_of_2( int n );

 static inline bool is_windows(void)
@@ -317,7 +330,7 @@ extern void cbin2hex(char *out, const char *in, size_t len);
 void   bin2hex( char *s, const unsigned char *p, size_t len );
 char  *abin2hex( const unsigned char *p, size_t len );
 char  *bebin2hex( const unsigned char *p, size_t len );
-bool   hex2bin( unsigned char *p, const char *hexstr, size_t len );
+bool   hex2bin( unsigned char *p, const char *hexstr, const size_t len );
 bool   jobj_binary( const json_t *obj, const char *key, void *buf,
                    size_t buflen );
 int    varint_encode( unsigned char *p, uint64_t n );
@@ -333,10 +346,7 @@ extern void memrev(unsigned char *p, size_t len);
 // number of hashes.
 //
 //     https://en.bitcoin.it/wiki/Difficulty
-//
 //     hash = diff * 2**32
-//
-// diff_to_hash = 2**32 = 0x100000000 = 4294967296 = exp32;

 #define EXP16 65536.
 #define EXP32 4294967296.
@@ -350,8 +360,9 @@ extern const long double exp160; // 2**160
 bool   fulltest( const uint32_t *hash, const uint32_t *target );
 bool   valid_hash( const void*, const void* );

-double hash_to_diff( const void* );
+extern double hash_to_diff( const void* );
 extern void diff_to_hash( uint32_t*, const double );
+extern double nbits_to_diff( uint32_t );

 double hash_target_ratio( uint32_t* hash, uint32_t* target );
 void   work_set_target_ratio( struct work* work, const void *hash );
@@ -405,7 +416,7 @@ struct work
 	unsigned char *xnonce2;
   bool sapling;
   bool stale;
-} __attribute__ ((aligned (64)));
+} __attribute__ ((aligned (WORK_ALIGNMENT)));

 struct stratum_job
 {
@@ -540,7 +551,6 @@ enum algos {
        ALGO_BMW,        
        ALGO_BMW512,
        ALGO_C11,         
-        ALGO_DECRED,
        ALGO_DEEP,
        ALGO_DMD_GR,
        ALGO_GROESTL,     
@@ -634,7 +644,6 @@ static const char* const algo_names[] = {
        "bmw",
        "bmw512",
        "c11",
-        "decred",
        "deep",
        "dmd-gr",
        "groestl",
@@ -795,7 +804,6 @@ Options:\n\
                          bmw           BMW 256\n\
                          bmw512        BMW 512\n\
                          c11           Chaincoin\n\
-                          decred        Blake256r14dcr\n\
                          deep          Deepcoin (DCN)\n\
                          dmd-gr        Diamond\n\
                          groestl       Groestl coin\n\
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -54,7 +54,7 @@ static inline __m128i mm128_mov64_128( const uint64_t n )
 #else
  asm( "movq %1, %0\n\t" : "=x"(a) : "r"(n) );
 #endif
-  return  a;
+  return a;
 }

 static inline __m128i mm128_mov32_128( const uint32_t n )
@@ -65,7 +65,7 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
 #else  
  asm( "movd %1, %0\n\t" : "=x"(a) : "r"(n) );
 #endif
-  return  a;
+  return a;
 }

 // Inconstant naming, prefix should reflect return value:
@@ -79,7 +79,7 @@ static inline uint64_t u64_mov128_64( const __m128i a )
 #else  
  asm( "movq %1, %0\n\t" : "=r"(n) : "x"(a) );
 #endif
-  return  n;
+  return n;
 }

 static inline uint32_t u32_mov128_32( const __m128i a )
@@ -90,7 +90,7 @@ static inline uint32_t u32_mov128_32( const __m128i a )
 #else  
  asm( "movd %1, %0\n\t" : "=r"(n) : "x"(a) );
 #endif
-  return  n;
+  return n;
 }

 // Equivalent of set1, broadcast integer to all elements.
@@ -204,11 +204,12 @@ static inline __m128i mm128_not( const __m128i v )

 #endif

+/*
 // Unary negation of elements (-v)
 #define mm128_negate_64( v )    _mm_sub_epi64( m128_zero, v )
 #define mm128_negate_32( v )    _mm_sub_epi32( m128_zero, v )  
 #define mm128_negate_16( v )    _mm_sub_epi16( m128_zero, v )  
-
+*/

 // Add 4 values, fewer dependencies than sequential addition.
 #define mm128_add4_64( a, b, c, d ) \
@@ -264,20 +265,16 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #if defined(__AVX512VL__)

 // a ^ b ^ c
-#define mm128_xor3( a, b, c ) \
-   _mm_ternarylogic_epi64( a, b, c, 0x96 )
+#define mm128_xor3( a, b, c )    _mm_ternarylogic_epi64( a, b, c, 0x96 )

 // a ^ ( b & c )
-#define mm128_xorand( a, b, c ) \
-   _mm_ternarylogic_epi64( a, b, c, 0x78 )
+#define mm128_xorand( a, b, c )  _mm_ternarylogic_epi64( a, b, c, 0x78 )

 #else

-#define mm128_xor3( a, b, c ) \
-   _mm_xor_si128( a, _mm_xor_si128( b, c ) )
+#define mm128_xor3( a, b, c )    _mm_xor_si128( a, _mm_xor_si128( b, c ) )

-#define mm128_xorand( a, b, c ) \
-  _mm_xor_si128( a, _mm_and_si128( b, c ) )
+#define mm128_xorand( a, b, c )  _mm_xor_si128( a, _mm_and_si128( b, c ) )

 #endif

@@ -292,64 +289,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm_movmask_32( v ) \
   _mm_castps_si128( _mm_movmask_ps( _mm_castsi128_ps( v ) ) )

-
-// Diagonal blend
-
-// Blend 4 32 bit elements from 4 vectors
-
-#if defined (__AVX2__)
-
-#define mm128_diagonal_32( v3, v2, v1, v0 ) \
-  mm_blend_epi32( _mm_blend_epi32( s3, s2, 0x4 ), \
-                  _mm_blend_epi32( s1, s0, 0x1 ), 0x3 )
-
-#elif defined(__SSE4_1__)
-
-#define mm128_diagonal_32( v3, v2, v1, v0 ) \
-  mm_blend_epi16( _mm_blend_epi16( s3, s2, 0x30 ), \
-                  _mm_blend_epi16( s1, s0, 0x03 ), 0x0f )
-
-#endif
-
-/*
-//
-// Extended bit shift for concatenated packed elements from 2 vectors.
-// Shift right returns low half, shift left return high half.
-
-#if defined(__AVX512VBMI2__) && defined(__AVX512VL__)
-
-#define mm128_shl2_64( v1, v2, c )    _mm_shldi_epi64( v1, v2, c ) 
-#define mm128_shr2_64( v1, v2, c )    _mm_shrdi_epi64( v1, v2, c ) 
-
-#define mm128_shl2_32( v1, v2, c )    _mm_shldi_epi32( v1, v2, c ) 
-#define mm128_shr2_32( v1, v2, c )    _mm_shrdi_epi32( v1, v2, c ) 
-
-#define mm128_shl2_16( v1, v2, c )    _mm_shldi_epi16( v1, v2, c )
-#define mm128_shr2_16( v1, v2, c )    _mm_shrdi_epi16( v1, v2, c )
-
-#else
-
-#define mm128_shl2_64( v1, v2, c ) \
-   _mm_or_si128( _mm_slli_epi64( v1, c ), _mm_srli_epi64( v2, 64 - (c) ) )
-
-#define mm128_shr2_64( v1, v2, c ) \
-   _mm_or_si128( _mm_srli_epi64( v2, c ), _mm_slli_epi64( v1, 64 - (c) ) )
-
-#define mm128_shl2_32( v1, v2, c ) \
-   _mm_or_si128( _mm_slli_epi32( v1, c ), _mm_srli_epi32( v2, 32 - (c) ) )
-
-#define mm128_shr2_32( v1, v2, c ) \
-   _mm_or_si128( _mm_srli_epi32( v2, c ), _mm_slli_epi32( v1, 32 - (c) ) )
-
-#define mm128_shl2_16( v1, v2, c ) \
-   _mm_or_si128( _mm_slli_epi16( v1, c ), _mm_srli_epi16( v2, 16 - (c) ) )
-
-#define mm128_shr2_16( v1, v2, c ) \
-   _mm_or_si128( _mm_srli_epi16( v2, c ), _mm_slli_epi16( v1, 16 - (c) ) )
-
-#endif
-*/
-
 //
 // Bit rotations

@@ -522,6 +461,10 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )

 #if defined(__SSSE3__)

+#define mm128_bswap_128( v ) \
+   _mm_shuffle_epi8( v, m128_const_64( 0x0001020304050607, \
+                                       0x08090a0b0c0d0e0f ) )
+
 #define mm128_bswap_64( v ) \
   _mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \
                                       0x0001020304050607 ) )
@@ -583,6 +526,9 @@ static inline __m128i mm128_bswap_16( __m128i v )
  return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
 }

+#define mm128_bswap_128( v ) \
+   mm128_swap_64( mm128_bswap_64( v ) )
+
 static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
 {
   d[0] = mm128_bswap_64( s[0] );
@@ -623,61 +569,18 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )

 #if defined(__SSSE3__)

-#define mm128_alignr_64( v1, v2 )    _mm_alignr_epi8( v1, v2, 8 )
-#define mm128_alignr_32( v1, v2 )    _mm_alignr_epi8( v1, v2, 4 )
+#define mm128_alignr_64( hi, lo, c )    _mm_alignr_epi8( hi, lo, (c)*8 )
+#define mm128_alignr_32( hi, lo, c )    _mm_alignr_epi8( hi, lo, (c)*4 )

 #else

-#define mm128_alignr_64( v1, v2 )    _mm_or_si128( _mm_slli_si128( v1, 8 ), \
-                                                   _mm_srli_si128( v2, 8 ) )
+#define mm128_alignr_64( hi, lo, c ) \
+   _mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )

-#define mm128_alignr_32( v1, v2 )    _mm_or_si128( _mm_slli_si128( v1, 4 ), \
-                                                   _mm_srli_si128( v2, 4 ) )
+#define mm128_alignr_32( hi, lo, c ) \
+   _mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )

 #endif

-// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
-// vrol & vror are deprecated and do not exist for larger vectors.
-// Their only use is by lyra2 blake2b when AVX2 is not available and is
-// grandfathered.
-
-#if defined(__SSSE3__)
-
-#define mm128_vror256_64( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
-           v1 = _mm_alignr_epi8( v2, v1, 8 ); \
-           v2 = t; \
-} while(0)
-
-#define mm128_vrol256_64( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
-           v2 = _mm_alignr_epi8( v2, v1, 8 ); \
-           v1 = t; \
-} while(0)
-
-#else  // SSE2
-
-#define mm128_vror256_64( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
-                              _mm_slli_si128( v2, 8 ) ); \
-           v2 = _mm_or_si128( _mm_srli_si128( v2, 8 ), \
-                              _mm_slli_si128( v1, 8 ) ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vrol256_64( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
-                              _mm_srli_si128( v2, 8 ) ); \
-           v2 = _mm_or_si128( _mm_slli_si128( v2, 8 ), \
-                              _mm_srli_si128( v1, 8 ) ); \
-           v1 = t; \
-} while(0)
-
-#endif  // SSE4.1 else SSE2
-
 #endif // __SSE2__
 #endif // SIMD_128_H__
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -15,6 +15,8 @@
 //
 // "_mm256_shuffle_epi8" and "_mm256_alignr_epi8" are restricted to 128 bit
 // lanes and data can't cross the 128 bit lane boundary.  
+// Full width byte shuffle is available with AVX512VL using the mask version
+// with a full mask (-1). 
 // Instructions that can move data across 128 bit lane boundary incur a
 // performance penalty over those that can't.
 // Some usage of index vectors may be encoded as if full vector shuffles are
@@ -65,10 +67,6 @@ typedef union
 #define u64_mov256_64( v ) u64_mov128_64( _mm256_castsi256_si128( v ) )
 #define u32_mov256_32( v ) u32_mov128_32( _mm256_castsi256_si128( v ) )

-// deprecated
-//#define mm256_mov256_64 u64_mov256_64 
-//#define mm256_mov256_32 u32_mov256_32
-
 // concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
 #define mm256_concat_128( hi, lo ) \
   _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )
@@ -151,10 +149,12 @@ static inline __m256i mm256_not( const __m256i v )

 #endif

+/*
 // Unary negation of each element ( -v )
 #define mm256_negate_64( v ) _mm256_sub_epi64( m256_zero, v )
 #define mm256_negate_32( v ) _mm256_sub_epi32( m256_zero, v )
 #define mm256_negate_16( v ) _mm256_sub_epi16( m256_zero, v )
+*/


 // Add 4 values, fewer dependencies than sequential addition.
@@ -176,44 +176,34 @@ static inline __m256i mm256_not( const __m256i v )
 // AVX512 has ternary logic that supports any 3 input boolean expression.

 // a ^ b ^ c
-#define mm256_xor3( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x96 )
+#define mm256_xor3( a, b, c )      _mm256_ternarylogic_epi64( a, b, c, 0x96 )

 // legacy convenience only
-#define mm256_xor4( a, b, c, d ) \
-   _mm256_xor_si256( a, mm256_xor3( b, c, d ) )
+#define mm256_xor4( a, b, c, d )   _mm256_xor_si256( a, mm256_xor3( b, c, d ) )

 // a & b & c
-#define mm256_and3( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x80 )
+#define mm256_and3( a, b, c )      _mm256_ternarylogic_epi64( a, b, c, 0x80 )

 // a | b | c
-#define mm256_or3( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0xfe )
+#define mm256_or3( a, b, c )       _mm256_ternarylogic_epi64( a, b, c, 0xfe )

 // a ^ ( b & c )
-#define mm256_xorand( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x78 )
+#define mm256_xorand( a, b, c )    _mm256_ternarylogic_epi64( a, b, c, 0x78 )

 // a & ( b ^ c )
-#define mm256_andxor( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x60 )
+#define mm256_andxor( a, b, c )    _mm256_ternarylogic_epi64( a, b, c, 0x60 )

 // a ^ ( b | c )
-#define mm256_xoror( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x1e )
+#define mm256_xoror( a, b, c )     _mm256_ternarylogic_epi64( a, b, c, 0x1e )

 // a ^ ( ~b & c )   
-#define mm256_xorandnot( a, b, c ) \
-  _mm256_ternarylogic_epi64( a, b, c, 0xd2 )
+#define mm256_xorandnot( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0xd2 )

 // a | ( b & c )
-#define mm256_orand( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0xf8  )
+#define mm256_orand( a, b, c )     _mm256_ternarylogic_epi64( a, b, c, 0xf8 )

 // ~( a ^ b ), same as (~a) ^ b
-#define mm256_xnor( a, b ) \
-   _mm256_ternarylogic_epi64( a, b, b, 0x81  )
+#define mm256_xnor( a, b )         _mm256_ternarylogic_epi64( a, b, b, 0x81 )
    
 #else

@@ -260,76 +250,6 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_movmask_32( v ) \
   _mm256_castps_si256( _mm256_movmask_ps( _mm256_castsi256_ps( v ) ) )

-
-// Diagonal blending
-
-// Blend 4 64 bit elements from 4 vectors
-#define mm256_diagonal_64( v3, v2, v1, v0 ) \
-  mm256_blend_epi32( _mm256_blend_epi32( v3, v2, 0x30 ), \
-                     _mm256_blend_epi32( v1, v0, 0x03 ), 0x0f )
-
-// Blend 8 32 bit elements from 8 vectors
-#define mm256_diagonal_32( v7, v6, v5, v4, v3, v2, v1, v0 ) \
-  _mm256_blend_epi32( \
-        _mm256_blend_epi32( \
-               _mm256_blend_epi32( v7, v6, 0x40 ), \
-               _mm256_blend_epi32( v5, v4, 0x10 ) 0x30 ), \
-        _mm256_blend_epi32( \
-               _mm256_blend_epi32( v3, v2, 0x04) \
-               _mm256_blend_epi32( v1, v0, 0x01 ), 0x03 ), 0x0f )  
-
-
-// Blend 4 32 bit elements from each 128 bit lane.
-#define mm256_diagonal128_32( v3, v2, v1, v0 ) \
-    _mm256_blend_epi32( \
-           _mm256_blend_epi32( v3, v2, 0x44) \
-           _mm256_blend_epi32( v1, v0, 0x11 ) )  
-
-/*
-//
-// Extended bit shift for concatenated packed elements from 2 vectors.
-// Shift right returns low half, shift left return high half.
-
-#if defined(__AVX512VBMI2__) && defined(__AVX512VL__)
-
-#define mm256_shl2_64( v1, v2, c )    _mm256_shldi_epi64( v1, v2, c )
-#define mm256_shr2_64( v1, v2, c )    _mm256_shrdi_epi64( v1, v2, c )
-
-#define mm256_shl2_32( v1, v2, c )    _mm256_shldi_epi32( v1, v2, c )
-#define mm256_shr2_32( v1, v2, c )    _mm256_shrdi_epi32( v1, v2, c )
-
-#define mm256_shl2_16( v1, v2, c )    _mm256_shldi_epi16( v1, v2, c )
-#define mm256_shr2_16( v1, v2, c )    _mm256_shrdi_epi16( v1, v2, c )
-
-#else
-
-#define mm256_shl2i_64( v1, v2, c ) \
-                     _mm256_or_si256( _mm256_slli_epi64( v1, c ), \
-                                      _mm256_srli_epi64( v2, 64 - (c) ) )
-
-#define mm512_shr2_64( v1, v2, c ) \
-                    _mm256_or_si256( _mm256_srli_epi64( v2, c ), \
-                                     _mm256_slli_epi64( v1, 64 - (c) ) )
-
-#define mm256_shl2_32( v1, v2, c ) \
-                    _mm256_or_si256( _mm256_slli_epi32( v1, c ), \
-                                     _mm256_srli_epi32( v2, 32 - (c) ) )
-
-#define mm256_shr2_32( v1, v2, c ) \
-                    _mm256_or_si256( _mm256_srli_epi32( v2, c ), \
-                                     _mm256_slli_epi32( v1, 32 - (c) ) )
-
-#define mm256_shl2_16( v1, v2, c ) \
-                    _mm256_or_si256( _mm256_slli_epi16( v1, c ), \
-                                     _mm256_srli_epi16( v2, 16 - (c) ) )
-
-#define mm256_shr2_16( v1, v2, c ) \
-                    _mm256_or_si256( _mm256_srli_epi16( v2, c ), \
-                                     _mm256_slli_epi16( v1, 16 - (c) ) )
-
-#endif
-*/
-
 //
 //           Bit rotations.
 //
@@ -448,6 +368,16 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_shufll_64( v )    _mm256_permute4x64_epi64( v, 0x93 )

 // Rotate 256 bit vector by one 32 bit element.
+#if defined(__AVX512VL__)
+
+static inline __m256i mm256_shuflr_32( const __m256i v )
+{ return _mm256_alignr_epi32( v, v, 1 ); }
+
+static inline __m256i mm256_shufll_32( const __m256i v )
+{ return _mm256_alignr_epi32( v, v, 15 ); }
+
+#else
+
 #define mm256_shuflr_32( v ) \
    _mm256_permutevar8x32_epi32( v, \
                     m256_const_64( 0x0000000000000007, 0x0000000600000005, \
@@ -458,6 +388,8 @@ static inline __m256i mm256_not( const __m256i v )
                     m256_const_64( 0x0000000600000005,  0x0000000400000003, \
                                    0x0000000200000001,  0x0000000000000007 ) )

+#endif
+
 //
 // Rotate elements within each 128 bit lane of 256 bit vector.

@@ -492,8 +424,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
  #define mm256_shuflr64_24( v )  _mm256_ror_epi64( v, 24 )
 #else
  #define mm256_shuflr64_24( v ) \
-    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
-                                    0x0a09080f0e0d0c0b, 0x0201000706050403, \
+    _mm256_shuffle_epi8( v, m256_const2_64( \
                                    0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
 #endif

@@ -501,8 +432,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
  #define mm256_shuflr64_16( v )  _mm256_ror_epi64( v, 16 )
 #else
  #define mm256_shuflr64_16( v ) \
-    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
-                                    0x09080f0e0d0c0b0a, 0x0100070605040302, \
+    _mm256_shuffle_epi8( v, m256_const2_64( \
                                    0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
 #endif

@@ -510,8 +440,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
  #define mm256_swap32_16( v )  _mm256_ror_epi32( v, 16 )
 #else
  #define mm256_swap32_16( v ) \
-    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
-                                    0x0d0c0f0e09080b0a, 0x0504070601000302, \
+    _mm256_shuffle_epi8( v, m256_const2_64( \
                                    0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
 #endif
 #define mm256_shuflr32_16 mm256_swap32_16
@@ -526,35 +455,24 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
                                    0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
 #endif

-// NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
-// lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
-// AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
-// needed for a shuffle that crosses 128 bit lanes. BSWAP doesn't therefore the
-// AVX2 version will work here. The bswap control vector is coded to work
-// with both versions, bit 4 is ignored in AVX2. 
-
 // Reverse byte order in elements, endian bswap.
 #define mm256_bswap_64( v ) \
   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                        0x08090a0b0c0d0e0f, 0x0001020304050607 ) )
+         m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) )

 #define mm256_bswap_32( v ) \
   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
-                        0x0c0d0e0f08090a0b, 0x0405060700010203 ) )
+         m256_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) )

 #define mm256_bswap_16( v ) \
   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x1e1f1c1d1a1b1819, 0x1617141512131011, \
-                        0x0e0f0c0d0a0b0809, 0x0607040502030001, ) )
+         m256_const2_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001, ) )

 // Source and destination are pointers, may point to same memory.
 // 8 byte qword * 8 qwords * 4 lanes = 256 bytes
 #define mm256_block_bswap_64( d, s ) do \
 { \
-  __m256i ctl = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                               0x08090a0b0c0d0e0f, 0x0001020304050607 ) ; \
+  __m256i ctl = m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ; \
  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
@@ -568,8 +486,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 // 4 byte dword * 8 dwords * 8 lanes = 256 bytes
 #define mm256_block_bswap_32( d, s ) do \
 { \
-  __m256i ctl = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
-                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+  __m256i ctl = m256_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -185,8 +185,16 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
 #define m512_one_16     m512_const1_16( 1 )
 #define m512_one_8      m512_const1_8( 1 )

-//#define m512_neg1 m512_const1_64( 0xffffffffffffffff )
-#define m512_neg1 _mm512_movm_epi64( 0xff )
+// use asm to avoid compiler warning for unitialized local
+static inline __m512i mm512_neg1_fn()
+{
+   __m512i a;
+   asm( "vpternlogq $0xff, %0, %0, %0\n\t" : "=x"(a) );
+   return a;
+}
+#define m512_neg1 mm512_neg1_fn()                          // 1 clock
+//#define m512_neg1 m512_const1_64( 0xffffffffffffffff )   // 5 clocks
+//#define m512_neg1 _mm512_movm_epi64( 0xff )              // 2 clocks

 //
 // Basic operations without SIMD equivalent
@@ -195,11 +203,12 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
 static inline __m512i mm512_not( const __m512i x )
 {  return _mm512_ternarylogic_epi64( x, x, x, 1 ); }

+/*
 // Unary negation: -x
 #define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x )
 #define mm512_negate_32( x ) _mm512_sub_epi32( m512_zero, x )  
 #define mm512_negate_16( x ) _mm512_sub_epi16( m512_zero, x )  
-
+*/

 //
 // Pointer casting
@@ -253,119 +262,43 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // expression using any number or combinations of AND, OR, XOR, NOT.

 // a ^ b ^ c
-#define mm512_xor3( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x96 )
+#define mm512_xor3( a, b, c )      _mm512_ternarylogic_epi64( a, b, c, 0x96 )

 // legacy convenience only
-#define mm512_xor4( a, b, c, d ) \
-   _mm512_xor_si512( a, mm512_xor3( b, c, d ) )
+#define mm512_xor4( a, b, c, d )   _mm512_xor_si512( a, mm512_xor3( b, c, d ) )

 // a & b & c
-#define mm512_and3( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x80 )
+#define mm512_and3( a, b, c )      _mm512_ternarylogic_epi64( a, b, c, 0x80 )

 // a | b | c
-#define mm512_or3( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0xfe )
+#define mm512_or3( a, b, c )       _mm512_ternarylogic_epi64( a, b, c, 0xfe )

 // a ^ ( b & c )
-#define mm512_xorand( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x78 )
+#define mm512_xorand( a, b, c )    _mm512_ternarylogic_epi64( a, b, c, 0x78 )

 // a & ( b ^ c )
-#define mm512_andxor( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x60 )
+#define mm512_andxor( a, b, c )    _mm512_ternarylogic_epi64( a, b, c, 0x60 )

 // a ^ ( b | c )
-#define mm512_xoror( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x1e )
+#define mm512_xoror( a, b, c )     _mm512_ternarylogic_epi64( a, b, c, 0x1e )

 // a ^ ( ~b & c ),     xor( a, andnot( b, c ) )
-#define mm512_xorandnot( a, b, c ) \
-  _mm512_ternarylogic_epi64( a, b, c, 0xd2 ) 
+#define mm512_xorandnot( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0xd2 ) 

 // a | ( b & c )
-#define mm512_orand( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0xf8  )
+#define mm512_orand( a, b, c )     _mm512_ternarylogic_epi64( a, b, c, 0xf8 )

 // Some 2 input operations that don't have their own instruction mnemonic.
+// Use with caution, args are not expression safe.

 // ~( a | b ),  (~a) & (~b)
-#define mm512_nor( a, b ) \
-   _mm512_ternarylogic_epi64( a, b, b, 0x01  )
+#define mm512_nor( a, b )          _mm512_ternarylogic_epi64( a, b, b, 0x01 )

 // ~( a ^ b ),  (~a) ^ b
-#define mm512_xnor( a, b ) \
-   _mm512_ternarylogic_epi64( a, b, b, 0x81  )
+#define mm512_xnor( a, b )         _mm512_ternarylogic_epi64( a, b, b, 0x81 )

 // ~( a & b )
-#define mm512_nand( a, b ) \
-   _mm512_ternarylogic_epi64( a, b, b, 0xef  )
-
-/*
-// Diagonal blending
-// Blend 8 64 bit elements from 8 vectors
-#define mm512_diagonal_64( v7, v6, v5, v4, v3, v2, v1, v0 ) \
-  _mm512_mask_blend_epi64( 0x0f, \
-        _mm512_mask_blend_epi64( 0x30, \
-               _mm512_mask_blend_epi64( 0x40, v7, v6 ), \
-               _mm512_mask_blend_epi64( 0x40, v5, v4 ) ), \
-        _mm512_mask_blend_epi64( 0x03, \
-               _mm512_mask_blend_epi64( 0x04, v3, v2 ) \
-               _mm512_mask_blend_epi64( 0x01, v1, v0 ) ) )  
-
-
-// Blend 4 32 bit elements from each 128 bit lane.
-#define mm512_diagonal128_32( v3, v2, v1, v0 ) \
-    _mm512_mask_blend_epi32( 0x3333, \
-           _mm512_mask_blend_epi32( 0x4444, v3, v2 ), \
-           _mm512_mask_blend_epi32( 0x1111, v1, v0 ) )  
-*/
-
-/*
-//
-// Extended bit shift of concatenated packed elements from 2 vectors.
-// Shift right returns low half, shift left returns high half.
-
-#if defined(__AVX512VBMI2__)
-
-#define mm512_shl2_64( v1, v2, c )    _mm512_shldi_epi64( v1, v2, c )
-#define mm512_shr2_64( v1, v2, c )    _mm512_shrdi_epi64( v1, v2, c )
-
-#define mm512_shl2_32( v1, v2, c )    _mm512_shldi_epi32( v1, v2, c )
-#define mm512_shr2_32( v1, v2, c )    _mm512_shrdi_epi32( v1, v2, c )
-
-#define mm512_shl2_16( v1, v2, c )    _mm512_shldi_epi16( v1, v2, c )
-#define mm512_shr2_16( v1, v2, c )    _mm512_shrdi_epi16( v1, v2, c )
-
-#else
-
-#define mm512_shl2_64( v1, v2, c ) \
-                     _mm512_or_si512( _mm512_slli_epi64( v1, c ), \
-                                      _mm512_srli_epi64( v2, 64 - (c) ) )
-
-#define mm512_shr2_64( v1, v2, c ) \
-                     _mm512_or_si512( _mm512_srli_epi64( v2, c ), \
-                                      _mm512_slli_epi64( v1, 64 - (c) ) )
-
-#define mm512_shl2_32( v1, v2, c ) \
-                     _mm512_or_si512( _mm512_slli_epi32( v1, c ), \
-                                      _mm512_srli_epi32( v2, 32 - (c) ) )
-
-#define mm512_shr2_32( v1, v2, c ) \
-                     _mm512_or_si512( _mm512_srli_epi32( v2, c ), \
-                                      _mm512_slli_epi32( v1, 32 - (c) ) )
-
-#define mm512_shl2_16( v1, v2, c ) \
-                     _mm512_or_si512( _mm512_slli_epi16( v1, c ), \
-                                      _mm512_srli_epi16( v2, 16 - (c) ) )
-
-#define mm512_shr2_16( v1, v2, c ) \
-                     _mm512_or_si512( _mm512_srli_epi16( v2, c ), \
-                                      _mm512_slli_epi16( v1, 16 - (c) ) )
-
-#endif
-*/
+#define mm512_nand( a, b )         _mm512_ternarylogic_epi64( a, b, b, 0xef )

 // Bit rotations.

@@ -382,19 +315,6 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_ror_32 _mm512_ror_epi32
 #define mm512_rol_32 _mm512_rol_epi32

-/*
-#if defined(__AVX512VBMI2__)
-
-// Use C inline function in case arg is coded as an expression.
-static inline __m512i mm512_ror_16( __m512i v, int c )
-{ return _mm512_shrdi_epi16( v, v, c ); }
-
-static inline __m512i mm512_rol_16( __m512i v, int c )
-{ return _mm512_shldi_epi16( v, v, c ); }
-
-#endif
-*/
-
 //
 // Reverse byte order of packed elements, vectorized endian conversion.

@@ -455,30 +375,10 @@ static inline __m512i mm512_rol_16( __m512i v, int c )
 } while(0)


-// Cross-lane shuffles implementing rotate & shift of packed elements.
-//
-
-#define mm512_shiftr_256( v ) \
-  _mm512_alignr_epi64( _mm512_setzero, v, 4 )
-#define mm512_shiftl_256( v ) mm512_shifr_256
-
-#define mm512_shiftr_128( v ) \
-  _mm512_alignr_epi64( _mm512_setzero, v, 2 )
-#define mm512_shiftl_128( v ) \
-  _mm512_alignr_epi64( v,  _mm512_setzero, 6 )
-
-#define mm512_shiftr_64( v ) \
-  _mm512_alignr_epi64( _mm512_setzero, v, 1 )
-#define mm512_shiftl_64( v ) \
-  _mm512_alignr_epi64( v, _mm512_setzero, 7 )
-
-#define mm512_shiftr_32( v ) \
-  _mm512_alignr_epi32( _mm512_setzero, v, 1 )
-#define mm512_shiftl_32( v ) \
-  _mm512_alignr_epi32( v, _mm512_setzero, 15 )
-
-// Shuffle-rotate elements left or right in 512 bit vector.
+// Cross-lane shuffles implementing rotation of packed elements.
+// 

+// Rotate elements across entire vector.
 static inline __m512i mm512_swap_256( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 4 ); }
 #define mm512_shuflr_256( v ) mm512_swap_256
@@ -512,16 +412,16 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuflr_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                       0x0000001F001E001D, 0x001C001B001A0019, \
-                       0X0018001700160015, 0X0014001300120011, \
-                       0X0010000F000E000D, 0X000C000B000A0009, \
-                       0X0008000700060005, 0X0004000300020001 ), v )
+                       0x0018001700160015, 0x0014001300120011, \
+                       0x0010000F000E000D, 0x000C000B000A0009, \
+                       0x0008000700060005, 0x0004000300020001 ), v )

 #define mm512_shufll_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                       0x001E001D001C001B, 0x001A001900180017, \
-                       0X0016001500140013, 0X001200110010000F, \
-                       0X000E000D000C000B, 0X000A000900080007, \
-                       0X0006000500040003, 0X000200010000001F ), v )
+                       0x0016001500140013, 0x001200110010000F, \
+                       0x000E000D000C000B, 0x000A000900080007, \
+                       0x0006000500040003, 0x000200010000001F ), v )

 #define mm512_shuflr_8( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
@@ -537,7 +437,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
                       0x1E1D1C1B1A191817, 0x161514131211100F, \
                       0x0E0D0C0B0A090807, 0x060504030201003F ) )

-//
+// 256 bit lanes used only by lyra2, move these there
 // Rotate elements within 256 bit lanes of 512 bit vector.

 // Swap hi & lo 128 bits in each 256 bit lane
@@ -549,6 +449,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuflr256_64( v )     _mm512_permutex_epi64( v, 0x39 )
 #define mm512_shufll256_64( v )     _mm512_permutex_epi64( v, 0x93 )

+/*
 // Rotate 256 bit lanes by one 32 bit element
 #define mm512_shuflr256_32( v ) \
   _mm512_permutexvar_epi32( m512_const_64( \
@@ -591,7 +492,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
                     0x2e2d2c2b2a292827, 0x262524232221203f, \
                     0x1e1d1c1b1a191817, 0x161514131211100f, \
                     0x0e0d0c0b0a090807, 0x060504030201001f ) )
-
+*/
 //
 // Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
 
--- a/simd-utils/simd-64.h
+++ b/simd-utils/simd-64.h
@@ -34,10 +34,12 @@
 //#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 )
 #define mm64_not( a ) ( (__m64)( ~( (uint64_t)(a) ) )

+/*      
 // Unary negate elements
 #define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, v )
 #define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, v )
 #define mm64_negate_8(  v ) _mm_sub_pi8(  m64_zero, v )
+*/

 // Rotate bits in packed elements of 64 bit vector
 #define mm64_rol_64( a, n ) \
--- a/util.c
+++ b/util.c
@@ -44,28 +44,22 @@
 #include <libgen.h>
 #endif

-//#include "miner.h"
 #include "elist.h"
 #include "algo-gate-api.h"
 #include "algo/sha/sha256d.h"

-//extern pthread_mutex_t stats_lock;
-
-struct data_buffer {
-	void		*buf;
-	size_t		len;
-};
-
-struct upload_buffer {
-	const void	*buf;
-	size_t		len;
-	size_t		pos;
-};
-
 struct header_info {
 	char		*lp_path;
 	char		*reason;
 	char		*stratum_url;
+   size_t	content_length;
+};
+
+struct data_buffer {
+	void			*buf;
+	size_t			len;
+	size_t			allocated;
+	struct header_info	*headers;
 };

 struct tq_ent {
@@ -127,7 +121,6 @@ void applog2( int prio, const char *fmt, ... )
      int len;
 //    struct tm tm;
 //    time_t now = time(NULL);
-
 //    localtime_r(&now, &tm);

      switch ( prio )
@@ -395,67 +388,53 @@ static void databuf_free(struct data_buffer *db)
 static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb,
 			  void *user_data)
 {
-	struct data_buffer *db = (struct data_buffer *) user_data;
+	struct data_buffer *db = user_data;
 	size_t len = size * nmemb;
-	size_t oldlen, newlen;
+	size_t newalloc, reqalloc;
 	void *newmem;
 	static const unsigned char zero = 0;
+	static const size_t max_realloc_increase = 8 * 1024 * 1024;
+	static const size_t initial_alloc = 16 * 1024;

-	oldlen = db->len;
-	newlen = oldlen + len;
+	/* minimum required allocation size */
+	reqalloc = db->len + len + 1;

-	newmem = realloc(db->buf, newlen + 1);
-	if (!newmem)
-		return 0;
+	if (reqalloc > db->allocated) {
+		if (db->len > 0) {
+			newalloc = db->allocated * 2;
+		} else {
+			if (db->headers->content_length > 0)
+				newalloc = db->headers->content_length + 1;
+			else
+				newalloc = initial_alloc;
+		}

-	db->buf = newmem;
-	db->len = newlen;
-	memcpy((uchar*) db->buf + oldlen, ptr, len);
-	memcpy((uchar*) db->buf + newlen, &zero, 1);	/* null terminate */
+		if (db->headers->content_length == 0) {
+			/* limit the maximum buffer increase */
+			if (newalloc - db->allocated > max_realloc_increase)
+				newalloc = db->allocated + max_realloc_increase;
+		}
+
+		/* ensure we have a big enough allocation */
+		if (reqalloc > newalloc)
+			newalloc = reqalloc;
+
+		newmem = realloc(db->buf, newalloc);
+		if (!newmem)
+			return 0;
+
+		db->buf = newmem;
+		db->allocated = newalloc;
+	}
+
+	memcpy(db->buf + db->len, ptr, len); /* append new data */
+	memcpy(db->buf + db->len + len, &zero, 1); /* null terminate */
+
+	db->len += len;

 	return len;
 }

-static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb,
-			     void *user_data)
-{
-	struct upload_buffer *ub = (struct upload_buffer *) user_data;
-	size_t len = size * nmemb;
-
-	if (len > ub->len - ub->pos)
-		len = ub->len - ub->pos;
-
-	if (len) {
-		memcpy(ptr, ((uchar*)ub->buf) + ub->pos, len);
-		ub->pos += len;
-	}
-
-	return len;
-}
-
-#if LIBCURL_VERSION_NUM >= 0x071200
-static int seek_data_cb(void *user_data, curl_off_t offset, int origin)
-{
-	struct upload_buffer *ub = (struct upload_buffer *) user_data;
-	
-	switch (origin) {
-	case SEEK_SET:
-		ub->pos = (size_t) offset;
-		break;
-	case SEEK_CUR:
-		ub->pos += (size_t) offset;
-		break;
-	case SEEK_END:
-		ub->pos = ub->len + (size_t) offset;
-		break;
-	default:
-		return 1; /* CURL_SEEKFUNC_FAIL */
-	}
-
-	return 0; /* CURL_SEEKFUNC_OK */
-}
-#endif
-
 static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
 {
 	struct header_info *hi = (struct header_info *) user_data;
@@ -505,6 +484,9 @@ static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
 		val = NULL;
 	}

+	if (!strcasecmp("Content-Length", key))
+		hi->content_length = strtoul(val, NULL, 10);
+
 out:
 	free(key);
 	free(val);
@@ -564,48 +546,37 @@ json_t *json_rpc_call(CURL *curl, const char *url,
 	int rc;
 	long http_rc;
 	struct data_buffer all_data = {0};
-	struct upload_buffer upload_data;
 	char *json_buf;
 	json_error_t err;
 	struct curl_slist *headers = NULL;
-	char len_hdr[64];
 	char curl_err_str[CURL_ERROR_SIZE] = { 0 };
 	long timeout = (flags & JSON_RPC_LONGPOLL) ? opt_timeout : 30;
 	struct header_info hi = {0};

 	/* it is assumed that 'curl' is freshly [re]initialized at this pt */

-	if (opt_protocol)
-		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
+	if (opt_protocol)  curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
 	curl_easy_setopt(curl, CURLOPT_URL, url);
-	if (opt_cert)
-		curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
-//
-        curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
-
+	if (opt_cert)      curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
+   curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
 	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
 	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
 	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
 	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
 	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb);
 	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
-	curl_easy_setopt(curl, CURLOPT_READFUNCTION, upload_data_cb);
-	curl_easy_setopt(curl, CURLOPT_READDATA, &upload_data);
-#if LIBCURL_VERSION_NUM >= 0x071200
-	curl_easy_setopt(curl, CURLOPT_SEEKFUNCTION, &seek_data_cb);
-	curl_easy_setopt(curl, CURLOPT_SEEKDATA, &upload_data);
-#endif
-	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
-	if (opt_redirect)
-		curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
+   curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
+	if (opt_redirect)  curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
 	curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
 	curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, resp_hdr_cb);
 	curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hi);
-	if (opt_proxy) {
+	if (opt_proxy)
+   {
 		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
 		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
 	}
-	if (userpass) {
+	if (userpass)
+   {
 		curl_easy_setopt(curl, CURLOPT_USERPWD, userpass);
 		curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC);
 	}
@@ -613,23 +584,16 @@ json_t *json_rpc_call(CURL *curl, const char *url,
 	if (flags & JSON_RPC_LONGPOLL)
 		curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
 #endif
-	curl_easy_setopt(curl, CURLOPT_POST, 1);
+   curl_easy_setopt(curl, CURLOPT_POSTFIELDS, rpc_req);

 	if (opt_protocol)
 		applog(LOG_DEBUG, "JSON protocol request:\n%s\n", rpc_req);

-	upload_data.buf = rpc_req;
-	upload_data.len = strlen(rpc_req);
-	upload_data.pos = 0;
-	sprintf(len_hdr, "Content-Length: %lu",
-		(unsigned long) upload_data.len);
-
 	headers = curl_slist_append(headers, "Content-Type: application/json");
-	headers = curl_slist_append(headers, len_hdr);
 	headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
 	headers = curl_slist_append(headers, "X-Mining-Extensions: longpoll reject-reason");
-	//headers = curl_slist_append(headers, "Accept:"); /* disable Accept hdr*/
-	//headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/
+	//headers = curl_slist_append(headers, "Accept:"); // disable Accept hdr
+	//headers = curl_slist_append(headers, "Expect:"); // disable Expect hdr

 	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);

@@ -786,18 +750,26 @@ err_out:
 	return cfg;
 }

-// Segwit BEGIN
 void memrev(unsigned char *p, size_t len)
 {
-   unsigned char c, *q;
-   for (q = p + len - 1; p < q; p++, q--) {
-      c = *p;
-      *p = *q;
-      *q = c;
+   if ( len == 32 )
+   {
+      __m128i *pv = (__m128i*)p;
+      __m128i t = mm128_bswap_128( pv[0] );
+      pv[0] =     mm128_bswap_128( pv[1] );   
+      pv[1] = t;
+   }
+   else
+   {
+      unsigned char c, *q;
+      for (q = p + len - 1; p < q; p++, q--) 
+      {
+         c = *p;
+         *p = *q;
+         *q = c;
+      }
   }
 }
-// Segwit END
-

 void cbin2hex(char *out, const char *in, size_t len)
 {
@@ -832,32 +804,42 @@ char *bebin2hex(const unsigned char *p, size_t len)
   return s;
 }

-bool hex2bin(unsigned char *p, const char *hexstr, size_t len)
+bool hex2bin( unsigned char *p, const char *hexstr, const size_t len )
 {
-	char hex_byte[3];
-	char *ep;
+	if( hexstr == NULL )	return false;

-	hex_byte[2] = '\0';
-
-	while (*hexstr && len) {
-		if (!hexstr[1]) {
-			applog(LOG_ERR, "hex2bin str truncated");
-			return false;
-		}
-		hex_byte[0] = hexstr[0];
-		hex_byte[1] = hexstr[1];
-		*p = (unsigned char) strtol(hex_byte, &ep, 16);
-		if (*ep) {
-			applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte);
-			return false;
-		}
-		p++;
-		hexstr += 2;
-		len--;
+	size_t hexstr_len = strlen( hexstr );
+	if( ( hexstr_len % 2 ) != 0 )
+   {
+		applog( LOG_ERR, "hex2bin string truncated" );
+		return false;
+	}
+	size_t bin_len = hexstr_len / 2;
+	if ( bin_len > len )
+   {
+		applog( LOG_ERR, "hex2bin buffer too small" );
+		return false;
 	}

-	return(!len) ? true : false;
-/*	return (len == 0 && *hexstr == 0) ? true : false; */
+	memset( p, 0, len );
+	size_t i = 0;
+	while ( i < hexstr_len )
+   {
+		char c = hexstr[i];
+		unsigned char nibble;
+		if      ( c >= '0' && c <= '9' )	 nibble = (c - '0');
+		else if ( c >= 'A' && c <= 'F' )	 nibble = ( 10 + (c - 'A') );
+		else if ( c >= 'a' && c <= 'f' )	 nibble = ( 10 + (c - 'a') );
+		else
+      {
+			applog( LOG_ERR, "hex2bin invalid hex" );
+			return false;
+		}
+		p[(i / 2)] |= (nibble << ( (1 - (i % 2) ) * 4) );
+		i++;
+	}
+
+	return true;
 }

 int varint_encode(unsigned char *p, uint64_t n)
@@ -1339,6 +1321,43 @@ inline bool valid_hash( const void *hash, const void *target )

 #endif 

+inline double nbits_to_diff( uint32_t nbits )
+{
+   long double diff;
+   uint32_t shift = nbits & 0xff;
+   uint32_t bits = bswap_32( nbits ) & 0x00ffffff;
+   int shift_off = (int)shift - 29;
+
+   // diff = ( (2**16 -1) / ( 256**shift_off * bits )
+   // With uint128 byte shift is good for 16 <= shift <= 41. As unlikely
+   // as this may seem necessary, check just in case.
+
+   if ( shift_off >= -13 && shift_off <= 12 ) 
+   {  // fast
+      if ( shift_off == 0 )
+         diff = (long double)0xffff / (long double)bits;
+      else if ( shift_off < 0 )   // shift < 29
+         diff = (long double)( (uint128_t)0xffff << ( (-shift_off) *8 ) ) 
+              / (long double)bits;
+      else // ( shift_off > 0 )   // shift > 29
+         diff =   (long double)0xffff
+                / (long double)( (uint128_t)bits << ( shift_off*8 ) );  
+   }
+   else
+   {  // slow
+      int m;
+      diff = 0.;
+      for ( m = shift; m < 29; m++ )    diff *= 256.0;
+      for ( m = 29; m < shift; m++ )    diff /= 256.0;
+   }
+
+   if ( opt_debug )
+      applog( LOG_INFO, "nbits %08x: shift %u(%d), bits %06x, diff %8g",
+                         nbits, shift, shift_off, bits, (double)diff );
+
+   return (double)diff;
+}
+
 #ifdef WIN32
 #define socket_blocks() (WSAGetLastError() == WSAEWOULDBLOCK)
 #else
@@ -1507,7 +1526,8 @@ out:
 	return sret;
 }

-#if LIBCURL_VERSION_NUM >= 0x071101
+#if LIBCURL_VERSION_NUM >= 0x071101 && LIBCURL_VERSION_NUM < 0x072d00
+//#if LIBCURL_VERSION_NUM >= 0x071101
 static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose,
 	struct curl_sockaddr *addr)
 {
@@ -1575,7 +1595,8 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
 #if LIBCURL_VERSION_NUM >= 0x070f06
 	curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
 #endif
-#if LIBCURL_VERSION_NUM >= 0x071101
+#if LIBCURL_VERSION_NUM >= 0x071101 && LIBCURL_VERSION_NUM < 0x072d00
+//#if LIBCURL_VERSION_NUM >= 0x071101
 	curl_easy_setopt(curl, CURLOPT_OPENSOCKETFUNCTION, opensocket_grab_cb);
 	curl_easy_setopt(curl, CURLOPT_OPENSOCKETDATA, &sctx->sock);
 #endif
@@ -1589,7 +1610,10 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
 		return false;
 	}

-#if LIBCURL_VERSION_NUM < 0x071101
+#if LIBCURL_VERSION_NUM >= 0x072d00
+	curl_easy_getinfo(curl, CURLINFO_ACTIVESOCKET, &sctx->sock);
+#elif LIBCURL_VERSION_NUM < 0x071101   
+//#if LIBCURL_VERSION_NUM < 0x071101
 	/* CURLINFO_LASTSOCKET is broken on Win64; only use it as a last resort */
 	curl_easy_getinfo(curl, CURLINFO_LASTSOCKET, (long *)&sctx->sock);
 #endif
@@ -1885,7 +1909,8 @@ static uint32_t getblocheight(struct stratum_ctx *sctx)

 	// find 0xffff tag
 	p = (uint8_t*) sctx->job.coinbase + 32;
-	m = p + 128;
+   m = p + sctx->job.coinbase_size - 32 - 2;
+//   m = p + 128;
 	while (*p != 0xff && p < m) p++;
 	while (*p == 0xff && p < m) p++;
 	if (*(p-1) == 0xff && *(p-2) == 0xff) {
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -17,7 +17,9 @@ export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
 # used by GCC
 export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
 # Support for Windows 7 CPU groups, AES sometimes not included in -march
-export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
+# CPU groups disabled due to incompatibilities between Intel and AMD CPUs.
+#export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
+export DEFAULT_CFLAGS="-maes -O3 -Wall"
 export DEFAULT_CFLAGS_OLD="-O3 -Wall"

 # make link to local gmp header file.
Author	SHA1	Message	Date
Jay D Dee	3dd6787531	v3.22.0	2023-03-21 17:12:51 -04:00
Jay D Dee	cae1ce2ab7	v3.21.5	2023-03-15 12:27:04 -04:00
Jay D Dee	7a91c41d74	v3.21.4	2023-03-13 14:54:38 -04:00
Jay D Dee	c6bc9d67fb	v3.21.3 Unreleased	2023-03-13 03:20:13 -04:00
Jay D Dee	b339450898	v3.21.3	2023-03-11 14:54:49 -05:00
Jay D Dee	fb93160641	v3.21.2	2023-03-03 12:38:31 -05:00
Jay D Dee	520d4d5384	v3.21.1	2023-02-08 22:11:05 -05:00