v3.21.2

v3.21.1
v3.21.0
2025-09-17 23:44:27 +00:00 · 2023-03-03 12:38:31 -05:00 · 2023-02-08 22:11:05 -05:00 · 2022-12-21 13:09:14 -05:00
48 changed files with 4406 additions and 3923 deletions
--- a/4
+++ b/4
@@ -1,4 +1,6 @@

+These instructions may be out of date, see the Wiki for the latest...
+https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source

 1. Requirements:
 ---------------
@@ -35,7 +37,7 @@ SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
 openssl 1.1.0e or higher.

 znver1 and znver2 should be recognized on most recent version of GCC and
-znver3 is expected with GCC 11. GCC 11 also includes rocketlake support.
+znver3 is available with GCC 11. GCC 11 also includes rocketlake support.
 In the meantime here are some suggestions to compile with new CPUs:

 "-march=native" is usually the best choice, used by build.sh.
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 Instructions for compiling cpuminer-opt for Windows.

-Thwaw intructions nay be out of date. Please consult the wiki for
+These intructions are out of date. Please consult the wiki for
 the latest:

 https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
--- a/README.md
+++ b/README.md
@@ -74,53 +74,50 @@ Supported Algorithms
                          argon2d250    argon2d-crds, Credits (CRDS)
                          argon2d500    argon2d-dyn,  Dynamic (DYN)
                          argon2d4096   argon2d-uis, Unitus, (UIS)
-                          axiom         Shabal-256 MemoHash
-                          blake         Blake-256 (SFR)
-                          blake2b       Blake2b 256
-                          blake2s       Blake-2 S
+                          blake         Blake-256
+                          blake2b       Blake2-512
+                          blake2s       Blake2-256
                          blakecoin     blake256r8
                          bmw           BMW 256
                          bmw512        BMW 512
-                          c11           Chaincoin
+                          c11           
                          decred
                          deep          Deepcoin (DCN)
                          dmd-gr        Diamond-Groestl
                          groestl       Groestl coin
                          hex           x16r-hex
-                          hmq1725       Espers
+                          hmq1725       
                          hodl          Hodlcoin
                          jha           Jackpotcoin
                          keccak        Maxcoin
                          keccakc       Creative coin
                          lbry          LBC, LBRY Credits
-                          luffa         Luffa
-                          lyra2h        Hppcoin
+                          lyra2h        
                          lyra2re       lyra2
                          lyra2rev2     lyra2v2
                          lyra2rev3     lyrav2v3
                          lyra2z        
-                          lyra2z330     Lyra2 330 rows, Zoin (ZOI)
-                          m7m           Magi (XMG)
-                          minotaur      Ringcoin (RNG)
+                          lyra2z330     
+                          m7m           
+                          minotaur 
+                          minotaurx
                          myr-gr        Myriad-Groestl
                          neoscrypt     NeoScrypt(128, 2, 1)
                          nist5         Nist5
                          pentablake    Pentablake
                          phi1612       phi
-                          phi2          Luxcoin (LUX)
-                          phi2-lux      identical to phi2
-                          pluck         Pluck:128 (Supcoin)
+                          phi2          
                          polytimos     Ninja
                          power2b       MicroBitcoin (MBC)
                          quark         Quark
                          qubit         Qubit
                          scrypt        scrypt(1024, 1, 1) (default)
                          scrypt:N      scrypt(N, 1, 1)
+                          scryptn2      scrypt(1048576, 1, 1)
                          sha256d       Double SHA-256
-                          sha256q       Quad SHA-256, Pyrite (PYE)
-                          sha256t       Triple SHA-256, Onecoin (OC)
+                          sha256q       Quad SHA-256
+                          sha256t       Triple SHA-256
                          sha3d         Double keccak256 (BSHA3)
-                          shavite3      Shavite3
                          skein         Skein+Sha (Skeincoin)
                          skein2        Double Skein (Woodcoin)
                          skunk         Signatum (SIGT)
@@ -136,17 +133,17 @@ Supported Algorithms
                          x11           Dash
                          x11evo        Revolvercoin
                          x11gost       sib (SibCoin)
-                          x12           Galaxie Cash (GCH)
-                          x13           X13
+                          x12           
+                          x13           
                          x13bcd        bcd
                          x13sm3        hsr (Hshare)
-                          x14           X14
-                          x15           X15
+                          x14           
+                          x15           
                          x16r          
                          x16rv2        
-                          x16rt         Gincoin (GIN)
-                          x16rt-veil    Veil (VEIL)
-                          x16s          Pigeoncoin (PGN)
+                          x16rt         
+                          x16rt-veil    veil
+                          x16s          
                          x17
                          x21s
                          x22i
--- a/README.txt
+++ b/README.txt
@@ -73,7 +73,6 @@ third party packages. They often will work and may be used instead of the
 included version of the files.


-
 If you like this software feel free to donate:

 BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
--- a/19
+++ b/19
@@ -65,6 +65,22 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v3.22.2
+
+Faster SALSA SIMD shuffle for yespower, yescrypt & scryptn2.
+Fixed a couple of compiler warnings with gcc-12.
+
+v3.21.1
+
+Fixed a segfault in some obsolete algos.
+Small optimizations to Hamsi & Shabal AVX2 & AVX512.
+
+v3.21.0
+
+Added minotaurx algo for stratum only.
+Blake256 & sha256 prehash optimized to ignore zero-padded data for AVX2 & AVX512.
+Other small improvements.
+
 v3.20.3

 Faster c11 algo: AVX512 6%, AVX2 4%, AVX2+VAES 15%.
@@ -98,12 +114,9 @@ v3.19.8

 #370 "stratum+ssl", in addition to "stratum+tcps", is now recognized as a valid
 url protocol specifier for requesting a secure stratum connection.
-
 The full url, including the protocol, is now displayed in the stratum connect
 log and the periodic summary log.
-
 Small optimizations to Cubehash, AVX2 & AVX512.
-
 Byte order and prehash optimizations for Blake256 & Blake512, AVX2 & AVX512.

 v3.19.7
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -1,6 +1,6 @@
-# generated automatically by aclocal 1.16.1 -*- Autoconf -*-
+# generated automatically by aclocal 1.16.5 -*- Autoconf -*-

-# Copyright (C) 1996-2018 Free Software Foundation, Inc.
+# Copyright (C) 1996-2021 Free Software Foundation, Inc.

 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -14,13 +14,13 @@
 m4_ifndef([AC_CONFIG_MACRO_DIRS], [m4_defun([_AM_CONFIG_MACRO_DIRS], [])m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])])
 m4_ifndef([AC_AUTOCONF_VERSION],
  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
-m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.69],,
-[m4_warning([this file was generated for autoconf 2.69.
+m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.71],,
+[m4_warning([this file was generated for autoconf 2.71.
 You have another version of autoconf.  It may work, but is not guaranteed to.
 If you have problems, you may need to regenerate the build system entirely.
 To do so, use the procedure documented by the package, typically 'autoreconf'.])])

-# Copyright (C) 2002-2018 Free Software Foundation, Inc.
+# Copyright (C) 2002-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -35,7 +35,7 @@ AC_DEFUN([AM_AUTOMAKE_VERSION],
 [am__api_version='1.16'
 dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
 dnl require some minimum version.  Point them to the right macro.
-m4_if([$1], [1.16.1], [],
+m4_if([$1], [1.16.5], [],
      [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
 ])

@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
 # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
 # This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.16.1])dnl
+[AM_AUTOMAKE_VERSION([1.16.5])dnl
 m4_ifndef([AC_AUTOCONF_VERSION],
  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])

 # Figure out how to run the assembler.                      -*- Autoconf -*-

-# Copyright (C) 2001-2018 Free Software Foundation, Inc.
+# Copyright (C) 2001-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -78,7 +78,7 @@ _AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl

 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-

-# Copyright (C) 2001-2018 Free Software Foundation, Inc.
+# Copyright (C) 2001-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -130,7 +130,7 @@ am_aux_dir=`cd "$ac_aux_dir" && pwd`

 # AM_CONDITIONAL                                            -*- Autoconf -*-

-# Copyright (C) 1997-2018 Free Software Foundation, Inc.
+# Copyright (C) 1997-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -161,7 +161,7 @@ AC_CONFIG_COMMANDS_PRE(
 Usually this means the macro was only invoked conditionally.]])
 fi])])

-# Copyright (C) 1999-2018 Free Software Foundation, Inc.
+# Copyright (C) 1999-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -352,7 +352,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl

 # Generate code to set up dependency tracking.              -*- Autoconf -*-

-# Copyright (C) 1999-2018 Free Software Foundation, Inc.
+# Copyright (C) 1999-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -391,7 +391,9 @@ AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
  done
  if test $am_rc -ne 0; then
    AC_MSG_FAILURE([Something went wrong bootstrapping makefile fragments
-    for automatic dependency tracking.  Try re-running configure with the
+    for automatic dependency tracking.  If GNU make was not used, consider
+    re-running the configure script with MAKE="gmake" (or whatever is
+    necessary).  You can also try re-running configure with the
    '--disable-dependency-tracking' option to at least be able to build
    the package (albeit without support for automatic dependency tracking).])
  fi
@@ -418,7 +420,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],

 # Do all the work for Automake.                             -*- Autoconf -*-

-# Copyright (C) 1996-2018 Free Software Foundation, Inc.
+# Copyright (C) 1996-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -446,6 +448,10 @@ m4_defn([AC_PROG_CC])
 # release and drop the old call support.
 AC_DEFUN([AM_INIT_AUTOMAKE],
 [AC_PREREQ([2.65])dnl
+m4_ifdef([_$0_ALREADY_INIT],
+  [m4_fatal([$0 expanded multiple times
+]m4_defn([_$0_ALREADY_INIT]))],
+  [m4_define([_$0_ALREADY_INIT], m4_expansion_stack)])dnl
 dnl Autoconf wants to disallow AM_ names.  We explicitly allow
 dnl the ones we care about.
 m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
@@ -482,7 +488,7 @@ m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
 [_AM_SET_OPTIONS([$1])dnl
 dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT.
 m4_if(
-  m4_ifdef([AC_PACKAGE_NAME], [ok]):m4_ifdef([AC_PACKAGE_VERSION], [ok]),
+  m4_ifset([AC_PACKAGE_NAME], [ok]):m4_ifset([AC_PACKAGE_VERSION], [ok]),
  [ok:ok],,
  [m4_fatal([AC_INIT should be called with package and version arguments])])dnl
 AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
@@ -534,6 +540,20 @@ AC_PROVIDE_IFELSE([AC_PROG_OBJCXX],
 		  [m4_define([AC_PROG_OBJCXX],
 			     m4_defn([AC_PROG_OBJCXX])[_AM_DEPENDENCIES([OBJCXX])])])dnl
 ])
+# Variables for tags utilities; see am/tags.am
+if test -z "$CTAGS"; then
+  CTAGS=ctags
+fi
+AC_SUBST([CTAGS])
+if test -z "$ETAGS"; then
+  ETAGS=etags
+fi
+AC_SUBST([ETAGS])
+if test -z "$CSCOPE"; then
+  CSCOPE=cscope
+fi
+AC_SUBST([CSCOPE])
+
 AC_REQUIRE([AM_SILENT_RULES])dnl
 dnl The testsuite driver may need to know about EXEEXT, so add the
 dnl 'am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen.  This
@@ -615,7 +635,7 @@ for _am_header in $config_headers :; do
 done
 echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])

-# Copyright (C) 2001-2018 Free Software Foundation, Inc.
+# Copyright (C) 2001-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -636,7 +656,7 @@ if test x"${install_sh+set}" != xset; then
 fi
 AC_SUBST([install_sh])])

-# Copyright (C) 2003-2018 Free Software Foundation, Inc.
+# Copyright (C) 2003-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -658,7 +678,7 @@ AC_SUBST([am__leading_dot])])
 # Add --enable-maintainer-mode option to configure.         -*- Autoconf -*-
 # From Jim Meyering

-# Copyright (C) 1996-2018 Free Software Foundation, Inc.
+# Copyright (C) 1996-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -693,7 +713,7 @@ AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])

 # Check to see how 'make' treats includes.	            -*- Autoconf -*-

-# Copyright (C) 2001-2018 Free Software Foundation, Inc.
+# Copyright (C) 2001-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -736,7 +756,7 @@ AC_SUBST([am__quote])])

 # Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-

-# Copyright (C) 1997-2018 Free Software Foundation, Inc.
+# Copyright (C) 1997-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -757,12 +777,7 @@ AC_DEFUN([AM_MISSING_HAS_RUN],
 [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
 AC_REQUIRE_AUX_FILE([missing])dnl
 if test x"${MISSING+set}" != xset; then
-  case $am_aux_dir in
-  *\ * | *\	*)
-    MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;;
-  *)
-    MISSING="\${SHELL} $am_aux_dir/missing" ;;
-  esac
+  MISSING="\${SHELL} '$am_aux_dir/missing'"
 fi
 # Use eval to expand $SHELL
 if eval "$MISSING --is-lightweight"; then
@@ -775,7 +790,7 @@ fi

 # Helper functions for option handling.                     -*- Autoconf -*-

-# Copyright (C) 2001-2018 Free Software Foundation, Inc.
+# Copyright (C) 2001-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -804,7 +819,7 @@ AC_DEFUN([_AM_SET_OPTIONS],
 AC_DEFUN([_AM_IF_OPTION],
 [m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])

-# Copyright (C) 1999-2018 Free Software Foundation, Inc.
+# Copyright (C) 1999-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -851,7 +866,7 @@ AC_LANG_POP([C])])
 # For backward compatibility.
 AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])

-# Copyright (C) 2001-2018 Free Software Foundation, Inc.
+# Copyright (C) 2001-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -870,7 +885,7 @@ AC_DEFUN([AM_RUN_LOG],

 # Check to make sure that the build environment is sane.    -*- Autoconf -*-

-# Copyright (C) 1996-2018 Free Software Foundation, Inc.
+# Copyright (C) 1996-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -951,7 +966,7 @@ AC_CONFIG_COMMANDS_PRE(
 rm -f conftest.file
 ])

-# Copyright (C) 2009-2018 Free Software Foundation, Inc.
+# Copyright (C) 2009-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1011,7 +1026,7 @@ AC_SUBST([AM_BACKSLASH])dnl
 _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
 ])

-# Copyright (C) 2001-2018 Free Software Foundation, Inc.
+# Copyright (C) 2001-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1039,7 +1054,7 @@ fi
 INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
 AC_SUBST([INSTALL_STRIP_PROGRAM])])

-# Copyright (C) 2006-2018 Free Software Foundation, Inc.
+# Copyright (C) 2006-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1058,7 +1073,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])

 # Check how to create a tarball.                            -*- Autoconf -*-

-# Copyright (C) 2004-2018 Free Software Foundation, Inc.
+# Copyright (C) 2004-2021 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -327,6 +327,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_LYRA2Z330:    rc = register_lyra2z330_algo     ( gate ); break;
    case ALGO_M7M:          rc = register_m7m_algo           ( gate ); break;
    case ALGO_MINOTAUR:     rc = register_minotaur_algo      ( gate ); break;
+    case ALGO_MINOTAURX:    rc = register_minotaur_algo      ( gate ); break;
    case ALGO_MYR_GR:       rc = register_myriad_algo        ( gate ); break;
    case ALGO_NEOSCRYPT:    rc = register_neoscrypt_algo     ( gate ); break;
    case ALGO_NIST5:        rc = register_nist5_algo         ( gate ); break;
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -115,7 +115,7 @@ void blake256_8way_close(void *cc, void *dst);
 void blake256_8way_update_le(void *cc, const void *data, size_t len);
 void blake256_8way_close_le(void *cc, void *dst);
 void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
-                                       const void *data );
+                                      void *data );
 void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
                                     const void *midhash, const void *data );

@@ -178,7 +178,7 @@ void blake256_16way_close(void *cc, void *dst);
 void blake256_16way_update_le(void *cc, const void *data, size_t len);
 void blake256_16way_close_le(void *cc, void *dst);
 void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
-                                       const void *data );
+                                       void *data );
 void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
                                     const void *midhash, const void *data );

--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -668,6 +668,258 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
        GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
 }

+// Short cut message expansion when the message data is known to be zero.
+// M[ 5:12, 14 ] are zero padded for the second block of 80 byte data.
+
+#define G256_8WAY_ALT( a, b, c, d, m0, m1 ) \
+{ \
+   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m0 ); \
+   d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
+   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m1 ); \
+   d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
+}
+
+// Message expansion optimized for each round.
+#define ROUND256_8WAY_0 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC, \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CS1 ) ), \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CS0 ) ) ); \
+   G256_8WAY_ALT( V1, V5, V9, VD, \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CS3 ) ), \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CS2 ) ) ); \
+   G256_8WAY_ALT( V2, V6, VA, VE, \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CS5 ) ), \
+                                        _mm256_set1_epi32( CS4 )   ); \
+   G256_8WAY_ALT( V3, V7, VB, VF,       _mm256_set1_epi32( CS7 )  , \
+                                        _mm256_set1_epi32( CS6 )   ); \
+   G256_8WAY_ALT( V0, V5, VA, VF,       _mm256_set1_epi32( CS9 )  , \
+                                        _mm256_set1_epi32( CS8 )   ); \
+   G256_8WAY_ALT( V1, V6, VB, VC,       _mm256_set1_epi32( CSB )  , \
+                                        _mm256_set1_epi32( CSA )   ); \
+   G256_8WAY_ALT( V2, V7, V8, VD,       _mm256_set1_epi32( CSD )  , \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CSC ) ) ); \
+   G256_8WAY_ALT( V3, V4, V9, VE,       _mm256_set1_epi32( CSF )  , \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CSE ) ) ); \
+}
+
+#define ROUND256_8WAY_1 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC,       _mm256_set1_epi32( CSA )  , \
+                                        _mm256_set1_epi32( CSE )   ); \
+   G256_8WAY_ALT( V1, V5, V9, VD, \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CS8 ) ), \
+                                        _mm256_set1_epi32( CS4 )   ); \
+   G256_8WAY_ALT( V2, V6, VA, VE,       _mm256_set1_epi32( CSF )  , \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CS9 ) ) ); \
+   G256_8WAY_ALT( V3, V7, VB, VF, \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CS6 ) ), \
+                                        _mm256_set1_epi32( CSD )   ); \
+   G256_8WAY_ALT( V0, V5, VA, VF, \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CSC ) ), \
+                                        _mm256_set1_epi32( CS1 )   ); \
+   G256_8WAY_ALT( V1, V6, VB, VC, \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CS2 ) ), \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CS0 ) ) ); \
+   G256_8WAY_ALT( V2, V7, V8, VD,       _mm256_set1_epi32( CS7 )  , \
+                                        _mm256_set1_epi32( CSB )   ); \
+   G256_8WAY_ALT( V3, V4, V9, VE,       _mm256_set1_epi32( CS3 )  , \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CS5 ) ) ); \
+}
+
+#define ROUND256_8WAY_2 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC,       _mm256_set1_epi32( CS8 )  , \
+                                        _mm256_set1_epi32( CSB )   ); \
+   G256_8WAY_ALT( V1, V5, V9, VD,       _mm256_set1_epi32( CS0 )  , \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CSC ) ) ); \
+   G256_8WAY_ALT( V2, V6, VA, VE,       _mm256_set1_epi32( CS2 )  , \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CS5 ) ) ); \
+   G256_8WAY_ALT( V3, V7, VB, VF, \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CSD ) ), \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CSF ) ) ); \
+   G256_8WAY_ALT( V0, V5, VA, VF,       _mm256_set1_epi32( CSE )  , \
+                                        _mm256_set1_epi32( CSA )   ); \
+   G256_8WAY_ALT( V1, V6, VB, VC, \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CS6 ) ), \
+                                        _mm256_set1_epi32( CS3 )   ); \
+   G256_8WAY_ALT( V2, V7, V8, VD,       _mm256_set1_epi32( CS1 )  , \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CS7 ) ) ); \
+   G256_8WAY_ALT( V3, V4, V9, VE,       _mm256_set1_epi32( CS4 )  , \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CS9 ) ) ); \
+}
+
+#define ROUND256_8WAY_3 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC,       _mm256_set1_epi32( CS9 )  , \
+                                        _mm256_set1_epi32( CS7 )   ); \
+   G256_8WAY_ALT( V1, V5, V9, VD, \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CS1 ) ), \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CS3 ) ) ); \
+   G256_8WAY_ALT( V2, V6, VA, VE, \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CSC ) ), \
+                                        _mm256_set1_epi32( CSD )   ); \
+   G256_8WAY_ALT( V3, V7, VB, VF,       _mm256_set1_epi32( CSE )  , \
+                                        _mm256_set1_epi32( CSB )   ); \
+   G256_8WAY_ALT( V0, V5, VA, VF, \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CS6 ) ), \
+                                        _mm256_set1_epi32( CS2 )   ); \
+   G256_8WAY_ALT( V1, V6, VB, VC,       _mm256_set1_epi32( CSA )  , \
+                                        _mm256_set1_epi32( CS5 )   ); \
+   G256_8WAY_ALT( V2, V7, V8, VD, \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CS0 ) ), \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CS4 ) ) ); \
+   G256_8WAY_ALT( V3, V4, V9, VE, \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CS8 ) ), \
+                                        _mm256_set1_epi32( CSF )   ); \
+}
+
+#define ROUND256_8WAY_4 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC,       _mm256_set1_epi32( CS0 )  , \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CS9 ) ) ); \
+   G256_8WAY_ALT( V1, V5, V9, VD,       _mm256_set1_epi32( CS7 )  , \
+                                        _mm256_set1_epi32( CS5 )   ); \
+   G256_8WAY_ALT( V2, V6, VA, VE, \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CS4 ) ), \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CS2 ) )  ); \
+   G256_8WAY_ALT( V3, V7, VB, VF,       _mm256_set1_epi32( CSF )  , \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CSA ) ) ); \
+   G256_8WAY_ALT( V0, V5, VA, VF,       _mm256_set1_epi32( CS1 )  , \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CSE ) ) ); \
+   G256_8WAY_ALT( V1, V6, VB, VC,       _mm256_set1_epi32( CSC )  , \
+                                        _mm256_set1_epi32( CSB )   ); \
+   G256_8WAY_ALT( V2, V7, V8, VD,       _mm256_set1_epi32( CS8 )  , \
+                                        _mm256_set1_epi32( CS6 )   ); \
+   G256_8WAY_ALT( V3, V4, V9, VE, \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CSD ) ), \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CS3 ) ) ); \
+}
+
+#define ROUND256_8WAY_5 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC, \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CSC ) ), \
+                                        _mm256_set1_epi32( CS2 )   ); \
+   G256_8WAY_ALT( V1, V5, V9, VD,       _mm256_set1_epi32( CSA )  , \
+                                        _mm256_set1_epi32( CS6 )   ); \
+   G256_8WAY_ALT( V2, V6, VA, VE, \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CSB ) ), \
+                                        _mm256_set1_epi32( CS0 )   ); \
+   G256_8WAY_ALT( V3, V7, VB, VF,       _mm256_set1_epi32( CS3 )  , \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CS8 ) ) ); \
+   G256_8WAY_ALT( V0, V5, VA, VF, \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CSD ) ), \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CS4 ) ) ); \
+   G256_8WAY_ALT( V1, V6, VB, VC,       _mm256_set1_epi32( CS5 )  , \
+                                        _mm256_set1_epi32( CS7 )   ); \
+   G256_8WAY_ALT( V2, V7, V8, VD, \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CSE ) ), \
+                                        _mm256_set1_epi32( CSF )   ); \
+   G256_8WAY_ALT( V3, V4, V9, VE, \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CS9 ) ), \
+                                        _mm256_set1_epi32( CS1 )   ); \
+}
+
+#define ROUND256_8WAY_6 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC,       _mm256_set1_epi32( CS5 )  , \
+                                        _mm256_set1_epi32( CSC )   ); \
+   G256_8WAY_ALT( V1, V5, V9, VD, \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CSF ) ), \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CS1 ) ) ); \
+   G256_8WAY_ALT( V2, V6, VA, VE,       _mm256_set1_epi32( CSD )  , \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CSE ) ) );\
+   G256_8WAY_ALT( V3, V7, VB, VF, \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CSA ) ), \
+                                        _mm256_set1_epi32( CS4 )   ); \
+   G256_8WAY_ALT( V0, V5, VA, VF, \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CS7 ) ), \
+                                        _mm256_set1_epi32( CS0 )   ); \
+   G256_8WAY_ALT( V1, V6, VB, VC,       _mm256_set1_epi32( CS3 )  , \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CS6 ) ) ); \
+   G256_8WAY_ALT( V2, V7, V8, VD,       _mm256_set1_epi32( CS2 )  , \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CS9 ) ) ); \
+   G256_8WAY_ALT( V3, V4, V9, VE,       _mm256_set1_epi32( CSB )  , \
+                                        _mm256_set1_epi32( CS8 )   ); \
+}
+
+#define ROUND256_8WAY_7 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC, \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CSB ) ), \
+                                        _mm256_set1_epi32( CSD )   ); \
+   G256_8WAY_ALT( V1, V5, V9, VD,       _mm256_set1_epi32( CSE )  , \
+                                        _mm256_set1_epi32( CS7 )   ); \
+   G256_8WAY_ALT( V2, V6, VA, VE,       _mm256_set1_epi32( CS1 )  , \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CSC ) ) ); \
+   G256_8WAY_ALT( V3, V7, VB, VF, \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CS9 ) ), \
+                                        _mm256_set1_epi32( CS3 )   ); \
+   G256_8WAY_ALT( V0, V5, VA, VF,       _mm256_set1_epi32( CS0 )  , \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CS5 ) ) ); \
+   G256_8WAY_ALT( V1, V6, VB, VC, \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CS4 ) ), \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CSF ) ) ); \
+   G256_8WAY_ALT( V2, V7, V8, VD,       _mm256_set1_epi32( CS6 )  , \
+                                        _mm256_set1_epi32( CS8 )   ); \
+   G256_8WAY_ALT( V3, V4, V9, VE, \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CSA ) ), \
+                                        _mm256_set1_epi32( CS2 )   ); \
+}
+
+#define ROUND256_8WAY_8 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC,       _mm256_set1_epi32( CSF   ), \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CS6 ) ) ); \
+   G256_8WAY_ALT( V1, V5, V9, VD,       _mm256_set1_epi32( CS9 )  , \
+                                        _mm256_set1_epi32( CSE )   ); \
+   G256_8WAY_ALT( V2, V6, VA, VE,       _mm256_set1_epi32( CS3 )  , \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CSB ) ) ); \
+   G256_8WAY_ALT( V3, V7, VB, VF, \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CS8 ) ), \
+                                        _mm256_set1_epi32( CS0 )   ); \
+   G256_8WAY_ALT( V0, V5, VA, VF,       _mm256_set1_epi32( CS2 )  , \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CSC ) ) ); \
+   G256_8WAY_ALT( V1, V6, VB, VC, \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CS7 ) ), \
+                                        _mm256_set1_epi32( CSD )   ); \
+   G256_8WAY_ALT( V2, V7, V8, VD, \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CS4 ) ), \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CS1 ) ) ); \
+   G256_8WAY_ALT( V3, V4, V9, VE,       _mm256_set1_epi32( CS5 )  , \
+                                        _mm256_set1_epi32( CSA )   ); \
+}
+
+#define ROUND256_8WAY_9 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC,       _mm256_set1_epi32( CS2 )  , \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CSA ) ) ); \
+   G256_8WAY_ALT( V1, V5, V9, VD,       _mm256_set1_epi32( CS4 )  , \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CS8 ) ) ); \
+   G256_8WAY_ALT( V2, V6, VA, VE,       _mm256_set1_epi32( CS6 )  , \
+                                        _mm256_set1_epi32( CS7 )    ); \
+   G256_8WAY_ALT( V3, V7, VB, VF, \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CS5 ) ), \
+                                        _mm256_set1_epi32( CS1 )   ); \
+   G256_8WAY_ALT( V0, V5, VA, VF, \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CSB ) ), \
+                                        _mm256_set1_epi32( CSF )   ); \
+   G256_8WAY_ALT( V1, V6, VB, VC,       _mm256_set1_epi32( CSE )  , \
+                                        _mm256_set1_epi32( CS9 )   ); \
+   G256_8WAY_ALT( V2, V7, V8, VD, \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CSC ) ), \
+                                        _mm256_set1_epi32( CS3 )   ); \
+   G256_8WAY_ALT( V3, V4, V9, VE, \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CS0 ) ), \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CSD ) ) ); \
+}
+
+
 #define DECL_STATE32_8WAY \
   __m256i H0, H1, H2, H3, H4, H5, H6, H7; \
   sph_u32 T0, T1;
@@ -834,9 +1086,9 @@ do { \
 }

 void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
-                                       const void *data )
+                                      void *data )
 {
-   const __m256i *M = (const __m256i*)data;
+   __m256i *M = (__m256i*)data;
   __m256i *V = (__m256i*)midstate;
   const __m256i *H = (const __m256i*)midhash;

@@ -857,6 +1109,17 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
   V[14] = m256_const1_32( CS6 );
   V[15] = m256_const1_32( CS7 );

+// M[ 0:3 ] contain new message data including unique nonces in M[ 3].
+// M[ 5:12, 14 ] are always zero and not needed or used.
+// M[ 4], M[ 13], M[15] are constant and are initialized here.
+// M[ 5] is a special case, used as a cache for (M[13] ^ CSC).
+
+   M[ 4] = m256_const1_32( 0x80000000 );
+   M[13] = m256_one_32;
+   M[15] = m256_const1_32( 80*8 );
+
+   M[ 5] =_mm256_xor_si256( M[13], _mm256_set1_epi32( CSC ) );
+
   // G0   
   GS_8WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );

@@ -868,21 +1131,45 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
   V[ 5] = mm256_ror_32( _mm256_xor_si256( V[ 5], V[ 9] ), 12 );
   V[ 1] = _mm256_add_epi32( V[ 1], V[ 5] );

-   // G2,G3
-   GS_8WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
-   GS_8WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
+   // G2
+   // GS_8WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
+   V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 6] ),
+                       _mm256_xor_si256( _mm256_set1_epi32( CS5 ), M[ 4] ) );
+   V[14] = mm256_swap32_16( _mm256_xor_si256( V[14], V[ 2] ) );
+   V[10] = _mm256_add_epi32( V[10], V[14] );
+   V[ 6] = mm256_ror_32( _mm256_xor_si256( V[ 6], V[10] ), 12 );
+   V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 6] ),
+                             _mm256_set1_epi32( CS4 ) );
+   V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 2] ), 8 );
+   V[10] = _mm256_add_epi32( V[10], V[14] );
+   V[ 6] = mm256_ror_32( _mm256_xor_si256( V[ 6], V[10] ), 7 );
+
+   // G3
+   // GS_8WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
+   V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 7] ),
+                             _mm256_set1_epi32( CS7 ) );
+   V[15] = mm256_swap32_16( _mm256_xor_si256( V[15], V[ 3] ) );
+   V[11] = _mm256_add_epi32( V[11], V[15] );
+   V[ 7] = mm256_ror_32( _mm256_xor_si256( V[ 7], V[11] ), 12 );
+   V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 7] ),
+                             _mm256_set1_epi32( CS6 ) );
+   V[15] = mm256_ror_32( _mm256_xor_si256( V[15], V[ 3] ), 8 );
+   V[11] = _mm256_add_epi32( V[11], V[15] );
+   V[ 7] = mm256_ror_32( _mm256_xor_si256( V[ 7], V[11] ), 7 );

   // G4   
-   V[ 0] = _mm256_add_epi32( V[ 0],
-                         _mm256_xor_si256( _mm256_set1_epi32( CS9 ), M[ 8] ) );
+   V[ 0] = _mm256_add_epi32( V[ 0], _mm256_set1_epi32( CS9 ) );
+
+   // G5
+   // GS_8WAY( M[10], M[11], CSA, CSB, V1, V6, VB, VC );

   // G6   
   V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 7] ),
-                         _mm256_xor_si256( _mm256_set1_epi32( CSD ), M[12] ) );
+                             _mm256_set1_epi32( CSD ) );

   // G7   
   V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 4] ),
-                         _mm256_xor_si256( _mm256_set1_epi32( CSF ), M[14] ) );
+                             _mm256_set1_epi32( CSF ) );
   V[14] = mm256_swap32_16( _mm256_xor_si256( V[14], V[ 3] ) );
   V[ 3] = _mm256_add_epi32( V[ 3],
                         _mm256_xor_si256( _mm256_set1_epi32( CSE ), M[15] ) );
@@ -893,47 +1180,40 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
 {
   __m256i *H = (__m256i*)final_hash;
   const __m256i *h = (const __m256i*)midhash;
-   const __m256i *v= (const __m256i*)midstate;
   __m256i V0, V1, V2, V3, V4, V5, V6, V7;
   __m256i V8, V9, VA, VB, VC, VD, VE, VF;
-   __m256i M0, M1, M2, M3, M4, M5, M6, M7;
-   __m256i M8, M9, MA, MB, MC, MD, ME, MF;
+   __m256i M0, M1, M2, M3, M4, MD, MF;
+   __m256i MDxorCSC;

-   V0 = v[ 0];
-   V1 = v[ 1];
-   V2 = v[ 2];
-   V3 = v[ 3];
-   V4 = v[ 4];
-   V5 = v[ 5];
-   V6 = v[ 6];
-   V7 = v[ 7];
-   V8 = v[ 8];
-   V9 = v[ 9];
-   VA = v[10];
-   VB = v[11];
-   VC = v[12];
-   VD = v[13];
-   VE = v[14];
-   VF = v[15];
+   V0 = _mm256_load_si256( (__m256i*)midstate +  0 );
+   V1 = _mm256_load_si256( (__m256i*)midstate +  1 );
+   V2 = _mm256_load_si256( (__m256i*)midstate +  2 );
+   V3 = _mm256_load_si256( (__m256i*)midstate +  3 );
+   V4 = _mm256_load_si256( (__m256i*)midstate +  4 );
+   V5 = _mm256_load_si256( (__m256i*)midstate +  5 );
+   V6 = _mm256_load_si256( (__m256i*)midstate +  6 );
+   V7 = _mm256_load_si256( (__m256i*)midstate +  7 );
+   V8 = _mm256_load_si256( (__m256i*)midstate +  8 );
+   V9 = _mm256_load_si256( (__m256i*)midstate +  9 );
+   VA = _mm256_load_si256( (__m256i*)midstate + 10 );
+   VB = _mm256_load_si256( (__m256i*)midstate + 11 );
+   VC = _mm256_load_si256( (__m256i*)midstate + 12 );
+   VD = _mm256_load_si256( (__m256i*)midstate + 13 );
+   VE = _mm256_load_si256( (__m256i*)midstate + 14 );
+   VF = _mm256_load_si256( (__m256i*)midstate + 15 );

-   M0 = casti_m256i( data,  0 );
-   M1 = casti_m256i( data,  1 );
-   M2 = casti_m256i( data,  2 );
-   M3 = casti_m256i( data,  3 );
-   M4 = casti_m256i( data,  4 );
-   M5 = casti_m256i( data,  5 );
-   M6 = casti_m256i( data,  6 );
-   M7 = casti_m256i( data,  7 );
-   M8 = casti_m256i( data,  8 );
-   M9 = casti_m256i( data,  9 );
-   MA = casti_m256i( data, 10 );
-   MB = casti_m256i( data, 11 );
-   MC = casti_m256i( data, 12 );
-   MD = casti_m256i( data, 13 );
-   ME = casti_m256i( data, 14 );
-   MF = casti_m256i( data, 15 );
-
-   // Finish round 0   
+   M0 = _mm256_load_si256( (__m256i*)data +  0 );
+   M1 = _mm256_load_si256( (__m256i*)data +  1 );
+   M2 = _mm256_load_si256( (__m256i*)data +  2 );
+   M3 = _mm256_load_si256( (__m256i*)data +  3 );
+   M4 = _mm256_load_si256( (__m256i*)data +  4 );
+   // M5 to MC & ME zero padding & optimised out.
+   MD = _mm256_load_si256( (__m256i*)data + 13 );
+   MF = _mm256_load_si256( (__m256i*)data + 15 );
+   // precalculated MD^CSC, used in round0 G6.
+   MDxorCSC = _mm256_load_si256( (__m256i*)data +  5 );
+   
+   // Finish round 0 with nonce in M3 
   // G1   
   V1 = _mm256_add_epi32( V1,
                         _mm256_xor_si256( _mm256_set1_epi32( CS2 ), M3 ) );
@@ -947,20 +1227,29 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
   VA = _mm256_add_epi32( VA, VF );
   V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 12 );
   V0 = _mm256_add_epi32( V0, _mm256_add_epi32( V5,
-                         _mm256_xor_si256( _mm256_set1_epi32( CS8 ), M9 ) ) );
+                             _mm256_set1_epi32( CS8 ) ) );
   VF = mm256_shuflr32_8( _mm256_xor_si256( VF, V0 ) );
   VA = _mm256_add_epi32( VA, VF );
   V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 7 );

   // G5
-   GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
+   // GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
+   V1 = _mm256_add_epi32( _mm256_add_epi32( V1, V6 ),
+                          _mm256_set1_epi32( CSB ) );
+   VC = mm256_swap32_16( _mm256_xor_si256( VC, V1 ) );
+   VB = _mm256_add_epi32( VB, VC );
+   V6 = mm256_ror_32( _mm256_xor_si256( V6, VB ), 12 );
+   V1 = _mm256_add_epi32( _mm256_add_epi32( V1, V6 ),
+                         _mm256_set1_epi32( CSA ) );
+   VC = mm256_ror_32( _mm256_xor_si256( VC, V1 ), 8 );
+   VB = _mm256_add_epi32( VB, VC );
+   V6 = mm256_ror_32( _mm256_xor_si256( V6, VB ), 7 );

   // G6
   VD = mm256_swap32_16( _mm256_xor_si256( VD, V2 ) );
   V8 = _mm256_add_epi32( V8, VD );
   V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 12 );
-   V2 = _mm256_add_epi32( _mm256_add_epi32( V2, V7 ),
-                         _mm256_xor_si256( _mm256_set1_epi32( CSC ), MD ) );
+   V2 = _mm256_add_epi32( V2, _mm256_add_epi32( V7, MDxorCSC ) );
   VD = mm256_shuflr32_8( _mm256_xor_si256( VD, V2 ) );
   V8 = _mm256_add_epi32( V8, VD );
   V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 7 );
@@ -974,19 +1263,19 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
   V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 7 );

   // Remaining rounds   
-   ROUND_S_8WAY( 1 );
-   ROUND_S_8WAY( 2 );
-   ROUND_S_8WAY( 3 );
-   ROUND_S_8WAY( 4 );
-   ROUND_S_8WAY( 5 );
-   ROUND_S_8WAY( 6 );
-   ROUND_S_8WAY( 7 );
-   ROUND_S_8WAY( 8 );
-   ROUND_S_8WAY( 9 );
-   ROUND_S_8WAY( 0 );
-   ROUND_S_8WAY( 1 );
-   ROUND_S_8WAY( 2 );
-   ROUND_S_8WAY( 3 );
+   ROUND256_8WAY_1;
+   ROUND256_8WAY_2;
+   ROUND256_8WAY_3;
+   ROUND256_8WAY_4;
+   ROUND256_8WAY_5;
+   ROUND256_8WAY_6;
+   ROUND256_8WAY_7;
+   ROUND256_8WAY_8;
+   ROUND256_8WAY_9;
+   ROUND256_8WAY_0;
+   ROUND256_8WAY_1;
+   ROUND256_8WAY_2;
+   ROUND256_8WAY_3;

   const __m256i shuf_bswap32 =
                  m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213,
@@ -1010,6 +1299,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
 //
 // Blake-256 16 way AVX512

+// Generic with full inline message expansion
 #define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
 { \
   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
@@ -1036,6 +1326,257 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
        GS_16WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
 }

+// Short cut message expansion when the message data is known to be zero.
+// M[ 5:12, 14 ] are zero padded for the second block of 80 byte data.
+
+#define G256_16WAY_ALT( a, b, c, d, m0, m1 ) \
+{ \
+   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m0 ); \
+   d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
+   c = _mm512_add_epi32( c, d ); \
+   b = mm512_ror_32( _mm512_xor_si512( b, c ), 12 ); \
+   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m1 ); \
+   d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \
+   c = _mm512_add_epi32( c, d ); \
+   b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \
+}
+
+// Message expansion optimized for each round.
+#define ROUND256_16WAY_0 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC, \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CS1 ) ), \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CS0 ) ) ); \
+   G256_16WAY_ALT( V1, V5, V9, VD, \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CS3 ) ), \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CS2 ) ) ); \
+   G256_16WAY_ALT( V2, V6, VA, VE, \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CS5 ) ), \
+                                         _mm512_set1_epi32( CS4 )   ); \
+   G256_16WAY_ALT( V3, V7, VB, VF,       _mm512_set1_epi32( CS7 )  , \
+                                         _mm512_set1_epi32( CS6 )   ); \
+   G256_16WAY_ALT( V0, V5, VA, VF,       _mm512_set1_epi32( CS9 )  , \
+                                         _mm512_set1_epi32( CS8 )   ); \
+   G256_16WAY_ALT( V1, V6, VB, VC,       _mm512_set1_epi32( CSB )  , \
+                                         _mm512_set1_epi32( CSA )   ); \
+   G256_16WAY_ALT( V2, V7, V8, VD,       _mm512_set1_epi32( CSD )  , \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CSC ) ) ); \
+   G256_16WAY_ALT( V3, V4, V9, VE,       _mm512_set1_epi32( CSF )  , \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CSE ) ) ); \
+}
+
+#define ROUND256_16WAY_1 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC,       _mm512_set1_epi32( CSA )  , \
+                                         _mm512_set1_epi32( CSE )   ); \
+   G256_16WAY_ALT( V1, V5, V9, VD, \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CS8 ) ), \
+                                         _mm512_set1_epi32( CS4 )   ); \
+   G256_16WAY_ALT( V2, V6, VA, VE,       _mm512_set1_epi32( CSF )  , \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CS9 ) ) ); \
+   G256_16WAY_ALT( V3, V7, VB, VF, \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CS6 ) ), \
+                                         _mm512_set1_epi32( CSD )   ); \
+   G256_16WAY_ALT( V0, V5, VA, VF, \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CSC ) ), \
+                                         _mm512_set1_epi32( CS1 )   ); \
+   G256_16WAY_ALT( V1, V6, VB, VC, \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CS2 ) ), \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CS0 ) ) ); \
+   G256_16WAY_ALT( V2, V7, V8, VD,       _mm512_set1_epi32( CS7 )  , \
+                                         _mm512_set1_epi32( CSB )   ); \
+   G256_16WAY_ALT( V3, V4, V9, VE,       _mm512_set1_epi32( CS3 )  , \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CS5 ) ) ); \
+}
+
+#define ROUND256_16WAY_2 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC,       _mm512_set1_epi32( CS8 )  , \
+                                         _mm512_set1_epi32( CSB )   ); \
+   G256_16WAY_ALT( V1, V5, V9, VD,       _mm512_set1_epi32( CS0 )  , \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CSC ) ) ); \
+   G256_16WAY_ALT( V2, V6, VA, VE,       _mm512_set1_epi32( CS2 )  , \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CS5 ) ) ); \
+   G256_16WAY_ALT( V3, V7, VB, VF, \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CSD ) ), \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CSF ) ) ); \
+   G256_16WAY_ALT( V0, V5, VA, VF,       _mm512_set1_epi32( CSE )  , \
+                                         _mm512_set1_epi32( CSA )   ); \
+   G256_16WAY_ALT( V1, V6, VB, VC, \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CS6 ) ), \
+                                         _mm512_set1_epi32( CS3 )   ); \
+   G256_16WAY_ALT( V2, V7, V8, VD,       _mm512_set1_epi32( CS1 )  , \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CS7 ) ) ); \
+   G256_16WAY_ALT( V3, V4, V9, VE,       _mm512_set1_epi32( CS4 )  , \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CS9 ) ) ); \
+}
+
+#define ROUND256_16WAY_3 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC,       _mm512_set1_epi32( CS9 )  , \
+                                         _mm512_set1_epi32( CS7 )   ); \
+   G256_16WAY_ALT( V1, V5, V9, VD, \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CS1 ) ), \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CS3 ) ) ); \
+   G256_16WAY_ALT( V2, V6, VA, VE, \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CSC ) ), \
+                                         _mm512_set1_epi32( CSD )   ); \
+   G256_16WAY_ALT( V3, V7, VB, VF,       _mm512_set1_epi32( CSE )  , \
+                                         _mm512_set1_epi32( CSB )   ); \
+   G256_16WAY_ALT( V0, V5, VA, VF, \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CS6 ) ), \
+                                         _mm512_set1_epi32( CS2 )   ); \
+   G256_16WAY_ALT( V1, V6, VB, VC,       _mm512_set1_epi32( CSA )  , \
+                                         _mm512_set1_epi32( CS5 )   ); \
+   G256_16WAY_ALT( V2, V7, V8, VD, \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CS0 ) ), \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CS4 ) ) ); \
+   G256_16WAY_ALT( V3, V4, V9, VE, \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CS8 ) ), \
+                                         _mm512_set1_epi32( CSF )   ); \
+}
+
+#define ROUND256_16WAY_4 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC,       _mm512_set1_epi32( CS0 )  , \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CS9 ) ) ); \
+   G256_16WAY_ALT( V1, V5, V9, VD,       _mm512_set1_epi32( CS7 )  , \
+                                         _mm512_set1_epi32( CS5 )   ); \
+   G256_16WAY_ALT( V2, V6, VA, VE, \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CS4 ) ), \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CS2 ) )  ); \
+   G256_16WAY_ALT( V3, V7, VB, VF,       _mm512_set1_epi32( CSF )  , \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CSA ) ) ); \
+   G256_16WAY_ALT( V0, V5, VA, VF,       _mm512_set1_epi32( CS1 )  , \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CSE ) ) ); \
+   G256_16WAY_ALT( V1, V6, VB, VC,       _mm512_set1_epi32( CSC )  , \
+                                         _mm512_set1_epi32( CSB )   ); \
+   G256_16WAY_ALT( V2, V7, V8, VD,       _mm512_set1_epi32( CS8 )  , \
+                                         _mm512_set1_epi32( CS6 )   ); \
+   G256_16WAY_ALT( V3, V4, V9, VE, \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CSD ) ), \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CS3 ) ) ); \
+}
+
+#define ROUND256_16WAY_5 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC, \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CSC ) ), \
+                                         _mm512_set1_epi32( CS2 )   ); \
+   G256_16WAY_ALT( V1, V5, V9, VD,       _mm512_set1_epi32( CSA )  , \
+                                         _mm512_set1_epi32( CS6 )   ); \
+   G256_16WAY_ALT( V2, V6, VA, VE, \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CSB ) ), \
+                                         _mm512_set1_epi32( CS0 )   ); \
+   G256_16WAY_ALT( V3, V7, VB, VF,       _mm512_set1_epi32( CS3 )  , \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CS8 ) ) ); \
+   G256_16WAY_ALT( V0, V5, VA, VF, \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CSD ) ), \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CS4 ) ) ); \
+   G256_16WAY_ALT( V1, V6, VB, VC,       _mm512_set1_epi32( CS5 )  , \
+                                         _mm512_set1_epi32( CS7 )   ); \
+   G256_16WAY_ALT( V2, V7, V8, VD, \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CSE ) ), \
+                                         _mm512_set1_epi32( CSF )   ); \
+   G256_16WAY_ALT( V3, V4, V9, VE, \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CS9 ) ), \
+                                         _mm512_set1_epi32( CS1 )   ); \
+}
+
+#define ROUND256_16WAY_6 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC,       _mm512_set1_epi32( CS5 )  , \
+                                         _mm512_set1_epi32( CSC )   ); \
+   G256_16WAY_ALT( V1, V5, V9, VD, \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CSF ) ), \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CS1 ) ) ); \
+   G256_16WAY_ALT( V2, V6, VA, VE,       _mm512_set1_epi32( CSD )  , \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CSE ) ) );\
+   G256_16WAY_ALT( V3, V7, VB, VF, \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CSA ) ), \
+                                         _mm512_set1_epi32( CS4 )   ); \
+   G256_16WAY_ALT( V0, V5, VA, VF, \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CS7 ) ), \
+                                         _mm512_set1_epi32( CS0 )   ); \
+   G256_16WAY_ALT( V1, V6, VB, VC,       _mm512_set1_epi32( CS3 )  , \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CS6 ) ) ); \
+   G256_16WAY_ALT( V2, V7, V8, VD,       _mm512_set1_epi32( CS2 )  , \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CS9 ) ) ); \
+   G256_16WAY_ALT( V3, V4, V9, VE,       _mm512_set1_epi32( CSB )  , \
+                                         _mm512_set1_epi32( CS8 )   ); \
+}
+
+#define ROUND256_16WAY_7 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC, \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CSB ) ), \
+                                         _mm512_set1_epi32( CSD )   ); \
+   G256_16WAY_ALT( V1, V5, V9, VD,       _mm512_set1_epi32( CSE )  , \
+                                         _mm512_set1_epi32( CS7 )   ); \
+   G256_16WAY_ALT( V2, V6, VA, VE,       _mm512_set1_epi32( CS1 )  , \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CSC ) ) ); \
+   G256_16WAY_ALT( V3, V7, VB, VF, \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CS9 ) ), \
+                                         _mm512_set1_epi32( CS3 )   ); \
+   G256_16WAY_ALT( V0, V5, VA, VF,       _mm512_set1_epi32( CS0 )  , \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CS5 ) ) ); \
+   G256_16WAY_ALT( V1, V6, VB, VC, \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CS4 ) ), \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CSF ) ) ); \
+   G256_16WAY_ALT( V2, V7, V8, VD,       _mm512_set1_epi32( CS6 )  , \
+                                         _mm512_set1_epi32( CS8 )   ); \
+   G256_16WAY_ALT( V3, V4, V9, VE, \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CSA ) ), \
+                                         _mm512_set1_epi32( CS2 )   ); \
+}
+
+#define ROUND256_16WAY_8 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC,       _mm512_set1_epi32( CSF   ), \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CS6 ) ) ); \
+   G256_16WAY_ALT( V1, V5, V9, VD,       _mm512_set1_epi32( CS9 )  , \
+                                         _mm512_set1_epi32( CSE )   ); \
+   G256_16WAY_ALT( V2, V6, VA, VE,       _mm512_set1_epi32( CS3 )  , \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CSB ) ) ); \
+   G256_16WAY_ALT( V3, V7, VB, VF, \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CS8 ) ), \
+                                         _mm512_set1_epi32( CS0 )   ); \
+   G256_16WAY_ALT( V0, V5, VA, VF,       _mm512_set1_epi32( CS2 )  , \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CSC ) ) ); \
+   G256_16WAY_ALT( V1, V6, VB, VC, \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CS7 ) ), \
+                                         _mm512_set1_epi32( CSD )   ); \
+   G256_16WAY_ALT( V2, V7, V8, VD, \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CS4 ) ), \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CS1 ) ) ); \
+   G256_16WAY_ALT( V3, V4, V9, VE,       _mm512_set1_epi32( CS5 )  , \
+                                         _mm512_set1_epi32( CSA )   ); \
+}
+
+#define ROUND256_16WAY_9 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC,       _mm512_set1_epi32( CS2 )  , \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CSA ) ) ); \
+   G256_16WAY_ALT( V1, V5, V9, VD,       _mm512_set1_epi32( CS4 )  , \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CS8 ) ) ); \
+   G256_16WAY_ALT( V2, V6, VA, VE,       _mm512_set1_epi32( CS6 )  , \
+                                         _mm512_set1_epi32( CS7 )    ); \
+   G256_16WAY_ALT( V3, V7, VB, VF, \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CS5 ) ), \
+                                         _mm512_set1_epi32( CS1 )   ); \
+   G256_16WAY_ALT( V0, V5, VA, VF, \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CSB ) ), \
+                                         _mm512_set1_epi32( CSF )   ); \
+   G256_16WAY_ALT( V1, V6, VB, VC,       _mm512_set1_epi32( CSE )  , \
+                                         _mm512_set1_epi32( CS9 )   ); \
+   G256_16WAY_ALT( V2, V7, V8, VD, \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CSC ) ), \
+                                         _mm512_set1_epi32( CS3 )   ); \
+   G256_16WAY_ALT( V3, V4, V9, VE, \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CS0 ) ), \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CSD ) ) ); \
+}
+
 #define DECL_STATE32_16WAY \
   __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
   sph_u32 T0, T1;
@@ -1208,9 +1749,9 @@ do { \
 // second part is run for each nonce using the precalculated midstate and the
 // hash from the first block.
 void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
-                                       const void *data )
+                                       void *data )
 {
-   const __m512i *M = (const __m512i*)data;
+   __m512i *M = (__m512i*)data;
   __m512i *V = (__m512i*)midstate;
   const __m512i *H = (const __m512i*)midhash;

@@ -1231,10 +1772,21 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
   V[14] = m512_const1_32( CS6 );
   V[15] = m512_const1_32( CS7 );

+// M[ 0:3 ] contain new message data including unique nonces in M[ 3].   
+// M[ 5:12, 14 ] are always zero and not needed or used, except M[5] as noted.
+// M[ 4], M[ 13], M[15] are constant and are initialized here.
+// M[ 5] is a special case, used as a cache for (M[13] ^ CSC).
+   
+   M[ 4] = m512_const1_32( 0x80000000 );
+   M[13] = m512_one_32;
+   M[15] = m512_const1_32( 80*8 );
+
+   M[ 5] =_mm512_xor_si512( M[13], _mm512_set1_epi32( CSC ) );
+
   // G0   
   GS_16WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );

-   // G1, nonce is in M[3]   
+   // G1   
   // GS_16WAY( M[ 2], M[ 3], CS2, CS3, V1, V5, V9, VD );
   V[ 1] = _mm512_add_epi32( _mm512_add_epi32( V[ 1], V[ 5] ),
                         _mm512_xor_si512( _mm512_set1_epi32( CS3 ), M[ 2] ) );
@@ -1243,14 +1795,35 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
   V[ 5] = mm512_ror_32( _mm512_xor_si512( V[ 5], V[ 9] ), 12 );
   V[ 1] = _mm512_add_epi32( V[ 1], V[ 5] );

-   // G2,G3
-   GS_16WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
-   GS_16WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
+   // G2
+   // GS_16WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
+   V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 6] ),
+                       _mm512_xor_si512( _mm512_set1_epi32( CS5 ), M[ 4] ) );
+   V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 2] ), 16 );
+   V[10] = _mm512_add_epi32( V[10], V[14] );
+   V[ 6] = mm512_ror_32( _mm512_xor_si512( V[ 6], V[10] ), 12 );
+   V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 6] ),
+                             _mm512_set1_epi32( CS4 ) );
+   V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 2] ), 8 );
+   V[10] = _mm512_add_epi32( V[10], V[14] ); \
+   V[ 6] = mm512_ror_32( _mm512_xor_si512( V[ 6], V[10] ), 7 );
+
+   // G3
+   // GS_16WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
+   V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 7] ),
+                             _mm512_set1_epi32( CS7 ) );
+   V[15] = mm512_ror_32( _mm512_xor_si512( V[15], V[ 3] ), 16 );
+   V[11] = _mm512_add_epi32( V[11], V[15] );
+   V[ 7] = mm512_ror_32( _mm512_xor_si512( V[ 7], V[11] ), 12 );
+   V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 7] ),
+                             _mm512_set1_epi32( CS6 ) );
+   V[15] = mm512_ror_32( _mm512_xor_si512( V[15], V[ 3] ), 8 );
+   V[11] = _mm512_add_epi32( V[11], V[15] ); \
+   V[ 7] = mm512_ror_32( _mm512_xor_si512( V[ 7], V[11] ), 7 );

   // G4   
   // GS_16WAY( M[ 8], M[ 9], CS8, CS9, V0, V5, VA, VF );
-   V[ 0] = _mm512_add_epi32( V[ 0],
-                         _mm512_xor_si512( _mm512_set1_epi32( CS9 ), M[ 8] ) ); 
+   V[ 0] = _mm512_add_epi32( V[ 0], _mm512_set1_epi32( CS9 ) ); 
   
   // G5
   // GS_16WAY( M[10], M[11], CSA, CSB, V1, V6, VB, VC );
@@ -1258,11 +1831,11 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
   // G6   
   // GS_16WAY( M[12], M[13], CSC, CSD, V2, V7, V8, VD );
   V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 7] ),
-                         _mm512_xor_si512( _mm512_set1_epi32( CSD ), M[12] ) );
+                             _mm512_set1_epi32( CSD ) );
   // G7   
   // GS_16WAY( M[14], M[15], CSE, CSF, V3, V4, V9, VE );
   V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 4] ),
-                         _mm512_xor_si512( _mm512_set1_epi32( CSF ), M[14] ) );
+                             _mm512_set1_epi32( CSF ) );
   V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 3] ), 16 );
   V[ 3] = _mm512_add_epi32( V[ 3],
                         _mm512_xor_si512( _mm512_set1_epi32( CSE ), M[15] ) );
@@ -1273,45 +1846,38 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
 {
   __m512i *H = (__m512i*)final_hash;
   const __m512i *h = (const __m512i*)midhash;
-   const __m512i *v= (const __m512i*)midstate;
   __m512i V0, V1, V2, V3, V4, V5, V6, V7;
   __m512i V8, V9, VA, VB, VC, VD, VE, VF;
-   __m512i M0, M1, M2, M3, M4, M5, M6, M7;
-   __m512i M8, M9, MA, MB, MC, MD, ME, MF;
+   __m512i M0, M1, M2, M3, M4, MD, MF;
+   __m512i MDxorCSC;

-   V0 = v[ 0];
-   V1 = v[ 1];
-   V2 = v[ 2];
-   V3 = v[ 3];
-   V4 = v[ 4];
-   V5 = v[ 5];
-   V6 = v[ 6];
-   V7 = v[ 7];
-   V8 = v[ 8];
-   V9 = v[ 9];
-   VA = v[10];
-   VB = v[11];
-   VC = v[12];
-   VD = v[13];
-   VE = v[14];
-   VF = v[15];
+   V0 = _mm512_load_si512( (__m512i*)midstate +  0 );
+   V1 = _mm512_load_si512( (__m512i*)midstate +  1 );
+   V2 = _mm512_load_si512( (__m512i*)midstate +  2 );
+   V3 = _mm512_load_si512( (__m512i*)midstate +  3 );
+   V4 = _mm512_load_si512( (__m512i*)midstate +  4 );
+   V5 = _mm512_load_si512( (__m512i*)midstate +  5 );
+   V6 = _mm512_load_si512( (__m512i*)midstate +  6 );
+   V7 = _mm512_load_si512( (__m512i*)midstate +  7 );
+   V8 = _mm512_load_si512( (__m512i*)midstate +  8 );
+   V9 = _mm512_load_si512( (__m512i*)midstate +  9 );
+   VA = _mm512_load_si512( (__m512i*)midstate + 10 );
+   VB = _mm512_load_si512( (__m512i*)midstate + 11 );
+   VC = _mm512_load_si512( (__m512i*)midstate + 12 );
+   VD = _mm512_load_si512( (__m512i*)midstate + 13 );
+   VE = _mm512_load_si512( (__m512i*)midstate + 14 );
+   VF = _mm512_load_si512( (__m512i*)midstate + 15 );

-   M0 = casti_m512i( data,  0 ); 
-   M1 = casti_m512i( data,  1 ); 
-   M2 = casti_m512i( data,  2 ); 
-   M3 = casti_m512i( data,  3 ); 
-   M4 = casti_m512i( data,  4 ); 
-   M5 = casti_m512i( data,  5 ); 
-   M6 = casti_m512i( data,  6 ); 
-   M7 = casti_m512i( data,  7 ); 
-   M8 = casti_m512i( data,  8 ); 
-   M9 = casti_m512i( data,  9 ); 
-   MA = casti_m512i( data, 10 ); 
-   MB = casti_m512i( data, 11 ); 
-   MC = casti_m512i( data, 12 ); 
-   MD = casti_m512i( data, 13 ); 
-   ME = casti_m512i( data, 14 ); 
-   MF = casti_m512i( data, 15 ); 
+   M0 = _mm512_load_si512( (__m512i*)data +  0 ); 
+   M1 = _mm512_load_si512( (__m512i*)data +  1 ); 
+   M2 = _mm512_load_si512( (__m512i*)data +  2 ); 
+   M3 = _mm512_load_si512( (__m512i*)data +  3 ); 
+   M4 = _mm512_load_si512( (__m512i*)data +  4 ); 
+   // M5 to MC & ME are zero padding and optimised out
+   MD = _mm512_load_si512( (__m512i*)data + 13 ); 
+   MF = _mm512_load_si512( (__m512i*)data + 15 ); 
+   // cache for precalculated MD^CSC, used in round0 G6.
+   MDxorCSC = _mm512_load_si512( (__m512i*)data +  5 );

   // Finish round 0 with the nonce (M3) now available
   // G0   
@@ -1336,21 +1902,30 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
   VA = _mm512_add_epi32( VA, VF );
   V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 12 );
   V0 = _mm512_add_epi32( V0, _mm512_add_epi32( V5,
-                         _mm512_xor_si512( _mm512_set1_epi32( CS8 ), M9 ) ) );
+                             _mm512_set1_epi32( CS8 ) ) );
   VF = mm512_ror_32( _mm512_xor_si512( VF, V0 ), 8 );
   VA = _mm512_add_epi32( VA, VF );
   V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 7 );

   // G5
-   GS_16WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
+   // GS_16WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
+   V1 = _mm512_add_epi32( _mm512_add_epi32( V1, V6 ),
+                          _mm512_set1_epi32( CSB ) );
+   VC = mm512_ror_32( _mm512_xor_si512( VC, V1 ), 16 );
+   VB = _mm512_add_epi32( VB, VC );
+   V6 = mm512_ror_32( _mm512_xor_si512( V6, VB ), 12 );
+   V1 = _mm512_add_epi32( _mm512_add_epi32( V1, V6 ),
+                         _mm512_set1_epi32( CSA ) );
+   VC = mm512_ror_32( _mm512_xor_si512( VC, V1 ), 8 );
+   VB = _mm512_add_epi32( VB, VC );
+   V6 = mm512_ror_32( _mm512_xor_si512( V6, VB ), 7 );

   // G6
   // GS_16WAY( MC, MD, CSC, CSD, V2, V7, V8, VD );
   VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 16 );
   V8 = _mm512_add_epi32( V8, VD );
   V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 12 );
-   V2 = _mm512_add_epi32( _mm512_add_epi32( V2, V7 ),
-                         _mm512_xor_si512( _mm512_set1_epi32( CSC ), MD ) );
+   V2 = _mm512_add_epi32( V2, _mm512_add_epi32( V7, MDxorCSC ) );
   VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 8 );
   V8 = _mm512_add_epi32( V8, VD );
   V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 7 );
@@ -1364,20 +1939,20 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
   V9 = _mm512_add_epi32( V9, VE );
   V4 = mm512_ror_32( _mm512_xor_si512( V4, V9 ), 7 );

-   // Remaining rounds   
-   ROUND_S_16WAY( 1 );
-   ROUND_S_16WAY( 2 );
-   ROUND_S_16WAY( 3 );
-   ROUND_S_16WAY( 4 );
-   ROUND_S_16WAY( 5 );
-   ROUND_S_16WAY( 6 );
-   ROUND_S_16WAY( 7 );
-   ROUND_S_16WAY( 8 );
-   ROUND_S_16WAY( 9 );
-   ROUND_S_16WAY( 0 );
-   ROUND_S_16WAY( 1 );
-   ROUND_S_16WAY( 2 );
-   ROUND_S_16WAY( 3 );
+   // Remaining rounds, optimised   
+   ROUND256_16WAY_1;
+   ROUND256_16WAY_2;
+   ROUND256_16WAY_3;
+   ROUND256_16WAY_4;
+   ROUND256_16WAY_5;
+   ROUND256_16WAY_6;
+   ROUND256_16WAY_7;
+   ROUND256_16WAY_8;
+   ROUND256_16WAY_9;
+   ROUND256_16WAY_0;
+   ROUND256_16WAY_1;
+   ROUND256_16WAY_2;
+   ROUND256_16WAY_3;

   // Byte swap final hash
   const __m512i shuf_bswap32 =
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -1,6 +1,6 @@
 #include "pentablake-gate.h"

-#if defined (__AVX2__)
+#if defined(PENTABLAKE_4WAY)

 #include <stdlib.h>
 #include <stdint.h>
--- a/algo/blake/pentablake-gate.h
+++ b/algo/blake/pentablake-gate.h
@@ -4,9 +4,10 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__)
-  #define PENTABLAKE_4WAY
-#endif
+// 4way is broken
+//#if defined(__AVX2__)
+//  #define PENTABLAKE_4WAY
+//#endif

 #if defined(PENTABLAKE_4WAY)
 void pentablakehash_4way( void *state, const void *input );
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -103,16 +103,16 @@
   const uint8_t *sigmaR = sigma[R]; \
   BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
   BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
-   V2 = mm128_shufl2r_64( V[2], V[3] ); \
-   V3 = mm128_shufl2r_64( V[3], V[2] ); \
-   V6 = mm128_shufl2l_64( V[6], V[7] ); \
-   V7 = mm128_shufl2l_64( V[7], V[6] ); \
+   V2 = mm128_alignr_64( V[3], V[2] ); \
+   V3 = mm128_alignr_64( V[2], V[3] ); \
+   V6 = mm128_alignr_64( V[6], V[7] ); \
+   V7 = mm128_alignr_64( V[7], V[6] ); \
   BLAKE2B_G( V[0], V2, V[5], V6,  8,  9, 10, 11 ); \
   BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
-   V[2] = mm128_shufl2l_64( V2, V3 ); \
-   V[3] = mm128_shufl2l_64( V3, V2 ); \
-   V[6] = mm128_shufl2r_64( V6, V7 ); \
-   V[7] = mm128_shufl2r_64( V7, V6 ); \
+   V[2] = mm128_alignr_64( V2, V3 ); \
+   V[3] = mm128_alignr_64( V3, V2 ); \
+   V[6] = mm128_alignr_64( V7, V6 ); \
+   V[7] = mm128_alignr_64( V6, V7 ); \
 }

 #else
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -24,9 +24,6 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )

  ctx->hashlen = hashlen;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
-
  for ( i = 0; i < SIZE512; i++ )
  {
     ctx->chaining[i] = _mm_setzero_si128();
@@ -46,9 +43,6 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
 {
  int i;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
-
  for ( i = 0; i < SIZE512; i++ )
  {
     ctx->chaining[i] = _mm_setzero_si128();
--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -22,9 +22,6 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )

  ctx->hashlen = hashlen;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
-
  for ( i = 0; i < SIZE256; i++ )
  {
     ctx->chaining[i] = _mm_setzero_si128();
@@ -43,9 +40,6 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
 {
  int i;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
-
  for ( i = 0; i < SIZE256; i++ )
  {
     ctx->chaining[i] = _mm_setzero_si128();
@@ -54,8 +48,6 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)

  ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 );

-//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-//  INIT256(ctx->chaining);
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -26,9 +26,6 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )

  ctx->hashlen = hashlen;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
-
  for ( i = 0; i < SIZE256; i++ )
  {
     ctx->chaining[i] = m512_zero;
@@ -54,8 +51,8 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
   __m512i* in = (__m512i*)input;
   int i;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
+//  if (ctx->chaining == NULL || ctx->buffer == NULL)
+//    return 1;

  for ( i = 0; i < SIZE256; i++ )
  {
@@ -179,8 +176,8 @@ int groestl256_2way_init( groestl256_2way_context* ctx, uint64_t hashlen )

  ctx->hashlen = hashlen;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
+//  if (ctx->chaining == NULL || ctx->buffer == NULL)
+//    return 1;

  for ( i = 0; i < SIZE256; i++ )
  {
@@ -207,9 +204,6 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
   __m256i* in = (__m256i*)input;
   int i;

-   if (ctx->chaining == NULL || ctx->buffer == NULL)
-     return 1;
-
   for ( i = 0; i < SIZE256; i++ )
   {
     ctx->chaining[i] = m256_zero;
--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -21,9 +21,6 @@

 int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
 {
-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
-
  memset_zero_512( ctx->chaining, SIZE512 );
  memset_zero_512( ctx->buffer, SIZE512 );

@@ -142,9 +139,6 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,

 int groestl512_2way_init( groestl512_2way_context* ctx, uint64_t hashlen )
 {
-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
-
  memset_zero_256( ctx->chaining, SIZE512 );
  memset_zero_256( ctx->buffer, SIZE512 );

--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -585,9 +585,8 @@ do { \
  t = _mm512_xor_si512( t, c ); \
  d = mm512_xoror( a, b, t ); \
  t = mm512_xorand( t, a, b ); \
-  b = mm512_xor3( b, d, t ); \
  a = c; \
-  c = b; \
+  c = mm512_xor3( b, d, t ); \
  b = d; \
  d = mm512_not( t ); \
 } while (0)
@@ -635,7 +634,7 @@ do { \

 #define ROUND_BIG8( alpha ) \
 do { \
-   __m512i t0, t1, t2, t3; \
+   __m512i t0, t1, t2, t3, t4, t5; \
   s0 = _mm512_xor_si512( s0, alpha[ 0] ); /* m0 */ \
   s1 = _mm512_xor_si512( s1, alpha[ 1] ); /* c0 */ \
   s2 = _mm512_xor_si512( s2, alpha[ 2] ); /* m1 */ \
@@ -662,43 +661,35 @@ do { \
  s5 = mm512_swap64_32( s5 ); \
  sD = mm512_swap64_32( sD ); \
  sE = mm512_swap64_32( sE ); \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
-  L8( s0, t1, s9, t3 ); \
-  s4 = _mm512_mask_blend_epi32( 0x5555, s4, t1 ); \
-  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, t1 ); \
-  sD = _mm512_mask_blend_epi32( 0x5555, sD, t3 ); \
-  sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t3 ); \
+  t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
+  L8( s0, t0, s9, t1 ); \
 \
  s6 = mm512_swap64_32( s6 ); \
  sF = mm512_swap64_32( sF ); \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
-  L8( s1, t1, sA, t3 ); \
-  s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
-  s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, t1 ); \
-  sE = _mm512_mask_blend_epi32( 0x5555, sE, t3 ); \
-  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
+  L8( s1, t2, sA, t3 ); \
+  s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \
+  sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \
 \
  s7 = mm512_swap64_32( s7 ); \
  sC = mm512_swap64_32( sC ); \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
-  L8( s2, t1, sB, t3 ); \
-  s6 = _mm512_mask_blend_epi32( 0x5555, s6, t1 ); \
-  s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, t1 ); \
-  sF = _mm512_mask_blend_epi32( 0x5555, sF, t3 ); \
-  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t3 ); \
+  t4 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
+  t5 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
+  L8( s2, t4, sB, t5 ); \
+  s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \
+  sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \
  s6 = mm512_swap64_32( s6 ); \
  sF = mm512_swap64_32( sF ); \
 \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
-  L8( s3, t1, s8, t3 ); \
-  s7 = _mm512_mask_blend_epi32( 0x5555, s7, t1 ); \
-  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, t1 ); \
-  sC = _mm512_mask_blend_epi32( 0x5555, sC, t3 ); \
-  sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t3 ); \
+  L8( s3, t2, s8, t3 ); \
+  s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \
+  s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \
+  sC = _mm512_mask_blend_epi32( 0x5555, t5, t3 ); \
+  sD = _mm512_mask_blend_epi32( 0xaaaa, t1, t3 ); \
  s7 = mm512_swap64_32( s7 ); \
  sC = mm512_swap64_32( sC ); \
 \
@@ -924,10 +915,9 @@ do { \
  d = _mm256_xor_si256( d, a ); \
  a = _mm256_and_si256( a, b ); \
  t = _mm256_xor_si256( t, a ); \
-  b = _mm256_xor_si256( b, d ); \
-  b = _mm256_xor_si256( b, t ); \
  a = c; \
-  c = b; \
+  c = _mm256_xor_si256( b, d ); \
+  c = _mm256_xor_si256( c, t ); \
  b = d; \
  d = mm256_not( t ); \
 } while (0)
@@ -977,7 +967,7 @@ do { \

 #define ROUND_BIG( alpha ) \
 do { \
-   __m256i t0, t1, t2, t3; \
+   __m256i t0, t1, t2, t3, t4, t5; \
   s0 = _mm256_xor_si256( s0, alpha[ 0] ); \
   s1 = _mm256_xor_si256( s1, alpha[ 1] ); \
   s2 = _mm256_xor_si256( s2, alpha[ 2] ); \
@@ -1004,43 +994,35 @@ do { \
  s5 = mm256_swap64_32( s5 ); \
  sD = mm256_swap64_32( sD ); \
  sE = mm256_swap64_32( sE ); \
-  t1 = _mm256_blend_epi32( s4, s5, 0xaa ); \
-  t3 = _mm256_blend_epi32( sD, sE, 0xaa ); \
-  L( s0, t1, s9, t3 ); \
-  s4 = _mm256_blend_epi32( s4, t1, 0x55 ); \
-  s5 = _mm256_blend_epi32( s5, t1, 0xaa ); \
-  sD = _mm256_blend_epi32( sD, t3, 0x55 ); \
-  sE = _mm256_blend_epi32( sE, t3, 0xaa ); \
+  t0 = _mm256_blend_epi32( s4, s5, 0xaa ); \
+  t1 = _mm256_blend_epi32( sD, sE, 0xaa ); \
+  L( s0, t0, s9, t1 ); \
 \
  s6 = mm256_swap64_32( s6 ); \
  sF = mm256_swap64_32( sF ); \
-  t1 = _mm256_blend_epi32( s5, s6, 0xaa ); \
+  t2 = _mm256_blend_epi32( s5, s6, 0xaa ); \
  t3 = _mm256_blend_epi32( sE, sF, 0xaa ); \
-  L( s1, t1, sA, t3 ); \
-  s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
-  s6 = _mm256_blend_epi32( s6, t1, 0xaa ); \
-  sE = _mm256_blend_epi32( sE, t3, 0x55 ); \
-  sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
+  L( s1, t2, sA, t3 ); \
+  s5 = _mm256_blend_epi32( t0, t2, 0x55 ); \
+  sE = _mm256_blend_epi32( t1, t3, 0x55 ); \
 \
  s7 = mm256_swap64_32( s7 ); \
  sC = mm256_swap64_32( sC ); \
-  t1 = _mm256_blend_epi32( s6, s7, 0xaa ); \
-  t3 = _mm256_blend_epi32( sF, sC, 0xaa ); \
-  L( s2, t1, sB, t3 ); \
-  s6 = _mm256_blend_epi32( s6, t1, 0x55 ); \
-  s7 = _mm256_blend_epi32( s7, t1, 0xaa ); \
-  sF = _mm256_blend_epi32( sF, t3, 0x55 ); \
-  sC = _mm256_blend_epi32( sC, t3, 0xaa ); \
+  t4 = _mm256_blend_epi32( s6, s7, 0xaa ); \
+  t5 = _mm256_blend_epi32( sF, sC, 0xaa ); \
+  L( s2, t4, sB, t5 ); \
+  s6 = _mm256_blend_epi32( t2, t4, 0x55 ); \
+  sF = _mm256_blend_epi32( t3, t5, 0x55 ); \
  s6 = mm256_swap64_32( s6 ); \
  sF = mm256_swap64_32( sF ); \
 \
-  t1 = _mm256_blend_epi32( s7, s4, 0xaa ); \
+  t2 = _mm256_blend_epi32( s7, s4, 0xaa ); \
  t3 = _mm256_blend_epi32( sC, sD, 0xaa ); \
-  L( s3, t1, s8, t3 ); \
-  s7 = _mm256_blend_epi32( s7, t1, 0x55 ); \
-  s4 = _mm256_blend_epi32( s4, t1, 0xaa ); \
-  sC = _mm256_blend_epi32( sC, t3, 0x55 ); \
-  sD = _mm256_blend_epi32( sD, t3, 0xaa ); \
+  L( s3, t2, s8, t3 ); \
+  s7 = _mm256_blend_epi32( t4, t2, 0x55 ); \
+  s4 = _mm256_blend_epi32( t0, t2, 0xaa ); \
+  sC = _mm256_blend_epi32( t5, t3, 0x55 ); \
+  sD = _mm256_blend_epi32( t1, t3, 0xaa ); \
  s7 = mm256_swap64_32( s7 ); \
  sC = mm256_swap64_32( sC ); \
 \
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -141,6 +141,13 @@ do { \
                       _mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \
 } while (0)

+#define STEP1(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
+do { \
+   __m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
+                                      mm128_ror_32( x7, 11 ) ), w ); \
+} while (0)
+
 /*
 * PASSy(n, in) computes pass number "y", for a total of "n", using the
 * one-argument macro "in" to access input words. Current state is assumed
@@ -152,22 +159,22 @@ do { \
 #define PASS1(n, in)   do { \
 		unsigned pass_count; \
 		for (pass_count = 0; pass_count < 32; pass_count += 8) { \
-			STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
-				in(pass_count + 0), SPH_C32(0x00000000)); \
-			STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
-				in(pass_count + 1), SPH_C32(0x00000000)); \
-			STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
-				in(pass_count + 2), SPH_C32(0x00000000)); \
-			STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
-				in(pass_count + 3), SPH_C32(0x00000000)); \
-			STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
-				in(pass_count + 4), SPH_C32(0x00000000)); \
-			STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
-				in(pass_count + 5), SPH_C32(0x00000000)); \
-			STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
-				in(pass_count + 6), SPH_C32(0x00000000)); \
-			STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
-				in(pass_count + 7), SPH_C32(0x00000000)); \
+			STEP1(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
+				in(pass_count + 0) ); \
+			STEP1(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
+				in(pass_count + 1) ); \
+			STEP1(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
+				in(pass_count + 2) ); \
+			STEP1(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
+				in(pass_count + 3) ); \
+			STEP1(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
+				in(pass_count + 4) ); \
+			STEP1(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
+				in(pass_count + 5) ); \
+			STEP1(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
+				in(pass_count + 6) ); \
+			STEP1(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
+				in(pass_count + 7) ); \
   		} \
 	} while (0)

@@ -605,25 +612,32 @@ do { \
                       _mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
 } while (0)

+#define STEP1_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
+do { \
+   __m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
+                                      mm256_ror_32( x7, 11 ) ), w ); \
+} while (0)
+   
 #define PASS1_8W(n, in)   do { \
      unsigned pass_count; \
      for (pass_count = 0; pass_count < 32; pass_count += 8) { \
-         STEP_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
-            in(pass_count + 0), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
-            in(pass_count + 1), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
-            in(pass_count + 2), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
-            in(pass_count + 3), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
-            in(pass_count + 4), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
-            in(pass_count + 5), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
-            in(pass_count + 6), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
-            in(pass_count + 7), SPH_C32(0x00000000)); \
+         STEP1_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
+            in(pass_count + 0) ); \
+         STEP1_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
+            in(pass_count + 1) ); \
+         STEP1_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
+            in(pass_count + 2) ); \
+         STEP1_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
+            in(pass_count + 3) ); \
+         STEP1_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
+            in(pass_count + 4) ); \
+         STEP1_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
+            in(pass_count + 5) ); \
+         STEP1_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
+            in(pass_count + 6) ); \
+         STEP1_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
+            in(pass_count + 7) ); \
         } \
   } while (0)

--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -49,12 +49,11 @@ extern "C"{

 #define Sb_8W(x0, x1, x2, x3, c) \
 do { \
-   __m512i cc = _mm512_set1_epi64( c ); \
-    x3 = mm512_not( x3 ); \
+    const __m512i cc = _mm512_set1_epi64( c ); \
    x0 = mm512_xorandnot( x0, x2, cc ); \
    tmp = mm512_xorand( cc, x0, x1 ); \
-    x0 = mm512_xorand( x0, x2, x3 ); \
-    x3 = mm512_xorandnot( x3, x1, x2 ); \
+    x0 = mm512_xorandnot( x0, x3, x2 ); \
+    x3 = _mm512_ternarylogic_epi64( x3, x1, x2, 0x2d ); /* ~x3 ^ (~x1 & x2) */\
    x1 = mm512_xorand( x1, x0, x2 ); \
    x2 = mm512_xorandnot( x2, x3, x0 ); \
    x0 = mm512_xoror( x0, x1, x3 ); \
@@ -79,7 +78,7 @@ do { \

 #define Sb(x0, x1, x2, x3, c) \
 do { \
-   __m256i cc = _mm256_set1_epi64x( c ); \
+   const __m256i cc = _mm256_set1_epi64x( c ); \
    x3 = mm256_not( x3 ); \
    x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
    tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -72,11 +72,11 @@ static const uint64_t RC[] = {
 // Targetted macros, keccak-macros.h is included for each target.

 #define DECL64(x)          __m512i x
-#define XOR(d, a, b)     (d = _mm512_xor_si512(a,b))
-#define XOR64 XOR
+#define XOR(d, a, b)       (d = _mm512_xor_si512(a,b))
+#define XOR64              XOR
 #define AND64(d, a, b)     (d = _mm512_and_si512(a,b))
 #define OR64(d, a, b)      (d = _mm512_or_si512(a,b))
-#define NOT64(d, s)        (d = _mm512_xor_si512(s,m512_neg1))
+#define NOT64(d, s)        (d = mm512_not( s ) )
 #define ROL64(d, v, n)     (d = mm512_rol_64(v, n))
 #define XOROR(d, a, b, c)  (d = mm512_xoror(a, b, c))
 #define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
@@ -257,14 +257,14 @@ keccak512_8way_close(void *cc, void *dst)
        kc->w[j ] = _mm256_xor_si256( kc->w[j], buf[j] ); \
 } while (0)

-#define DECL64(x)        __m256i x
-#define XOR(d, a, b)    (d = _mm256_xor_si256(a,b))
-#define XOR64 XOR
-#define AND64(d, a, b)   (d = _mm256_and_si256(a,b))
-#define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
-#define NOT64(d, s)      (d = _mm256_xor_si256(s,m256_neg1))
-#define ROL64(d, v, n)   (d = mm256_rol_64(v, n))
-#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
+#define DECL64(x)          __m256i x
+#define XOR(d, a, b)       (d = _mm256_xor_si256(a,b))
+#define XOR64              XOR
+#define AND64(d, a, b)     (d = _mm256_and_si256(a,b))
+#define OR64(d, a, b)      (d = _mm256_or_si256(a,b))
+#define NOT64(d, s)        (d = mm256_not( s ) )
+#define ROL64(d, v, n)     (d = mm256_rol_64(v, n))
+#define XOROR(d, a, b, c)  (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
 #define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
 #define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c ))

--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -23,13 +23,26 @@
 #include "simd-utils.h"
 #include "luffa_for_sse2.h"

+#if defined(__SSE4_1__)
+
 #define MULT2( a0, a1 ) do \
 { \
-  __m128i b =  _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) ); \
-  a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) ); \
-  a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) );  \
+  __m128i b =  _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
+  a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
+  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) );  \
 } while(0)

+#else
+
+#define MULT2( a0, a1 ) do \
+{ \
+  __m128i b =  _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 16 ) ); \
+  a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
+  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) );  \
+} while(0)
+
+#endif
+
 #define STEP_PART(x,c,t)\
    SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
    SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
@@ -60,13 +73,13 @@
    t  = _mm_load_si128(&a0);\
    a0 = _mm_or_si128(a0,a1);\
    a2 = _mm_xor_si128(a2,a3);\
-    a1 = _mm_andnot_si128(a1,ALLONE);\
+    a1 = mm128_not( a1 );\
    a0 = _mm_xor_si128(a0,a3);\
    a3 = _mm_and_si128(a3,t);\
    a1 = _mm_xor_si128(a1,a3);\
    a3 = _mm_xor_si128(a3,a2);\
    a2 = _mm_and_si128(a2,a0);\
-    a0 = _mm_andnot_si128(a0,ALLONE);\
+    a0 = mm128_not( a0 );\
    a2 = _mm_xor_si128(a2,a1);\
    a1 = _mm_or_si128(a1,a3);\
    t  = _mm_xor_si128(t,a1);\
@@ -242,17 +255,18 @@ static const uint32 CNS_INIT[128] __attribute((aligned(16))) = {


 __m128i CNS128[32];
-__m128i ALLONE;
+#if !defined(__SSE4_1__)
 __m128i MASK;
+#endif

 HashReturn init_luffa(hashState_luffa *state, int hashbitlen)
 {
    int i;
    state->hashbitlen = hashbitlen;
+#if !defined(__SSE4_1__)
    /* set the lower 32 bits to '1' */
    MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
-    /* set all bits to '1' */
-    ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
+#endif
    /* set the 32-bit round constant values to the 128-bit data field */
    for ( i=0; i<32; i++ )
        CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
@@ -352,10 +366,10 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
 // Optimized for integrals of 16 bytes, good for 64 and 80 byte len
    int i;
    state->hashbitlen = hashbitlen;
+#if !defined(__SSE4_1__)
    /* set the lower 32 bits to '1' */
    MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
-    /* set all bits to '1' */
-    ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
+#endif
    /* set the 32-bit round constant values to the 128-bit data field */
    for ( i=0; i<32; i++ )
        CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -230,25 +230,13 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
   block0_hash[7] = _mm512_set1_epi32( phash[7] );

   // Build vectored second block, interleave last 16 bytes of data using
-   // unique nonces, add padding.
+   // unique nonces.
   block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
   block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
   block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
   block_buf[ 3] =
             _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
-   block_buf[ 4] = m512_const1_32( 0x80000000 );
-   block_buf[ 5] =
-   block_buf[ 6] = 
-   block_buf[ 7] = 
-   block_buf[ 8] = 
-   block_buf[ 9] = 
-   block_buf[10] = 
-   block_buf[11] = 
-   block_buf[12] = m512_zero;
-   block_buf[13] = m512_one_32;
-   block_buf[14] = m512_zero;
-   block_buf[15] = m512_const1_32( 80*8 );

   // Partialy prehash second block without touching nonces in block_buf[3].
   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
@@ -425,24 +413,12 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
   block0_hash[7] = _mm256_set1_epi32( phash[7] );

   // Build vectored second block, interleave last 16 bytes of data using
-   // unique nonces and add padding.
+   // unique nonces.
   block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
   block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
   block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
-   block_buf[ 3] =
-            _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
-   block_buf[ 4] = m256_const1_32( 0x80000000 );
-   block_buf[ 5] =
-   block_buf[ 6] =
-   block_buf[ 7] =
-   block_buf[ 8] =
-   block_buf[ 9] =
-   block_buf[10] =
-   block_buf[11] =
-   block_buf[12] = m256_zero;
-   block_buf[13] = m256_one_32;
-   block_buf[14] = m256_zero;
-   block_buf[15] = m256_const1_32( 80*8 );
+   block_buf[ 3] = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4,
+                                     n+ 3, n+ 2, n+ 1, n );

   // Partialy prehash second block without touching nonces
   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -120,25 +120,13 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
   block0_hash[7] = _mm512_set1_epi32( phash[7] );

   // Build vectored second block, interleave last 16 bytes of data using
-   // unique nonces and add padding.
+   // unique nonces.
   block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
   block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
   block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
   block_buf[ 3] =
             _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
-   block_buf[ 4] = m512_const1_32( 0x80000000 );
-   block_buf[ 5] =
-   block_buf[ 6] =
-   block_buf[ 7] =
-   block_buf[ 8] =
-   block_buf[ 9] =
-   block_buf[10] =
-   block_buf[11] =
-   block_buf[12] = m512_zero;
-   block_buf[13] = m512_one_32;
-   block_buf[14] = m512_zero;
-   block_buf[15] = m512_const1_32( 80*8 );

   // Partialy prehash second block without touching nonces in block_buf[3].
   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
@@ -240,24 +228,12 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
   block0_hash[7] = _mm256_set1_epi32( phash[7] );

   // Build vectored second block, interleave last 16 bytes of data using
-   // unique nonces and add padding.
+   // unique nonces.
   block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
   block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
   block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
   block_buf[ 3] =
            _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
-   block_buf[ 4] = m256_const1_32( 0x80000000 );
-   block_buf[ 5] =
-   block_buf[ 6] =
-   block_buf[ 7] =
-   block_buf[ 8] =
-   block_buf[ 9] =
-   block_buf[10] =
-   block_buf[11] =
-   block_buf[12] = m256_zero;
-   block_buf[13] = m256_one_32;
-   block_buf[14] = m256_zero;
-   block_buf[15] = m256_const1_32( 80*8 );

   // Partialy prehash second block without touching nonces
   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -3,7 +3,7 @@
 #include "lyra2.h"
 #include "simd-utils.h"

-__thread uint64_t* lyra2z330_wholeMatrix;
+static __thread uint64_t* lyra2z330_wholeMatrix;

 void lyra2z330_hash(void *state, const void *input, uint32_t height)
 {
--- a/algo/scrypt/scrypt-core-4way.c
+++ b/algo/scrypt/scrypt-core-4way.c
@@ -830,7 +830,7 @@ void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N )
   }
 }

-// Working, not up to date, needs stream optimization.
+// Working, not up to date, needs stream, shuffle optimizations.
 // 4x32 interleaving
 static void salsa8_simd128_4way( __m128i *b, const __m128i *c )
 {
@@ -937,46 +937,28 @@ void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
 // 4x memory usage
 // Working
 // 4x128 interleaving
-static void salsa_shuffle_4way_simd128( __m512i *X )
+static inline void salsa_shuffle_4way_simd128( __m512i *X )
 {
-   __m512i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3;
-
-   Y0 = _mm512_mask_blend_epi32( 0x1111, X[1], X[0] );
-   Z0 = _mm512_mask_blend_epi32( 0x4444, X[3], X[2] );
-
-   Y1 = _mm512_mask_blend_epi32( 0x1111, X[2], X[1] );
-   Z1 = _mm512_mask_blend_epi32( 0x4444, X[0], X[3] );
-
-   Y2 = _mm512_mask_blend_epi32( 0x1111, X[3], X[2] );
-   Z2 = _mm512_mask_blend_epi32( 0x4444, X[1], X[0] );
-
-   Y3 = _mm512_mask_blend_epi32( 0x1111, X[0], X[3] );
-   Z3 = _mm512_mask_blend_epi32( 0x4444, X[2], X[1] );
-
-   X[0] = _mm512_mask_blend_epi32( 0x3333, Z0, Y0 );
-   X[1] = _mm512_mask_blend_epi32( 0x3333, Z1, Y1 );
-   X[2] = _mm512_mask_blend_epi32( 0x3333, Z2, Y2 );
-   X[3] = _mm512_mask_blend_epi32( 0x3333, Z3, Y3 );
+  __m512i t0 = _mm512_mask_blend_epi32( 0xaaaa, X[0], X[1] );
+  __m512i t1 = _mm512_mask_blend_epi32( 0x5555, X[0], X[1] );
+  __m512i t2 = _mm512_mask_blend_epi32( 0xaaaa, X[2], X[3] );
+  __m512i t3 = _mm512_mask_blend_epi32( 0x5555, X[2], X[3] );
+  X[0] = _mm512_mask_blend_epi32( 0xcccc, t0, t2 );
+  X[1] = _mm512_mask_blend_epi32( 0x6666, t1, t3 );
+  X[2] = _mm512_mask_blend_epi32( 0x3333, t0, t2 );
+  X[3] = _mm512_mask_blend_epi32( 0x9999, t1, t3 );
 }

-static void salsa_unshuffle_4way_simd128( __m512i *X )
+static inline void salsa_unshuffle_4way_simd128( __m512i *X )
 {
-   __m512i Y0, Y1, Y2, Y3;
-
-   Y0 = _mm512_mask_blend_epi32( 0x8888, X[0], X[1] );
-   Y1 = _mm512_mask_blend_epi32( 0x1111, X[0], X[1] );
-   Y2 = _mm512_mask_blend_epi32( 0x2222, X[0], X[1] );
-   Y3 = _mm512_mask_blend_epi32( 0x4444, X[0], X[1] );
-
-   Y0 = _mm512_mask_blend_epi32( 0x4444, Y0, X[2] );
-   Y1 = _mm512_mask_blend_epi32( 0x8888, Y1, X[2] );
-   Y2 = _mm512_mask_blend_epi32( 0x1111, Y2, X[2] );
-   Y3 = _mm512_mask_blend_epi32( 0x2222, Y3, X[2] );
-
-   X[0] = _mm512_mask_blend_epi32( 0x2222, Y0, X[3] );
-   X[1] = _mm512_mask_blend_epi32( 0x4444, Y1, X[3] );
-   X[2] = _mm512_mask_blend_epi32( 0x8888, Y2, X[3] );
-   X[3] = _mm512_mask_blend_epi32( 0x1111, Y3, X[3] );
+  __m512i t0 = _mm512_mask_blend_epi32( 0xcccc, X[0], X[2] );
+  __m512i t1 = _mm512_mask_blend_epi32( 0x3333, X[0], X[2] );
+  __m512i t2 = _mm512_mask_blend_epi32( 0x6666, X[1], X[3] );
+  __m512i t3 = _mm512_mask_blend_epi32( 0x9999, X[1], X[3] );
+  X[0] = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 );
+  X[1] = _mm512_mask_blend_epi32( 0x5555, t0, t2 );
+  X[2] = _mm512_mask_blend_epi32( 0xaaaa, t1, t3 );
+  X[3] = _mm512_mask_blend_epi32( 0x5555, t1, t3 );
 }

 static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C)
@@ -1147,46 +1129,28 @@ void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N )
 // { l1xb, l1xa, l1c9, l1x8,   l0xb, l0xa, l0x9, l0x8 }   b[1]  B[23:16]
 // { l1xf, l1xe, l1xd, l1xc,   l0xf, l0xe, l0xd, l0xc }   b[0]  B[31:24]

-static void salsa_shuffle_2way_simd128( __m256i *X )
+static inline void salsa_shuffle_2way_simd128( __m256i *X )
 {
-   __m256i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3;
-
-   Y0 = _mm256_blend_epi32( X[1], X[0], 0x11 );
-   Z0 = _mm256_blend_epi32( X[3], X[2], 0x44 );
-
-   Y1 = _mm256_blend_epi32( X[2], X[1], 0x11 );
-   Z1 = _mm256_blend_epi32( X[0], X[3], 0x44 );
-
-   Y2 = _mm256_blend_epi32( X[3], X[2], 0x11 );
-   Z2 = _mm256_blend_epi32( X[1], X[0], 0x44 );
-
-   Y3 = _mm256_blend_epi32( X[0], X[3], 0x11 );
-   Z3 = _mm256_blend_epi32( X[2], X[1], 0x44 );
-
-   X[0] = _mm256_blend_epi32( Z0, Y0, 0x33 );
-   X[1] = _mm256_blend_epi32( Z1, Y1, 0x33 );
-   X[2] = _mm256_blend_epi32( Z2, Y2, 0x33 );
-   X[3] = _mm256_blend_epi32( Z3, Y3, 0x33 );
+  __m256i t0 = _mm256_blend_epi32( X[0], X[1], 0xaa );
+  __m256i t1 = _mm256_blend_epi32( X[0], X[1], 0x55 );
+  __m256i t2 = _mm256_blend_epi32( X[2], X[3], 0xaa );
+  __m256i t3 = _mm256_blend_epi32( X[2], X[3], 0x55 );
+  X[0] = _mm256_blend_epi32( t0, t2, 0xcc );
+  X[1] = _mm256_blend_epi32( t1, t3, 0x66 );
+  X[2] = _mm256_blend_epi32( t0, t2, 0x33 );
+  X[3] = _mm256_blend_epi32( t1, t3, 0x99 );
 }

-static void salsa_unshuffle_2way_simd128( __m256i *X )
+static inline void salsa_unshuffle_2way_simd128( __m256i *X )
 {
-   __m256i Y0, Y1, Y2, Y3;
-
-   Y0 = _mm256_blend_epi32( X[0], X[1], 0x88 );
-   Y1 = _mm256_blend_epi32( X[0], X[1], 0x11 );
-   Y2 = _mm256_blend_epi32( X[0], X[1], 0x22 );
-   Y3 = _mm256_blend_epi32( X[0], X[1], 0x44 );
-
-   Y0 = _mm256_blend_epi32( Y0, X[2], 0x44 );
-   Y1 = _mm256_blend_epi32( Y1, X[2], 0x88 );
-   Y2 = _mm256_blend_epi32( Y2, X[2], 0x11 );
-   Y3 = _mm256_blend_epi32( Y3, X[2], 0x22 );
-
-   X[0] = _mm256_blend_epi32( Y0, X[3], 0x22 );
-   X[1] = _mm256_blend_epi32( Y1, X[3], 0x44 );
-   X[2] = _mm256_blend_epi32( Y2, X[3], 0x88 );
-   X[3] = _mm256_blend_epi32( Y3, X[3], 0x11 );
+  __m256i t0 = _mm256_blend_epi32( X[0], X[2], 0xcc );
+  __m256i t1 = _mm256_blend_epi32( X[0], X[2], 0x33 );
+  __m256i t2 = _mm256_blend_epi32( X[1], X[3], 0x66 );
+  __m256i t3 = _mm256_blend_epi32( X[1], X[3], 0x99 );
+  X[0] = _mm256_blend_epi32( t0, t2, 0xaa );
+  X[1] = _mm256_blend_epi32( t0, t2, 0x55 );
+  X[2] = _mm256_blend_epi32( t1, t3, 0xaa );
+  X[3] = _mm256_blend_epi32( t1, t3, 0x55 );
 }

 static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C)
@@ -2163,7 +2127,7 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
   X2 = _mm_blend_epi32( B[1], B[0], 0x4 );
   Y3 = _mm_blend_epi32( B[0], B[3], 0x1 );
   X3 = _mm_blend_epi32( B[2], B[1], 0x4 );
-   X0 = _mm_blend_epi32( X0, Y0, 0x3);
+   X0 = _mm_blend_epi32( X0, Y0, 0x3 );
   X1 = _mm_blend_epi32( X1, Y1, 0x3 );
   X2 = _mm_blend_epi32( X2, Y2, 0x3 );
   X3 = _mm_blend_epi32( X3, Y3, 0x3 );
@@ -2311,91 +2275,34 @@ void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )
 // Double buffered, 2x memory usage
 // No interleaving

-static void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
+static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
 {
   __m128i *XA = (__m128i*)xa;
   __m128i *XB = (__m128i*)xb;
-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;

 #if defined(__SSE4_1__)

-//   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
-   __m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 );
-   YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 );
-   ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 );
-   ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 );
-
-   YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 );
-   YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 );
-   ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 );
-   ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 );
-
-   YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 );
-   YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 );
-   ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 );
-   ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 );
-
-   YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 );
-   YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 );
-   ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 );
-   ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 );
-
-   XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 );
-   XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 );
-
-   XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 );
-   XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 );
-
-   XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 );
-   XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 );
-
-   XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 );
-   XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 );
-
-#else
-
-//  SSE4.1
-
-   YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 );
-   YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 );
-   ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 );
-   ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 );
-
-   YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 );
-   YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 );
-   ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 );
-   ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 );
-
-   YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 );
-   YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 );
-   ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 );
-   ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 );
-
-   YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 );
-   YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 );
-   ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 );
-   ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 );
-
-   XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f );
-   XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f );
-
-   XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f );
-   XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f );
-
-   XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f );
-   XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f );
-
-   XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f );
-   XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f );
-
-#endif  // AVX2 else SSE4_1
+  __m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
+  __m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
+  __m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
+  __m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
+  XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XA[3] = _mm_blend_epi16( t1, t3, 0xc3 );
+  t0 = _mm_blend_epi16( XB[0], XB[1], 0xcc );
+  t1 = _mm_blend_epi16( XB[0], XB[1], 0x33 );
+  t2 = _mm_blend_epi16( XB[2], XB[3], 0xcc );
+  t3 = _mm_blend_epi16( XB[2], XB[3], 0x33 );
+  XB[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XB[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );

 #else   // SSE2
-  
+
+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
+   
   YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
   YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
   YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
@@ -2417,7 +2324,7 @@ static void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
 #endif
 }

-static void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
+static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
 {

   __m128i *XA = (__m128i*)xa;
@@ -2425,67 +2332,22 @@ static void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )

 #if defined(__SSE4_1__)

-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 );
-   YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 );
-   YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 );
-   YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 );
-   YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 );
-   YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 );
-   YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 );
-   YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 );
-
-   YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 );
-   YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 );
-   YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 );
-   YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 );
-   YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 );
-   YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 );
-   YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 );
-   YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 );
-
-   XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 );
-   XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 );
-   XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 );
-   XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 );
-   XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 );
-   XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 );
-   XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 );
-   XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 );
-
-#else   // SSE4_1
-
-   YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 );
-   YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 );
-   YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 );
-   YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 );
-   YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c );
-   YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c );
-   YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 );
-   YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 );
-
-   YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 );
-   YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 );
-   YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 );
-   YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 );
-   YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 );
-   YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 );
-   YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c );
-   YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c );
-
-   XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c );
-   XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c );
-   XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 );
-   XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 );
-   XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 );
-   XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 );
-   XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 );
-   XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 );
-
-#endif  // AVX2 else SSE4_1
+  __m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
+  __m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
+  __m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
+  __m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
+  XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XA[3] = _mm_blend_epi16( t1, t3, 0x33 );
+  t0 = _mm_blend_epi16( XB[0], XB[2], 0xf0 );
+  t1 = _mm_blend_epi16( XB[0], XB[2], 0x0f );
+  t2 = _mm_blend_epi16( XB[1], XB[3], 0x3c );
+  t3 = _mm_blend_epi16( XB[1], XB[3], 0xc3 );
+  XB[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XB[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XB[3] = _mm_blend_epi16( t1, t3, 0x33 );

 #else  // SSE2

@@ -2690,116 +2552,44 @@ void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
 }


-static void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
+static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
                                        uint32_t *xc )
 {
   __m128i *XA = (__m128i*)xa;
   __m128i *XB = (__m128i*)xb;
   __m128i *XC = (__m128i*)xc;
-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;

 #if defined(__SSE4_1__)

-   __m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3, ZC0, ZC1, ZC2, ZC3;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 );
-   YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 );
-   YC0 = _mm_blend_epi32( XC[1], XC[0], 0x1 );
-   ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 );
-   ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 );
-   ZC0 = _mm_blend_epi32( XC[3], XC[2], 0x4 );
-
-   YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 );
-   YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 );
-   YC1 = _mm_blend_epi32( XC[2], XC[1], 0x1 );
-   ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 );
-   ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 );
-   ZC1 = _mm_blend_epi32( XC[0], XC[3], 0x4 );
-
-   YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 );
-   YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 );
-   YC2 = _mm_blend_epi32( XC[3], XC[2], 0x1 );
-   ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 );
-   ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 );
-   ZC2 = _mm_blend_epi32( XC[1], XC[0], 0x4 );
-
-   YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 );
-   YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 );
-   YC3 = _mm_blend_epi32( XC[0], XC[3], 0x1 );
-   ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 );
-   ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 );
-   ZC3 = _mm_blend_epi32( XC[2], XC[1], 0x4 );
-
-   XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 );
-   XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 );
-   XC[0] = _mm_blend_epi32( ZC0, YC0, 0x3 );
-
-   XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 );
-   XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 );
-   XC[1] = _mm_blend_epi32( ZC1, YC1, 0x3 );
-
-   XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 );
-   XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 );
-   XC[2] = _mm_blend_epi32( ZC2, YC2, 0x3 );
-
-   XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 );
-   XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 );
-   XC[3] = _mm_blend_epi32( ZC3, YC3, 0x3 );
-
-#else   
-
-//  SSE4.1
-
-   YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 );
-   YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 );
-   YC0 = _mm_blend_epi16( XC[1], XC[0], 0x03 );
-   ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 );
-   ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 );
-   ZC0 = _mm_blend_epi16( XC[3], XC[2], 0x30 );
-
-   YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 );
-   YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 );
-   YC1 = _mm_blend_epi16( XC[2], XC[1], 0x03 );
-   ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 );
-   ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 );
-   ZC1 = _mm_blend_epi16( XC[0], XC[3], 0x30 );
-
-   YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 );
-   YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 );
-   YC2 = _mm_blend_epi16( XC[3], XC[2], 0x03 );
-   ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 );
-   ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 );
-   ZC2 = _mm_blend_epi16( XC[1], XC[0], 0x30 );
-
-   YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 );
-   YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 );
-   YC3 = _mm_blend_epi16( XC[0], XC[3], 0x03 );
-   ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 );
-   ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 );
-   ZC3 = _mm_blend_epi16( XC[2], XC[1], 0x30 );
-
-   XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f );
-   XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f );
-   XC[0] = _mm_blend_epi16( ZC0, YC0, 0x0f );
-
-   XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f );
-   XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f );
-   XC[1] = _mm_blend_epi16( ZC1, YC1, 0x0f );
-
-   XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f );
-   XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f );
-   XC[2] = _mm_blend_epi16( ZC2, YC2, 0x0f );
-
-   XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f );
-   XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f );
-   XC[3] = _mm_blend_epi16( ZC3, YC3, 0x0f );
-
-#endif  // AVX2 else SSE4_1
+  __m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
+  __m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
+  __m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
+  __m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
+  XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XA[3] = _mm_blend_epi16( t1, t3, 0xc3 );
+  t0 = _mm_blend_epi16( XB[0], XB[1], 0xcc );
+  t1 = _mm_blend_epi16( XB[0], XB[1], 0x33 );
+  t2 = _mm_blend_epi16( XB[2], XB[3], 0xcc );
+  t3 = _mm_blend_epi16( XB[2], XB[3], 0x33 );
+  XB[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XB[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );
+  t0 = _mm_blend_epi16( XC[0], XC[1], 0xcc );
+  t1 = _mm_blend_epi16( XC[0], XC[1], 0x33 );
+  t2 = _mm_blend_epi16( XC[2], XC[3], 0xcc );
+  t3 = _mm_blend_epi16( XC[2], XC[3], 0x33 );
+  XC[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XC[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XC[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XC[3] = _mm_blend_epi16( t1, t3, 0xc3 );

 #else   // SSE2

+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
+
   YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
   YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
   YC0 = _mm_set_epi32( xc[15], xc[10], xc[ 5], xc[ 0] );
@@ -2829,7 +2619,7 @@ static void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
 #endif
 }

-static void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
+static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
                                          uint32_t* xc )
 {
   __m128i *XA = (__m128i*)xa;
@@ -2838,91 +2628,30 @@ static void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,

 #if defined(__SSE4_1__)

-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 );
-   YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 );
-   YC0 = _mm_blend_epi32( XC[0], XC[1], 0x8 );
-   YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 );
-   YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 );
-   YC1 = _mm_blend_epi32( XC[0], XC[1], 0x1 );
-   YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 );
-   YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 );
-   YC2 = _mm_blend_epi32( XC[0], XC[1], 0x2 );
-   YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 );
-   YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 );
-   YC3 = _mm_blend_epi32( XC[0], XC[1], 0x4 );
-
-   YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 );
-   YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 );
-   YC0 = _mm_blend_epi32( YC0, XC[2], 0x4 );
-   YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 );
-   YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 );
-   YC1 = _mm_blend_epi32( YC1, XC[2], 0x8 );
-   YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 );
-   YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 );
-   YC2 = _mm_blend_epi32( YC2, XC[2], 0x1 );
-   YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 );
-   YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 );
-   YC3 = _mm_blend_epi32( YC3, XC[2], 0x2 );
-
-   XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 );
-   XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 );
-   XC[0] = _mm_blend_epi32( YC0, XC[3], 0x2 );
-   XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 );
-   XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 );
-   XC[1] = _mm_blend_epi32( YC1, XC[3], 0x4 );
-   XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 );
-   XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 );
-   XC[2] = _mm_blend_epi32( YC2, XC[3], 0x8 );
-   XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 );
-   XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 );
-   XC[3] = _mm_blend_epi32( YC3, XC[3], 0x1 );
-
-#else   // SSE4_1
-
-   YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 );
-   YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 );
-   YC0 = _mm_blend_epi16( XC[0], XC[1], 0xc0 );
-   YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 );
-   YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 );
-   YC1 = _mm_blend_epi16( XC[0], XC[1], 0x03 );
-   YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c );
-   YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c );
-   YC2 = _mm_blend_epi16( XC[0], XC[1], 0x0c );
-   YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 );
-   YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 );
-   YC3 = _mm_blend_epi16( XC[0], XC[1], 0x30 );
-
-   YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 );
-   YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 );
-   YC0 = _mm_blend_epi16( YC0, XC[2], 0x30 );
-   YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 );
-   YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 );
-   YC1 = _mm_blend_epi16( YC1, XC[2], 0xc0 );
-   YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 );
-   YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 );
-   YC2 = _mm_blend_epi16( YC2, XC[2], 0x03 );
-   YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c );
-   YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c );
-   YC3 = _mm_blend_epi16( YC3, XC[2], 0x0c );
-
-   XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c );
-   XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c );
-   XC[0] = _mm_blend_epi16( YC0, XC[3], 0x0c );
-   XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 );
-   XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 );
-   XC[1] = _mm_blend_epi16( YC1, XC[3], 0x30 );
-   XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 );
-   XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 );
-   XC[2] = _mm_blend_epi16( YC2, XC[3], 0xc0 );
-   XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 );
-   XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 );
-   XC[3] = _mm_blend_epi16( YC3, XC[3], 0x03 );
-
-#endif  // AVX2 else SSE4_1
+  __m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
+  __m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
+  __m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
+  __m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
+  XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XA[3] = _mm_blend_epi16( t1, t3, 0x33 );
+  t0 = _mm_blend_epi16( XB[0], XB[2], 0xf0 );
+  t1 = _mm_blend_epi16( XB[0], XB[2], 0x0f );
+  t2 = _mm_blend_epi16( XB[1], XB[3], 0x3c );
+  t3 = _mm_blend_epi16( XB[1], XB[3], 0xc3 );
+  XB[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XB[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XB[3] = _mm_blend_epi16( t1, t3, 0x33 );
+  t0 = _mm_blend_epi16( XC[0], XC[2], 0xf0 );
+  t1 = _mm_blend_epi16( XC[0], XC[2], 0x0f );
+  t2 = _mm_blend_epi16( XC[1], XC[3], 0x3c );
+  t3 = _mm_blend_epi16( XC[1], XC[3], 0xc3 );
+  XC[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XC[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XC[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XC[3] = _mm_blend_epi16( t1, t3, 0x33 );

 #else  // SSE2

--- a/algo/sha/md-helper-4way.c
+++ b/algo/sha/md-helper-4way.c
@@ -1,270 +0,0 @@
-/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */
-/*
- * This file contains some functions which implement the external data
- * handling and padding for Merkle-Damgard hash functions which follow
- * the conventions set out by MD4 (little-endian) or SHA-1 (big-endian).
- *
- * API: this file is meant to be included, not compiled as a stand-alone
- * file. Some macros must be defined:
- *   RFUN   name for the round function
- *   HASH   "short name" for the hash function
- *   BE32   defined for big-endian, 32-bit based (e.g. SHA-1)
- *   LE32   defined for little-endian, 32-bit based (e.g. MD5)
- *   BE64   defined for big-endian, 64-bit based (e.g. SHA-512)
- *   LE64   defined for little-endian, 64-bit based (no example yet)
- *   PW01   if defined, append 0x01 instead of 0x80 (for Tiger)
- *   BLEN   if defined, length of a message block (in bytes)
- *   PLW1   if defined, length is defined on one 64-bit word only (for Tiger)
- *   PLW4   if defined, length is defined on four 64-bit words (for WHIRLPOOL)
- *   SVAL   if defined, reference to the context state information
- *
- * BLEN is used when a message block is not 16 (32-bit or 64-bit) words:
- * this is used for instance for Tiger, which works on 64-bit words but
- * uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are
- * ignored if 32-bit words are used; if 64-bit words are used and PLW1 is
- * set, then only one word (64 bits) will be used to encode the input
- * message length (in bits), otherwise two words will be used (as in
- * SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but
- * not PLW1), four 64-bit words will be used to encode the message length
- * (in bits). Note that regardless of those settings, only 64-bit message
- * lengths are supported (in bits): messages longer than 2 Exabytes will be
- * improperly hashed (this is unlikely to happen soon: 2 Exabytes is about
- * 2 millions Terabytes, which is huge).
- *
- * If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close()
- * function. This is used for Tiger2, which is identical to Tiger except
- * when it comes to the padding (Tiger2 uses the standard 0x80 byte instead
- * of the 0x01 from original Tiger).
- *
- * The RFUN function is invoked with two arguments, the first pointing to
- * aligned data (as a "const void *"), the second being state information
- * from the context structure. By default, this state information is the
- * "val" field from the context, and this field is assumed to be an array
- * of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64).
- * from the context structure. The "val" field can have any type, except
- * for the output encoding which assumes that it is an array of "sph_u32"
- * values. By defining NO_OUTPUT, this last step is deactivated; the
- * includer code is then responsible for writing out the hash result. When
- * NO_OUTPUT is defined, the third parameter to the "close()" function is
- * ignored.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifdef _MSC_VER
-#pragma warning (disable: 4146)
-#endif
-
-#undef SPH_XCAT
-#define SPH_XCAT(a, b)     SPH_XCAT_(a, b)
-#undef SPH_XCAT_
-#define SPH_XCAT_(a, b)    a ## b
-
-#undef SPH_BLEN
-#undef SPH_WLEN
-#if defined BE64 || defined LE64
-#define SPH_BLEN    128U
-#define SPH_WLEN      8U
-#else
-#define SPH_BLEN     64U
-#define SPH_WLEN      4U
-#endif
-
-#ifdef BLEN
-#undef SPH_BLEN
-#define SPH_BLEN    BLEN
-#endif
-
-#undef SPH_MAXPAD
-#if defined PLW1
-#define SPH_MAXPAD   (SPH_BLEN - SPH_WLEN)
-#elif defined PLW4
-#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 2))
-#else
-#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 1))
-#endif
-
-#undef SPH_VAL
-#undef SPH_NO_OUTPUT
-#ifdef SVAL
-#define SPH_VAL         SVAL
-#define SPH_NO_OUTPUT   1
-#else
-#define SPH_VAL   sc->val
-#endif
-
-#ifndef CLOSE_ONLY
-
-#ifdef SPH_UPTR
-static void
-SPH_XCAT(HASH, _short)( void *cc, const void *data, size_t len )
-#else
-void
-HASH ( void *cc, const void *data, size_t len )
-#endif
-{
-   SPH_XCAT( HASH, _context ) *sc;
-   __m256i *vdata = (__m256i*)data;
-   size_t ptr;
-
-   sc = cc;
-   ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
-   while ( len > 0 )
-   {
-      size_t clen;
-      clen = SPH_BLEN - ptr;
-      if ( clen > len )
-         clen = len;
-      memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
-      vdata = vdata + (clen>>3);
-      ptr += clen;
-      len -= clen;
-      if ( ptr == SPH_BLEN )
-      {
-         RFUN( sc->buf, SPH_VAL );
-         ptr = 0;
-      }
-         sc->count += clen;
-   }
-}
-
-#ifdef SPH_UPTR
-void
-HASH (void *cc, const void *data, size_t len)
-{
-   SPH_XCAT(HASH, _context) *sc;
-   __m256i *vdata = (__m256i*)data;
-   unsigned ptr;
-
-   if ( len < (2 * SPH_BLEN) )
-   {
-      SPH_XCAT(HASH, _short)(cc, data, len);
-      return;
-   }
-   sc = cc;
-   ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
-   if ( ptr > 0 )
-   {
-      unsigned t;
-      t = SPH_BLEN - ptr;
-      SPH_XCAT( HASH, _short )( cc, data, t );
-      vdata = vdata + (t>>3);
-      len -= t;
-   }
-   SPH_XCAT( HASH, _short )( cc, data, len );
-}
-#endif
-
-#endif
-
-/*
- * Perform padding and produce result. The context is NOT reinitialized
- * by this function.
- */
-static void
-SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
-          void *dst, unsigned rnum )
-{
-    SPH_XCAT(HASH, _context) *sc;
-    unsigned ptr, u;
-    sc = cc;
-    ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
-
-#ifdef PW01
-    sc->buf[ptr>>3] = m256_const1_64( 0x100 >> 8 );
-#else
-    sc->buf[ptr>>3] = m256_const1_64( 0x80 );
-#endif
-    ptr += 8;
-
-    if ( ptr > SPH_MAXPAD )
-    {
-         memset_zero_256( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
-         RFUN( sc->buf, SPH_VAL );
-         memset_zero_256( sc->buf, SPH_MAXPAD >> 3 );
-    }
-    else
-    {
-         memset_zero_256( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
-    }
-#if defined BE64
-#if defined PLW1
-    sc->buf[ SPH_MAXPAD>>3 ] =
-                 mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
-#elif defined PLW4
-    memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
-    sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-                mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
-    sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-                mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
-#else
-    sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-               mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
-    sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-               mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
-#endif  // PLW
-#else  // LE64
-#if defined PLW1
-    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
-#elif defined PLW4
-    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
-    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
-                       _mm256_set1_epi64x( c->count >> 61 );
-    memset_zero_256( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
-                       2 * SPH_WLEN );
-#else
-    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
-    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
-                          _mm256_set1_epi64x( sc->count >> 61 );
-#endif // PLW
-
-#endif // LE64
-
-    RFUN( sc->buf, SPH_VAL );
-
-#ifdef SPH_NO_OUTPUT
-    (void)dst;
-    (void)rnum;
-    (void)u;
-#else
-    for ( u = 0; u < rnum; u ++ )
-    {
-#if defined BE64
-       ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
-#else  // LE64
-       ((__m256i*)dst)[u] = sc->val[u];
-#endif
-    }
-#endif
-}
-
-static void
-SPH_XCAT( HASH, _mdclose )( void *cc, void *dst, unsigned rnum )
-{
-   SPH_XCAT( HASH, _addbits_and_close )( cc, 0, 0, dst, rnum );
-}
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -711,8 +711,11 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
 {
   __m256i A, B, C, D, E, F, G, H;

-   X[ 0] = SHA2x_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
-   X[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   // W[9:14] are zero, therefore X[9:13] are also zero and not needed.
+   // Except X[ 9] which is part of W[ 0] from the third group.
+   X[ 0] = _mm256_add_epi32( SSG2_0x( W[ 1] ), W[ 0] );
+   X[ 1] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( W[15] ),
+                             SSG2_0x( W[ 2] ) ), W[ 1] );
   X[ 2] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 0] ), W[11] ),
                             W[ 2] );
   X[ 3] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 1] ), W[12] ),
@@ -725,16 +728,12 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
                             W[ 6] );
   X[ 7] = _mm256_add_epi32( _mm256_add_epi32( X[ 0], SSG2_0x( W[ 8] ) ),
                             W[ 7] );
-   X[ 8] = _mm256_add_epi32( _mm256_add_epi32( X[ 1], SSG2_0x( W[ 9] ) ),
-                             W[ 8] );
-   X[ 9] = _mm256_add_epi32( SSG2_0x( W[10] ), W[ 9] );
-   X[10] = _mm256_add_epi32( SSG2_0x( W[11] ), W[10] );
-   X[11] = _mm256_add_epi32( SSG2_0x( W[12] ), W[11] );
-   X[12] = _mm256_add_epi32( SSG2_0x( W[13] ), W[12] );
-   X[13] = _mm256_add_epi32( SSG2_0x( W[14] ), W[13] );
-   X[14] = _mm256_add_epi32( SSG2_0x( W[15] ), W[14] );
+   X[ 8] = _mm256_add_epi32( X[ 1], W[ 8] );
+   X[14] = SSG2_0x( W[15] );
   X[15] = _mm256_add_epi32( SSG2_0x( X[ 0] ), W[15] );

+   X[ 9] = _mm256_add_epi32( SSG2_0x( X[ 1] ), X[ 0] );
+   
   A = _mm256_load_si256( state_in     );
   B = _mm256_load_si256( state_in + 1 );
   C = _mm256_load_si256( state_in + 2 );
@@ -779,10 +778,6 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
   G = _mm256_load_si256( state_mid + 6 );
   H = _mm256_load_si256( state_mid + 7 );

-//   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-//   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-//   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-
 #if !defined(__AVX512VL__)
   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( G, H );
 #endif
@@ -810,23 +805,36 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
   W[ 6] = _mm256_add_epi32( X[ 6], SSG2_1x( W[ 4] ) );
   W[ 7] = _mm256_add_epi32( X[ 7], SSG2_1x( W[ 5] ) );
   W[ 8] = _mm256_add_epi32( X[ 8], SSG2_1x( W[ 6] ) );
-   W[ 9] = _mm256_add_epi32( X[ 9], _mm256_add_epi32( SSG2_1x( W[ 7] ),
-                                                      W[ 2] ) );
-   W[10] = _mm256_add_epi32( X[10], _mm256_add_epi32( SSG2_1x( W[ 8] ),
-                                                      W[ 3] ) );
-   W[11] = _mm256_add_epi32( X[11], _mm256_add_epi32( SSG2_1x( W[ 9] ),
-                                                      W[ 4] ) );
-   W[12] = _mm256_add_epi32( X[12], _mm256_add_epi32( SSG2_1x( W[10] ),
-                                                      W[ 5] ) );
-   W[13] = _mm256_add_epi32( X[13], _mm256_add_epi32( SSG2_1x( W[11] ),
-                                                      W[ 6] ) );
+   W[ 9] = _mm256_add_epi32( SSG2_1x( W[ 7] ), W[ 2] );
+   W[10] = _mm256_add_epi32( SSG2_1x( W[ 8] ), W[ 3] );
+   W[11] = _mm256_add_epi32( SSG2_1x( W[ 9] ), W[ 4] );
+   W[12] = _mm256_add_epi32( SSG2_1x( W[10] ), W[ 5] );
+   W[13] = _mm256_add_epi32( SSG2_1x( W[11] ), W[ 6] );
   W[14] = _mm256_add_epi32( X[14], _mm256_add_epi32( SSG2_1x( W[12] ),
                                                      W[ 7] ) );
   W[15] = _mm256_add_epi32( X[15], _mm256_add_epi32( SSG2_1x( W[13] ),
                                                      W[ 8] ) );

   SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
-   SHA256x8_MSG_EXPANSION( W );
+
+   W[ 0] = _mm256_add_epi32( X[ 9], _mm256_add_epi32( SSG2_1x( W[14] ),
+                                                      W[ 9] ) );
+   W[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   W[ 2] = SHA2x_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
+   W[ 3] = SHA2x_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
+   W[ 4] = SHA2x_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
+   W[ 5] = SHA2x_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
+   W[ 6] = SHA2x_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
+   W[ 7] = SHA2x_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
+   W[ 8] = SHA2x_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
+   W[ 9] = SHA2x_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
+   W[10] = SHA2x_MEXP( W[ 8], W[ 3], W[11], W[10] );
+   W[11] = SHA2x_MEXP( W[ 9], W[ 4], W[12], W[11] );
+   W[12] = SHA2x_MEXP( W[10], W[ 5], W[13], W[12] );
+   W[13] = SHA2x_MEXP( W[11], W[ 6], W[14], W[13] );
+   W[14] = SHA2x_MEXP( W[12], W[ 7], W[15], W[14] );
+   W[15] = SHA2x_MEXP( W[13], W[ 8], W[ 0], W[15] ); 
+
   SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
   SHA256x8_MSG_EXPANSION( W );
   SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
@@ -1201,9 +1209,13 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
 {
   __m512i A, B, C, D, E, F, G, H;
   
-   // precalculate constant part msg expansion for second iteration.
-   X[ 0] = SHA2x16_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
-   X[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   // X is pre-expanded constant part of msg for second group, rounds 16 to 31.
+   // W[9:14] are zero, therefore X[9:13] are also zero and not needed.
+   // Except X[ 9] which is used to pre-expand part of W[ 0] from the third
+   // group, rounds 32 to 48.
+   X[ 0] = _mm512_add_epi32( SSG2_0x16( W[ 1] ), W[ 0] );
+   X[ 1] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( W[15] ),
+                             SSG2_0x16( W[ 2] ) ), W[ 1] );
   X[ 2] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 0] ), W[11] ),
                             W[ 2] );
   X[ 3] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 1] ), W[12] ),
@@ -1216,16 +1228,12 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
                             W[ 6] ); 
   X[ 7] = _mm512_add_epi32( _mm512_add_epi32( X[ 0], SSG2_0x16( W[ 8] ) ),
                             W[ 7] );
-   X[ 8] = _mm512_add_epi32( _mm512_add_epi32( X[ 1], SSG2_0x16( W[ 9] ) ),
-                             W[ 8] );
-   X[ 9] = _mm512_add_epi32( SSG2_0x16( W[10] ), W[ 9] );
-   X[10] = _mm512_add_epi32( SSG2_0x16( W[11] ), W[10] );
-   X[11] = _mm512_add_epi32( SSG2_0x16( W[12] ), W[11] );
-   X[12] = _mm512_add_epi32( SSG2_0x16( W[13] ), W[12] );
-   X[13] = _mm512_add_epi32( SSG2_0x16( W[14] ), W[13] );
-   X[14] = _mm512_add_epi32( SSG2_0x16( W[15] ), W[14] );
+   X[ 8] = _mm512_add_epi32( X[ 1], W[ 8] );
+   X[14] = SSG2_0x16( W[15] );
   X[15] = _mm512_add_epi32( SSG2_0x16( X[ 0] ), W[15] );

+   X[ 9] = _mm512_add_epi32( SSG2_0x16( X[ 1] ), X[ 0] );
+
   A = _mm512_load_si512( state_in     );
   B = _mm512_load_si512( state_in + 1 );
   C = _mm512_load_si512( state_in + 2 );
@@ -1280,7 +1288,7 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );

-   // update precalculated msg expansion with new nonce: W[3].
+   // inject nonce, W[3], to complete msg expansion.
   W[ 0] = X[ 0];
   W[ 1] = X[ 1];
   W[ 2] = _mm512_add_epi32( X[ 2], SSG2_0x16( W[ 3] ) );
@@ -1290,23 +1298,36 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
   W[ 6] = _mm512_add_epi32( X[ 6], SSG2_1x16( W[ 4] ) );
   W[ 7] = _mm512_add_epi32( X[ 7], SSG2_1x16( W[ 5] ) );
   W[ 8] = _mm512_add_epi32( X[ 8], SSG2_1x16( W[ 6] ) );
-   W[ 9] = _mm512_add_epi32( X[ 9], _mm512_add_epi32( SSG2_1x16( W[ 7] ),
-                                                      W[ 2] ) );
-   W[10] = _mm512_add_epi32( X[10], _mm512_add_epi32( SSG2_1x16( W[ 8] ),
-                                                      W[ 3] ) );
-   W[11] = _mm512_add_epi32( X[11], _mm512_add_epi32( SSG2_1x16( W[ 9] ),
-                                                      W[ 4] ) );
-   W[12] = _mm512_add_epi32( X[12], _mm512_add_epi32( SSG2_1x16( W[10] ),
-                                                      W[ 5] ) );
-   W[13] = _mm512_add_epi32( X[13], _mm512_add_epi32( SSG2_1x16( W[11] ),
-                                                      W[ 6] ) );
+   W[ 9] = _mm512_add_epi32( SSG2_1x16( W[ 7] ), W[ 2] );
+   W[10] = _mm512_add_epi32( SSG2_1x16( W[ 8] ), W[ 3] );
+   W[11] = _mm512_add_epi32( SSG2_1x16( W[ 9] ), W[ 4] );
+   W[12] = _mm512_add_epi32( SSG2_1x16( W[10] ), W[ 5] );
+   W[13] = _mm512_add_epi32( SSG2_1x16( W[11] ), W[ 6] );
   W[14] = _mm512_add_epi32( X[14], _mm512_add_epi32( SSG2_1x16( W[12] ),
                                                      W[ 7] ) );
   W[15] = _mm512_add_epi32( X[15], _mm512_add_epi32( SSG2_1x16( W[13] ),
                                                      W[ 8] ) );

   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
-   SHA256x16_MSG_EXPANSION( W );
+
+   W[ 0] = _mm512_add_epi32( X[ 9], _mm512_add_epi32( SSG2_1x16( W[14] ),
+                                                      W[ 9] ) ); 
+   W[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   W[ 2] = SHA2x16_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
+   W[ 3] = SHA2x16_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
+   W[ 4] = SHA2x16_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
+   W[ 5] = SHA2x16_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
+   W[ 6] = SHA2x16_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
+   W[ 7] = SHA2x16_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
+   W[ 8] = SHA2x16_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
+   W[ 9] = SHA2x16_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
+   W[10] = SHA2x16_MEXP( W[ 8], W[ 3], W[11], W[10] );
+   W[11] = SHA2x16_MEXP( W[ 9], W[ 4], W[12], W[11] );
+   W[12] = SHA2x16_MEXP( W[10], W[ 5], W[13], W[12] );
+   W[13] = SHA2x16_MEXP( W[11], W[ 6], W[14], W[13] );
+   W[14] = SHA2x16_MEXP( W[12], W[ 7], W[15], W[14] );
+   W[15] = SHA2x16_MEXP( W[13], W[ 8], W[ 0], W[15] );
+
   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
   SHA256x16_MSG_EXPANSION( W );
   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
@@ -1336,8 +1357,8 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
 {
   __m512i A, B, C, D, E, F, G, H;
   __m512i W[16];      memcpy_512( W, data, 16 );
-   // Value for H at round 60, before adding K, to produce valid final hash
-   //where H == 0.
+   // Value for H at round 60, before adding K, needed to produce valid final
+   // hash where H == 0.
   // H_ =  -( H256[7] + K256[60] );
   const __m512i H_ = m512_const1_32( 0x136032ED );

--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -33,6 +33,7 @@
 #include <stddef.h>
 #include <string.h>

+// 4way is only used with AVX2, 8way only with AVX512, 16way is not needed.
 #ifdef __SSE4_1__

 #include "shabal-hash-4way.h"
@@ -44,21 +45,6 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

-/*
- * Part of this code was automatically generated (the part between
- * the "BEGIN" and "END" markers).
- */
-
-#define sM    16
-
-#define C32   SPH_C32
-#define T32   SPH_T32
-
-#define O1   13
-#define O2    9
-#define O3    6
-
-
 #if defined(__AVX2__)

 #define DECL_STATE8   \
@@ -310,72 +296,71 @@ do { \
    mm256_swap512_256( BF, CF ); \
 } while (0)

-#define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
+#define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
-   xa0 = mm256_xor3( xm, xb1, _mm256_xor_si256(  \
-            _mm256_andnot_si256( xb3, xb2 ), \
-            _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
-               _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \
-                                   FIVE ) ), THREE ) ) ); \
+   xa0 = mm256_xor3( xm, xb1, mm256_xorandnot(  \
+           _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
+              _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
+           xb3, xb2 ) ); \
   xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
 } while (0)

 #define PERM_STEP_0_8   do { \
-      PERM_ELT8(A0, AB, B0, BD, B9, B6, C8, M0); \
-      PERM_ELT8(A1, A0, B1, BE, BA, B7, C7, M1); \
-      PERM_ELT8(A2, A1, B2, BF, BB, B8, C6, M2); \
-      PERM_ELT8(A3, A2, B3, B0, BC, B9, C5, M3); \
-      PERM_ELT8(A4, A3, B4, B1, BD, BA, C4, M4); \
-      PERM_ELT8(A5, A4, B5, B2, BE, BB, C3, M5); \
-      PERM_ELT8(A6, A5, B6, B3, BF, BC, C2, M6); \
-      PERM_ELT8(A7, A6, B7, B4, B0, BD, C1, M7); \
-      PERM_ELT8(A8, A7, B8, B5, B1, BE, C0, M8); \
-      PERM_ELT8(A9, A8, B9, B6, B2, BF, CF, M9); \
-      PERM_ELT8(AA, A9, BA, B7, B3, B0, CE, MA); \
-      PERM_ELT8(AB, AA, BB, B8, B4, B1, CD, MB); \
-      PERM_ELT8(A0, AB, BC, B9, B5, B2, CC, MC); \
-      PERM_ELT8(A1, A0, BD, BA, B6, B3, CB, MD); \
-      PERM_ELT8(A2, A1, BE, BB, B7, B4, CA, ME); \
-      PERM_ELT8(A3, A2, BF, BC, B8, B5, C9, MF); \
-   } while (0)
+      PERM_ELT8( A0, AB, B0, BD, B9, B6, C8, M0 ); \
+      PERM_ELT8( A1, A0, B1, BE, BA, B7, C7, M1 ); \
+      PERM_ELT8( A2, A1, B2, BF, BB, B8, C6, M2 ); \
+      PERM_ELT8( A3, A2, B3, B0, BC, B9, C5, M3 ); \
+      PERM_ELT8( A4, A3, B4, B1, BD, BA, C4, M4 ); \
+      PERM_ELT8( A5, A4, B5, B2, BE, BB, C3, M5 ); \
+      PERM_ELT8( A6, A5, B6, B3, BF, BC, C2, M6 ); \
+      PERM_ELT8( A7, A6, B7, B4, B0, BD, C1, M7 ); \
+      PERM_ELT8( A8, A7, B8, B5, B1, BE, C0, M8 ); \
+      PERM_ELT8( A9, A8, B9, B6, B2, BF, CF, M9 ); \
+      PERM_ELT8( AA, A9, BA, B7, B3, B0, CE, MA ); \
+      PERM_ELT8( AB, AA, BB, B8, B4, B1, CD, MB ); \
+      PERM_ELT8( A0, AB, BC, B9, B5, B2, CC, MC ); \
+      PERM_ELT8( A1, A0, BD, BA, B6, B3, CB, MD ); \
+      PERM_ELT8( A2, A1, BE, BB, B7, B4, CA, ME ); \
+      PERM_ELT8( A3, A2, BF, BC, B8, B5, C9, MF ); \
+} while (0)

 #define PERM_STEP_1_8   do { \
-      PERM_ELT8(A4, A3, B0, BD, B9, B6, C8, M0); \
-      PERM_ELT8(A5, A4, B1, BE, BA, B7, C7, M1); \
-      PERM_ELT8(A6, A5, B2, BF, BB, B8, C6, M2); \
-      PERM_ELT8(A7, A6, B3, B0, BC, B9, C5, M3); \
-      PERM_ELT8(A8, A7, B4, B1, BD, BA, C4, M4); \
-      PERM_ELT8(A9, A8, B5, B2, BE, BB, C3, M5); \
-      PERM_ELT8(AA, A9, B6, B3, BF, BC, C2, M6); \
-      PERM_ELT8(AB, AA, B7, B4, B0, BD, C1, M7); \
-      PERM_ELT8(A0, AB, B8, B5, B1, BE, C0, M8); \
-      PERM_ELT8(A1, A0, B9, B6, B2, BF, CF, M9); \
-      PERM_ELT8(A2, A1, BA, B7, B3, B0, CE, MA); \
-      PERM_ELT8(A3, A2, BB, B8, B4, B1, CD, MB); \
-      PERM_ELT8(A4, A3, BC, B9, B5, B2, CC, MC); \
-      PERM_ELT8(A5, A4, BD, BA, B6, B3, CB, MD); \
-      PERM_ELT8(A6, A5, BE, BB, B7, B4, CA, ME); \
-      PERM_ELT8(A7, A6, BF, BC, B8, B5, C9, MF); \
-   } while (0)
+      PERM_ELT8( A4, A3, B0, BD, B9, B6, C8, M0 ); \
+      PERM_ELT8( A5, A4, B1, BE, BA, B7, C7, M1 ); \
+      PERM_ELT8( A6, A5, B2, BF, BB, B8, C6, M2 ); \
+      PERM_ELT8( A7, A6, B3, B0, BC, B9, C5, M3 ); \
+      PERM_ELT8( A8, A7, B4, B1, BD, BA, C4, M4 ); \
+      PERM_ELT8( A9, A8, B5, B2, BE, BB, C3, M5 ); \
+      PERM_ELT8( AA, A9, B6, B3, BF, BC, C2, M6 ); \
+      PERM_ELT8( AB, AA, B7, B4, B0, BD, C1, M7 ); \
+      PERM_ELT8( A0, AB, B8, B5, B1, BE, C0, M8 ); \
+      PERM_ELT8( A1, A0, B9, B6, B2, BF, CF, M9 ); \
+      PERM_ELT8( A2, A1, BA, B7, B3, B0, CE, MA ); \
+      PERM_ELT8( A3, A2, BB, B8, B4, B1, CD, MB ); \
+      PERM_ELT8( A4, A3, BC, B9, B5, B2, CC, MC ); \
+      PERM_ELT8( A5, A4, BD, BA, B6, B3, CB, MD ); \
+      PERM_ELT8( A6, A5, BE, BB, B7, B4, CA, ME ); \
+      PERM_ELT8( A7, A6, BF, BC, B8, B5, C9, MF ); \
+} while (0)

 #define PERM_STEP_2_8   do { \
-      PERM_ELT8(A8, A7, B0, BD, B9, B6, C8, M0); \
-      PERM_ELT8(A9, A8, B1, BE, BA, B7, C7, M1); \
-      PERM_ELT8(AA, A9, B2, BF, BB, B8, C6, M2); \
-      PERM_ELT8(AB, AA, B3, B0, BC, B9, C5, M3); \
-      PERM_ELT8(A0, AB, B4, B1, BD, BA, C4, M4); \
-      PERM_ELT8(A1, A0, B5, B2, BE, BB, C3, M5); \
-      PERM_ELT8(A2, A1, B6, B3, BF, BC, C2, M6); \
-      PERM_ELT8(A3, A2, B7, B4, B0, BD, C1, M7); \
-      PERM_ELT8(A4, A3, B8, B5, B1, BE, C0, M8); \
-      PERM_ELT8(A5, A4, B9, B6, B2, BF, CF, M9); \
-      PERM_ELT8(A6, A5, BA, B7, B3, B0, CE, MA); \
-      PERM_ELT8(A7, A6, BB, B8, B4, B1, CD, MB); \
-      PERM_ELT8(A8, A7, BC, B9, B5, B2, CC, MC); \
-      PERM_ELT8(A9, A8, BD, BA, B6, B3, CB, MD); \
-      PERM_ELT8(AA, A9, BE, BB, B7, B4, CA, ME); \
-      PERM_ELT8(AB, AA, BF, BC, B8, B5, C9, MF); \
-   } while (0)
+      PERM_ELT8( A8, A7, B0, BD, B9, B6, C8, M0 ); \
+      PERM_ELT8( A9, A8, B1, BE, BA, B7, C7, M1 ); \
+      PERM_ELT8( AA, A9, B2, BF, BB, B8, C6, M2 ); \
+      PERM_ELT8( AB, AA, B3, B0, BC, B9, C5, M3 ); \
+      PERM_ELT8( A0, AB, B4, B1, BD, BA, C4, M4 ); \
+      PERM_ELT8( A1, A0, B5, B2, BE, BB, C3, M5 ); \
+      PERM_ELT8( A2, A1, B6, B3, BF, BC, C2, M6 ); \
+      PERM_ELT8( A3, A2, B7, B4, B0, BD, C1, M7 ); \
+      PERM_ELT8( A4, A3, B8, B5, B1, BE, C0, M8 ); \
+      PERM_ELT8( A5, A4, B9, B6, B2, BF, CF, M9 ); \
+      PERM_ELT8( A6, A5, BA, B7, B3, B0, CE, MA ); \
+      PERM_ELT8( A7, A6, BB, B8, B4, B1, CD, MB ); \
+      PERM_ELT8( A8, A7, BC, B9, B5, B2, CC, MC ); \
+      PERM_ELT8( A9, A8, BD, BA, B6, B3, CB, MD ); \
+      PERM_ELT8( AA, A9, BE, BB, B7, B4, CA, ME ); \
+      PERM_ELT8( AB, AA, BF, BC, B8, B5, C9, MF ); \
+} while (0)

 #define APPLY_P8 \
 do { \
@@ -437,8 +422,8 @@ do { \
 } while (0)

 #define INCR_W8   do { \
-      if ((Wlow = T32(Wlow + 1)) == 0) \
-         Whigh = T32(Whigh + 1); \
+      if ( ( Wlow = Wlow + 1 ) == 0 ) \
+         Whigh = Whigh + 1; \
   } while (0)

 static void
@@ -650,15 +635,8 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
   shabal_8way_close(cc, ub, n, dst, 16);
 }

-
 #endif  // AVX2

-/*
- * We copy the state into local variables, so that the compiler knows
- * that it can optimize them at will.
- */
-
-
 #define DECL_STATE   \
 	__m128i A0, A1, A2, A3, A4, A5, A6, A7, \
 	        A8, A9, AA, AB; \
@@ -888,15 +866,6 @@ do { \
   A1 = _mm_xor_si128( A1, _mm_set1_epi32( Whigh ) ); \
 } while (0)

-
-/*
-#define SWAP(v1, v2)   do { \
-		sph_u32 tmp = (v1); \
-		(v1) = (v2); \
-		(v2) = tmp; \
-	} while (0)
-*/
-
 #define SWAP_BC \
 do { \
    mm128_swap256_128( B0, C0 ); \
@@ -917,18 +886,6 @@ do { \
    mm128_swap256_128( BF, CF ); \
 } while (0)

-/*
-#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
-do { \
-  __m128i t1 = _mm_mullo_epi32(  mm_rol_32( xa1, 15 ),\
-                                   _mm_set1_epi32(5UL) ) \
-  __m128i t2 = _mm_xor_si128( xa0, xc ); \
-  xb0 = mm_not( _mm_xor_si256( xa0, mm_rol_32( xb0, 1 ) ) ); \
-  xa0 = mm_xor4( xm, xb1, _mm_andnot_si128( xb3, xb2 ), \
-              _mm_xor_si128( t2, \
-                      _mm_mullo_epi32( t1, _mm_set1_epi32(5UL) ) ) ) \
-*/
-
 #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
 do { \
   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
@@ -1056,8 +1013,8 @@ do { \
 } while (0)

 #define INCR_W   do { \
-		if ((Wlow = T32(Wlow + 1)) == 0) \
-			Whigh = T32(Whigh + 1); \
+		if ( ( Wlow = Wlow + 1 ) == 0 ) \
+			Whigh = Whigh + 1; \
 	} while (0)

 /*
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -75,7 +75,6 @@ void shabal512_8way_close( void *cc, void *dst );
 void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                       void *dst );

-
 #endif

 typedef struct {
@@ -97,7 +96,6 @@ void shabal256_4way_addbits_and_close(	void *cc, unsigned ub, unsigned n,

 void shabal512_4way_init( void *cc );
 void shabal512_4way_update( void *cc, const void *data, size_t len );
-//#define shabal512_4way shabal512_4way_update
 void shabal512_4way_close( void *cc, void *dst );
 void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                       void *dst );
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -383,11 +383,17 @@ static const m512_v16 FFT256_Twiddle4w[] =

 #define shufxor4w(x,s) _mm512_shuffle_epi32( x, XCAT( SHUFXOR_, s ))

+#define REDUCE4w(x) \
+  _mm512_sub_epi16( _mm512_maskz_mov_epi8( 0x5555555555555555, x ), \
+                    _mm512_srai_epi16( x, 8 ) )
+
+/*
 #define REDUCE4w(x) \
  _mm512_sub_epi16( _mm512_and_si512( x, m512_const1_64( \
                         0x00ff00ff00ff00ff ) ), _mm512_srai_epi16( x, 8 ) )
+*/

-#define EXTRA_REDUCE_S4w(x)\
+#define EXTRA_REDUCE_S4w(x) \
  _mm512_sub_epi16( x, _mm512_and_si512( \
             m512_const1_64( 0x0101010101010101 ), \
             _mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \
@@ -400,8 +406,8 @@ static const m512_v16 FFT256_Twiddle4w[] =

 #define DO_REDUCE_FULL_S4w(i) \
 do { \
-    X(i) = REDUCE4w( X(i) );                        \
-    X(i) = EXTRA_REDUCE_S4w( X(i) );                \
+    X(i) = REDUCE4w( X(i) ); \
+    X(i) = EXTRA_REDUCE_S4w( X(i) ); \
 } while(0)


@@ -431,10 +437,6 @@ void fft64_4way( void *a )
   //  Unrolled decimation in frequency (DIF) radix-2 NTT.
   //  Output data is in revbin_permuted order.

-  static const int w[] = {0, 2, 4, 6};
-//   __m256i *Twiddle = (__m256i*)FFT64_Twiddle;
-
-
 // targetted  
 #define BUTTERFLY_0( i,j ) \
 do { \
@@ -443,25 +445,25 @@ do { \
    X(i) = _mm512_sub_epi16( X(i), v ); \
 } while(0)

-#define BUTTERFLY_N( i,j,n ) \
+#define BUTTERFLY_N( i, j, w ) \
 do { \
    __m512i v = X(j); \
    X(j) = _mm512_add_epi16( X(i), X(j) ); \
-    X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w[n] ); \
+    X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w ); \
 } while(0)

  BUTTERFLY_0( 0, 4 );
-  BUTTERFLY_N( 1, 5, 1 );
-  BUTTERFLY_N( 2, 6, 2 );
-  BUTTERFLY_N( 3, 7, 3 );
+  BUTTERFLY_N( 1, 5, 2 );
+  BUTTERFLY_N( 2, 6, 4 );
+  BUTTERFLY_N( 3, 7, 6 );

  DO_REDUCE( 2 );
  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
-  BUTTERFLY_N( 1, 3, 2 );
-  BUTTERFLY_N( 5, 7, 2 );
+  BUTTERFLY_N( 1, 3, 4 );
+  BUTTERFLY_N( 5, 7, 4 );

  DO_REDUCE( 1 );

@@ -501,12 +503,11 @@ do { \
  // Transpose the FFT state with a revbin order permutation
  // on the rows and the column.
  // This will make the full FFT_64 in order.
-#define INTERLEAVE(i,j) \
+#define INTERLEAVE( i, j ) \
  do { \
-    __m512i t1= X(i); \
-    __m512i t2= X(j); \
-    X(i) = _mm512_unpacklo_epi16( t1, t2 ); \
-    X(j) = _mm512_unpackhi_epi16( t1, t2 ); \
+    __m512i u = X(j); \
+    X(j) = _mm512_unpackhi_epi16( X(i), X(j) ); \
+    X(i) = _mm512_unpacklo_epi16( X(i), u ); \
  } while(0)

  INTERLEAVE( 1, 0 );
@@ -534,10 +535,10 @@ do { \
 } while(0)


-#define BUTTERFLY_N( i,j,n ) \
+#define BUTTERFLY_N( i, j, w ) \
 do { \
   __m512i u = X(j); \
-   X(i) = _mm512_slli_epi16( X(i), w[n] ); \
+   X(i) = _mm512_slli_epi16( X(i), w ); \
   X(j) = _mm512_sub_epi16( X(j), X(i) ); \
   X(i) = _mm512_add_epi16( u, X(i) ); \
 } while(0)
@@ -558,15 +559,15 @@ do { \

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
-  BUTTERFLY_N( 1, 3, 2 );
-  BUTTERFLY_N( 5, 7, 2 );
+  BUTTERFLY_N( 1, 3, 4 );
+  BUTTERFLY_N( 5, 7, 4 );

  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 4 );
-  BUTTERFLY_N( 1, 5, 1 );
-  BUTTERFLY_N( 2, 6, 2 );
-  BUTTERFLY_N( 3, 7, 3 );
+  BUTTERFLY_N( 1, 5, 2 );
+  BUTTERFLY_N( 2, 6, 4 );
+  BUTTERFLY_N( 3, 7, 6 );

  DO_REDUCE_FULL_S4w( 0 );
  DO_REDUCE_FULL_S4w( 1 );
@@ -599,7 +600,6 @@ void fft128_4way( void *a )
  // Temp space to help for interleaving in the end
  __m512i B[8];
  __m512i *A = (__m512i*) a;
-//  __m256i *Twiddle = (__m256i*)FFT128_Twiddle;

  /* Size-2 butterflies */
  for ( i = 0; i<8; i++ )
@@ -633,7 +633,6 @@ void fft128_4way_msg( uint16_t *a, const uint8_t *x, int final )

  __m512i *X = (__m512i*)x;
  __m512i *A = (__m512i*)a;
-//  __m256i *Twiddle = (__m256i*)FFT128_Twiddle;

 #define UNPACK( i ) \
 do { \
@@ -686,7 +685,6 @@ void fft256_4way_msg( uint16_t *a, const uint8_t *x, int final )

  __m512i *X = (__m512i*)x;
  __m512i *A = (__m512i*)a;
-//  __m256i *Twiddle = (__m256i*)FFT256_Twiddle;

 #define UNPACK( i ) \
 do { \
@@ -776,109 +774,6 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
  // We split the round function in two halfes
  // so as to insert some independent computations in between

-// generic
-#if 0
-#define SUM7_00 0
-#define SUM7_01 1
-#define SUM7_02 2
-#define SUM7_03 3
-#define SUM7_04 4
-#define SUM7_05 5
-#define SUM7_06 6
-
-#define SUM7_10 1
-#define SUM7_11 2
-#define SUM7_12 3
-#define SUM7_13 4
-#define SUM7_14 5
-#define SUM7_15 6
-#define SUM7_16 0
-
-#define SUM7_20 2
-#define SUM7_21 3
-#define SUM7_22 4
-#define SUM7_23 5
-#define SUM7_24 6
-#define SUM7_25 0
-#define SUM7_26 1
-
-#define SUM7_30 3
-#define SUM7_31 4
-#define SUM7_32 5
-#define SUM7_33 6
-#define SUM7_34 0
-#define SUM7_35 1
-#define SUM7_36 2
-
-#define SUM7_40 4
-#define SUM7_41 5
-#define SUM7_42 6
-#define SUM7_43 0
-#define SUM7_44 1
-#define SUM7_45 2
-#define SUM7_46 3
-
-#define SUM7_50 5
-#define SUM7_51 6
-#define SUM7_52 0
-#define SUM7_53 1
-#define SUM7_54 2
-#define SUM7_55 3
-#define SUM7_56 4
-
-#define SUM7_60 6
-#define SUM7_61 0
-#define SUM7_62 1
-#define SUM7_63 2
-#define SUM7_64 3
-#define SUM7_65 4
-#define SUM7_66 5
-
-#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
-
-#define PERM_0(d,a) /* XOR 1 */ \
-do { \
-    d##l = shufxor( a##l, 1 ); \
-    d##h = shufxor( a##h, 1 ); \
- } while(0)
-
-#define PERM_1(d,a) /* XOR 6 */ \
-do { \
-    d##l = shufxor( a##h, 2 ); \
-    d##h = shufxor( a##l, 2 ); \
-} while(0)
-
-#define PERM_2(d,a) /* XOR 2 */ \
-do { \
-    d##l = shufxor( a##l, 2 ); \
-    d##h = shufxor( a##h, 2 ); \
-} while(0)
-
-#define PERM_3(d,a) /* XOR 3 */ \
-do { \
-    d##l = shufxor( a##l, 3 ); \
-    d##h = shufxor( a##h, 3 ); \
-} while(0)
-
-#define PERM_4(d,a) /* XOR 5 */ \
-do { \
-    d##l = shufxor( a##h, 1 ); \
-    d##h = shufxor( a##l, 1 ); \
-} while(0)
-
-#define PERM_5(d,a) /* XOR 7 */ \
-do { \
-    d##l = shufxor( a##h, 3 ); \
-    d##h = shufxor( a##l, 3 ); \
-} while(0)
-
-#define PERM_6(d,a) /* XOR 4 */ \
-do { \
-    d##l = a##h; \
-    d##h = a##l; \
-} while(0)
-#endif
-
 // targetted
  
 #define STEP_1_(a,b,c,d,w,fun,r,s,z) \
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -1106,8 +1106,7 @@ skein256_4way_close(void *cc, void *dst)
 }


-
-// Do not use with 128 bit data
+// Broken for 80 & 128 bytes, use prehash or full
 void
 skein512_4way_update(void *cc, const void *data, size_t len)
 {
--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -112,8 +112,9 @@ void timetravel_4way_hash(void *output, const void *input)
              intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
        case 3:
-           skein512_4way_update( &ctx.skein, vhashA, dataLen );
-           skein512_4way_close( &ctx.skein, vhashB );
+           skein512_4way_full( &ctx.skein, vhashB, vhashA, dataLen );
+//           skein512_4way_update( &ctx.skein, vhashA, dataLen );
+//           skein512_4way_close( &ctx.skein, vhashB );
           if ( i == 7 )
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
        break;
--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -118,8 +118,9 @@ void timetravel10_4way_hash(void *output, const void *input)
              intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
        case 3:
-           skein512_4way_update( &ctx.skein, vhashA, dataLen );
-           skein512_4way_close( &ctx.skein, vhashB );
+           skein512_4way_full( &ctx.skein, vhashB, vhashA, dataLen );
+//           skein512_4way_update( &ctx.skein, vhashA, dataLen );
+//           skein512_4way_close( &ctx.skein, vhashB );
           if ( i == 9 )
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
        break;
--- a/algo/x14/polytimos-4way.c
+++ b/algo/x14/polytimos-4way.c
@@ -33,9 +33,10 @@ void polytimos_4way_hash( void *output, const void *input )
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     poly_4way_context_overlay ctx;

-     skein512_4way_init( &ctx.skein );
-     skein512_4way_update( &ctx.skein, input, 80 );
-     skein512_4way_close( &ctx.skein, vhash );
+     skein512_4way_full( &ctx.skein, vhash, input, 80 );
+//     skein512_4way_init( &ctx.skein );
+//     skein512_4way_update( &ctx.skein, input, 80 );
+//     skein512_4way_close( &ctx.skein, vhash );

     // Need to convert from 64 bit interleaved to 32 bit interleaved.
     uint32_t vhash32[16*4];
--- a/algo/x14/veltor-4way.c
+++ b/algo/x14/veltor-4way.c
@@ -38,8 +38,10 @@ void veltor_4way_hash( void *output, const void *input )
     veltor_4way_ctx_holder ctx __attribute__ ((aligned (64)));
     memcpy( &ctx, &veltor_4way_ctx, sizeof(veltor_4way_ctx) );

-     skein512_4way_update( &ctx.skein, input, 80 );
-     skein512_4way_close( &ctx.skein, vhash );
+//     skein512_4way_update( &ctx.skein, input, 80 );
+//     skein512_4way_close( &ctx.skein, vhash );
+
+     skein512_4way_full( &ctx.skein, vhash, input, 80 );
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     sph_shavite512( &ctx.shavite, hash0, 64 );
@@ -105,7 +107,7 @@ int scanhash_veltor_4way( struct work *work, uint32_t max_nonce,
         pdata[19] = n;

         for ( int i = 0; i < 4; i++ )
-         if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+         if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) && ! opt_benchmark )
         {
            pdata[19] = n+i;
            submit_solution( work, hash+(i<<3), mythr );
--- a/algo/x16/minotaur.c
+++ b/algo/x16/minotaur.c
@@ -18,6 +18,7 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sph_sha2.h"
+#include "algo/yespower/yespower.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -31,6 +32,9 @@
 // Config
 #define MINOTAUR_ALGO_COUNT	16

+static const yespower_params_t minotaurx_yespower_params =
+                         { YESPOWER_1_0, 2048, 8, "et in arcadia ego", 17 };
+
 typedef struct TortureNode TortureNode;
 typedef struct TortureGarden TortureGarden;

@@ -59,20 +63,22 @@ struct TortureGarden
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
        sph_sha512_context      sha512;
-
-    struct TortureNode {
+    struct TortureNode
+    {
        unsigned int algo;
        TortureNode *child[2];
    } nodes[22];
 } __attribute__ ((aligned (64)));

 // Get a 64-byte hash for given 64-byte input, using given TortureGarden contexts and given algo index
-static void get_hash( void *output, const void *input, TortureGarden *garden,
-	              unsigned int algo )
+static int get_hash( void *output, const void *input, TortureGarden *garden,
+	                  unsigned int algo, int thr_id )
 {    
 	unsigned char hash[64] __attribute__ ((aligned (64)));
+   int rc = 1;

-    switch (algo) {
+    switch ( algo )
+    {
        case 0:
            sph_blake512_init(&garden->blake);
            sph_blake512(&garden->blake, input, 64);
@@ -97,14 +103,14 @@ static void get_hash( void *output, const void *input, TortureGarden *garden,
            sph_echo512(&garden->echo, input, 64);
            sph_echo512_close(&garden->echo, hash);          
 #endif
-	    break;
+	         break;
        case 4:
 #if defined(__AES__)
            fugue512_full( &garden->fugue, hash, input, 64 );
 #else
            sph_fugue512_full( &garden->fugue, hash, input, 64 );
 #endif
-	    break;
+	         break;
        case 5:
 #if defined(__AES__)
            groestl512_full( &garden->groestl, (char*)hash, (char*)input, 512 );
@@ -113,7 +119,7 @@ static void get_hash( void *output, const void *input, TortureGarden *garden,
            sph_groestl512(&garden->groestl, input, 64);
            sph_groestl512_close(&garden->groestl, hash);          
 #endif
-	    break;
+	         break;
        case 6:
            sph_hamsi512_init(&garden->hamsi);
            sph_hamsi512(&garden->hamsi, input, 64);
@@ -164,16 +170,20 @@ static void get_hash( void *output, const void *input, TortureGarden *garden,
            sph_whirlpool(&garden->whirlpool, input, 64);
            sph_whirlpool_close(&garden->whirlpool, hash);          
            break;
+        case 16: // minotaurx only, yespower hardcoded for last node
+            rc = yespower_tls( input, 64, &minotaurx_yespower_params,
+                               (yespower_binary_t*)hash, thr_id );
    }

    memcpy(output, hash, 64);
+    return rc;
 }

 static __thread TortureGarden garden;

 bool initialize_torture_garden()
 {
-    // Create torture garden nodes. Note that both sides of 19 and 20 lead to 21, and 21 has no children (to make traversal complete).
+   // Create torture garden nodes. Note that both sides of 19 and 20 lead to 21, and 21 has no children (to make traversal complete).

   garden.nodes[ 0].child[0] = &garden.nodes[ 1];
   garden.nodes[ 0].child[1] = &garden.nodes[ 2];
@@ -219,7 +229,6 @@ bool initialize_torture_garden()
   garden.nodes[20].child[1] = &garden.nodes[21];
   garden.nodes[21].child[0] = NULL;
   garden.nodes[21].child[1] = NULL;
-
   return true;
 }

@@ -227,38 +236,45 @@ bool initialize_torture_garden()
 int minotaur_hash( void *output, const void *input, int thr_id )
 {    
    unsigned char hash[64] __attribute__ ((aligned (64)));
+    int rc = 1;

    // Find initial sha512 hash
    sph_sha512_init( &garden.sha512 );
    sph_sha512( &garden.sha512, input, 80 );
    sph_sha512_close( &garden.sha512, hash );
-
-    // algo 6 (Hamsi) is very slow. It's faster to skip hashing this nonce
-    // if Hamsi is needed but only the first and last functions are
-    // currently known. Abort if either is Hamsi.
-    if ( ( ( hash[ 0] % MINOTAUR_ALGO_COUNT ) == 6 )
-      || ( ( hash[21] % MINOTAUR_ALGO_COUNT ) == 6 ) )
-         return 0;
+    
+    if ( opt_algo != ALGO_MINOTAURX )
+    {
+       // algo 6 (Hamsi) is very slow. It's faster to skip hashing this nonce
+       // if Hamsi is needed but only the first and last functions are
+       // currently known. Abort if either is Hamsi.
+       if ( ( ( hash[ 0] % MINOTAUR_ALGO_COUNT ) == 6 )
+         || ( ( hash[21] % MINOTAUR_ALGO_COUNT ) == 6 ) )
+           return 0;
+    }

    // Assign algos to torture garden nodes based on initial hash
    for ( int i = 0; i < 22; i++ )
        garden.nodes[i].algo = hash[i] % MINOTAUR_ALGO_COUNT;

+    // MinotaurX override algo for last node with yespower
+    if ( opt_algo == ALGO_MINOTAURX )
+        garden.nodes[21].algo = MINOTAUR_ALGO_COUNT;
+    
    // Send the initial hash through the torture garden
    TortureNode *node = &garden.nodes[0];
-
-    while ( node )
+    while ( rc && node )
    {
-      get_hash( hash, hash, &garden, node->algo );
+      rc = get_hash( hash, hash, &garden, node->algo, thr_id );
      node = node->child[ hash[63] & 1 ];
    }

    memcpy( output, hash, 32 );
-    return 1;
+    return rc;
 }

 int scanhash_minotaur( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
+                       uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t edata[20] __attribute__((aligned(64)));
   uint32_t hash[8] __attribute__((aligned(64)));
@@ -277,7 +293,7 @@ int scanhash_minotaur( struct work *work, uint32_t max_nonce,
      edata[19] = n;
      if ( likely( algo_gate.hash( hash, edata, thr_id ) ) )
      {
-	 if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
+         if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
         {
            pdata[19] = bswap_32( n );
            submit_solution( work, hash, mythr );
@@ -291,12 +307,14 @@ int scanhash_minotaur( struct work *work, uint32_t max_nonce,
   return 0;
 }

+// hash function has hooks for minotaurx
 bool register_minotaur_algo( algo_gate_t* gate )
 {
-  gate->scanhash = (void*)&scanhash_minotaur;
-  gate->hash      = (void*)&minotaur_hash;
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->scanhash          = (void*)&scanhash_minotaur;
+  gate->hash              = (void*)&minotaur_hash;
  gate->miner_thread_init = (void*)&initialize_torture_garden;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  if ( opt_algo == ALGO_MINOTAURX ) gate->optimizations |= SHA_OPT;
  return true;
 };

--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -198,7 +198,7 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
   {
       char* data;
       data = (char*)malloc( 2 + strlen( denom10_str ) * 4 + 16 * 4
-                             + strlen( merkleroot_str ) * 3 );
+                             + strlen( merkleroot_str ) * 3 + 1 );
       // Build the block header veildatahash in hex
       sprintf( data, "%s%s%s%s%s%s%s%s%s%s%s%s",
                       merkleroot_str, witmerkleroot_str, "04",
--- a/algo/yespower/yespower-opt.c
+++ b/algo/yespower/yespower-opt.c
--- a/api.c
+++ b/api.c
@@ -336,7 +336,7 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
 	char inpkey[128] = { 0 };
 	char seckey[64];
 	uchar sha1[20];
-	SHA_CTX ctx;
+//	SHA_CTX ctx;

 	if (opt_protocol)
 		applog(LOG_DEBUG, "clientkey: %s", clientkey);
@@ -346,9 +346,11 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
 	// SHA-1 test from rfc, returns in base64 "s3pPLMBiTxaQ9kYGzzhZRbK+xOo="
 	//sprintf(inpkey, "dGhlIHNhbXBsZSBub25jZQ==258EAFA5-E914-47DA-95CA-C5AB0DC85B11");

-	SHA1_Init(&ctx);
-	SHA1_Update(&ctx, inpkey, strlen(inpkey));
-	SHA1_Final(sha1, &ctx);
+   SHA1( inpkey, strlen(inpkey), sha1 );
+// Deprecated in openssl-3
+// SHA1_Init(&ctx);
+//	SHA1_Update(&ctx, inpkey, strlen(inpkey));
+//	SHA1_Final(sha1, &ctx);

 	base64_encode(sha1, 20, seckey, sizeof(seckey));

--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,7 +4,7 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.

-rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake > /dev/null

 # AVX512 SHA VAES: Intel Core Icelake, Rocketlake
 make distclean || echo clean
@@ -17,13 +17,22 @@ make -j 8
 strip -s cpuminer
 mv cpuminer cpuminer-avx512-sha-vaes

+# AVX256 SHA VAES: Intel Core Alderlake, needs gcc-12
+#make clean || echo clean
+#rm -f config.status
+#./autogen.sh || echo done
+#CFLAGS="-O3 -march=alderlake -Wall -fno-common" ./configure --with-curl
+#make -j 8
+#strip -s cpuminer
+#mv cpuminer cpuminer-alderlake
+
 # Zen4 AVX512 SHA VAES
 make clean || echo clean
 rm -f config.status
 # znver3 needs gcc-11, znver4 ?
 #CFLAGS="-O3 -march=znver4 -Wall -fno-common " ./configure --with-curl
-#CFLAGS="-O3 -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
-CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
+CFLAGS="-O3 -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
+#CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
 make -j 8
 strip -s cpuminer
 mv cpuminer cpuminer-zen4
@@ -31,8 +40,8 @@ mv cpuminer cpuminer-zen4
 # Zen3 AVX2 SHA VAES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=znver2 -mvaes -fno-common " ./configure --with-curl
-#CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl
+#CFLAGS="-O3 -march=znver2 -mvaes -fno-common " ./configure --with-curl
+CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl
 make -j 8
 strip -s cpuminer
 mv cpuminer cpuminer-zen3
@@ -80,7 +89,7 @@ make -j 8
 strip -s cpuminer
 mv cpuminer cpuminer-avx

-# SSE4.2 AES: Intel Westmere
+# SSE4.2 AES: Intel Westmere, most Pentium & Celeron
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl
--- a/4343
+++ b/4343
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.20.3])
+AC_INIT([cpuminer-opt], [3.21.2])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -131,10 +131,9 @@ bool opt_verify = false;
 static bool opt_stratum_keepalive = false;
 static struct timeval stratum_keepalive_timer;
 // Stratum typically times out in 5 minutes or 300 seconds
-#define stratum_keepalive_timeout 180  // 3 minutes
+#define stratum_keepalive_timeout 150  // 2.5 minutes
 static struct timeval stratum_reset_time;

-
 // pk_buffer_size is used as a version selector by b58 code, therefore
 // it must be set correctly to work.
 const int pk_buffer_size_max = 26;
@@ -899,6 +898,17 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      goto out;
   }

+// See git issue https://github.com/JayDDee/cpuminer-opt/issues/379    
+#if defined(__AVX2__)
+   if ( opt_debug )
+   {
+      if ( (uint64_t)target % 32 )
+         applog( LOG_ERR, "Misaligned target %p", target );
+      if ( (uint64_t)(work->target) % 32 )
+         applog( LOG_ERR, "Misaligned work->target %p", work->target );
+   }   
+#endif
+
   for ( i = 0; i < 8; i++ )
      work->target[7 - i] = be32dec( target + i );
   net_diff = work->targetdiff = hash_to_diff( work->target );
@@ -2192,6 +2202,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
      } // !quiet
   }  // new diff/block

+/*   
   if ( new_job && !( opt_quiet || stratum_errors ) )
   {
      int mismatch = submitted_share_count - ( accepted_share_count
@@ -2202,6 +2213,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
                 CL_LBL "%d Submitted share pending, maybe stale" CL_N,
                 submitted_share_count );
   }
+*/
 }

 static void *miner_thread( void *userdata )
@@ -2446,8 +2458,8 @@ static void *miner_thread( void *userdata )
          {
             scale_hash_for_display( &hashrate,  hr_units );
             sprintf( hr, "%.2f", hashrate );
-             applog( LOG_INFO, "CPU #%d: %s %sh/s",
-                               thr_id, hr, hr_units );
+             applog( LOG_INFO, "Thread %d, CPU %d: %s %sh/s",
+                        thr_id, thread_affinity_map[ thr_id ], hr, hr_units );
          }
       }

@@ -2887,7 +2899,7 @@ static void *stratum_thread(void *userdata )
            else
              timeval_subtract( &et, &now, &stratum_reset_time );

-            if ( et.tv_sec > stratum_keepalive_timeout + 60 )
+            if ( et.tv_sec > stratum_keepalive_timeout + 90 )
            {
               applog( LOG_NOTICE, "No shares submitted, resetting stratum connection" );
               stratum_need_reset = true;
--- a/miner.h
+++ b/miner.h
@@ -118,7 +118,7 @@ static inline bool is_windows(void)
 static inline uint32_t swab32(uint32_t v)
 {
 #ifdef WANT_BUILTIN_BSWAP
-	return __builtin_bswap32(v);
+   return __builtin_bswap32(v);
 #else
 	return bswap_32(v);
 #endif
@@ -559,6 +559,7 @@ enum algos {
        ALGO_LYRA2Z330,
        ALGO_M7M,
        ALGO_MINOTAUR,
+        ALGO_MINOTAURX,
        ALGO_MYR_GR,      
        ALGO_NEOSCRYPT,
        ALGO_NIST5,       
@@ -652,6 +653,7 @@ static const char* const algo_names[] = {
        "lyra2z330",
        "m7m",
        "minotaur",
+        "minotaurx",
        "myr-gr",
        "neoscrypt",
        "nist5",
@@ -813,6 +815,7 @@ Options:\n\
                          m7m           Magi (XMG)\n\
                          myr-gr        Myriad-Groestl\n\
                          minotaur\n\
+                          minotaurx\n\
                          neoscrypt     NeoScrypt(128, 2, 1)\n\
                          nist5         Nist5\n\
                          pentablake    5 x blake512\n\
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -54,7 +54,7 @@ static inline __m128i mm128_mov64_128( const uint64_t n )
 #else
  asm( "movq %1, %0\n\t" : "=x"(a) : "r"(n) );
 #endif
-  return  a;
+  return a;
 }

 static inline __m128i mm128_mov32_128( const uint32_t n )
@@ -65,7 +65,7 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
 #else  
  asm( "movd %1, %0\n\t" : "=x"(a) : "r"(n) );
 #endif
-  return  a;
+  return a;
 }

 // Inconstant naming, prefix should reflect return value:
@@ -79,7 +79,7 @@ static inline uint64_t u64_mov128_64( const __m128i a )
 #else  
  asm( "movq %1, %0\n\t" : "=r"(n) : "x"(a) );
 #endif
-  return  n;
+  return n;
 }

 static inline uint32_t u32_mov128_32( const __m128i a )
@@ -90,7 +90,7 @@ static inline uint32_t u32_mov128_32( const __m128i a )
 #else  
  asm( "movd %1, %0\n\t" : "=r"(n) : "x"(a) );
 #endif
-  return  n;
+  return n;
 }

 // Equivalent of set1, broadcast integer to all elements.
@@ -193,13 +193,23 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m )
 // Basic operations without equivalent SIMD intrinsic

 // Bitwise not (~v)  
+#if defined(__AVX512VL__)
+
+static inline __m128i mm128_not( const __m128i v )
+{  return _mm_ternarylogic_epi64( v, v, v, 1 ); }
+
+#else
+
 #define mm128_not( v )          _mm_xor_si128( v, m128_neg1 ) 

+#endif
+
+/*
 // Unary negation of elements (-v)
 #define mm128_negate_64( v )    _mm_sub_epi64( m128_zero, v )
 #define mm128_negate_32( v )    _mm_sub_epi32( m128_zero, v )  
 #define mm128_negate_16( v )    _mm_sub_epi16( m128_zero, v )  
-
+*/

 // Add 4 values, fewer dependencies than sequential addition.
 #define mm128_add4_64( a, b, c, d ) \
@@ -255,20 +265,16 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #if defined(__AVX512VL__)

 // a ^ b ^ c
-#define mm128_xor3( a, b, c ) \
-   _mm_ternarylogic_epi64( a, b, c, 0x96 )
+#define mm128_xor3( a, b, c )    _mm_ternarylogic_epi64( a, b, c, 0x96 )

 // a ^ ( b & c )
-#define mm128_xorand( a, b, c ) \
-   _mm_ternarylogic_epi64( a, b, c, 0x78 )
+#define mm128_xorand( a, b, c )  _mm_ternarylogic_epi64( a, b, c, 0x78 )

 #else

-#define mm128_xor3( a, b, c ) \
-   _mm_xor_si128( a, _mm_xor_si128( b, c ) )
+#define mm128_xor3( a, b, c )    _mm_xor_si128( a, _mm_xor_si128( b, c ) )

-#define mm128_xorand( a, b, c ) \
-  _mm_xor_si128( a, _mm_and_si128( b, c ) )
+#define mm128_xorand( a, b, c )  _mm_xor_si128( a, _mm_and_si128( b, c ) )

 #endif

@@ -283,64 +289,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm_movmask_32( v ) \
   _mm_castps_si128( _mm_movmask_ps( _mm_castsi128_ps( v ) ) )

-
-// Diagonal blend
-
-// Blend 4 32 bit elements from 4 vectors
-
-#if defined (__AVX2__)
-
-#define mm128_diagonal_32( v3, v2, v1, v0 ) \
-  mm_blend_epi32( _mm_blend_epi32( s3, s2, 0x4 ), \
-                  _mm_blend_epi32( s1, s0, 0x1 ), 0x3 )
-
-#elif defined(__SSE4_1__)
-
-#define mm128_diagonal_32( v3, v2, v1, v0 ) \
-  mm_blend_epi16( _mm_blend_epi16( s3, s2, 0x30 ), \
-                  _mm_blend_epi16( s1, s0, 0x03 ), 0x0f )
-
-#endif
-
-/*
-//
-// Extended bit shift for concatenated packed elements from 2 vectors.
-// Shift right returns low half, shift left return high half.
-
-#if defined(__AVX512VBMI2__) && defined(__AVX512VL__)
-
-#define mm128_shl2_64( v1, v2, c )    _mm_shldi_epi64( v1, v2, c ) 
-#define mm128_shr2_64( v1, v2, c )    _mm_shrdi_epi64( v1, v2, c ) 
-
-#define mm128_shl2_32( v1, v2, c )    _mm_shldi_epi32( v1, v2, c ) 
-#define mm128_shr2_32( v1, v2, c )    _mm_shrdi_epi32( v1, v2, c ) 
-
-#define mm128_shl2_16( v1, v2, c )    _mm_shldi_epi16( v1, v2, c )
-#define mm128_shr2_16( v1, v2, c )    _mm_shrdi_epi16( v1, v2, c )
-
-#else
-
-#define mm128_shl2_64( v1, v2, c ) \
-   _mm_or_si128( _mm_slli_epi64( v1, c ), _mm_srli_epi64( v2, 64 - (c) ) )
-
-#define mm128_shr2_64( v1, v2, c ) \
-   _mm_or_si128( _mm_srli_epi64( v2, c ), _mm_slli_epi64( v1, 64 - (c) ) )
-
-#define mm128_shl2_32( v1, v2, c ) \
-   _mm_or_si128( _mm_slli_epi32( v1, c ), _mm_srli_epi32( v2, 32 - (c) ) )
-
-#define mm128_shr2_32( v1, v2, c ) \
-   _mm_or_si128( _mm_srli_epi32( v2, c ), _mm_slli_epi32( v1, 32 - (c) ) )
-
-#define mm128_shl2_16( v1, v2, c ) \
-   _mm_or_si128( _mm_slli_epi16( v1, c ), _mm_srli_epi16( v2, 16 - (c) ) )
-
-#define mm128_shr2_16( v1, v2, c ) \
-   _mm_or_si128( _mm_srli_epi16( v2, c ), _mm_slli_epi16( v1, 16 - (c) ) )
-
-#endif
-*/
-
 //
 // Bit rotations

@@ -439,7 +387,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )

 //
 // Limited 2 input shuffle, combines shuffle with blend. The destination low
-// half is always taken from src a, and the high half from src b.
+// half is always taken from v1, and the high half from v2.
 #define mm128_shuffle2_64( v1, v2, c ) \
   _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( v1 ), \
                                     _mm_castsi128_pd( v2 ), c ) ); 
@@ -600,9 +548,6 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )

 #endif // SSSE3 else SSE2

-//
-// Rotate in place concatenated 128 bit vectors as one 256 bit vector.
-
 // Swap 128 bit vectors.
 // This should be avoided, it's more efficient to switch references.
 #define mm128_swap256_128( v1, v2 ) \
@@ -611,61 +556,23 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
   v1 = _mm_xor_si128( v1, v2 );


-// Two input shuffle-rotate.
-// Concatenate v1 & v2 and byte rotate as a 256 bit vector.
-// Function macros with two inputs and one output, inputs are preserved.
-// Returns the high 128 bits, ie updated v1.
+// alignr for 32 & 64 bit elements is only available with AVX512 but
+// emulated here. Shift argument is not needed, it's always 1.
+// Behaviour is otherwise consistent with Intel alignr intrinsics.

 #if defined(__SSSE3__)

-#define mm128_shufl2r_64( v1, v2 )     _mm_alignr_epi8( v2, v1, 8 )
-#define mm128_shufl2l_64( v1, v2 )     _mm_alignr_epi8( v1, v2, 8 )
-
-/*
-#define mm128_shufl2r_32( v1, v2 )     _mm_alignr_epi8( v2, v1, 4 )
-#define mm128_shufl2l_32( v1, v2 )     _mm_alignr_epi8( v1, v2, 4 )
-
-#define mm128_shufl2r_16( v1, v2 )     _mm_alignr_epi8( v2, v1, 2 )
-#define mm128_shufl2l_16( v1, v2 )     _mm_alignr_epi8( v1, v2, 2 )
-
-#define mm128_shufl2r_8( v1, v2 )      _mm_alignr_epi8( v2, v1, 1 )
-#define mm128_shufl2l_8( v1, v2 )      _mm_alignr_epi8( v1, v2, 1 )
-*/
+#define mm128_alignr_64( v1, v2 )    _mm_alignr_epi8( v1, v2, 8 )
+#define mm128_alignr_32( v1, v2 )    _mm_alignr_epi8( v1, v2, 4 )

 #else

-#define mm128_shufl2r_64( v1, v2 ) \
-   _mm_or_si128( _mm_srli_si128( v1, 8 ), \
-                 _mm_slli_si128( v2, 8 ) )
+#define mm128_alignr_64( v1, v2 )    _mm_or_si128( _mm_slli_si128( v1, 8 ), \
+                                                   _mm_srli_si128( v2, 8 ) )

-#define mm128_shufl2l_64( v1, v2 ) \
-   _mm_or_si128( _mm_slli_si128( v1, 8 ), \
-                 _mm_srli_si128( v2, 8 ) )
-/*
-#define mm128_shufl2r_32( v1, v2 ) \
-   _mm_or_si128( _mm_srli_si128( v1,  4 ), \
-                 _mm_slli_si128( v2, 12 ) )
+#define mm128_alignr_32( v1, v2 )    _mm_or_si128( _mm_slli_si128( v1, 4 ), \
+                                                   _mm_srli_si128( v2, 4 ) )

-#define mm128_shufl2l_32( v1, v2 ) \
-   _mm_or_si128( _mm_slli_si128( v1, 4 ), \
-                 _mm_srli_si128( v2, 12 ) )
-
-#define mm128_shufl2r_16( v1, v2 ) \
-   _mm_or_si128( _mm_srli_si128( v1,  2 ), \
-                 _mm_slli_si128( v2, 14 ) )
-
-#define mm128_shufl2l_16( v1, v2 ) \
-   _mm_or_si128( _mm_slli_si128( v1,  2 ), \
-                 _mm_srli_si128( v2, 14 ) )
-
-#define mm128_shufl2r_8( v1, v2 ) \
-   _mm_or_si128( _mm_srli_si128( v1,  1 ), \
-                 _mm_slli_si128( v2, 15 ) )
-
-#define mm128_shufl2l_8( v1, v2 ) \
-   _mm_or_si128( _mm_slli_si128( v1,  1 ), \
-                 _mm_srli_si128( v2, 15 ) )
-*/
 #endif

 // Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
@@ -689,50 +596,6 @@ do { \
           v1 = t; \
 } while(0)

-/*
-#define mm128_vror256_32( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 4 ); \
-           v1 = _mm_alignr_epi8( v2, v1, 4 ); \
-           v2 = t; \
-} while(0)
-
-#define mm128_vrol256_32( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 12 ); \
-           v2 = _mm_alignr_epi8( v2, v1, 12 ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vror256_16( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 2 ); \
-           v1 = _mm_alignr_epi8( v2, v1, 2 ); \
-           v2 = t; \
-} while(0)
-
-#define mm128_vrol256_16( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 14 ); \
-           v2 = _mm_alignr_epi8( v2, v1, 14 ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vror256_8( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 1 ); \
-           v1 = _mm_alignr_epi8( v2, v1, 1 ); \
-           v2 = t; \
-} while(0)
-
-#define mm128_vrol256_8( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 15 ); \
-           v2 = _mm_alignr_epi8( v2, v1, 15 ); \
-           v1 = t; \
-} while(0)
-*/
-
 #else  // SSE2

 #define mm128_vror256_64( v1, v2 ) \
@@ -752,61 +615,7 @@ do { \
                              _mm_srli_si128( v1, 8 ) ); \
           v1 = t; \
 } while(0)
-/*
-#define mm128_vror256_32( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
-                              _mm_slli_si128( v2, 12 ) ); \
-           v2 = _mm_or_si128( _mm_srli_si128( v2, 4 ), \
-                              _mm_slli_si128( v1, 12 ) ); \
-           v1 = t; \
-} while(0)

-#define mm128_vrol256_32( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
-                              _mm_srli_si128( v2, 12 ) ); \
-           v2 = _mm_or_si128( _mm_slli_si128( v2, 4 ), \
-                              _mm_srli_si128( v1, 12 ) ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vror256_16( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
-                              _mm_slli_si128( v2, 14 ) ); \
-           v2 = _mm_or_si128( _mm_srli_si128( v2, 2 ), \
-                              _mm_slli_si128( v1, 14 ) ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vrol256_16( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
-                              _mm_srli_si128( v2, 14 ) ); \
-           v2 = _mm_or_si128( _mm_slli_si128( v2, 2 ), \
-                              _mm_srli_si128( v1, 14 ) ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vror256_8( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
-                              _mm_slli_si128( v2, 15 ) ); \
-           v2 = _mm_or_si128( _mm_srli_si128( v2, 1 ), \
-                              _mm_slli_si128( v1, 15 ) ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vrol256_8( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
-                              _mm_srli_si128( v2, 15 ) ); \
-           v2 = _mm_or_si128( _mm_slli_si128( v2, 1 ), \
-                              _mm_srli_si128( v1, 15 ) ); \
-           v1 = t; \
-} while(0)
-*/
 #endif  // SSE4.1 else SSE2

 #endif // __SSE2__
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -15,14 +15,13 @@
 //
 // "_mm256_shuffle_epi8" and "_mm256_alignr_epi8" are restricted to 128 bit
 // lanes and data can't cross the 128 bit lane boundary.  
-// Some usage may have the index vector encoded as if full vector
-// shuffles are supported. This has no side effects and would have the same
-// results using either version.
-// If the need arises and AVX512VL is available, 256 bit full vector shuffles
-// can be implemented using the AVX512 zero-mask feature with a NULL mask.
-// Using intrinsics it's simple:   _mm256_maskz_shuffle_epi8( 0, v, c )
-// With asm it's a bit more complicated with the addition of the mask register
-// and zero tag:   vpshufb ymm0{k0}{z}, ymm1, ymm2 
+// Instructions that can move data across 128 bit lane boundary incur a
+// performance penalty over those that can't.
+// Some usage of index vectors may be encoded as if full vector shuffles are
+// supported. This has no side effects and would have the same results using
+// either version.
+// If the need arises and AVX512VL is available, 256 bit full vector byte 
+// shuffles can be implemented using the AVX512 mask feature with a NULL mask.

 #if defined(__AVX__)

@@ -66,10 +65,6 @@ typedef union
 #define u64_mov256_64( v ) u64_mov128_64( _mm256_castsi256_si128( v ) )
 #define u32_mov256_32( v ) u32_mov128_32( _mm256_castsi256_si128( v ) )

-// deprecated
-//#define mm256_mov256_64 u64_mov256_64 
-//#define mm256_mov256_32 u32_mov256_32
-
 // concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
 #define mm256_concat_128( hi, lo ) \
   _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )
@@ -141,7 +136,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
 // Basic operations without SIMD equivalent

-// Bitwise not ( ~v )
 #if defined(__AVX512VL__)

 static inline __m256i mm256_not( const __m256i v )
@@ -153,10 +147,12 @@ static inline __m256i mm256_not( const __m256i v )

 #endif

+/*
 // Unary negation of each element ( -v )
 #define mm256_negate_64( v ) _mm256_sub_epi64( m256_zero, v )
 #define mm256_negate_32( v ) _mm256_sub_epi32( m256_zero, v )
 #define mm256_negate_16( v ) _mm256_sub_epi16( m256_zero, v )
+*/


 // Add 4 values, fewer dependencies than sequential addition.
@@ -178,44 +174,34 @@ static inline __m256i mm256_not( const __m256i v )
 // AVX512 has ternary logic that supports any 3 input boolean expression.

 // a ^ b ^ c
-#define mm256_xor3( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x96 )
+#define mm256_xor3( a, b, c )      _mm256_ternarylogic_epi64( a, b, c, 0x96 )

 // legacy convenience only
-#define mm256_xor4( a, b, c, d ) \
-   _mm256_xor_si256( a, mm256_xor3( b, c, d ) )
+#define mm256_xor4( a, b, c, d )   _mm256_xor_si256( a, mm256_xor3( b, c, d ) )

 // a & b & c
-#define mm256_and3( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x80 )
+#define mm256_and3( a, b, c )      _mm256_ternarylogic_epi64( a, b, c, 0x80 )

 // a | b | c
-#define mm256_or3( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0xfe )
+#define mm256_or3( a, b, c )       _mm256_ternarylogic_epi64( a, b, c, 0xfe )

 // a ^ ( b & c )
-#define mm256_xorand( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x78 )
+#define mm256_xorand( a, b, c )    _mm256_ternarylogic_epi64( a, b, c, 0x78 )

 // a & ( b ^ c )
-#define mm256_andxor( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x60 )
+#define mm256_andxor( a, b, c )    _mm256_ternarylogic_epi64( a, b, c, 0x60 )

 // a ^ ( b | c )
-#define mm256_xoror( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x1e )
+#define mm256_xoror( a, b, c )     _mm256_ternarylogic_epi64( a, b, c, 0x1e )

 // a ^ ( ~b & c )   
-#define mm256_xorandnot( a, b, c ) \
-  _mm256_ternarylogic_epi64( a, b, c, 0xd2 )
+#define mm256_xorandnot( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0xd2 )

 // a | ( b & c )
-#define mm256_orand( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0xf8  )
+#define mm256_orand( a, b, c )     _mm256_ternarylogic_epi64( a, b, c, 0xf8 )

 // ~( a ^ b ), same as (~a) ^ b
-#define mm256_xnor( a, b ) \
-   _mm256_ternarylogic_epi64( a, b, b, 0x81  )
+#define mm256_xnor( a, b )         _mm256_ternarylogic_epi64( a, b, b, 0x81 )
    
 #else

@@ -262,76 +248,6 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_movmask_32( v ) \
   _mm256_castps_si256( _mm256_movmask_ps( _mm256_castsi256_ps( v ) ) )

-
-// Diagonal blending
-
-// Blend 4 64 bit elements from 4 vectors
-#define mm256_diagonal_64( v3, v2, v1, v0 ) \
-  mm256_blend_epi32( _mm256_blend_epi32( v3, v2, 0x30 ), \
-                     _mm256_blend_epi32( v1, v0, 0x03 ), 0x0f )
-
-// Blend 8 32 bit elements from 8 vectors
-#define mm256_diagonal_32( v7, v6, v5, v4, v3, v2, v1, v0 ) \
-  _mm256_blend_epi32( \
-        _mm256_blend_epi32( \
-               _mm256_blend_epi32( v7, v6, 0x40 ), \
-               _mm256_blend_epi32( v5, v4, 0x10 ) 0x30 ), \
-        _mm256_blend_epi32( \
-               _mm256_blend_epi32( v3, v2, 0x04) \
-               _mm256_blend_epi32( v1, v0, 0x01 ), 0x03 ), 0x0f )  
-
-
-// Blend 4 32 bit elements from each 128 bit lane.
-#define mm256_diagonal128_32( v3, v2, v1, v0 ) \
-    _mm256_blend_epi32( \
-           _mm256_blend_epi32( v3, v2, 0x44) \
-           _mm256_blend_epi32( v1, v0, 0x11 ) )  
-
-/*
-//
-// Extended bit shift for concatenated packed elements from 2 vectors.
-// Shift right returns low half, shift left return high half.
-
-#if defined(__AVX512VBMI2__) && defined(__AVX512VL__)
-
-#define mm256_shl2_64( v1, v2, c )    _mm256_shldi_epi64( v1, v2, c )
-#define mm256_shr2_64( v1, v2, c )    _mm256_shrdi_epi64( v1, v2, c )
-
-#define mm256_shl2_32( v1, v2, c )    _mm256_shldi_epi32( v1, v2, c )
-#define mm256_shr2_32( v1, v2, c )    _mm256_shrdi_epi32( v1, v2, c )
-
-#define mm256_shl2_16( v1, v2, c )    _mm256_shldi_epi16( v1, v2, c )
-#define mm256_shr2_16( v1, v2, c )    _mm256_shrdi_epi16( v1, v2, c )
-
-#else
-
-#define mm256_shl2i_64( v1, v2, c ) \
-                     _mm256_or_si256( _mm256_slli_epi64( v1, c ), \
-                                      _mm256_srli_epi64( v2, 64 - (c) ) )
-
-#define mm512_shr2_64( v1, v2, c ) \
-                    _mm256_or_si256( _mm256_srli_epi64( v2, c ), \
-                                     _mm256_slli_epi64( v1, 64 - (c) ) )
-
-#define mm256_shl2_32( v1, v2, c ) \
-                    _mm256_or_si256( _mm256_slli_epi32( v1, c ), \
-                                     _mm256_srli_epi32( v2, 32 - (c) ) )
-
-#define mm256_shr2_32( v1, v2, c ) \
-                    _mm256_or_si256( _mm256_srli_epi32( v2, c ), \
-                                     _mm256_slli_epi32( v1, 32 - (c) ) )
-
-#define mm256_shl2_16( v1, v2, c ) \
-                    _mm256_or_si256( _mm256_slli_epi16( v1, c ), \
-                                     _mm256_srli_epi16( v2, 16 - (c) ) )
-
-#define mm256_shr2_16( v1, v2, c ) \
-                    _mm256_or_si256( _mm256_srli_epi16( v2, c ), \
-                                     _mm256_slli_epi16( v1, 16 - (c) ) )
-
-#endif
-*/
-
 //
 //           Bit rotations.
 //
@@ -450,6 +366,16 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_shufll_64( v )    _mm256_permute4x64_epi64( v, 0x93 )

 // Rotate 256 bit vector by one 32 bit element.
+#if defined(__AVX512VL__)
+
+static inline __m256i mm256_shuflr_32( const __m256i v )
+{ return _mm256_alignr_epi32( v, v, 1 ); }
+
+static inline __m256i mm256_shufll_32( const __m256i v )
+{ return _mm256_alignr_epi32( v, v, 15 ); }
+
+#else
+
 #define mm256_shuflr_32( v ) \
    _mm256_permutevar8x32_epi32( v, \
                     m256_const_64( 0x0000000000000007, 0x0000000600000005, \
@@ -460,6 +386,8 @@ static inline __m256i mm256_not( const __m256i v )
                     m256_const_64( 0x0000000600000005,  0x0000000400000003, \
                                    0x0000000200000001,  0x0000000000000007 ) )

+#endif
+
 //
 // Rotate elements within each 128 bit lane of 256 bit vector.

--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -37,13 +37,21 @@
 //    version of this specific instruction does not.
 //
 //    New alignr instructions for epi64 and epi32 operate across the entire
-//    vector. "_mm512_alignr_epi8" continues to be restricted to 128 bit lanes.
+//    vector but slower than epi8 which continues to be restricted to 128 bit
+//    lanes.
 //
 //    "_mm512_permutexvar_epi8" and "_mm512_permutex2var_epi8" require
 //    AVX512-VBMI. The same instructions with larger elements don't have this
 //    requirement. "_mm512_permutexvar_epi8" also performs the same operation
 //    as "_mm512_shuffle_epi8" which only requires AVX512-BW.
 //
+//    Two coding conventions are used to prevent macro argument side effects:
+//      - if a macro arg is used in an expression it must be protected by
+//        parentheses to ensure an expression argument is evaluated first.
+//      - if an argument is to referenced multiple times a C inline function
+//        should be used instead of a macro to prevent an expression argument
+//        from being evaluated multiple times.
+//
 //    There are 2 areas where overhead is a major concern: constants and
 //    permutations.
 //
@@ -177,22 +185,30 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
 #define m512_one_16     m512_const1_16( 1 )
 #define m512_one_8      m512_const1_8( 1 )

-//#define m512_neg1 m512_const1_64( 0xffffffffffffffff )
-#define m512_neg1 _mm512_movm_epi64( 0xff )
+// use asm to avoid compiler warning for unitialized local
+static inline __m512i mm512_neg1_fn()
+{
+   __m512i a;
+   asm( "vpternlogq $0xff, %0, %0, %0\n\t" : "=x"(a) );
+   return a;
+}
+#define m512_neg1 mm512_neg1_fn()                          // 1 clock
+//#define m512_neg1 m512_const1_64( 0xffffffffffffffff )   // 5 clocks
+//#define m512_neg1 _mm512_movm_epi64( 0xff )              // 2 clocks

 //
 // Basic operations without SIMD equivalent

 // Bitwise NOT: ~x
-// #define mm512_not( x )       _mm512_xor_si512( x, m512_neg1 )
 static inline __m512i mm512_not( const __m512i x )
 {  return _mm512_ternarylogic_epi64( x, x, x, 1 ); }

+/*
 // Unary negation: -x
 #define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x )
 #define mm512_negate_32( x ) _mm512_sub_epi32( m512_zero, x )  
 #define mm512_negate_16( x ) _mm512_sub_epi16( m512_zero, x )  
-
+*/

 //
 // Pointer casting
@@ -246,118 +262,43 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // expression using any number or combinations of AND, OR, XOR, NOT.

 // a ^ b ^ c
-#define mm512_xor3( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x96 )
+#define mm512_xor3( a, b, c )      _mm512_ternarylogic_epi64( a, b, c, 0x96 )

 // legacy convenience only
-#define mm512_xor4( a, b, c, d ) \
-   _mm512_xor_si512( a, mm512_xor3( b, c, d ) )
+#define mm512_xor4( a, b, c, d )   _mm512_xor_si512( a, mm512_xor3( b, c, d ) )

 // a & b & c
-#define mm512_and3( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x80 )
+#define mm512_and3( a, b, c )      _mm512_ternarylogic_epi64( a, b, c, 0x80 )

 // a | b | c
-#define mm512_or3( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0xfe )
+#define mm512_or3( a, b, c )       _mm512_ternarylogic_epi64( a, b, c, 0xfe )

 // a ^ ( b & c )
-#define mm512_xorand( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x78 )
+#define mm512_xorand( a, b, c )    _mm512_ternarylogic_epi64( a, b, c, 0x78 )

 // a & ( b ^ c )
-#define mm512_andxor( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x60 )
+#define mm512_andxor( a, b, c )    _mm512_ternarylogic_epi64( a, b, c, 0x60 )

 // a ^ ( b | c )
-#define mm512_xoror( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x1e )
+#define mm512_xoror( a, b, c )     _mm512_ternarylogic_epi64( a, b, c, 0x1e )

 // a ^ ( ~b & c ),     xor( a, andnot( b, c ) )
-#define mm512_xorandnot( a, b, c ) \
-  _mm512_ternarylogic_epi64( a, b, c, 0xd2 ) 
+#define mm512_xorandnot( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0xd2 ) 

 // a | ( b & c )
-#define mm512_orand( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0xf8  )
+#define mm512_orand( a, b, c )     _mm512_ternarylogic_epi64( a, b, c, 0xf8 )

 // Some 2 input operations that don't have their own instruction mnemonic.
+// Use with caution, args are not expression safe.

 // ~( a | b ),  (~a) & (~b)
-#define mm512_nor( a, b ) \
-   _mm512_ternarylogic_epi64( a, b, b, 0x01  )
+#define mm512_nor( a, b )          _mm512_ternarylogic_epi64( a, b, b, 0x01 )

 // ~( a ^ b ),  (~a) ^ b
-#define mm512_xnor( a, b ) \
-   _mm512_ternarylogic_epi64( a, b, b, 0x81  )
+#define mm512_xnor( a, b )         _mm512_ternarylogic_epi64( a, b, b, 0x81 )

 // ~( a & b )
-#define mm512_nand( a, b ) \
-   _mm512_ternarylogic_epi64( a, b, b, 0xef  )
-
-
-// Diagonal blending
-// Blend 8 64 bit elements from 8 vectors
-#define mm512_diagonal_64( v7, v6, v5, v4, v3, v2, v1, v0 ) \
-  _mm512_mask_blend_epi64( 0x0f, \
-        _mm512_mask_blend_epi64( 0x30, \
-               _mm512_mask_blend_epi64( 0x40, v7, v6 ), \
-               _mm512_mask_blend_epi64( 0x40, v5, v4 ) ), \
-        _mm512_mask_blend_epi64( 0x03, \
-               _mm512_mask_blend_epi64( 0x04, v3, v2 ) \
-               _mm512_mask_blend_epi64( 0x01, v1, v0 ) ) )  
-
-
-// Blend 4 32 bit elements from each 128 bit lane.
-#define mm512_diagonal128_32( v3, v2, v1, v0 ) \
-    _mm512_mask_blend_epi32( 0x3333, \
-           _mm512_mask_blend_epi32( 0x4444, v3, v2 ), \
-           _mm512_mask_blend_epi32( 0x1111, v1, v0 ) )  
-
-/*
-//
-// Extended bit shift of concatenated packed elements from 2 vectors.
-// Shift right returns low half, shift left returns high half.
-
-#if defined(__AVX512VBMI2__)
-
-#define mm512_shl2_64( v1, v2, c )    _mm512_shldi_epi64( v1, v2, c )
-#define mm512_shr2_64( v1, v2, c )    _mm512_shrdi_epi64( v1, v2, c )
-
-#define mm512_shl2_32( v1, v2, c )    _mm512_shldi_epi32( v1, v2, c )
-#define mm512_shr2_32( v1, v2, c )    _mm512_shrdi_epi32( v1, v2, c )
-
-#define mm512_shl2_16( v1, v2, c )    _mm512_shldi_epi16( v1, v2, c )
-#define mm512_shr2_16( v1, v2, c )    _mm512_shrdi_epi16( v1, v2, c )
-
-#else
-
-#define mm512_shl2_64( v1, v2, c ) \
-                     _mm512_or_si512( _mm512_slli_epi64( v1, c ), \
-                                      _mm512_srli_epi64( v2, 64 - (c) ) )
-
-#define mm512_shr2_64( v1, v2, c ) \
-                     _mm512_or_si512( _mm512_srli_epi64( v2, c ), \
-                                      _mm512_slli_epi64( v1, 64 - (c) ) )
-
-#define mm512_shl2_32( v1, v2, c ) \
-                     _mm512_or_si512( _mm512_slli_epi32( v1, c ), \
-                                      _mm512_srli_epi32( v2, 32 - (c) ) )
-
-#define mm512_shr2_32( v1, v2, c ) \
-                     _mm512_or_si512( _mm512_srli_epi32( v2, c ), \
-                                      _mm512_slli_epi32( v1, 32 - (c) ) )
-
-#define mm512_shl2_16( v1, v2, c ) \
-                     _mm512_or_si512( _mm512_slli_epi16( v1, c ), \
-                                      _mm512_srli_epi16( v2, 16 - (c) ) )
-
-#define mm512_shr2_16( v1, v2, c ) \
-                     _mm512_or_si512( _mm512_srli_epi16( v2, c ), \
-                                      _mm512_slli_epi16( v1, 16 - (c) ) )
-
-#endif
-*/
+#define mm512_nand( a, b )         _mm512_ternarylogic_epi64( a, b, b, 0xef )

 // Bit rotations.

@@ -434,30 +375,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 } while(0)


-// Cross-lane shuffles implementing rotate & shift of packed elements.
-//
-
-#define mm512_shiftr_256( v ) \
-  _mm512_alignr_epi64( _mm512_setzero, v, 4 )
-#define mm512_shiftl_256( v ) mm512_shifr_256
-
-#define mm512_shiftr_128( v ) \
-  _mm512_alignr_epi64( _mm512_setzero, v, 2 )
-#define mm512_shiftl_128( v ) \
-  _mm512_alignr_epi64( v,  _mm512_setzero, 6 )
-
-#define mm512_shiftr_64( v ) \
-  _mm512_alignr_epi64( _mm512_setzero, v, 1 )
-#define mm512_shiftl_64( v ) \
-  _mm512_alignr_epi64( v, _mm512_setzero, 7 )
-
-#define mm512_shiftr_32( v ) \
-  _mm512_alignr_epi32( _mm512_setzero, v, 1 )
-#define mm512_shiftl_32( v ) \
-  _mm512_alignr_epi32( v, _mm512_setzero, 15 )
-
-// Shuffle-rotate elements left or right in 512 bit vector.
+// Cross-lane shuffles implementing rotation of packed elements.
+// 

+// Rotate elements across entire vector.
 static inline __m512i mm512_swap_256( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 4 ); }
 #define mm512_shuflr_256( v ) mm512_swap_256
@@ -516,9 +437,8 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
                       0x1E1D1C1B1A191817, 0x161514131211100F, \
                       0x0E0D0C0B0A090807, 0x060504030201003F ) )

-//
+// 256 bit lanes used only by lyra2, move these there
 // Rotate elements within 256 bit lanes of 512 bit vector.
-// 128 bit lane shift is handled by bslli bsrli.

 // Swap hi & lo 128 bits in each 256 bit lane
 #define mm512_swap256_128( v )      _mm512_permutex_epi64( v, 0x4e )
@@ -529,6 +449,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuflr256_64( v )     _mm512_permutex_epi64( v, 0x39 )
 #define mm512_shufll256_64( v )     _mm512_permutex_epi64( v, 0x93 )

+/*
 // Rotate 256 bit lanes by one 32 bit element
 #define mm512_shuflr256_32( v ) \
   _mm512_permutexvar_epi32( m512_const_64( \
@@ -571,7 +492,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
                     0x2e2d2c2b2a292827, 0x262524232221203f, \
                     0x1e1d1c1b1a191817, 0x161514131211100f, \
                     0x0e0d0c0b0a090807, 0x060504030201001f ) )
-
+*/
 //
 // Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
 
@@ -623,22 +544,5 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 #define mm512_shuflr32_8(  v )  _mm512_ror_epi32( v,  8 )
 #define mm512_shufll32_8(  v )  _mm512_rol_epi32( v,  8 )

-/*
-// 2 input, 1 output
-// Concatenate { v1, v2 } then rotate right or left and return the high
-// 512 bits, ie rotated v1. 
-#define mm512_shufl2r_256( v1, v2 )    _mm512_alignr_epi64( v2, v1, 4 )
-#define mm512_shufl2l_256( v1, v2 )    _mm512_alignr_epi64( v1, v2, 4 )
-
-#define mm512_shufl2r_128( v1, v2 )    _mm512_alignr_epi64( v2, v1, 2 )
-#define mm512_shufl2l_128( v1, v2 )    _mm512_alignr_epi64( v1, v2, 2 )
-
-#define mm512_shufl2r_64( v1, v2 )     _mm512_alignr_epi64( v2, v1, 1 )
-#define mm512_shufl2l_64( v1, v2 )     _mm512_alignr_epi64( v1, v2, 1 )
-
-#define mm512_shufl2r_32( v1, v2 )     _mm512_alignr_epi32( v2, v1, 1 )
-#define mm512_shufl2l_32( v1, v2 )     _mm512_alignr_epi32( v1, v2, 1 )
-*/
-
 #endif // AVX512
 #endif // SIMD_512_H__
--- a/simd-utils/simd-64.h
+++ b/simd-utils/simd-64.h
@@ -34,10 +34,12 @@
 //#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 )
 #define mm64_not( a ) ( (__m64)( ~( (uint64_t)(a) ) )

+/*      
 // Unary negate elements
 #define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, v )
 #define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, v )
 #define mm64_negate_8(  v ) _mm_sub_pi8(  m64_zero, v )
+*/

 // Rotate bits in packed elements of 64 bit vector
 #define mm64_rol_64( a, n ) \
Author	SHA1	Message	Date
Jay D Dee	fb93160641	v3.21.2	2023-03-03 12:38:31 -05:00
Jay D Dee	520d4d5384	v3.21.1	2023-02-08 22:11:05 -05:00
Jay D Dee	da7030faa8	v3.21.0	2022-12-21 13:09:14 -05:00