v3.21.0

2025-09-17 23:44:27 +00:00 · 2022-12-21 13:09:14 -05:00
parent bd84f199fe
commit da7030faa8
24 changed files with 997 additions and 642 deletions
--- a/2
+++ b/2
@@ -1,4 +1,6 @@

+These instructions may be out of date, see the Wiki for the latest...
+https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source

 1. Requirements:
 ---------------
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 Instructions for compiling cpuminer-opt for Windows.

-Thwaw intructions nay be out of date. Please consult the wiki for
+These intructions are out of date. Please consult the wiki for
 the latest:

 https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
--- a/README.md
+++ b/README.md
@@ -74,53 +74,50 @@ Supported Algorithms
                          argon2d250    argon2d-crds, Credits (CRDS)
                          argon2d500    argon2d-dyn,  Dynamic (DYN)
                          argon2d4096   argon2d-uis, Unitus, (UIS)
-                          axiom         Shabal-256 MemoHash
-                          blake         Blake-256 (SFR)
-                          blake2b       Blake2b 256
-                          blake2s       Blake-2 S
+                          blake         Blake-256
+                          blake2b       Blake2-512
+                          blake2s       Blake2-256
                          blakecoin     blake256r8
                          bmw           BMW 256
                          bmw512        BMW 512
-                          c11           Chaincoin
+                          c11           
                          decred
                          deep          Deepcoin (DCN)
                          dmd-gr        Diamond-Groestl
                          groestl       Groestl coin
                          hex           x16r-hex
-                          hmq1725       Espers
+                          hmq1725       
                          hodl          Hodlcoin
                          jha           Jackpotcoin
                          keccak        Maxcoin
                          keccakc       Creative coin
                          lbry          LBC, LBRY Credits
-                          luffa         Luffa
-                          lyra2h        Hppcoin
+                          lyra2h        
                          lyra2re       lyra2
                          lyra2rev2     lyra2v2
                          lyra2rev3     lyrav2v3
                          lyra2z        
-                          lyra2z330     Lyra2 330 rows, Zoin (ZOI)
-                          m7m           Magi (XMG)
-                          minotaur      Ringcoin (RNG)
+                          lyra2z330     
+                          m7m           
+                          minotaur 
+                          minotaurx
                          myr-gr        Myriad-Groestl
                          neoscrypt     NeoScrypt(128, 2, 1)
                          nist5         Nist5
                          pentablake    Pentablake
                          phi1612       phi
-                          phi2          Luxcoin (LUX)
-                          phi2-lux      identical to phi2
-                          pluck         Pluck:128 (Supcoin)
+                          phi2          
                          polytimos     Ninja
                          power2b       MicroBitcoin (MBC)
                          quark         Quark
                          qubit         Qubit
                          scrypt        scrypt(1024, 1, 1) (default)
                          scrypt:N      scrypt(N, 1, 1)
+                          scryptn2      scrypt(1048576, 1, 1)
                          sha256d       Double SHA-256
-                          sha256q       Quad SHA-256, Pyrite (PYE)
-                          sha256t       Triple SHA-256, Onecoin (OC)
+                          sha256q       Quad SHA-256
+                          sha256t       Triple SHA-256
                          sha3d         Double keccak256 (BSHA3)
-                          shavite3      Shavite3
                          skein         Skein+Sha (Skeincoin)
                          skein2        Double Skein (Woodcoin)
                          skunk         Signatum (SIGT)
@@ -136,17 +133,17 @@ Supported Algorithms
                          x11           Dash
                          x11evo        Revolvercoin
                          x11gost       sib (SibCoin)
-                          x12           Galaxie Cash (GCH)
-                          x13           X13
+                          x12           
+                          x13           
                          x13bcd        bcd
                          x13sm3        hsr (Hshare)
-                          x14           X14
-                          x15           X15
+                          x14           
+                          x15           
                          x16r          
                          x16rv2        
-                          x16rt         Gincoin (GIN)
-                          x16rt-veil    Veil (VEIL)
-                          x16s          Pigeoncoin (PGN)
+                          x16rt         
+                          x16rt-veil    veil
+                          x16s          
                          x17
                          x21s
                          x22i
--- a/README.txt
+++ b/README.txt
@@ -73,7 +73,6 @@ third party packages. They often will work and may be used instead of the
 included version of the files.


-
 If you like this software feel free to donate:

 BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
--- a/9
+++ b/9
@@ -65,6 +65,12 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v3.21.0
+
+Added minotaurx algo for stratum only.
+Blake256 & sha256 prehash optimised to ignore zero-padded data for AVX2 & AVX512.
+Other small improvements.
+
 v3.20.3

 Faster c11 algo: AVX512 6%, AVX2 4%, AVX2+VAES 15%.
@@ -98,12 +104,9 @@ v3.19.8

 #370 "stratum+ssl", in addition to "stratum+tcps", is now recognized as a valid
 url protocol specifier for requesting a secure stratum connection.
-
 The full url, including the protocol, is now displayed in the stratum connect
 log and the periodic summary log.
-
 Small optimizations to Cubehash, AVX2 & AVX512.
-
 Byte order and prehash optimizations for Blake256 & Blake512, AVX2 & AVX512.

 v3.19.7
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -327,6 +327,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_LYRA2Z330:    rc = register_lyra2z330_algo     ( gate ); break;
    case ALGO_M7M:          rc = register_m7m_algo           ( gate ); break;
    case ALGO_MINOTAUR:     rc = register_minotaur_algo      ( gate ); break;
+    case ALGO_MINOTAURX:    rc = register_minotaur_algo      ( gate ); break;
    case ALGO_MYR_GR:       rc = register_myriad_algo        ( gate ); break;
    case ALGO_NEOSCRYPT:    rc = register_neoscrypt_algo     ( gate ); break;
    case ALGO_NIST5:        rc = register_nist5_algo         ( gate ); break;
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -115,7 +115,7 @@ void blake256_8way_close(void *cc, void *dst);
 void blake256_8way_update_le(void *cc, const void *data, size_t len);
 void blake256_8way_close_le(void *cc, void *dst);
 void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
-                                       const void *data );
+                                      void *data );
 void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
                                     const void *midhash, const void *data );

@@ -178,7 +178,7 @@ void blake256_16way_close(void *cc, void *dst);
 void blake256_16way_update_le(void *cc, const void *data, size_t len);
 void blake256_16way_close_le(void *cc, void *dst);
 void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
-                                       const void *data );
+                                       void *data );
 void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
                                     const void *midhash, const void *data );

--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -668,6 +668,258 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
        GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
 }

+// Short cut message expansion when the message data is known to be zero.
+// M[ 5:12, 14 ] are zero padded for the second block of 80 byte data.
+
+#define G256_8WAY_ALT( a, b, c, d, m0, m1 ) \
+{ \
+   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m0 ); \
+   d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
+   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m1 ); \
+   d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
+}
+
+// Message expansion optimized for each round.
+#define ROUND256_8WAY_0 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC, \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CS1 ) ), \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CS0 ) ) ); \
+   G256_8WAY_ALT( V1, V5, V9, VD, \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CS3 ) ), \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CS2 ) ) ); \
+   G256_8WAY_ALT( V2, V6, VA, VE, \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CS5 ) ), \
+                                        _mm256_set1_epi32( CS4 )   ); \
+   G256_8WAY_ALT( V3, V7, VB, VF,       _mm256_set1_epi32( CS7 )  , \
+                                        _mm256_set1_epi32( CS6 )   ); \
+   G256_8WAY_ALT( V0, V5, VA, VF,       _mm256_set1_epi32( CS9 )  , \
+                                        _mm256_set1_epi32( CS8 )   ); \
+   G256_8WAY_ALT( V1, V6, VB, VC,       _mm256_set1_epi32( CSB )  , \
+                                        _mm256_set1_epi32( CSA )   ); \
+   G256_8WAY_ALT( V2, V7, V8, VD,       _mm256_set1_epi32( CSD )  , \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CSC ) ) ); \
+   G256_8WAY_ALT( V3, V4, V9, VE,       _mm256_set1_epi32( CSF )  , \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CSE ) ) ); \
+}
+
+#define ROUND256_8WAY_1 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC,       _mm256_set1_epi32( CSA )  , \
+                                        _mm256_set1_epi32( CSE )   ); \
+   G256_8WAY_ALT( V1, V5, V9, VD, \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CS8 ) ), \
+                                        _mm256_set1_epi32( CS4 )   ); \
+   G256_8WAY_ALT( V2, V6, VA, VE,       _mm256_set1_epi32( CSF )  , \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CS9 ) ) ); \
+   G256_8WAY_ALT( V3, V7, VB, VF, \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CS6 ) ), \
+                                        _mm256_set1_epi32( CSD )   ); \
+   G256_8WAY_ALT( V0, V5, VA, VF, \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CSC ) ), \
+                                        _mm256_set1_epi32( CS1 )   ); \
+   G256_8WAY_ALT( V1, V6, VB, VC, \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CS2 ) ), \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CS0 ) ) ); \
+   G256_8WAY_ALT( V2, V7, V8, VD,       _mm256_set1_epi32( CS7 )  , \
+                                        _mm256_set1_epi32( CSB )   ); \
+   G256_8WAY_ALT( V3, V4, V9, VE,       _mm256_set1_epi32( CS3 )  , \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CS5 ) ) ); \
+}
+
+#define ROUND256_8WAY_2 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC,       _mm256_set1_epi32( CS8 )  , \
+                                        _mm256_set1_epi32( CSB )   ); \
+   G256_8WAY_ALT( V1, V5, V9, VD,       _mm256_set1_epi32( CS0 )  , \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CSC ) ) ); \
+   G256_8WAY_ALT( V2, V6, VA, VE,       _mm256_set1_epi32( CS2 )  , \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CS5 ) ) ); \
+   G256_8WAY_ALT( V3, V7, VB, VF, \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CSD ) ), \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CSF ) ) ); \
+   G256_8WAY_ALT( V0, V5, VA, VF,       _mm256_set1_epi32( CSE )  , \
+                                        _mm256_set1_epi32( CSA )   ); \
+   G256_8WAY_ALT( V1, V6, VB, VC, \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CS6 ) ), \
+                                        _mm256_set1_epi32( CS3 )   ); \
+   G256_8WAY_ALT( V2, V7, V8, VD,       _mm256_set1_epi32( CS1 )  , \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CS7 ) ) ); \
+   G256_8WAY_ALT( V3, V4, V9, VE,       _mm256_set1_epi32( CS4 )  , \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CS9 ) ) ); \
+}
+
+#define ROUND256_8WAY_3 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC,       _mm256_set1_epi32( CS9 )  , \
+                                        _mm256_set1_epi32( CS7 )   ); \
+   G256_8WAY_ALT( V1, V5, V9, VD, \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CS1 ) ), \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CS3 ) ) ); \
+   G256_8WAY_ALT( V2, V6, VA, VE, \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CSC ) ), \
+                                        _mm256_set1_epi32( CSD )   ); \
+   G256_8WAY_ALT( V3, V7, VB, VF,       _mm256_set1_epi32( CSE )  , \
+                                        _mm256_set1_epi32( CSB )   ); \
+   G256_8WAY_ALT( V0, V5, VA, VF, \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CS6 ) ), \
+                                        _mm256_set1_epi32( CS2 )   ); \
+   G256_8WAY_ALT( V1, V6, VB, VC,       _mm256_set1_epi32( CSA )  , \
+                                        _mm256_set1_epi32( CS5 )   ); \
+   G256_8WAY_ALT( V2, V7, V8, VD, \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CS0 ) ), \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CS4 ) ) ); \
+   G256_8WAY_ALT( V3, V4, V9, VE, \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CS8 ) ), \
+                                        _mm256_set1_epi32( CSF )   ); \
+}
+
+#define ROUND256_8WAY_4 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC,       _mm256_set1_epi32( CS0 )  , \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CS9 ) ) ); \
+   G256_8WAY_ALT( V1, V5, V9, VD,       _mm256_set1_epi32( CS7 )  , \
+                                        _mm256_set1_epi32( CS5 )   ); \
+   G256_8WAY_ALT( V2, V6, VA, VE, \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CS4 ) ), \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CS2 ) )  ); \
+   G256_8WAY_ALT( V3, V7, VB, VF,       _mm256_set1_epi32( CSF )  , \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CSA ) ) ); \
+   G256_8WAY_ALT( V0, V5, VA, VF,       _mm256_set1_epi32( CS1 )  , \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CSE ) ) ); \
+   G256_8WAY_ALT( V1, V6, VB, VC,       _mm256_set1_epi32( CSC )  , \
+                                        _mm256_set1_epi32( CSB )   ); \
+   G256_8WAY_ALT( V2, V7, V8, VD,       _mm256_set1_epi32( CS8 )  , \
+                                        _mm256_set1_epi32( CS6 )   ); \
+   G256_8WAY_ALT( V3, V4, V9, VE, \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CSD ) ), \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CS3 ) ) ); \
+}
+
+#define ROUND256_8WAY_5 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC, \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CSC ) ), \
+                                        _mm256_set1_epi32( CS2 )   ); \
+   G256_8WAY_ALT( V1, V5, V9, VD,       _mm256_set1_epi32( CSA )  , \
+                                        _mm256_set1_epi32( CS6 )   ); \
+   G256_8WAY_ALT( V2, V6, VA, VE, \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CSB ) ), \
+                                        _mm256_set1_epi32( CS0 )   ); \
+   G256_8WAY_ALT( V3, V7, VB, VF,       _mm256_set1_epi32( CS3 )  , \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CS8 ) ) ); \
+   G256_8WAY_ALT( V0, V5, VA, VF, \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CSD ) ), \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CS4 ) ) ); \
+   G256_8WAY_ALT( V1, V6, VB, VC,       _mm256_set1_epi32( CS5 )  , \
+                                        _mm256_set1_epi32( CS7 )   ); \
+   G256_8WAY_ALT( V2, V7, V8, VD, \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CSE ) ), \
+                                        _mm256_set1_epi32( CSF )   ); \
+   G256_8WAY_ALT( V3, V4, V9, VE, \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CS9 ) ), \
+                                        _mm256_set1_epi32( CS1 )   ); \
+}
+
+#define ROUND256_8WAY_6 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC,       _mm256_set1_epi32( CS5 )  , \
+                                        _mm256_set1_epi32( CSC )   ); \
+   G256_8WAY_ALT( V1, V5, V9, VD, \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CSF ) ), \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CS1 ) ) ); \
+   G256_8WAY_ALT( V2, V6, VA, VE,       _mm256_set1_epi32( CSD )  , \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CSE ) ) );\
+   G256_8WAY_ALT( V3, V7, VB, VF, \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CSA ) ), \
+                                        _mm256_set1_epi32( CS4 )   ); \
+   G256_8WAY_ALT( V0, V5, VA, VF, \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CS7 ) ), \
+                                        _mm256_set1_epi32( CS0 )   ); \
+   G256_8WAY_ALT( V1, V6, VB, VC,       _mm256_set1_epi32( CS3 )  , \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CS6 ) ) ); \
+   G256_8WAY_ALT( V2, V7, V8, VD,       _mm256_set1_epi32( CS2 )  , \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CS9 ) ) ); \
+   G256_8WAY_ALT( V3, V4, V9, VE,       _mm256_set1_epi32( CSB )  , \
+                                        _mm256_set1_epi32( CS8 )   ); \
+}
+
+#define ROUND256_8WAY_7 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC, \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CSB ) ), \
+                                        _mm256_set1_epi32( CSD )   ); \
+   G256_8WAY_ALT( V1, V5, V9, VD,       _mm256_set1_epi32( CSE )  , \
+                                        _mm256_set1_epi32( CS7 )   ); \
+   G256_8WAY_ALT( V2, V6, VA, VE,       _mm256_set1_epi32( CS1 )  , \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CSC ) ) ); \
+   G256_8WAY_ALT( V3, V7, VB, VF, \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CS9 ) ), \
+                                        _mm256_set1_epi32( CS3 )   ); \
+   G256_8WAY_ALT( V0, V5, VA, VF,       _mm256_set1_epi32( CS0 )  , \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CS5 ) ) ); \
+   G256_8WAY_ALT( V1, V6, VB, VC, \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CS4 ) ), \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CSF ) ) ); \
+   G256_8WAY_ALT( V2, V7, V8, VD,       _mm256_set1_epi32( CS6 )  , \
+                                        _mm256_set1_epi32( CS8 )   ); \
+   G256_8WAY_ALT( V3, V4, V9, VE, \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CSA ) ), \
+                                        _mm256_set1_epi32( CS2 )   ); \
+}
+
+#define ROUND256_8WAY_8 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC,       _mm256_set1_epi32( CSF   ), \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CS6 ) ) ); \
+   G256_8WAY_ALT( V1, V5, V9, VD,       _mm256_set1_epi32( CS9 )  , \
+                                        _mm256_set1_epi32( CSE )   ); \
+   G256_8WAY_ALT( V2, V6, VA, VE,       _mm256_set1_epi32( CS3 )  , \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CSB ) ) ); \
+   G256_8WAY_ALT( V3, V7, VB, VF, \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CS8 ) ), \
+                                        _mm256_set1_epi32( CS0 )   ); \
+   G256_8WAY_ALT( V0, V5, VA, VF,       _mm256_set1_epi32( CS2 )  , \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CSC ) ) ); \
+   G256_8WAY_ALT( V1, V6, VB, VC, \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CS7 ) ), \
+                                        _mm256_set1_epi32( CSD )   ); \
+   G256_8WAY_ALT( V2, V7, V8, VD, \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CS4 ) ), \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CS1 ) ) ); \
+   G256_8WAY_ALT( V3, V4, V9, VE,       _mm256_set1_epi32( CS5 )  , \
+                                        _mm256_set1_epi32( CSA )   ); \
+}
+
+#define ROUND256_8WAY_9 \
+{ \
+   G256_8WAY_ALT( V0, V4, V8, VC,       _mm256_set1_epi32( CS2 )  , \
+                  _mm256_xor_si256( M2, _mm256_set1_epi32( CSA ) ) ); \
+   G256_8WAY_ALT( V1, V5, V9, VD,       _mm256_set1_epi32( CS4 )  , \
+                  _mm256_xor_si256( M4, _mm256_set1_epi32( CS8 ) ) ); \
+   G256_8WAY_ALT( V2, V6, VA, VE,       _mm256_set1_epi32( CS6 )  , \
+                                        _mm256_set1_epi32( CS7 )    ); \
+   G256_8WAY_ALT( V3, V7, VB, VF, \
+                  _mm256_xor_si256( M1, _mm256_set1_epi32( CS5 ) ), \
+                                        _mm256_set1_epi32( CS1 )   ); \
+   G256_8WAY_ALT( V0, V5, VA, VF, \
+                  _mm256_xor_si256( MF, _mm256_set1_epi32( CSB ) ), \
+                                        _mm256_set1_epi32( CSF )   ); \
+   G256_8WAY_ALT( V1, V6, VB, VC,       _mm256_set1_epi32( CSE )  , \
+                                        _mm256_set1_epi32( CS9 )   ); \
+   G256_8WAY_ALT( V2, V7, V8, VD, \
+                  _mm256_xor_si256( M3, _mm256_set1_epi32( CSC ) ), \
+                                        _mm256_set1_epi32( CS3 )   ); \
+   G256_8WAY_ALT( V3, V4, V9, VE, \
+                  _mm256_xor_si256( MD, _mm256_set1_epi32( CS0 ) ), \
+                  _mm256_xor_si256( M0, _mm256_set1_epi32( CSD ) ) ); \
+}
+
+
 #define DECL_STATE32_8WAY \
   __m256i H0, H1, H2, H3, H4, H5, H6, H7; \
   sph_u32 T0, T1;
@@ -834,9 +1086,9 @@ do { \
 }

 void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
-                                       const void *data )
+                                      void *data )
 {
-   const __m256i *M = (const __m256i*)data;
+   __m256i *M = (__m256i*)data;
   __m256i *V = (__m256i*)midstate;
   const __m256i *H = (const __m256i*)midhash;

@@ -857,6 +1109,17 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
   V[14] = m256_const1_32( CS6 );
   V[15] = m256_const1_32( CS7 );

+// M[ 0:3 ] contain new message data including unique nonces in M[ 3].
+// M[ 5:12, 14 ] are always zero and not needed or used.
+// M[ 4], M[ 13], M[15] are constant and are initialized here.
+// M[ 5] is a special case, used as a cache for (M[13] ^ CSC).
+
+   M[ 4] = m256_const1_32( 0x80000000 );
+   M[13] = m256_one_32;
+   M[15] = m256_const1_32( 80*8 );
+
+   M[ 5] =_mm256_xor_si256( M[13], _mm256_set1_epi32( CSC ) );
+
   // G0   
   GS_8WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );

@@ -868,21 +1131,45 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
   V[ 5] = mm256_ror_32( _mm256_xor_si256( V[ 5], V[ 9] ), 12 );
   V[ 1] = _mm256_add_epi32( V[ 1], V[ 5] );

-   // G2,G3
-   GS_8WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
-   GS_8WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
+   // G2
+   // GS_8WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
+   V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 6] ),
+                       _mm256_xor_si256( _mm256_set1_epi32( CS5 ), M[ 4] ) );
+   V[14] = mm256_swap32_16( _mm256_xor_si256( V[14], V[ 2] ) );
+   V[10] = _mm256_add_epi32( V[10], V[14] );
+   V[ 6] = mm256_ror_32( _mm256_xor_si256( V[ 6], V[10] ), 12 );
+   V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 6] ),
+                             _mm256_set1_epi32( CS4 ) );
+   V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 2] ), 8 );
+   V[10] = _mm256_add_epi32( V[10], V[14] );
+   V[ 6] = mm256_ror_32( _mm256_xor_si256( V[ 6], V[10] ), 7 );
+
+   // G3
+   // GS_8WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
+   V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 7] ),
+                             _mm256_set1_epi32( CS7 ) );
+   V[15] = mm256_swap32_16( _mm256_xor_si256( V[15], V[ 3] ) );
+   V[11] = _mm256_add_epi32( V[11], V[15] );
+   V[ 7] = mm256_ror_32( _mm256_xor_si256( V[ 7], V[11] ), 12 );
+   V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 7] ),
+                             _mm256_set1_epi32( CS6 ) );
+   V[15] = mm256_ror_32( _mm256_xor_si256( V[15], V[ 3] ), 8 );
+   V[11] = _mm256_add_epi32( V[11], V[15] );
+   V[ 7] = mm256_ror_32( _mm256_xor_si256( V[ 7], V[11] ), 7 );

   // G4   
-   V[ 0] = _mm256_add_epi32( V[ 0],
-                         _mm256_xor_si256( _mm256_set1_epi32( CS9 ), M[ 8] ) );
+   V[ 0] = _mm256_add_epi32( V[ 0], _mm256_set1_epi32( CS9 ) );
+
+   // G5
+   // GS_8WAY( M[10], M[11], CSA, CSB, V1, V6, VB, VC );

   // G6   
   V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 7] ),
-                         _mm256_xor_si256( _mm256_set1_epi32( CSD ), M[12] ) );
+                             _mm256_set1_epi32( CSD ) );

   // G7   
   V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 4] ),
-                         _mm256_xor_si256( _mm256_set1_epi32( CSF ), M[14] ) );
+                             _mm256_set1_epi32( CSF ) );
   V[14] = mm256_swap32_16( _mm256_xor_si256( V[14], V[ 3] ) );
   V[ 3] = _mm256_add_epi32( V[ 3],
                         _mm256_xor_si256( _mm256_set1_epi32( CSE ), M[15] ) );
@@ -893,47 +1180,40 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
 {
   __m256i *H = (__m256i*)final_hash;
   const __m256i *h = (const __m256i*)midhash;
-   const __m256i *v= (const __m256i*)midstate;
   __m256i V0, V1, V2, V3, V4, V5, V6, V7;
   __m256i V8, V9, VA, VB, VC, VD, VE, VF;
-   __m256i M0, M1, M2, M3, M4, M5, M6, M7;
-   __m256i M8, M9, MA, MB, MC, MD, ME, MF;
+   __m256i M0, M1, M2, M3, M4, MD, MF;
+   __m256i MDxorCSC;

-   V0 = v[ 0];
-   V1 = v[ 1];
-   V2 = v[ 2];
-   V3 = v[ 3];
-   V4 = v[ 4];
-   V5 = v[ 5];
-   V6 = v[ 6];
-   V7 = v[ 7];
-   V8 = v[ 8];
-   V9 = v[ 9];
-   VA = v[10];
-   VB = v[11];
-   VC = v[12];
-   VD = v[13];
-   VE = v[14];
-   VF = v[15];
+   V0 = _mm256_load_si256( (__m256i*)midstate +  0 );
+   V1 = _mm256_load_si256( (__m256i*)midstate +  1 );
+   V2 = _mm256_load_si256( (__m256i*)midstate +  2 );
+   V3 = _mm256_load_si256( (__m256i*)midstate +  3 );
+   V4 = _mm256_load_si256( (__m256i*)midstate +  4 );
+   V5 = _mm256_load_si256( (__m256i*)midstate +  5 );
+   V6 = _mm256_load_si256( (__m256i*)midstate +  6 );
+   V7 = _mm256_load_si256( (__m256i*)midstate +  7 );
+   V8 = _mm256_load_si256( (__m256i*)midstate +  8 );
+   V9 = _mm256_load_si256( (__m256i*)midstate +  9 );
+   VA = _mm256_load_si256( (__m256i*)midstate + 10 );
+   VB = _mm256_load_si256( (__m256i*)midstate + 11 );
+   VC = _mm256_load_si256( (__m256i*)midstate + 12 );
+   VD = _mm256_load_si256( (__m256i*)midstate + 13 );
+   VE = _mm256_load_si256( (__m256i*)midstate + 14 );
+   VF = _mm256_load_si256( (__m256i*)midstate + 15 );

-   M0 = casti_m256i( data,  0 );
-   M1 = casti_m256i( data,  1 );
-   M2 = casti_m256i( data,  2 );
-   M3 = casti_m256i( data,  3 );
-   M4 = casti_m256i( data,  4 );
-   M5 = casti_m256i( data,  5 );
-   M6 = casti_m256i( data,  6 );
-   M7 = casti_m256i( data,  7 );
-   M8 = casti_m256i( data,  8 );
-   M9 = casti_m256i( data,  9 );
-   MA = casti_m256i( data, 10 );
-   MB = casti_m256i( data, 11 );
-   MC = casti_m256i( data, 12 );
-   MD = casti_m256i( data, 13 );
-   ME = casti_m256i( data, 14 );
-   MF = casti_m256i( data, 15 );
+   M0 = _mm256_load_si256( (__m256i*)data +  0 );
+   M1 = _mm256_load_si256( (__m256i*)data +  1 );
+   M2 = _mm256_load_si256( (__m256i*)data +  2 );
+   M3 = _mm256_load_si256( (__m256i*)data +  3 );
+   M4 = _mm256_load_si256( (__m256i*)data +  4 );
+   // M5 to MC & ME zero padding & optimised out.
+   MD = _mm256_load_si256( (__m256i*)data + 13 );
+   MF = _mm256_load_si256( (__m256i*)data + 15 );
+   // precalculated MD^CSC, used in round0 G6.
+   MDxorCSC = _mm256_load_si256( (__m256i*)data +  5 );
   
-   // Finish round 0   
+   // Finish round 0 with nonce in M3 
   // G1   
   V1 = _mm256_add_epi32( V1,
                         _mm256_xor_si256( _mm256_set1_epi32( CS2 ), M3 ) );
@@ -947,20 +1227,29 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
   VA = _mm256_add_epi32( VA, VF );
   V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 12 );
   V0 = _mm256_add_epi32( V0, _mm256_add_epi32( V5,
-                         _mm256_xor_si256( _mm256_set1_epi32( CS8 ), M9 ) ) );
+                             _mm256_set1_epi32( CS8 ) ) );
   VF = mm256_shuflr32_8( _mm256_xor_si256( VF, V0 ) );
   VA = _mm256_add_epi32( VA, VF );
   V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 7 );

   // G5
-   GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
+   // GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
+   V1 = _mm256_add_epi32( _mm256_add_epi32( V1, V6 ),
+                          _mm256_set1_epi32( CSB ) );
+   VC = mm256_swap32_16( _mm256_xor_si256( VC, V1 ) );
+   VB = _mm256_add_epi32( VB, VC );
+   V6 = mm256_ror_32( _mm256_xor_si256( V6, VB ), 12 );
+   V1 = _mm256_add_epi32( _mm256_add_epi32( V1, V6 ),
+                         _mm256_set1_epi32( CSA ) );
+   VC = mm256_ror_32( _mm256_xor_si256( VC, V1 ), 8 );
+   VB = _mm256_add_epi32( VB, VC );
+   V6 = mm256_ror_32( _mm256_xor_si256( V6, VB ), 7 );

   // G6
   VD = mm256_swap32_16( _mm256_xor_si256( VD, V2 ) );
   V8 = _mm256_add_epi32( V8, VD );
   V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 12 );
-   V2 = _mm256_add_epi32( _mm256_add_epi32( V2, V7 ),
-                         _mm256_xor_si256( _mm256_set1_epi32( CSC ), MD ) );
+   V2 = _mm256_add_epi32( V2, _mm256_add_epi32( V7, MDxorCSC ) );
   VD = mm256_shuflr32_8( _mm256_xor_si256( VD, V2 ) );
   V8 = _mm256_add_epi32( V8, VD );
   V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 7 );
@@ -974,19 +1263,19 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
   V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 7 );

   // Remaining rounds   
-   ROUND_S_8WAY( 1 );
-   ROUND_S_8WAY( 2 );
-   ROUND_S_8WAY( 3 );
-   ROUND_S_8WAY( 4 );
-   ROUND_S_8WAY( 5 );
-   ROUND_S_8WAY( 6 );
-   ROUND_S_8WAY( 7 );
-   ROUND_S_8WAY( 8 );
-   ROUND_S_8WAY( 9 );
-   ROUND_S_8WAY( 0 );
-   ROUND_S_8WAY( 1 );
-   ROUND_S_8WAY( 2 );
-   ROUND_S_8WAY( 3 );
+   ROUND256_8WAY_1;
+   ROUND256_8WAY_2;
+   ROUND256_8WAY_3;
+   ROUND256_8WAY_4;
+   ROUND256_8WAY_5;
+   ROUND256_8WAY_6;
+   ROUND256_8WAY_7;
+   ROUND256_8WAY_8;
+   ROUND256_8WAY_9;
+   ROUND256_8WAY_0;
+   ROUND256_8WAY_1;
+   ROUND256_8WAY_2;
+   ROUND256_8WAY_3;

   const __m256i shuf_bswap32 =
                  m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213,
@@ -1010,6 +1299,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
 //
 // Blake-256 16 way AVX512

+// Generic with full inline message expansion
 #define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
 { \
   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
@@ -1036,6 +1326,257 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
        GS_16WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
 }

+// Short cut message expansion when the message data is known to be zero.
+// M[ 5:12, 14 ] are zero padded for the second block of 80 byte data.
+
+#define G256_16WAY_ALT( a, b, c, d, m0, m1 ) \
+{ \
+   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m0 ); \
+   d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
+   c = _mm512_add_epi32( c, d ); \
+   b = mm512_ror_32( _mm512_xor_si512( b, c ), 12 ); \
+   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m1 ); \
+   d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \
+   c = _mm512_add_epi32( c, d ); \
+   b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \
+}
+
+// Message expansion optimized for each round.
+#define ROUND256_16WAY_0 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC, \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CS1 ) ), \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CS0 ) ) ); \
+   G256_16WAY_ALT( V1, V5, V9, VD, \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CS3 ) ), \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CS2 ) ) ); \
+   G256_16WAY_ALT( V2, V6, VA, VE, \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CS5 ) ), \
+                                         _mm512_set1_epi32( CS4 )   ); \
+   G256_16WAY_ALT( V3, V7, VB, VF,       _mm512_set1_epi32( CS7 )  , \
+                                         _mm512_set1_epi32( CS6 )   ); \
+   G256_16WAY_ALT( V0, V5, VA, VF,       _mm512_set1_epi32( CS9 )  , \
+                                         _mm512_set1_epi32( CS8 )   ); \
+   G256_16WAY_ALT( V1, V6, VB, VC,       _mm512_set1_epi32( CSB )  , \
+                                         _mm512_set1_epi32( CSA )   ); \
+   G256_16WAY_ALT( V2, V7, V8, VD,       _mm512_set1_epi32( CSD )  , \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CSC ) ) ); \
+   G256_16WAY_ALT( V3, V4, V9, VE,       _mm512_set1_epi32( CSF )  , \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CSE ) ) ); \
+}
+
+#define ROUND256_16WAY_1 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC,       _mm512_set1_epi32( CSA )  , \
+                                         _mm512_set1_epi32( CSE )   ); \
+   G256_16WAY_ALT( V1, V5, V9, VD, \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CS8 ) ), \
+                                         _mm512_set1_epi32( CS4 )   ); \
+   G256_16WAY_ALT( V2, V6, VA, VE,       _mm512_set1_epi32( CSF )  , \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CS9 ) ) ); \
+   G256_16WAY_ALT( V3, V7, VB, VF, \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CS6 ) ), \
+                                         _mm512_set1_epi32( CSD )   ); \
+   G256_16WAY_ALT( V0, V5, VA, VF, \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CSC ) ), \
+                                         _mm512_set1_epi32( CS1 )   ); \
+   G256_16WAY_ALT( V1, V6, VB, VC, \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CS2 ) ), \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CS0 ) ) ); \
+   G256_16WAY_ALT( V2, V7, V8, VD,       _mm512_set1_epi32( CS7 )  , \
+                                         _mm512_set1_epi32( CSB )   ); \
+   G256_16WAY_ALT( V3, V4, V9, VE,       _mm512_set1_epi32( CS3 )  , \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CS5 ) ) ); \
+}
+
+#define ROUND256_16WAY_2 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC,       _mm512_set1_epi32( CS8 )  , \
+                                         _mm512_set1_epi32( CSB )   ); \
+   G256_16WAY_ALT( V1, V5, V9, VD,       _mm512_set1_epi32( CS0 )  , \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CSC ) ) ); \
+   G256_16WAY_ALT( V2, V6, VA, VE,       _mm512_set1_epi32( CS2 )  , \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CS5 ) ) ); \
+   G256_16WAY_ALT( V3, V7, VB, VF, \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CSD ) ), \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CSF ) ) ); \
+   G256_16WAY_ALT( V0, V5, VA, VF,       _mm512_set1_epi32( CSE )  , \
+                                         _mm512_set1_epi32( CSA )   ); \
+   G256_16WAY_ALT( V1, V6, VB, VC, \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CS6 ) ), \
+                                         _mm512_set1_epi32( CS3 )   ); \
+   G256_16WAY_ALT( V2, V7, V8, VD,       _mm512_set1_epi32( CS1 )  , \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CS7 ) ) ); \
+   G256_16WAY_ALT( V3, V4, V9, VE,       _mm512_set1_epi32( CS4 )  , \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CS9 ) ) ); \
+}
+
+#define ROUND256_16WAY_3 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC,       _mm512_set1_epi32( CS9 )  , \
+                                         _mm512_set1_epi32( CS7 )   ); \
+   G256_16WAY_ALT( V1, V5, V9, VD, \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CS1 ) ), \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CS3 ) ) ); \
+   G256_16WAY_ALT( V2, V6, VA, VE, \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CSC ) ), \
+                                         _mm512_set1_epi32( CSD )   ); \
+   G256_16WAY_ALT( V3, V7, VB, VF,       _mm512_set1_epi32( CSE )  , \
+                                         _mm512_set1_epi32( CSB )   ); \
+   G256_16WAY_ALT( V0, V5, VA, VF, \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CS6 ) ), \
+                                         _mm512_set1_epi32( CS2 )   ); \
+   G256_16WAY_ALT( V1, V6, VB, VC,       _mm512_set1_epi32( CSA )  , \
+                                         _mm512_set1_epi32( CS5 )   ); \
+   G256_16WAY_ALT( V2, V7, V8, VD, \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CS0 ) ), \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CS4 ) ) ); \
+   G256_16WAY_ALT( V3, V4, V9, VE, \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CS8 ) ), \
+                                         _mm512_set1_epi32( CSF )   ); \
+}
+
+#define ROUND256_16WAY_4 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC,       _mm512_set1_epi32( CS0 )  , \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CS9 ) ) ); \
+   G256_16WAY_ALT( V1, V5, V9, VD,       _mm512_set1_epi32( CS7 )  , \
+                                         _mm512_set1_epi32( CS5 )   ); \
+   G256_16WAY_ALT( V2, V6, VA, VE, \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CS4 ) ), \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CS2 ) )  ); \
+   G256_16WAY_ALT( V3, V7, VB, VF,       _mm512_set1_epi32( CSF )  , \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CSA ) ) ); \
+   G256_16WAY_ALT( V0, V5, VA, VF,       _mm512_set1_epi32( CS1 )  , \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CSE ) ) ); \
+   G256_16WAY_ALT( V1, V6, VB, VC,       _mm512_set1_epi32( CSC )  , \
+                                         _mm512_set1_epi32( CSB )   ); \
+   G256_16WAY_ALT( V2, V7, V8, VD,       _mm512_set1_epi32( CS8 )  , \
+                                         _mm512_set1_epi32( CS6 )   ); \
+   G256_16WAY_ALT( V3, V4, V9, VE, \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CSD ) ), \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CS3 ) ) ); \
+}
+
+#define ROUND256_16WAY_5 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC, \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CSC ) ), \
+                                         _mm512_set1_epi32( CS2 )   ); \
+   G256_16WAY_ALT( V1, V5, V9, VD,       _mm512_set1_epi32( CSA )  , \
+                                         _mm512_set1_epi32( CS6 )   ); \
+   G256_16WAY_ALT( V2, V6, VA, VE, \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CSB ) ), \
+                                         _mm512_set1_epi32( CS0 )   ); \
+   G256_16WAY_ALT( V3, V7, VB, VF,       _mm512_set1_epi32( CS3 )  , \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CS8 ) ) ); \
+   G256_16WAY_ALT( V0, V5, VA, VF, \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CSD ) ), \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CS4 ) ) ); \
+   G256_16WAY_ALT( V1, V6, VB, VC,       _mm512_set1_epi32( CS5 )  , \
+                                         _mm512_set1_epi32( CS7 )   ); \
+   G256_16WAY_ALT( V2, V7, V8, VD, \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CSE ) ), \
+                                         _mm512_set1_epi32( CSF )   ); \
+   G256_16WAY_ALT( V3, V4, V9, VE, \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CS9 ) ), \
+                                         _mm512_set1_epi32( CS1 )   ); \
+}
+
+#define ROUND256_16WAY_6 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC,       _mm512_set1_epi32( CS5 )  , \
+                                         _mm512_set1_epi32( CSC )   ); \
+   G256_16WAY_ALT( V1, V5, V9, VD, \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CSF ) ), \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CS1 ) ) ); \
+   G256_16WAY_ALT( V2, V6, VA, VE,       _mm512_set1_epi32( CSD )  , \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CSE ) ) );\
+   G256_16WAY_ALT( V3, V7, VB, VF, \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CSA ) ), \
+                                         _mm512_set1_epi32( CS4 )   ); \
+   G256_16WAY_ALT( V0, V5, VA, VF, \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CS7 ) ), \
+                                         _mm512_set1_epi32( CS0 )   ); \
+   G256_16WAY_ALT( V1, V6, VB, VC,       _mm512_set1_epi32( CS3 )  , \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CS6 ) ) ); \
+   G256_16WAY_ALT( V2, V7, V8, VD,       _mm512_set1_epi32( CS2 )  , \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CS9 ) ) ); \
+   G256_16WAY_ALT( V3, V4, V9, VE,       _mm512_set1_epi32( CSB )  , \
+                                         _mm512_set1_epi32( CS8 )   ); \
+}
+
+#define ROUND256_16WAY_7 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC, \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CSB ) ), \
+                                         _mm512_set1_epi32( CSD )   ); \
+   G256_16WAY_ALT( V1, V5, V9, VD,       _mm512_set1_epi32( CSE )  , \
+                                         _mm512_set1_epi32( CS7 )   ); \
+   G256_16WAY_ALT( V2, V6, VA, VE,       _mm512_set1_epi32( CS1 )  , \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CSC ) ) ); \
+   G256_16WAY_ALT( V3, V7, VB, VF, \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CS9 ) ), \
+                                         _mm512_set1_epi32( CS3 )   ); \
+   G256_16WAY_ALT( V0, V5, VA, VF,       _mm512_set1_epi32( CS0 )  , \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CS5 ) ) ); \
+   G256_16WAY_ALT( V1, V6, VB, VC, \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CS4 ) ), \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CSF ) ) ); \
+   G256_16WAY_ALT( V2, V7, V8, VD,       _mm512_set1_epi32( CS6 )  , \
+                                         _mm512_set1_epi32( CS8 )   ); \
+   G256_16WAY_ALT( V3, V4, V9, VE, \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CSA ) ), \
+                                         _mm512_set1_epi32( CS2 )   ); \
+}
+
+#define ROUND256_16WAY_8 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC,       _mm512_set1_epi32( CSF   ), \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CS6 ) ) ); \
+   G256_16WAY_ALT( V1, V5, V9, VD,       _mm512_set1_epi32( CS9 )  , \
+                                         _mm512_set1_epi32( CSE )   ); \
+   G256_16WAY_ALT( V2, V6, VA, VE,       _mm512_set1_epi32( CS3 )  , \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CSB ) ) ); \
+   G256_16WAY_ALT( V3, V7, VB, VF, \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CS8 ) ), \
+                                         _mm512_set1_epi32( CS0 )   ); \
+   G256_16WAY_ALT( V0, V5, VA, VF,       _mm512_set1_epi32( CS2 )  , \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CSC ) ) ); \
+   G256_16WAY_ALT( V1, V6, VB, VC, \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CS7 ) ), \
+                                         _mm512_set1_epi32( CSD )   ); \
+   G256_16WAY_ALT( V2, V7, V8, VD, \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CS4 ) ), \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CS1 ) ) ); \
+   G256_16WAY_ALT( V3, V4, V9, VE,       _mm512_set1_epi32( CS5 )  , \
+                                         _mm512_set1_epi32( CSA )   ); \
+}
+
+#define ROUND256_16WAY_9 \
+{ \
+   G256_16WAY_ALT( V0, V4, V8, VC,       _mm512_set1_epi32( CS2 )  , \
+                   _mm512_xor_si512( M2, _mm512_set1_epi32( CSA ) ) ); \
+   G256_16WAY_ALT( V1, V5, V9, VD,       _mm512_set1_epi32( CS4 )  , \
+                   _mm512_xor_si512( M4, _mm512_set1_epi32( CS8 ) ) ); \
+   G256_16WAY_ALT( V2, V6, VA, VE,       _mm512_set1_epi32( CS6 )  , \
+                                         _mm512_set1_epi32( CS7 )    ); \
+   G256_16WAY_ALT( V3, V7, VB, VF, \
+                   _mm512_xor_si512( M1, _mm512_set1_epi32( CS5 ) ), \
+                                         _mm512_set1_epi32( CS1 )   ); \
+   G256_16WAY_ALT( V0, V5, VA, VF, \
+                   _mm512_xor_si512( MF, _mm512_set1_epi32( CSB ) ), \
+                                         _mm512_set1_epi32( CSF )   ); \
+   G256_16WAY_ALT( V1, V6, VB, VC,       _mm512_set1_epi32( CSE )  , \
+                                         _mm512_set1_epi32( CS9 )   ); \
+   G256_16WAY_ALT( V2, V7, V8, VD, \
+                   _mm512_xor_si512( M3, _mm512_set1_epi32( CSC ) ), \
+                                         _mm512_set1_epi32( CS3 )   ); \
+   G256_16WAY_ALT( V3, V4, V9, VE, \
+                   _mm512_xor_si512( MD, _mm512_set1_epi32( CS0 ) ), \
+                   _mm512_xor_si512( M0, _mm512_set1_epi32( CSD ) ) ); \
+}
+
 #define DECL_STATE32_16WAY \
   __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
   sph_u32 T0, T1;
@@ -1208,9 +1749,9 @@ do { \
 // second part is run for each nonce using the precalculated midstate and the
 // hash from the first block.
 void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
-                                       const void *data )
+                                       void *data )
 {
-   const __m512i *M = (const __m512i*)data;
+   __m512i *M = (__m512i*)data;
   __m512i *V = (__m512i*)midstate;
   const __m512i *H = (const __m512i*)midhash;

@@ -1231,10 +1772,21 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
   V[14] = m512_const1_32( CS6 );
   V[15] = m512_const1_32( CS7 );

+// M[ 0:3 ] contain new message data including unique nonces in M[ 3].   
+// M[ 5:12, 14 ] are always zero and not needed or used, except M[5] as noted.
+// M[ 4], M[ 13], M[15] are constant and are initialized here.
+// M[ 5] is a special case, used as a cache for (M[13] ^ CSC).
+   
+   M[ 4] = m512_const1_32( 0x80000000 );
+   M[13] = m512_one_32;
+   M[15] = m512_const1_32( 80*8 );
+
+   M[ 5] =_mm512_xor_si512( M[13], _mm512_set1_epi32( CSC ) );
+
   // G0   
   GS_16WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );

-   // G1, nonce is in M[3]   
+   // G1   
   // GS_16WAY( M[ 2], M[ 3], CS2, CS3, V1, V5, V9, VD );
   V[ 1] = _mm512_add_epi32( _mm512_add_epi32( V[ 1], V[ 5] ),
                         _mm512_xor_si512( _mm512_set1_epi32( CS3 ), M[ 2] ) );
@@ -1243,14 +1795,35 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
   V[ 5] = mm512_ror_32( _mm512_xor_si512( V[ 5], V[ 9] ), 12 );
   V[ 1] = _mm512_add_epi32( V[ 1], V[ 5] );

-   // G2,G3
-   GS_16WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
-   GS_16WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
+   // G2
+   // GS_16WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
+   V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 6] ),
+                       _mm512_xor_si512( _mm512_set1_epi32( CS5 ), M[ 4] ) );
+   V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 2] ), 16 );
+   V[10] = _mm512_add_epi32( V[10], V[14] );
+   V[ 6] = mm512_ror_32( _mm512_xor_si512( V[ 6], V[10] ), 12 );
+   V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 6] ),
+                             _mm512_set1_epi32( CS4 ) );
+   V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 2] ), 8 );
+   V[10] = _mm512_add_epi32( V[10], V[14] ); \
+   V[ 6] = mm512_ror_32( _mm512_xor_si512( V[ 6], V[10] ), 7 );
+
+   // G3
+   // GS_16WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
+   V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 7] ),
+                             _mm512_set1_epi32( CS7 ) );
+   V[15] = mm512_ror_32( _mm512_xor_si512( V[15], V[ 3] ), 16 );
+   V[11] = _mm512_add_epi32( V[11], V[15] );
+   V[ 7] = mm512_ror_32( _mm512_xor_si512( V[ 7], V[11] ), 12 );
+   V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 7] ),
+                             _mm512_set1_epi32( CS6 ) );
+   V[15] = mm512_ror_32( _mm512_xor_si512( V[15], V[ 3] ), 8 );
+   V[11] = _mm512_add_epi32( V[11], V[15] ); \
+   V[ 7] = mm512_ror_32( _mm512_xor_si512( V[ 7], V[11] ), 7 );

   // G4   
   // GS_16WAY( M[ 8], M[ 9], CS8, CS9, V0, V5, VA, VF );
-   V[ 0] = _mm512_add_epi32( V[ 0],
-                         _mm512_xor_si512( _mm512_set1_epi32( CS9 ), M[ 8] ) ); 
+   V[ 0] = _mm512_add_epi32( V[ 0], _mm512_set1_epi32( CS9 ) ); 
   
   // G5
   // GS_16WAY( M[10], M[11], CSA, CSB, V1, V6, VB, VC );
@@ -1258,11 +1831,11 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
   // G6   
   // GS_16WAY( M[12], M[13], CSC, CSD, V2, V7, V8, VD );
   V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 7] ),
-                         _mm512_xor_si512( _mm512_set1_epi32( CSD ), M[12] ) );
+                             _mm512_set1_epi32( CSD ) );
   // G7   
   // GS_16WAY( M[14], M[15], CSE, CSF, V3, V4, V9, VE );
   V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 4] ),
-                         _mm512_xor_si512( _mm512_set1_epi32( CSF ), M[14] ) );
+                             _mm512_set1_epi32( CSF ) );
   V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 3] ), 16 );
   V[ 3] = _mm512_add_epi32( V[ 3],
                         _mm512_xor_si512( _mm512_set1_epi32( CSE ), M[15] ) );
@@ -1273,45 +1846,38 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
 {
   __m512i *H = (__m512i*)final_hash;
   const __m512i *h = (const __m512i*)midhash;
-   const __m512i *v= (const __m512i*)midstate;
   __m512i V0, V1, V2, V3, V4, V5, V6, V7;
   __m512i V8, V9, VA, VB, VC, VD, VE, VF;
-   __m512i M0, M1, M2, M3, M4, M5, M6, M7;
-   __m512i M8, M9, MA, MB, MC, MD, ME, MF;
+   __m512i M0, M1, M2, M3, M4, MD, MF;
+   __m512i MDxorCSC;

-   V0 = v[ 0];
-   V1 = v[ 1];
-   V2 = v[ 2];
-   V3 = v[ 3];
-   V4 = v[ 4];
-   V5 = v[ 5];
-   V6 = v[ 6];
-   V7 = v[ 7];
-   V8 = v[ 8];
-   V9 = v[ 9];
-   VA = v[10];
-   VB = v[11];
-   VC = v[12];
-   VD = v[13];
-   VE = v[14];
-   VF = v[15];
+   V0 = _mm512_load_si512( (__m512i*)midstate +  0 );
+   V1 = _mm512_load_si512( (__m512i*)midstate +  1 );
+   V2 = _mm512_load_si512( (__m512i*)midstate +  2 );
+   V3 = _mm512_load_si512( (__m512i*)midstate +  3 );
+   V4 = _mm512_load_si512( (__m512i*)midstate +  4 );
+   V5 = _mm512_load_si512( (__m512i*)midstate +  5 );
+   V6 = _mm512_load_si512( (__m512i*)midstate +  6 );
+   V7 = _mm512_load_si512( (__m512i*)midstate +  7 );
+   V8 = _mm512_load_si512( (__m512i*)midstate +  8 );
+   V9 = _mm512_load_si512( (__m512i*)midstate +  9 );
+   VA = _mm512_load_si512( (__m512i*)midstate + 10 );
+   VB = _mm512_load_si512( (__m512i*)midstate + 11 );
+   VC = _mm512_load_si512( (__m512i*)midstate + 12 );
+   VD = _mm512_load_si512( (__m512i*)midstate + 13 );
+   VE = _mm512_load_si512( (__m512i*)midstate + 14 );
+   VF = _mm512_load_si512( (__m512i*)midstate + 15 );

-   M0 = casti_m512i( data,  0 ); 
-   M1 = casti_m512i( data,  1 ); 
-   M2 = casti_m512i( data,  2 ); 
-   M3 = casti_m512i( data,  3 ); 
-   M4 = casti_m512i( data,  4 ); 
-   M5 = casti_m512i( data,  5 ); 
-   M6 = casti_m512i( data,  6 ); 
-   M7 = casti_m512i( data,  7 ); 
-   M8 = casti_m512i( data,  8 ); 
-   M9 = casti_m512i( data,  9 ); 
-   MA = casti_m512i( data, 10 ); 
-   MB = casti_m512i( data, 11 ); 
-   MC = casti_m512i( data, 12 ); 
-   MD = casti_m512i( data, 13 ); 
-   ME = casti_m512i( data, 14 ); 
-   MF = casti_m512i( data, 15 ); 
+   M0 = _mm512_load_si512( (__m512i*)data +  0 ); 
+   M1 = _mm512_load_si512( (__m512i*)data +  1 ); 
+   M2 = _mm512_load_si512( (__m512i*)data +  2 ); 
+   M3 = _mm512_load_si512( (__m512i*)data +  3 ); 
+   M4 = _mm512_load_si512( (__m512i*)data +  4 ); 
+   // M5 to MC & ME are zero padding and optimised out
+   MD = _mm512_load_si512( (__m512i*)data + 13 ); 
+   MF = _mm512_load_si512( (__m512i*)data + 15 ); 
+   // cache for precalculated MD^CSC, used in round0 G6.
+   MDxorCSC = _mm512_load_si512( (__m512i*)data +  5 );

   // Finish round 0 with the nonce (M3) now available
   // G0   
@@ -1336,21 +1902,30 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
   VA = _mm512_add_epi32( VA, VF );
   V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 12 );
   V0 = _mm512_add_epi32( V0, _mm512_add_epi32( V5,
-                         _mm512_xor_si512( _mm512_set1_epi32( CS8 ), M9 ) ) );
+                             _mm512_set1_epi32( CS8 ) ) );
   VF = mm512_ror_32( _mm512_xor_si512( VF, V0 ), 8 );
   VA = _mm512_add_epi32( VA, VF );
   V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 7 );

   // G5
-   GS_16WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
+   // GS_16WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
+   V1 = _mm512_add_epi32( _mm512_add_epi32( V1, V6 ),
+                          _mm512_set1_epi32( CSB ) );
+   VC = mm512_ror_32( _mm512_xor_si512( VC, V1 ), 16 );
+   VB = _mm512_add_epi32( VB, VC );
+   V6 = mm512_ror_32( _mm512_xor_si512( V6, VB ), 12 );
+   V1 = _mm512_add_epi32( _mm512_add_epi32( V1, V6 ),
+                         _mm512_set1_epi32( CSA ) );
+   VC = mm512_ror_32( _mm512_xor_si512( VC, V1 ), 8 );
+   VB = _mm512_add_epi32( VB, VC );
+   V6 = mm512_ror_32( _mm512_xor_si512( V6, VB ), 7 );

   // G6
   // GS_16WAY( MC, MD, CSC, CSD, V2, V7, V8, VD );
   VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 16 );
   V8 = _mm512_add_epi32( V8, VD );
   V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 12 );
-   V2 = _mm512_add_epi32( _mm512_add_epi32( V2, V7 ),
-                         _mm512_xor_si512( _mm512_set1_epi32( CSC ), MD ) );
+   V2 = _mm512_add_epi32( V2, _mm512_add_epi32( V7, MDxorCSC ) );
   VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 8 );
   V8 = _mm512_add_epi32( V8, VD );
   V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 7 );
@@ -1364,20 +1939,20 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
   V9 = _mm512_add_epi32( V9, VE );
   V4 = mm512_ror_32( _mm512_xor_si512( V4, V9 ), 7 );

-   // Remaining rounds   
-   ROUND_S_16WAY( 1 );
-   ROUND_S_16WAY( 2 );
-   ROUND_S_16WAY( 3 );
-   ROUND_S_16WAY( 4 );
-   ROUND_S_16WAY( 5 );
-   ROUND_S_16WAY( 6 );
-   ROUND_S_16WAY( 7 );
-   ROUND_S_16WAY( 8 );
-   ROUND_S_16WAY( 9 );
-   ROUND_S_16WAY( 0 );
-   ROUND_S_16WAY( 1 );
-   ROUND_S_16WAY( 2 );
-   ROUND_S_16WAY( 3 );
+   // Remaining rounds, optimised   
+   ROUND256_16WAY_1;
+   ROUND256_16WAY_2;
+   ROUND256_16WAY_3;
+   ROUND256_16WAY_4;
+   ROUND256_16WAY_5;
+   ROUND256_16WAY_6;
+   ROUND256_16WAY_7;
+   ROUND256_16WAY_8;
+   ROUND256_16WAY_9;
+   ROUND256_16WAY_0;
+   ROUND256_16WAY_1;
+   ROUND256_16WAY_2;
+   ROUND256_16WAY_3;

   // Byte swap final hash
   const __m512i shuf_bswap32 =
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -103,16 +103,16 @@
   const uint8_t *sigmaR = sigma[R]; \
   BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
   BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
-   V2 = mm128_shufl2r_64( V[2], V[3] ); \
-   V3 = mm128_shufl2r_64( V[3], V[2] ); \
-   V6 = mm128_shufl2l_64( V[6], V[7] ); \
-   V7 = mm128_shufl2l_64( V[7], V[6] ); \
+   V2 = mm128_alignr_64( V[3], V[2] ); \
+   V3 = mm128_alignr_64( V[2], V[3] ); \
+   V6 = mm128_alignr_64( V[6], V[7] ); \
+   V7 = mm128_alignr_64( V[7], V[6] ); \
   BLAKE2B_G( V[0], V2, V[5], V6,  8,  9, 10, 11 ); \
   BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
-   V[2] = mm128_shufl2l_64( V2, V3 ); \
-   V[3] = mm128_shufl2l_64( V3, V2 ); \
-   V[6] = mm128_shufl2r_64( V6, V7 ); \
-   V[7] = mm128_shufl2r_64( V7, V6 ); \
+   V[2] = mm128_alignr_64( V2, V3 ); \
+   V[3] = mm128_alignr_64( V3, V2 ); \
+   V[6] = mm128_alignr_64( V7, V6 ); \
+   V[7] = mm128_alignr_64( V6, V7 ); \
 }

 #else
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -49,12 +49,11 @@ extern "C"{

 #define Sb_8W(x0, x1, x2, x3, c) \
 do { \
-   __m512i cc = _mm512_set1_epi64( c ); \
-    x3 = mm512_not( x3 ); \
+    const __m512i cc = _mm512_set1_epi64( c ); \
    x0 = mm512_xorandnot( x0, x2, cc ); \
    tmp = mm512_xorand( cc, x0, x1 ); \
-    x0 = mm512_xorand( x0, x2, x3 ); \
-    x3 = mm512_xorandnot( x3, x1, x2 ); \
+    x0 = mm512_xorandnot( x0, x3, x2 ); \
+    x3 = _mm512_ternarylogic_epi64( x3, x1, x2, 0x2d ); /* ~x3 ^ (~x1 & x2) */\
    x1 = mm512_xorand( x1, x0, x2 ); \
    x2 = mm512_xorandnot( x2, x3, x0 ); \
    x0 = mm512_xoror( x0, x1, x3 ); \
@@ -79,7 +78,7 @@ do { \

 #define Sb(x0, x1, x2, x3, c) \
 do { \
-   __m256i cc = _mm256_set1_epi64x( c ); \
+   const __m256i cc = _mm256_set1_epi64x( c ); \
    x3 = mm256_not( x3 ); \
    x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
    tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -23,13 +23,26 @@
 #include "simd-utils.h"
 #include "luffa_for_sse2.h"

+#if defined(__SSE4_1__)
+
 #define MULT2( a0, a1 ) do \
 { \
-  __m128i b =  _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) ); \
-  a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) ); \
-  a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) );  \
+  __m128i b =  _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
+  a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
+  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) );  \
 } while(0)

+#else
+
+#define MULT2( a0, a1 ) do \
+{ \
+  __m128i b =  _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 16 ) ); \
+  a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
+  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) );  \
+} while(0)
+
+#endif
+
 #define STEP_PART(x,c,t)\
    SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
    SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
@@ -60,13 +73,13 @@
    t  = _mm_load_si128(&a0);\
    a0 = _mm_or_si128(a0,a1);\
    a2 = _mm_xor_si128(a2,a3);\
-    a1 = _mm_andnot_si128(a1,ALLONE);\
+    a1 = mm128_not( a1 );\
    a0 = _mm_xor_si128(a0,a3);\
    a3 = _mm_and_si128(a3,t);\
    a1 = _mm_xor_si128(a1,a3);\
    a3 = _mm_xor_si128(a3,a2);\
    a2 = _mm_and_si128(a2,a0);\
-    a0 = _mm_andnot_si128(a0,ALLONE);\
+    a0 = mm128_not( a0 );\
    a2 = _mm_xor_si128(a2,a1);\
    a1 = _mm_or_si128(a1,a3);\
    t  = _mm_xor_si128(t,a1);\
@@ -242,17 +255,18 @@ static const uint32 CNS_INIT[128] __attribute((aligned(16))) = {


 __m128i CNS128[32];
-__m128i ALLONE;
+#if !defined(__SSE4_1__)
 __m128i MASK;
+#endif

 HashReturn init_luffa(hashState_luffa *state, int hashbitlen)
 {
    int i;
    state->hashbitlen = hashbitlen;
+#if !defined(__SSE4_1__)
    /* set the lower 32 bits to '1' */
    MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
-    /* set all bits to '1' */
-    ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
+#endif
    /* set the 32-bit round constant values to the 128-bit data field */
    for ( i=0; i<32; i++ )
        CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
@@ -352,10 +366,10 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
 // Optimized for integrals of 16 bytes, good for 64 and 80 byte len
    int i;
    state->hashbitlen = hashbitlen;
+#if !defined(__SSE4_1__)
    /* set the lower 32 bits to '1' */
    MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
-    /* set all bits to '1' */
-    ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
+#endif
    /* set the 32-bit round constant values to the 128-bit data field */
    for ( i=0; i<32; i++ )
        CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -230,25 +230,13 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
   block0_hash[7] = _mm512_set1_epi32( phash[7] );

   // Build vectored second block, interleave last 16 bytes of data using
-   // unique nonces, add padding.
+   // unique nonces.
   block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
   block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
   block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
   block_buf[ 3] =
             _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
-   block_buf[ 4] = m512_const1_32( 0x80000000 );
-   block_buf[ 5] =
-   block_buf[ 6] = 
-   block_buf[ 7] = 
-   block_buf[ 8] = 
-   block_buf[ 9] = 
-   block_buf[10] = 
-   block_buf[11] = 
-   block_buf[12] = m512_zero;
-   block_buf[13] = m512_one_32;
-   block_buf[14] = m512_zero;
-   block_buf[15] = m512_const1_32( 80*8 );

   // Partialy prehash second block without touching nonces in block_buf[3].
   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
@@ -425,24 +413,12 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
   block0_hash[7] = _mm256_set1_epi32( phash[7] );

   // Build vectored second block, interleave last 16 bytes of data using
-   // unique nonces and add padding.
+   // unique nonces.
   block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
   block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
   block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
-   block_buf[ 3] =
-            _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
-   block_buf[ 4] = m256_const1_32( 0x80000000 );
-   block_buf[ 5] =
-   block_buf[ 6] =
-   block_buf[ 7] =
-   block_buf[ 8] =
-   block_buf[ 9] =
-   block_buf[10] =
-   block_buf[11] =
-   block_buf[12] = m256_zero;
-   block_buf[13] = m256_one_32;
-   block_buf[14] = m256_zero;
-   block_buf[15] = m256_const1_32( 80*8 );
+   block_buf[ 3] = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4,
+                                     n+ 3, n+ 2, n+ 1, n );

   // Partialy prehash second block without touching nonces
   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -120,25 +120,13 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
   block0_hash[7] = _mm512_set1_epi32( phash[7] );

   // Build vectored second block, interleave last 16 bytes of data using
-   // unique nonces and add padding.
+   // unique nonces.
   block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
   block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
   block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
   block_buf[ 3] =
             _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
-   block_buf[ 4] = m512_const1_32( 0x80000000 );
-   block_buf[ 5] =
-   block_buf[ 6] =
-   block_buf[ 7] =
-   block_buf[ 8] =
-   block_buf[ 9] =
-   block_buf[10] =
-   block_buf[11] =
-   block_buf[12] = m512_zero;
-   block_buf[13] = m512_one_32;
-   block_buf[14] = m512_zero;
-   block_buf[15] = m512_const1_32( 80*8 );

   // Partialy prehash second block without touching nonces in block_buf[3].
   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
@@ -240,24 +228,12 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
   block0_hash[7] = _mm256_set1_epi32( phash[7] );

   // Build vectored second block, interleave last 16 bytes of data using
-   // unique nonces and add padding.
+   // unique nonces.
   block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
   block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
   block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
   block_buf[ 3] =
            _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
-   block_buf[ 4] = m256_const1_32( 0x80000000 );
-   block_buf[ 5] =
-   block_buf[ 6] =
-   block_buf[ 7] =
-   block_buf[ 8] =
-   block_buf[ 9] =
-   block_buf[10] =
-   block_buf[11] =
-   block_buf[12] = m256_zero;
-   block_buf[13] = m256_one_32;
-   block_buf[14] = m256_zero;
-   block_buf[15] = m256_const1_32( 80*8 );

   // Partialy prehash second block without touching nonces
   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -711,8 +711,11 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
 {
   __m256i A, B, C, D, E, F, G, H;

-   X[ 0] = SHA2x_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
-   X[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   // W[9:14] are zero, therefore X[9:13] are also zero and not needed.
+   // Except X[ 9] which is part of W[ 0] from the third group.
+   X[ 0] = _mm256_add_epi32( SSG2_0x( W[ 1] ), W[ 0] );
+   X[ 1] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( W[15] ),
+                             SSG2_0x( W[ 2] ) ), W[ 1] );
   X[ 2] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 0] ), W[11] ),
                             W[ 2] );
   X[ 3] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 1] ), W[12] ),
@@ -725,16 +728,12 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
                             W[ 6] );
   X[ 7] = _mm256_add_epi32( _mm256_add_epi32( X[ 0], SSG2_0x( W[ 8] ) ),
                             W[ 7] );
-   X[ 8] = _mm256_add_epi32( _mm256_add_epi32( X[ 1], SSG2_0x( W[ 9] ) ),
-                             W[ 8] );
-   X[ 9] = _mm256_add_epi32( SSG2_0x( W[10] ), W[ 9] );
-   X[10] = _mm256_add_epi32( SSG2_0x( W[11] ), W[10] );
-   X[11] = _mm256_add_epi32( SSG2_0x( W[12] ), W[11] );
-   X[12] = _mm256_add_epi32( SSG2_0x( W[13] ), W[12] );
-   X[13] = _mm256_add_epi32( SSG2_0x( W[14] ), W[13] );
-   X[14] = _mm256_add_epi32( SSG2_0x( W[15] ), W[14] );
+   X[ 8] = _mm256_add_epi32( X[ 1], W[ 8] );
+   X[14] = SSG2_0x( W[15] );
   X[15] = _mm256_add_epi32( SSG2_0x( X[ 0] ), W[15] );

+   X[ 9] = _mm256_add_epi32( SSG2_0x( X[ 1] ), X[ 0] );
+   
   A = _mm256_load_si256( state_in     );
   B = _mm256_load_si256( state_in + 1 );
   C = _mm256_load_si256( state_in + 2 );
@@ -779,10 +778,6 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
   G = _mm256_load_si256( state_mid + 6 );
   H = _mm256_load_si256( state_mid + 7 );

-//   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-//   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-//   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-
 #if !defined(__AVX512VL__)
   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( G, H );
 #endif
@@ -810,23 +805,36 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
   W[ 6] = _mm256_add_epi32( X[ 6], SSG2_1x( W[ 4] ) );
   W[ 7] = _mm256_add_epi32( X[ 7], SSG2_1x( W[ 5] ) );
   W[ 8] = _mm256_add_epi32( X[ 8], SSG2_1x( W[ 6] ) );
-   W[ 9] = _mm256_add_epi32( X[ 9], _mm256_add_epi32( SSG2_1x( W[ 7] ),
-                                                      W[ 2] ) );
-   W[10] = _mm256_add_epi32( X[10], _mm256_add_epi32( SSG2_1x( W[ 8] ),
-                                                      W[ 3] ) );
-   W[11] = _mm256_add_epi32( X[11], _mm256_add_epi32( SSG2_1x( W[ 9] ),
-                                                      W[ 4] ) );
-   W[12] = _mm256_add_epi32( X[12], _mm256_add_epi32( SSG2_1x( W[10] ),
-                                                      W[ 5] ) );
-   W[13] = _mm256_add_epi32( X[13], _mm256_add_epi32( SSG2_1x( W[11] ),
-                                                      W[ 6] ) );
+   W[ 9] = _mm256_add_epi32( SSG2_1x( W[ 7] ), W[ 2] );
+   W[10] = _mm256_add_epi32( SSG2_1x( W[ 8] ), W[ 3] );
+   W[11] = _mm256_add_epi32( SSG2_1x( W[ 9] ), W[ 4] );
+   W[12] = _mm256_add_epi32( SSG2_1x( W[10] ), W[ 5] );
+   W[13] = _mm256_add_epi32( SSG2_1x( W[11] ), W[ 6] );
   W[14] = _mm256_add_epi32( X[14], _mm256_add_epi32( SSG2_1x( W[12] ),
                                                      W[ 7] ) );
   W[15] = _mm256_add_epi32( X[15], _mm256_add_epi32( SSG2_1x( W[13] ),
                                                      W[ 8] ) );

   SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
-   SHA256x8_MSG_EXPANSION( W );
+
+   W[ 0] = _mm256_add_epi32( X[ 9], _mm256_add_epi32( SSG2_1x( W[14] ),
+                                                      W[ 9] ) );
+   W[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   W[ 2] = SHA2x_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
+   W[ 3] = SHA2x_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
+   W[ 4] = SHA2x_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
+   W[ 5] = SHA2x_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
+   W[ 6] = SHA2x_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
+   W[ 7] = SHA2x_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
+   W[ 8] = SHA2x_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
+   W[ 9] = SHA2x_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
+   W[10] = SHA2x_MEXP( W[ 8], W[ 3], W[11], W[10] );
+   W[11] = SHA2x_MEXP( W[ 9], W[ 4], W[12], W[11] );
+   W[12] = SHA2x_MEXP( W[10], W[ 5], W[13], W[12] );
+   W[13] = SHA2x_MEXP( W[11], W[ 6], W[14], W[13] );
+   W[14] = SHA2x_MEXP( W[12], W[ 7], W[15], W[14] );
+   W[15] = SHA2x_MEXP( W[13], W[ 8], W[ 0], W[15] ); 
+
   SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
   SHA256x8_MSG_EXPANSION( W );
   SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
@@ -1201,9 +1209,13 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
 {
   __m512i A, B, C, D, E, F, G, H;
   
-   // precalculate constant part msg expansion for second iteration.
-   X[ 0] = SHA2x16_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
-   X[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   // X is pre-expanded constant part of msg for second group, rounds 16 to 31.
+   // W[9:14] are zero, therefore X[9:13] are also zero and not needed.
+   // Except X[ 9] which is used to pre-expand part of W[ 0] from the third
+   // group, rounds 32 to 48.
+   X[ 0] = _mm512_add_epi32( SSG2_0x16( W[ 1] ), W[ 0] );
+   X[ 1] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( W[15] ),
+                             SSG2_0x16( W[ 2] ) ), W[ 1] );
   X[ 2] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 0] ), W[11] ),
                             W[ 2] );
   X[ 3] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 1] ), W[12] ),
@@ -1216,16 +1228,12 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
                             W[ 6] ); 
   X[ 7] = _mm512_add_epi32( _mm512_add_epi32( X[ 0], SSG2_0x16( W[ 8] ) ),
                             W[ 7] );
-   X[ 8] = _mm512_add_epi32( _mm512_add_epi32( X[ 1], SSG2_0x16( W[ 9] ) ),
-                             W[ 8] );
-   X[ 9] = _mm512_add_epi32( SSG2_0x16( W[10] ), W[ 9] );
-   X[10] = _mm512_add_epi32( SSG2_0x16( W[11] ), W[10] );
-   X[11] = _mm512_add_epi32( SSG2_0x16( W[12] ), W[11] );
-   X[12] = _mm512_add_epi32( SSG2_0x16( W[13] ), W[12] );
-   X[13] = _mm512_add_epi32( SSG2_0x16( W[14] ), W[13] );
-   X[14] = _mm512_add_epi32( SSG2_0x16( W[15] ), W[14] );
+   X[ 8] = _mm512_add_epi32( X[ 1], W[ 8] );
+   X[14] = SSG2_0x16( W[15] );
   X[15] = _mm512_add_epi32( SSG2_0x16( X[ 0] ), W[15] );

+   X[ 9] = _mm512_add_epi32( SSG2_0x16( X[ 1] ), X[ 0] );
+
   A = _mm512_load_si512( state_in     );
   B = _mm512_load_si512( state_in + 1 );
   C = _mm512_load_si512( state_in + 2 );
@@ -1280,7 +1288,7 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );

-   // update precalculated msg expansion with new nonce: W[3].
+   // inject nonce, W[3], to complete msg expansion.
   W[ 0] = X[ 0];
   W[ 1] = X[ 1];
   W[ 2] = _mm512_add_epi32( X[ 2], SSG2_0x16( W[ 3] ) );
@@ -1290,23 +1298,36 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
   W[ 6] = _mm512_add_epi32( X[ 6], SSG2_1x16( W[ 4] ) );
   W[ 7] = _mm512_add_epi32( X[ 7], SSG2_1x16( W[ 5] ) );
   W[ 8] = _mm512_add_epi32( X[ 8], SSG2_1x16( W[ 6] ) );
-   W[ 9] = _mm512_add_epi32( X[ 9], _mm512_add_epi32( SSG2_1x16( W[ 7] ),
-                                                      W[ 2] ) );
-   W[10] = _mm512_add_epi32( X[10], _mm512_add_epi32( SSG2_1x16( W[ 8] ),
-                                                      W[ 3] ) );
-   W[11] = _mm512_add_epi32( X[11], _mm512_add_epi32( SSG2_1x16( W[ 9] ),
-                                                      W[ 4] ) );
-   W[12] = _mm512_add_epi32( X[12], _mm512_add_epi32( SSG2_1x16( W[10] ),
-                                                      W[ 5] ) );
-   W[13] = _mm512_add_epi32( X[13], _mm512_add_epi32( SSG2_1x16( W[11] ),
-                                                      W[ 6] ) );
+   W[ 9] = _mm512_add_epi32( SSG2_1x16( W[ 7] ), W[ 2] );
+   W[10] = _mm512_add_epi32( SSG2_1x16( W[ 8] ), W[ 3] );
+   W[11] = _mm512_add_epi32( SSG2_1x16( W[ 9] ), W[ 4] );
+   W[12] = _mm512_add_epi32( SSG2_1x16( W[10] ), W[ 5] );
+   W[13] = _mm512_add_epi32( SSG2_1x16( W[11] ), W[ 6] );
   W[14] = _mm512_add_epi32( X[14], _mm512_add_epi32( SSG2_1x16( W[12] ),
                                                      W[ 7] ) );
   W[15] = _mm512_add_epi32( X[15], _mm512_add_epi32( SSG2_1x16( W[13] ),
                                                      W[ 8] ) );

   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
-   SHA256x16_MSG_EXPANSION( W );
+
+   W[ 0] = _mm512_add_epi32( X[ 9], _mm512_add_epi32( SSG2_1x16( W[14] ),
+                                                      W[ 9] ) ); 
+   W[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   W[ 2] = SHA2x16_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
+   W[ 3] = SHA2x16_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
+   W[ 4] = SHA2x16_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
+   W[ 5] = SHA2x16_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
+   W[ 6] = SHA2x16_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
+   W[ 7] = SHA2x16_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
+   W[ 8] = SHA2x16_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
+   W[ 9] = SHA2x16_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
+   W[10] = SHA2x16_MEXP( W[ 8], W[ 3], W[11], W[10] );
+   W[11] = SHA2x16_MEXP( W[ 9], W[ 4], W[12], W[11] );
+   W[12] = SHA2x16_MEXP( W[10], W[ 5], W[13], W[12] );
+   W[13] = SHA2x16_MEXP( W[11], W[ 6], W[14], W[13] );
+   W[14] = SHA2x16_MEXP( W[12], W[ 7], W[15], W[14] );
+   W[15] = SHA2x16_MEXP( W[13], W[ 8], W[ 0], W[15] );
+
   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
   SHA256x16_MSG_EXPANSION( W );
   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
@@ -1336,8 +1357,8 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
 {
   __m512i A, B, C, D, E, F, G, H;
   __m512i W[16];      memcpy_512( W, data, 16 );
-   // Value for H at round 60, before adding K, to produce valid final hash
-   //where H == 0.
+   // Value for H at round 60, before adding K, needed to produce valid final
+   // hash where H == 0.
   // H_ =  -( H256[7] + K256[60] );
   const __m512i H_ = m512_const1_32( 0x136032ED );

--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -383,11 +383,17 @@ static const m512_v16 FFT256_Twiddle4w[] =

 #define shufxor4w(x,s) _mm512_shuffle_epi32( x, XCAT( SHUFXOR_, s ))

+#define REDUCE4w(x) \
+  _mm512_sub_epi16( _mm512_maskz_mov_epi8( 0x5555555555555555, x ), \
+                    _mm512_srai_epi16( x, 8 ) )
+
+/*
 #define REDUCE4w(x) \
  _mm512_sub_epi16( _mm512_and_si512( x, m512_const1_64( \
                         0x00ff00ff00ff00ff ) ), _mm512_srai_epi16( x, 8 ) )
+*/

-#define EXTRA_REDUCE_S4w(x)\
+#define EXTRA_REDUCE_S4w(x) \
  _mm512_sub_epi16( x, _mm512_and_si512( \
             m512_const1_64( 0x0101010101010101 ), \
             _mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \
@@ -431,10 +437,6 @@ void fft64_4way( void *a )
   //  Unrolled decimation in frequency (DIF) radix-2 NTT.
   //  Output data is in revbin_permuted order.

-  static const int w[] = {0, 2, 4, 6};
-//   __m256i *Twiddle = (__m256i*)FFT64_Twiddle;
-
-
 // targetted  
 #define BUTTERFLY_0( i,j ) \
 do { \
@@ -443,25 +445,25 @@ do { \
    X(i) = _mm512_sub_epi16( X(i), v ); \
 } while(0)

-#define BUTTERFLY_N( i,j,n ) \
+#define BUTTERFLY_N( i, j, w ) \
 do { \
    __m512i v = X(j); \
    X(j) = _mm512_add_epi16( X(i), X(j) ); \
-    X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w[n] ); \
+    X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w ); \
 } while(0)

  BUTTERFLY_0( 0, 4 );
-  BUTTERFLY_N( 1, 5, 1 );
-  BUTTERFLY_N( 2, 6, 2 );
-  BUTTERFLY_N( 3, 7, 3 );
+  BUTTERFLY_N( 1, 5, 2 );
+  BUTTERFLY_N( 2, 6, 4 );
+  BUTTERFLY_N( 3, 7, 6 );

  DO_REDUCE( 2 );
  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
-  BUTTERFLY_N( 1, 3, 2 );
-  BUTTERFLY_N( 5, 7, 2 );
+  BUTTERFLY_N( 1, 3, 4 );
+  BUTTERFLY_N( 5, 7, 4 );

  DO_REDUCE( 1 );

@@ -501,12 +503,11 @@ do { \
  // Transpose the FFT state with a revbin order permutation
  // on the rows and the column.
  // This will make the full FFT_64 in order.
-#define INTERLEAVE(i,j) \
+#define INTERLEAVE( i, j ) \
  do { \
-    __m512i t1= X(i); \
-    __m512i t2= X(j); \
-    X(i) = _mm512_unpacklo_epi16( t1, t2 ); \
-    X(j) = _mm512_unpackhi_epi16( t1, t2 ); \
+    __m512i u = X(j); \
+    X(j) = _mm512_unpackhi_epi16( X(i), X(j) ); \
+    X(i) = _mm512_unpacklo_epi16( X(i), u ); \
  } while(0)

  INTERLEAVE( 1, 0 );
@@ -534,10 +535,10 @@ do { \
 } while(0)


-#define BUTTERFLY_N( i,j,n ) \
+#define BUTTERFLY_N( i, j, w ) \
 do { \
   __m512i u = X(j); \
-   X(i) = _mm512_slli_epi16( X(i), w[n] ); \
+   X(i) = _mm512_slli_epi16( X(i), w ); \
   X(j) = _mm512_sub_epi16( X(j), X(i) ); \
   X(i) = _mm512_add_epi16( u, X(i) ); \
 } while(0)
@@ -558,15 +559,15 @@ do { \

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
-  BUTTERFLY_N( 1, 3, 2 );
-  BUTTERFLY_N( 5, 7, 2 );
+  BUTTERFLY_N( 1, 3, 4 );
+  BUTTERFLY_N( 5, 7, 4 );

  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 4 );
-  BUTTERFLY_N( 1, 5, 1 );
-  BUTTERFLY_N( 2, 6, 2 );
-  BUTTERFLY_N( 3, 7, 3 );
+  BUTTERFLY_N( 1, 5, 2 );
+  BUTTERFLY_N( 2, 6, 4 );
+  BUTTERFLY_N( 3, 7, 6 );

  DO_REDUCE_FULL_S4w( 0 );
  DO_REDUCE_FULL_S4w( 1 );
@@ -599,7 +600,6 @@ void fft128_4way( void *a )
  // Temp space to help for interleaving in the end
  __m512i B[8];
  __m512i *A = (__m512i*) a;
-//  __m256i *Twiddle = (__m256i*)FFT128_Twiddle;

  /* Size-2 butterflies */
  for ( i = 0; i<8; i++ )
@@ -633,7 +633,6 @@ void fft128_4way_msg( uint16_t *a, const uint8_t *x, int final )

  __m512i *X = (__m512i*)x;
  __m512i *A = (__m512i*)a;
-//  __m256i *Twiddle = (__m256i*)FFT128_Twiddle;

 #define UNPACK( i ) \
 do { \
@@ -686,7 +685,6 @@ void fft256_4way_msg( uint16_t *a, const uint8_t *x, int final )

  __m512i *X = (__m512i*)x;
  __m512i *A = (__m512i*)a;
-//  __m256i *Twiddle = (__m256i*)FFT256_Twiddle;

 #define UNPACK( i ) \
 do { \
@@ -776,109 +774,6 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
  // We split the round function in two halfes
  // so as to insert some independent computations in between

-// generic
-#if 0
-#define SUM7_00 0
-#define SUM7_01 1
-#define SUM7_02 2
-#define SUM7_03 3
-#define SUM7_04 4
-#define SUM7_05 5
-#define SUM7_06 6
-
-#define SUM7_10 1
-#define SUM7_11 2
-#define SUM7_12 3
-#define SUM7_13 4
-#define SUM7_14 5
-#define SUM7_15 6
-#define SUM7_16 0
-
-#define SUM7_20 2
-#define SUM7_21 3
-#define SUM7_22 4
-#define SUM7_23 5
-#define SUM7_24 6
-#define SUM7_25 0
-#define SUM7_26 1
-
-#define SUM7_30 3
-#define SUM7_31 4
-#define SUM7_32 5
-#define SUM7_33 6
-#define SUM7_34 0
-#define SUM7_35 1
-#define SUM7_36 2
-
-#define SUM7_40 4
-#define SUM7_41 5
-#define SUM7_42 6
-#define SUM7_43 0
-#define SUM7_44 1
-#define SUM7_45 2
-#define SUM7_46 3
-
-#define SUM7_50 5
-#define SUM7_51 6
-#define SUM7_52 0
-#define SUM7_53 1
-#define SUM7_54 2
-#define SUM7_55 3
-#define SUM7_56 4
-
-#define SUM7_60 6
-#define SUM7_61 0
-#define SUM7_62 1
-#define SUM7_63 2
-#define SUM7_64 3
-#define SUM7_65 4
-#define SUM7_66 5
-
-#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
-
-#define PERM_0(d,a) /* XOR 1 */ \
-do { \
-    d##l = shufxor( a##l, 1 ); \
-    d##h = shufxor( a##h, 1 ); \
- } while(0)
-
-#define PERM_1(d,a) /* XOR 6 */ \
-do { \
-    d##l = shufxor( a##h, 2 ); \
-    d##h = shufxor( a##l, 2 ); \
-} while(0)
-
-#define PERM_2(d,a) /* XOR 2 */ \
-do { \
-    d##l = shufxor( a##l, 2 ); \
-    d##h = shufxor( a##h, 2 ); \
-} while(0)
-
-#define PERM_3(d,a) /* XOR 3 */ \
-do { \
-    d##l = shufxor( a##l, 3 ); \
-    d##h = shufxor( a##h, 3 ); \
-} while(0)
-
-#define PERM_4(d,a) /* XOR 5 */ \
-do { \
-    d##l = shufxor( a##h, 1 ); \
-    d##h = shufxor( a##l, 1 ); \
-} while(0)
-
-#define PERM_5(d,a) /* XOR 7 */ \
-do { \
-    d##l = shufxor( a##h, 3 ); \
-    d##h = shufxor( a##l, 3 ); \
-} while(0)
-
-#define PERM_6(d,a) /* XOR 4 */ \
-do { \
-    d##l = a##h; \
-    d##h = a##l; \
-} while(0)
-#endif
-
 // targetted
  
 #define STEP_1_(a,b,c,d,w,fun,r,s,z) \
--- a/algo/x16/minotaur.c
+++ b/algo/x16/minotaur.c
@@ -18,6 +18,7 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sph_sha2.h"
+#include "algo/yespower/yespower.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -31,6 +32,9 @@
 // Config
 #define MINOTAUR_ALGO_COUNT	16

+static const yespower_params_t minotaurx_yespower_params =
+                         { YESPOWER_1_0, 2048, 8, "et in arcadia ego", 17 };
+
 typedef struct TortureNode TortureNode;
 typedef struct TortureGarden TortureGarden;

@@ -59,20 +63,22 @@ struct TortureGarden
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
        sph_sha512_context      sha512;
-
-    struct TortureNode {
+    struct TortureNode
+    {
        unsigned int algo;
        TortureNode *child[2];
    } nodes[22];
 } __attribute__ ((aligned (64)));

 // Get a 64-byte hash for given 64-byte input, using given TortureGarden contexts and given algo index
-static void get_hash( void *output, const void *input, TortureGarden *garden,
-	              unsigned int algo )
+static int get_hash( void *output, const void *input, TortureGarden *garden,
+	                  unsigned int algo, int thr_id )
 {    
 	unsigned char hash[64] __attribute__ ((aligned (64)));
+   int rc = 1;

-    switch (algo) {
+    switch ( algo )
+    {
        case 0:
            sph_blake512_init(&garden->blake);
            sph_blake512(&garden->blake, input, 64);
@@ -164,9 +170,13 @@ static void get_hash( void *output, const void *input, TortureGarden *garden,
            sph_whirlpool(&garden->whirlpool, input, 64);
            sph_whirlpool_close(&garden->whirlpool, hash);          
            break;
+        case 16: // minotaurx only, yespower hardcoded for last node
+            rc = yespower_tls( input, 64, &minotaurx_yespower_params,
+                               (yespower_binary_t*)hash, thr_id );
    }

    memcpy(output, hash, 64);
+    return rc;
 }

 static __thread TortureGarden garden;
@@ -219,7 +229,6 @@ bool initialize_torture_garden()
   garden.nodes[20].child[1] = &garden.nodes[21];
   garden.nodes[21].child[0] = NULL;
   garden.nodes[21].child[1] = NULL;
-
   return true;
 }

@@ -227,34 +236,41 @@ bool initialize_torture_garden()
 int minotaur_hash( void *output, const void *input, int thr_id )
 {    
    unsigned char hash[64] __attribute__ ((aligned (64)));
+    int rc = 1;

    // Find initial sha512 hash
    sph_sha512_init( &garden.sha512 );
    sph_sha512( &garden.sha512, input, 80 );
    sph_sha512_close( &garden.sha512, hash );
    
+    if ( opt_algo != ALGO_MINOTAURX )
+    {
       // algo 6 (Hamsi) is very slow. It's faster to skip hashing this nonce
       // if Hamsi is needed but only the first and last functions are
       // currently known. Abort if either is Hamsi.
       if ( ( ( hash[ 0] % MINOTAUR_ALGO_COUNT ) == 6 )
         || ( ( hash[21] % MINOTAUR_ALGO_COUNT ) == 6 ) )
           return 0;
+    }

    // Assign algos to torture garden nodes based on initial hash
    for ( int i = 0; i < 22; i++ )
        garden.nodes[i].algo = hash[i] % MINOTAUR_ALGO_COUNT;

+    // MinotaurX override algo for last node with yespower
+    if ( opt_algo == ALGO_MINOTAURX )
+        garden.nodes[21].algo = MINOTAUR_ALGO_COUNT;
+    
    // Send the initial hash through the torture garden
    TortureNode *node = &garden.nodes[0];
-
-    while ( node )
+    while ( rc && node )
    {
-      get_hash( hash, hash, &garden, node->algo );
+      rc = get_hash( hash, hash, &garden, node->algo, thr_id );
      node = node->child[ hash[63] & 1 ];
    }

    memcpy( output, hash, 32 );
-    return 1;
+    return rc;
 }

 int scanhash_minotaur( struct work *work, uint32_t max_nonce,
@@ -291,12 +307,14 @@ int scanhash_minotaur( struct work *work, uint32_t max_nonce,
   return 0;
 }

+// hash function has hooks for minotaurx
 bool register_minotaur_algo( algo_gate_t* gate )
 {
  gate->scanhash          = (void*)&scanhash_minotaur;
  gate->hash              = (void*)&minotaur_hash;
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&initialize_torture_garden;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  if ( opt_algo == ALGO_MINOTAURX ) gate->optimizations |= SHA_OPT;
  return true;
 };

--- a/algo/yespower/yespower-opt.c
+++ b/algo/yespower/yespower-opt.c
@@ -1136,10 +1136,14 @@ int yespower(yespower_local_t *local,
   ctx.S0 = S;
   ctx.S1 = S + Swidth_to_Sbytes1( Swidth );

-   // copy prehash, do tail   
+   if ( srclen == 80 )   // assume 64 byte prehash was done
+   {
     memcpy( &sha256_ctx, &sha256_prehash_ctx, sizeof sha256_ctx );
     sha256_update( &sha256_ctx, src+64, srclen-64 );
     sha256_final( &sha256_ctx, sha256 );
+   }
+   else
+     sha256_full( sha256, src, srclen );
   
   if ( version == YESPOWER_0_5 )
   {
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.3.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.21.0.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.20.3'
-PACKAGE_STRING='cpuminer-opt 3.20.3'
+PACKAGE_VERSION='3.21.0'
+PACKAGE_STRING='cpuminer-opt 3.21.0'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.20.3 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.21.0 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.20.3:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.21.0:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.20.3
+cpuminer-opt configure 3.21.0
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.20.3, which was
+It was created by cpuminer-opt $as_me 3.21.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.20.3'
+ VERSION='3.21.0'


 cat >>confdefs.h <<_ACEOF
@@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.20.3, which was
+This file was extended by cpuminer-opt $as_me 3.21.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6784,7 +6784,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.20.3
+cpuminer-opt config.status 3.21.0
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.20.3])
+AC_INIT([cpuminer-opt], [3.21.0])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -131,10 +131,9 @@ bool opt_verify = false;
 static bool opt_stratum_keepalive = false;
 static struct timeval stratum_keepalive_timer;
 // Stratum typically times out in 5 minutes or 300 seconds
-#define stratum_keepalive_timeout 180  // 3 minutes
+#define stratum_keepalive_timeout 150  // 2.5 minutes
 static struct timeval stratum_reset_time;

-
 // pk_buffer_size is used as a version selector by b58 code, therefore
 // it must be set correctly to work.
 const int pk_buffer_size_max = 26;
@@ -2192,6 +2191,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
      } // !quiet
   }  // new diff/block

+/*   
   if ( new_job && !( opt_quiet || stratum_errors ) )
   {
      int mismatch = submitted_share_count - ( accepted_share_count
@@ -2202,6 +2202,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
                 CL_LBL "%d Submitted share pending, maybe stale" CL_N,
                 submitted_share_count );
   }
+*/
 }

 static void *miner_thread( void *userdata )
@@ -2446,8 +2447,8 @@ static void *miner_thread( void *userdata )
          {
             scale_hash_for_display( &hashrate,  hr_units );
             sprintf( hr, "%.2f", hashrate );
-             applog( LOG_INFO, "CPU #%d: %s %sh/s",
-                               thr_id, hr, hr_units );
+             applog( LOG_INFO, "Thread %d, CPU %d: %s %sh/s",
+                        thr_id, thread_affinity_map[ thr_id ], hr, hr_units );
          }
       }

@@ -2887,7 +2888,7 @@ static void *stratum_thread(void *userdata )
            else
              timeval_subtract( &et, &now, &stratum_reset_time );

-            if ( et.tv_sec > stratum_keepalive_timeout + 60 )
+            if ( et.tv_sec > stratum_keepalive_timeout + 90 )
            {
               applog( LOG_NOTICE, "No shares submitted, resetting stratum connection" );
               stratum_need_reset = true;
--- a/miner.h
+++ b/miner.h
@@ -559,6 +559,7 @@ enum algos {
        ALGO_LYRA2Z330,
        ALGO_M7M,
        ALGO_MINOTAUR,
+        ALGO_MINOTAURX,
        ALGO_MYR_GR,      
        ALGO_NEOSCRYPT,
        ALGO_NIST5,       
@@ -652,6 +653,7 @@ static const char* const algo_names[] = {
        "lyra2z330",
        "m7m",
        "minotaur",
+        "minotaurx",
        "myr-gr",
        "neoscrypt",
        "nist5",
@@ -813,6 +815,7 @@ Options:\n\
                          m7m           Magi (XMG)\n\
                          myr-gr        Myriad-Groestl\n\
                          minotaur\n\
+                          minotaurx\n\
                          neoscrypt     NeoScrypt(128, 2, 1)\n\
                          nist5         Nist5\n\
                          pentablake    5 x blake512\n\
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -193,8 +193,17 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m )
 // Basic operations without equivalent SIMD intrinsic

 // Bitwise not (~v)  
+#if defined(__AVX512VL__)
+
+static inline __m128i mm128_not( const __m128i v )
+{  return _mm_ternarylogic_epi64( v, v, v, 1 ); }
+
+#else
+
 #define mm128_not( v )          _mm_xor_si128( v, m128_neg1 ) 

+#endif
+
 // Unary negation of elements (-v)
 #define mm128_negate_64( v )    _mm_sub_epi64( m128_zero, v )
 #define mm128_negate_32( v )    _mm_sub_epi32( m128_zero, v )  
@@ -439,7 +448,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )

 //
 // Limited 2 input shuffle, combines shuffle with blend. The destination low
-// half is always taken from src a, and the high half from src b.
+// half is always taken from v1, and the high half from v2.
 #define mm128_shuffle2_64( v1, v2, c ) \
   _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( v1 ), \
                                     _mm_castsi128_pd( v2 ), c ) ); 
@@ -600,9 +609,6 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )

 #endif // SSSE3 else SSE2

-//
-// Rotate in place concatenated 128 bit vectors as one 256 bit vector.
-
 // Swap 128 bit vectors.
 // This should be avoided, it's more efficient to switch references.
 #define mm128_swap256_128( v1, v2 ) \
@@ -611,61 +617,23 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
   v1 = _mm_xor_si128( v1, v2 );


-// Two input shuffle-rotate.
-// Concatenate v1 & v2 and byte rotate as a 256 bit vector.
-// Function macros with two inputs and one output, inputs are preserved.
-// Returns the high 128 bits, ie updated v1.
+// alignr for 32 & 64 bit elements is only available with AVX512 but
+// emulated here. Shift argument is not needed, it's always 1.
+// Behaviour is otherwise consistent with Intel alignr intrinsics.

 #if defined(__SSSE3__)

-#define mm128_shufl2r_64( v1, v2 )     _mm_alignr_epi8( v2, v1, 8 )
-#define mm128_shufl2l_64( v1, v2 )     _mm_alignr_epi8( v1, v2, 8 )
-
-/*
-#define mm128_shufl2r_32( v1, v2 )     _mm_alignr_epi8( v2, v1, 4 )
-#define mm128_shufl2l_32( v1, v2 )     _mm_alignr_epi8( v1, v2, 4 )
-
-#define mm128_shufl2r_16( v1, v2 )     _mm_alignr_epi8( v2, v1, 2 )
-#define mm128_shufl2l_16( v1, v2 )     _mm_alignr_epi8( v1, v2, 2 )
-
-#define mm128_shufl2r_8( v1, v2 )      _mm_alignr_epi8( v2, v1, 1 )
-#define mm128_shufl2l_8( v1, v2 )      _mm_alignr_epi8( v1, v2, 1 )
-*/
+#define mm128_alignr_64( v1, v2 )    _mm_alignr_epi8( v1, v2, 8 )
+#define mm128_alignr_32( v1, v2 )    _mm_alignr_epi8( v1, v2, 4 )

 #else

-#define mm128_shufl2r_64( v1, v2 ) \
-   _mm_or_si128( _mm_srli_si128( v1, 8 ), \
-                 _mm_slli_si128( v2, 8 ) )
-
-#define mm128_shufl2l_64( v1, v2 ) \
-   _mm_or_si128( _mm_slli_si128( v1, 8 ), \
+#define mm128_alignr_64( v1, v2 )    _mm_or_si128( _mm_slli_si128( v1, 8 ), \
                                                   _mm_srli_si128( v2, 8 ) )
-/*
-#define mm128_shufl2r_32( v1, v2 ) \
-   _mm_or_si128( _mm_srli_si128( v1,  4 ), \
-                 _mm_slli_si128( v2, 12 ) )

-#define mm128_shufl2l_32( v1, v2 ) \
-   _mm_or_si128( _mm_slli_si128( v1, 4 ), \
-                 _mm_srli_si128( v2, 12 ) )
+#define mm128_alignr_32( v1, v2 )    _mm_or_si128( _mm_slli_si128( v1, 4 ), \
+                                                   _mm_srli_si128( v2, 4 ) )

-#define mm128_shufl2r_16( v1, v2 ) \
-   _mm_or_si128( _mm_srli_si128( v1,  2 ), \
-                 _mm_slli_si128( v2, 14 ) )
-
-#define mm128_shufl2l_16( v1, v2 ) \
-   _mm_or_si128( _mm_slli_si128( v1,  2 ), \
-                 _mm_srli_si128( v2, 14 ) )
-
-#define mm128_shufl2r_8( v1, v2 ) \
-   _mm_or_si128( _mm_srli_si128( v1,  1 ), \
-                 _mm_slli_si128( v2, 15 ) )
-
-#define mm128_shufl2l_8( v1, v2 ) \
-   _mm_or_si128( _mm_slli_si128( v1,  1 ), \
-                 _mm_srli_si128( v2, 15 ) )
-*/
 #endif

 // Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
@@ -689,50 +657,6 @@ do { \
           v1 = t; \
 } while(0)

-/*
-#define mm128_vror256_32( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 4 ); \
-           v1 = _mm_alignr_epi8( v2, v1, 4 ); \
-           v2 = t; \
-} while(0)
-
-#define mm128_vrol256_32( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 12 ); \
-           v2 = _mm_alignr_epi8( v2, v1, 12 ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vror256_16( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 2 ); \
-           v1 = _mm_alignr_epi8( v2, v1, 2 ); \
-           v2 = t; \
-} while(0)
-
-#define mm128_vrol256_16( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 14 ); \
-           v2 = _mm_alignr_epi8( v2, v1, 14 ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vror256_8( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 1 ); \
-           v1 = _mm_alignr_epi8( v2, v1, 1 ); \
-           v2 = t; \
-} while(0)
-
-#define mm128_vrol256_8( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 15 ); \
-           v2 = _mm_alignr_epi8( v2, v1, 15 ); \
-           v1 = t; \
-} while(0)
-*/
-
 #else  // SSE2

 #define mm128_vror256_64( v1, v2 ) \
@@ -752,61 +676,7 @@ do { \
                              _mm_srli_si128( v1, 8 ) ); \
           v1 = t; \
 } while(0)
-/*
-#define mm128_vror256_32( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
-                              _mm_slli_si128( v2, 12 ) ); \
-           v2 = _mm_or_si128( _mm_srli_si128( v2, 4 ), \
-                              _mm_slli_si128( v1, 12 ) ); \
-           v1 = t; \
-} while(0)

-#define mm128_vrol256_32( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
-                              _mm_srli_si128( v2, 12 ) ); \
-           v2 = _mm_or_si128( _mm_slli_si128( v2, 4 ), \
-                              _mm_srli_si128( v1, 12 ) ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vror256_16( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
-                              _mm_slli_si128( v2, 14 ) ); \
-           v2 = _mm_or_si128( _mm_srli_si128( v2, 2 ), \
-                              _mm_slli_si128( v1, 14 ) ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vrol256_16( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
-                              _mm_srli_si128( v2, 14 ) ); \
-           v2 = _mm_or_si128( _mm_slli_si128( v2, 2 ), \
-                              _mm_srli_si128( v1, 14 ) ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vror256_8( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
-                              _mm_slli_si128( v2, 15 ) ); \
-           v2 = _mm_or_si128( _mm_srli_si128( v2, 1 ), \
-                              _mm_slli_si128( v1, 15 ) ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vrol256_8( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
-                              _mm_srli_si128( v2, 15 ) ); \
-           v2 = _mm_or_si128( _mm_slli_si128( v2, 1 ), \
-                              _mm_srli_si128( v1, 15 ) ); \
-           v1 = t; \
-} while(0)
-*/
 #endif  // SSE4.1 else SSE2

 #endif // __SSE2__
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -15,14 +15,13 @@
 //
 // "_mm256_shuffle_epi8" and "_mm256_alignr_epi8" are restricted to 128 bit
 // lanes and data can't cross the 128 bit lane boundary.  
-// Some usage may have the index vector encoded as if full vector
-// shuffles are supported. This has no side effects and would have the same
-// results using either version.
-// If the need arises and AVX512VL is available, 256 bit full vector shuffles
-// can be implemented using the AVX512 zero-mask feature with a NULL mask.
-// Using intrinsics it's simple:   _mm256_maskz_shuffle_epi8( 0, v, c )
-// With asm it's a bit more complicated with the addition of the mask register
-// and zero tag:   vpshufb ymm0{k0}{z}, ymm1, ymm2 
+// Instructions that can move data across 128 bit lane boundary incur a
+// performance penalty over those that can't.
+// Some usage of index vectors may be encoded as if full vector shuffles are
+// supported. This has no side effects and would have the same results using
+// either version.
+// If the need arises and AVX512VL is available, 256 bit full vector byte 
+// shuffles can be implemented using the AVX512 mask feature with a NULL mask.

 #if defined(__AVX__)

@@ -141,7 +140,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
 // Basic operations without SIMD equivalent

-// Bitwise not ( ~v )
 #if defined(__AVX512VL__)

 static inline __m256i mm256_not( const __m256i v )
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -37,13 +37,21 @@
 //    version of this specific instruction does not.
 //
 //    New alignr instructions for epi64 and epi32 operate across the entire
-//    vector. "_mm512_alignr_epi8" continues to be restricted to 128 bit lanes.
+//    vector but slower than epi8 which continues to be restricted to 128 bit
+//    lanes.
 //
 //    "_mm512_permutexvar_epi8" and "_mm512_permutex2var_epi8" require
 //    AVX512-VBMI. The same instructions with larger elements don't have this
 //    requirement. "_mm512_permutexvar_epi8" also performs the same operation
 //    as "_mm512_shuffle_epi8" which only requires AVX512-BW.
 //
+//    Two coding conventions are used to prevent macro argument side effects:
+//      - if a macro arg is used in an expression it must be protected by
+//        parentheses to ensure an expression argument is evaluated first.
+//      - if an argument is to referenced multiple times a C inline function
+//        should be used instead of a macro to prevent an expression argument
+//        from being evaluated multiple times.
+//
 //    There are 2 areas where overhead is a major concern: constants and
 //    permutations.
 //
@@ -184,7 +192,6 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
 // Basic operations without SIMD equivalent

 // Bitwise NOT: ~x
-// #define mm512_not( x )       _mm512_xor_si512( x, m512_neg1 )
 static inline __m512i mm512_not( const __m512i x )
 {  return _mm512_ternarylogic_epi64( x, x, x, 1 ); }

@@ -295,7 +302,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_nand( a, b ) \
   _mm512_ternarylogic_epi64( a, b, b, 0xef  )

-
+/*
 // Diagonal blending
 // Blend 8 64 bit elements from 8 vectors
 #define mm512_diagonal_64( v7, v6, v5, v4, v3, v2, v1, v0 ) \
@@ -313,6 +320,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
    _mm512_mask_blend_epi32( 0x3333, \
           _mm512_mask_blend_epi32( 0x4444, v3, v2 ), \
           _mm512_mask_blend_epi32( 0x1111, v1, v0 ) )  
+*/

 /*
 //
@@ -374,6 +382,19 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_ror_32 _mm512_ror_epi32
 #define mm512_rol_32 _mm512_rol_epi32

+/*
+#if defined(__AVX512VBMI2__)
+
+// Use C inline function in case arg is coded as an expression.
+static inline __m512i mm512_ror_16( __m512i v, int c )
+{ return _mm512_shrdi_epi16( v, v, c ); }
+
+static inline __m512i mm512_rol_16( __m512i v, int c )
+{ return _mm512_shldi_epi16( v, v, c ); }
+
+#endif
+*/
+
 //
 // Reverse byte order of packed elements, vectorized endian conversion.

@@ -518,7 +539,6 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )

 //
 // Rotate elements within 256 bit lanes of 512 bit vector.
-// 128 bit lane shift is handled by bslli bsrli.

 // Swap hi & lo 128 bits in each 256 bit lane
 #define mm512_swap256_128( v )      _mm512_permutex_epi64( v, 0x4e )
@@ -623,22 +643,5 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 #define mm512_shuflr32_8(  v )  _mm512_ror_epi32( v,  8 )
 #define mm512_shufll32_8(  v )  _mm512_rol_epi32( v,  8 )

-/*
-// 2 input, 1 output
-// Concatenate { v1, v2 } then rotate right or left and return the high
-// 512 bits, ie rotated v1. 
-#define mm512_shufl2r_256( v1, v2 )    _mm512_alignr_epi64( v2, v1, 4 )
-#define mm512_shufl2l_256( v1, v2 )    _mm512_alignr_epi64( v1, v2, 4 )
-
-#define mm512_shufl2r_128( v1, v2 )    _mm512_alignr_epi64( v2, v1, 2 )
-#define mm512_shufl2l_128( v1, v2 )    _mm512_alignr_epi64( v1, v2, 2 )
-
-#define mm512_shufl2r_64( v1, v2 )     _mm512_alignr_epi64( v2, v1, 1 )
-#define mm512_shufl2l_64( v1, v2 )     _mm512_alignr_epi64( v1, v2, 1 )
-
-#define mm512_shufl2r_32( v1, v2 )     _mm512_alignr_epi32( v2, v1, 1 )
-#define mm512_shufl2l_32( v1, v2 )     _mm512_alignr_epi32( v1, v2, 1 )
-*/
-
 #endif // AVX512
 #endif // SIMD_512_H__