v3.22.2

v3.22.1
v3.22.0
2025-09-17 23:44:27 +00:00 · 2023-04-06 13:38:37 -04:00 · 2023-03-24 18:29:42 -04:00 · 2023-03-21 17:12:51 -04:00 · 2023-03-15 12:27:04 -04:00 · 2023-03-13 14:54:38 -04:00
100 changed files with 4809 additions and 14411 deletions
--- a/4
+++ b/4
@@ -1,4 +1,6 @@

+These instructions may be out of date, see the Wiki for the latest...
+https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source

 1. Requirements:
 ---------------
@@ -35,7 +37,7 @@ SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
 openssl 1.1.0e or higher.

 znver1 and znver2 should be recognized on most recent version of GCC and
-znver3 is expected with GCC 11. GCC 11 also includes rocketlake support.
+znver3 is available with GCC 11. GCC 11 also includes rocketlake support.
 In the meantime here are some suggestions to compile with new CPUs:

 "-march=native" is usually the best choice, used by build.sh.
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 Instructions for compiling cpuminer-opt for Windows.

-Thwaw intructions nay be out of date. Please consult the wiki for
+These intructions are out of date. Please consult the wiki for
 the latest:

 https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
--- a/Makefile.am
+++ b/Makefile.am
@@ -55,9 +55,6 @@ cpuminer_SOURCES = \
  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
  algo/blake/blakecoin-4way.c \
-  algo/blake/decred-gate.c \
-  algo/blake/decred.c \
-  algo/blake/decred-4way.c \
  algo/blake/pentablake-gate.c \
  algo/blake/pentablake-4way.c \
  algo/blake/pentablake.c \
@@ -178,6 +175,8 @@ cpuminer_SOURCES = \
  algo/sha/sha256t.c \
  algo/sha/sha256q-4way.c \
  algo/sha/sha256q.c \
+  algo/sha/sha512256d-4way.c \
+  algo/sha/sha256dt.c \
  algo/shabal/sph_shabal.c \
  algo/shabal/shabal-hash-4way.c \
  algo/shavite/sph_shavite.c \
@@ -205,7 +204,6 @@ cpuminer_SOURCES = \
  algo/verthash/tiny_sha3/sha3.c \
  algo/verthash/tiny_sha3/sha3-4way.c \
  algo/whirlpool/sph_whirlpool.c \
-  algo/whirlpool/whirlpool-hash-4way.c \
  algo/whirlpool/whirlpool-gate.c \
  algo/whirlpool/whirlpool.c \
  algo/whirlpool/whirlpoolx.c \
@@ -285,8 +283,6 @@ cpuminer_SOURCES = \
  algo/x22/x22i-gate.c \
  algo/x22/x25x.c \
  algo/x22/x25x-4way.c \
-  algo/yescrypt/yescrypt.c \
-  algo/yescrypt/yescrypt-best.c \
  algo/yespower/yespower-gate.c \
  algo/yespower/yespower-blake2b.c \
  algo/yespower/crypto/hmac-blake2b.c \
--- a/README.md
+++ b/README.md
@@ -40,17 +40,25 @@ Requirements
 Intel Core2 and newer and AMD equivalents. Further optimizations are available
 on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.

-Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
-performance.
+32 bit CPUs are not supported.
+Other CPU architectures such as ARM, Raspberry Pi, RISC-V, Xeon Phi, etc,
+are not supported.

-ARM and Aarch64 CPUs are not supported.
+Mobile CPUs like laptop computers are not recommended because they aren't
+designed for extreme heat of operating at full load for extended periods of
+time.
+
+Older CPUs and ARM architecture may be supported by cpuminer-multi by TPruvot.

 2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
 including Mint and Centos, are known to work and have all dependencies
 in their repositories. Others may work but may require more effort. Older
 versions such as Centos 6 don't work due to missing features. 
-64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.

+Windows 7 or newer is supported with mingw_w64 and msys or using the pre-built
+binaries. WindowsXP 64 bit is YMMV.
+
+FreeBSD is not actively tested but should work, YMMV.
 MacOS, OSx and Android are not supported.

 3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
@@ -66,53 +74,50 @@ Supported Algorithms
                          argon2d250    argon2d-crds, Credits (CRDS)
                          argon2d500    argon2d-dyn,  Dynamic (DYN)
                          argon2d4096   argon2d-uis, Unitus, (UIS)
-                          axiom         Shabal-256 MemoHash
-                          blake         Blake-256 (SFR)
-                          blake2b       Blake2b 256
-                          blake2s       Blake-2 S
+                          blake         Blake-256
+                          blake2b       Blake2-512
+                          blake2s       Blake2-256
                          blakecoin     blake256r8
                          bmw           BMW 256
                          bmw512        BMW 512
-                          c11           Chaincoin
+                          c11           
                          decred
                          deep          Deepcoin (DCN)
                          dmd-gr        Diamond-Groestl
                          groestl       Groestl coin
                          hex           x16r-hex
-                          hmq1725       Espers
+                          hmq1725       
                          hodl          Hodlcoin
                          jha           Jackpotcoin
                          keccak        Maxcoin
                          keccakc       Creative coin
                          lbry          LBC, LBRY Credits
-                          luffa         Luffa
-                          lyra2h        Hppcoin
+                          lyra2h        
                          lyra2re       lyra2
                          lyra2rev2     lyra2v2
                          lyra2rev3     lyrav2v3
                          lyra2z        
-                          lyra2z330     Lyra2 330 rows, Zoin (ZOI)
-                          m7m           Magi (XMG)
-                          minotaur      Ringcoin (RNG)
+                          lyra2z330     
+                          m7m           
+                          minotaur 
+                          minotaurx
                          myr-gr        Myriad-Groestl
                          neoscrypt     NeoScrypt(128, 2, 1)
                          nist5         Nist5
                          pentablake    Pentablake
                          phi1612       phi
-                          phi2          Luxcoin (LUX)
-                          phi2-lux      identical to phi2
-                          pluck         Pluck:128 (Supcoin)
+                          phi2          
                          polytimos     Ninja
                          power2b       MicroBitcoin (MBC)
                          quark         Quark
                          qubit         Qubit
                          scrypt        scrypt(1024, 1, 1) (default)
                          scrypt:N      scrypt(N, 1, 1)
+                          scryptn2      scrypt(1048576, 1, 1)
                          sha256d       Double SHA-256
-                          sha256q       Quad SHA-256, Pyrite (PYE)
-                          sha256t       Triple SHA-256, Onecoin (OC)
+                          sha256q       Quad SHA-256
+                          sha256t       Triple SHA-256
                          sha3d         Double keccak256 (BSHA3)
-                          shavite3      Shavite3
                          skein         Skein+Sha (Skeincoin)
                          skein2        Double Skein (Woodcoin)
                          skunk         Signatum (SIGT)
@@ -128,17 +133,17 @@ Supported Algorithms
                          x11           Dash
                          x11evo        Revolvercoin
                          x11gost       sib (SibCoin)
-                          x12           Galaxie Cash (GCH)
-                          x13           X13
+                          x12           
+                          x13           
                          x13bcd        bcd
                          x13sm3        hsr (Hshare)
-                          x14           X14
-                          x15           X15
+                          x14           
+                          x15           
                          x16r          
                          x16rv2        
-                          x16rt         Gincoin (GIN)
-                          x16rt-veil    Veil (VEIL)
-                          x16s          Pigeoncoin (PGN)
+                          x16rt         
+                          x16rt-veil    veil
+                          x16s          
                          x17
                          x21s
                          x22i
--- a/README.txt
+++ b/README.txt
@@ -1,12 +1,22 @@
 This file is included in the Windows binary package. Compile instructions
 for Linux and Windows can be found in RELEASE_NOTES.

-This package is officially avalable only from:
+cpuminer-opt is open source and free of any fees. Many forks exist that are
+closed source and contain usage fees. support open source free software.
+
+This package is officially avalaible only from:
+
 https://github.com/JayDDee/cpuminer-opt
+
 No other sources should be trusted.

 cpuminer is a console program that is executed from a DOS or Powershell
-prompt. There is no GUI and no mouse support.
+command prompt. There is no GUI and no mouse support.
+
+New users are encouraged to consult the cpuminer-opt Wiki for detailed
+information on usage:
+
+https://github.com/JayDDee/cpuminer-opt/wiki

 Miner programs are often flagged as malware by antivirus programs. This is
 a false positive, they are flagged simply because they are cryptocurrency 
@@ -43,12 +53,11 @@ cpuminer-avx2.exe              Haswell, Skylake, Kabylake, Coffeelake, Cometlake
 cpuminer-avx2-sha.exe          AMD Zen1, Zen2
 cpuminer-avx2-sha-vaes.exe     Intel Alderlake*, AMD Zen3
 cpuminer-avx512.exe            Intel HEDT Skylake-X, Cascadelake
-cpuminer-avx512-sha-vaes.exe   Icelake, Tigerlake, Rocketlake
+cpuminer-avx512-sha-vaes.exe   AMD Zen4, Intel Rocketlake, Icelake

-* Alderlake is a hybrid architecture. With the E-cores disabled it may be
-  possible to enable AVX512 on the the P-cores and use the avx512-sha-vaes
-  build. This is not officially supported by Intel at time of writing.
-  Check for current information.
+* Alderlake is a hybrid architecture with a mix of E-cores & P-cores. Although
+  the P-cores can support AVX512 the E-cores can't so Intel decided to disable
+  AVX512 on the the P-cores.

 Notes about included DLL files:

@@ -59,9 +68,10 @@ source code obtained from the author's official repository. The exact
 procedure is documented in the build instructions for Windows:
 https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source

-Some DLL filess may already be installed on the system by Windows or third
-party packages. They often will work and may be used instead of the included
-file. 
+Some included DLL files may already be installed on the system by Windows or
+third party packages. They often will work and may be used instead of the
+included version of the files.
+

 If you like this software feel free to donate:

--- a/95
+++ b/95
@@ -65,12 +65,102 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v3.22.2
+
+Added sha512256d & sha256dt algos.
+Fixed intermittant invalid shares lyra2v2 AVX512.
+Removed application limits on the number of CPUs and threads, HW and OS limits still apply.
+Added a log warning if more threads are defined than active CPUs in affinity mask.
+Improved merkle tree memory management for stratum.
+Added transaction count to New Work log.
+Other small improvements.
+
+v3.22.1
+
+#393 fixed segfault in GBT, regression from v3.22.0.
+More efficient 32 bit data interleaving.
+
+v3.22.0
+
+Stratum: faster netdiff calculation.
+Merged a few updates from Pooler/cpuminer:
+   Use CURLOPT_POSTFIELDS in json_rpc_call,
+   Use CURLINFO_ACTIVESOCKET when supported,
+   JSONRPC speedup,
+   Speed up hex2bin function.  
+Small log improvements, notably more frequent hash rate reports.
+Removed decred algo.
+
+v3.21.5
+
+All issues with v3.21.3 & v3.21.4 should be resolved.
+Changes since v3.21.2:
+#392 #379 #389 Fixed misaligned address segfault solo mining.
+#392 Fixed stats for myr-gr algo, and a few others, for CPUs without AVX2.
+#392 Fixed conditional mining.
+#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
+     Windows binaries no longer support CPU groups,
+     Windows binaries support CPUs with up to 64 threads.
+Small optimizations to serialized vectoring.
+
+v3.21.4 CANCELLED
+
+Reapply selected changes from v3.21.3.
+#392 #379 #389 Fixed misaligned address segfault solo mining.
+#392 Fixed conditional mining.
+#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
+     Windows binaries no longer support CPU groups,
+     Windows binaries support CPUs with up to 64 threads.
+
+v3.21.3.1 UNRELEASED
+
+Revert to 3.21.2
+
+v3.21.3 CANCELLED
+
+#392 #379 #389 Fixed misaligned address segfault solo mining.
+#392 Fixed stats for myr-gr algo, and a few others, for CPUs without AVX2.
+#392 Fixed conditional mining.
+#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
+     Windows binaries no longer support CPU groups,
+     Windows binaries support CPUs with up to 64 threads.
+Midstate prehash is now centralized, done only once instead of by every thread
+for selected algos.
+Small optimizations to serialized vectoring.
+
+v3.21.2 
+
+Faster SALSA SIMD shuffle for yespower, yescrypt & scryptn2.
+Fixed a couple of compiler warnings with gcc-12.
+
+v3.21.1
+
+Fixed a segfault in some obsolete algos.
+Small optimizations to Hamsi & Shabal AVX2 & AVX512.
+
+v3.21.0
+
+Added minotaurx algo for stratum only.
+Blake256 & sha256 prehash optimized to ignore zero-padded data for AVX2 & AVX512.
+Other small improvements.
+
+v3.20.3
+
+Faster c11 algo: AVX512 6%, AVX2 4%, AVX2+VAES 15%.
+Faster AVX2+VAES for anime 14%, hmq1725 6%.
+Small optimizations to Luffa AVX2 & AVX512.
+
+v3.20.2
+
+Bit rotation optimizations to Blake256, Blake512, Blake2b, Blake2s & Lyra2-blake2b for SSE2 & AVX2.
+Removed old unused yescrypt library and other unused code.
+
 v3.20.1

 sph_blake2b optimized 1-way SSSE3 & AVX2.
 Removed duplicate Blake2b used by Power2b algo, will now use optimized sph_blake2b.
 Removed imprecise hash & target display from rejected share log.
-Share and target difficulty is now displayed only for low diificulty shares.
+Share and target difficulty is now displayed only for low difficulty shares.
 Updated configure.ac to check for AVX512 asm support.
 Small optimization to Lyra2 SSE2.

@@ -87,12 +177,9 @@ v3.19.8

 #370 "stratum+ssl", in addition to "stratum+tcps", is now recognized as a valid
 url protocol specifier for requesting a secure stratum connection.
-
 The full url, including the protocol, is now displayed in the stratum connect
 log and the periodic summary log.
-
 Small optimizations to Cubehash, AVX2 & AVX512.
-
 Byte order and prehash optimizations for Blake256 & Blake512, AVX2 & AVX512.

 v3.19.7
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -67,7 +67,6 @@ void do_nothing   () {}
 bool return_true  () { return true;  }
 bool return_false () { return false; }
 void *return_null () { return NULL;  }
-void call_error   () { printf("ERR: Uninitialized function pointer\n"); }

 void algo_not_tested()
 {
@@ -95,7 +94,8 @@ int null_scanhash()
   return 0;
 }

-// Default generic scanhash can be used in many cases.
+// Default generic scanhash can be used in many cases. Not to be used when
+// prehashing can be done or when byte swapping the data can be avoided.
 int scanhash_generic( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
@@ -152,6 +152,9 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
   const bool bench = opt_benchmark;

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   // overwrite byte swapped nonce with original byte order for proper
+   // incrementing. The nonce only needs to byte swapped if it is to be
+   // sumbitted.
   *noncev = mm256_intrlv_blend_32(
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
@@ -260,8 +263,6 @@ void init_algo_gate( algo_gate_t* gate )
   gate->build_block_header      = (void*)&std_build_block_header;
   gate->build_extraheader       = (void*)&std_build_extraheader;
   gate->set_work_data_endian    = (void*)&do_nothing;
-   gate->calc_network_diff       = (void*)&std_calc_network_diff;
-   gate->ready_to_mine           = (void*)&std_ready_to_mine;
   gate->resync_threads          = (void*)&do_nothing;
   gate->do_this_thread          = (void*)&return_true;
   gate->longpoll_rpc_call       = (void*)&std_longpoll_rpc_call;
@@ -305,7 +306,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_BLAKECOIN:    rc = register_blakecoin_algo     ( gate ); break;
    case ALGO_BMW512:       rc = register_bmw512_algo        ( gate ); break;
    case ALGO_C11:          rc = register_c11_algo           ( gate ); break;
-    case ALGO_DECRED:       rc = register_decred_algo        ( gate ); break;
    case ALGO_DEEP:         rc = register_deep_algo          ( gate ); break;
    case ALGO_DMD_GR:       rc = register_dmd_gr_algo        ( gate ); break;
    case ALGO_GROESTL:      rc = register_groestl_algo       ( gate ); break;
@@ -324,6 +324,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_LYRA2Z330:    rc = register_lyra2z330_algo     ( gate ); break;
    case ALGO_M7M:          rc = register_m7m_algo           ( gate ); break;
    case ALGO_MINOTAUR:     rc = register_minotaur_algo      ( gate ); break;
+    case ALGO_MINOTAURX:    rc = register_minotaur_algo      ( gate ); break;
    case ALGO_MYR_GR:       rc = register_myriad_algo        ( gate ); break;
    case ALGO_NEOSCRYPT:    rc = register_neoscrypt_algo     ( gate ); break;
    case ALGO_NIST5:        rc = register_nist5_algo         ( gate ); break;
@@ -336,9 +337,11 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_QUBIT:        rc = register_qubit_algo         ( gate ); break;
    case ALGO_SCRYPT:       rc = register_scrypt_algo        ( gate ); break;
    case ALGO_SHA256D:      rc = register_sha256d_algo       ( gate ); break;
+    case ALGO_SHA256DT:     rc = register_sha256dt_algo      ( gate ); break;
    case ALGO_SHA256Q:      rc = register_sha256q_algo       ( gate ); break;
    case ALGO_SHA256T:      rc = register_sha256t_algo       ( gate ); break;
    case ALGO_SHA3D:        rc = register_sha3d_algo         ( gate ); break;
+    case ALGO_SHA512256D:   rc = register_sha512256d_algo    ( gate ); break;
    case ALGO_SHAVITE3:     rc = register_shavite_algo       ( gate ); break;
    case ALGO_SKEIN:        rc = register_skein_algo         ( gate ); break;
    case ALGO_SKEIN2:       rc = register_skein2_algo        ( gate ); break;
@@ -371,15 +374,11 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_X22I:         rc = register_x22i_algo          ( gate ); break;
    case ALGO_X25X:         rc = register_x25x_algo          ( gate ); break;
    case ALGO_XEVAN:        rc = register_xevan_algo         ( gate ); break;
-    case ALGO_YESCRYPT:     rc = register_yescrypt_05_algo   ( gate ); break;
-//    case ALGO_YESCRYPT:      register_yescrypt_algo      ( gate ); break;
-    case ALGO_YESCRYPTR8:   rc = register_yescryptr8_05_algo ( gate ); break;
-//    case ALGO_YESCRYPTR8:    register_yescryptr8_algo    ( gate ); break;
+    case ALGO_YESCRYPT:     rc = register_yescrypt_algo      ( gate ); break;
+    case ALGO_YESCRYPTR8:   rc = register_yescryptr8_algo    ( gate ); break;
    case ALGO_YESCRYPTR8G:  rc = register_yescryptr8g_algo   ( gate ); break;
-    case ALGO_YESCRYPTR16:  rc = register_yescryptr16_05_algo( gate ); break;
-//    case ALGO_YESCRYPTR16:   register_yescryptr16_algo   ( gate ); break;
-    case ALGO_YESCRYPTR32:  rc = register_yescryptr32_05_algo( gate ); break;
-//    case ALGO_YESCRYPTR32:   register_yescryptr32_algo   ( gate ); break;
+    case ALGO_YESCRYPTR16:  rc = register_yescryptr16_algo   ( gate ); break;
+    case ALGO_YESCRYPTR32:  rc = register_yescryptr32_algo   ( gate ); break;
    case ALGO_YESPOWER:     rc = register_yespower_algo      ( gate ); break;
    case ALGO_YESPOWERR16:  rc = register_yespowerr16_algo   ( gate ); break;
    case ALGO_YESPOWER_B2B: rc = register_yespower_b2b_algo  ( gate ); break;
@@ -427,7 +426,6 @@ const char* const algo_alias_map[][2] =
  { "blake256r8",        "blakecoin"      },
  { "blake256r8vnl",     "vanilla"        },
  { "blake256r14",       "blake"          },
-  { "blake256r14dcr",    "decred"         },
  { "diamond",           "dmd-gr"         },
  { "espers",            "hmq1725"        },
  { "flax",              "c11"            },
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -144,7 +144,7 @@ void ( *gen_merkle_root )       ( char*, struct stratum_ctx* );
 void ( *build_extraheader )     ( struct work*, struct stratum_ctx* );

 void ( *build_block_header )    ( struct work*, uint32_t, uint32_t*,
-	                                uint32_t*, uint32_t, uint32_t,
+	                                uint32_t*,   uint32_t, uint32_t,
                                   unsigned char* );

 // Build mining.submit message
@@ -155,19 +155,13 @@ char* ( *malloc_txs_request )   ( struct work* );
 // Big endian or little endian
 void ( *set_work_data_endian )  ( struct work* );

-double ( *calc_network_diff )   ( struct work* );
-
-// Wait for first work
-bool ( *ready_to_mine )         ( struct work*, struct stratum_ctx*, int );
-
 // Diverge mining threads
 bool ( *do_this_thread )        ( int );

 // After do_this_thread
 void ( *resync_threads )        ( int, struct work* );

-// No longer needed
-json_t* (*longpoll_rpc_call)      ( CURL*, int*, char* );
+json_t* ( *longpoll_rpc_call )  ( CURL*, int*, char* );

 set_t optimizations;
 int  ( *get_work_data_size )     ();
@@ -286,8 +280,6 @@ char* std_malloc_txs_request( struct work *work );
 // Default is do_nothing, little endian is assumed
 void set_work_data_big_endian( struct work *work );

-double std_calc_network_diff( struct work *work );
-
 void std_build_block_header( struct work* g_work, uint32_t version,
 	                          uint32_t *prevhash,  uint32_t *merkle_root,
   	                       uint32_t ntime,      uint32_t nbits,
@@ -297,9 +289,6 @@ void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );

 json_t* std_longpoll_rpc_call( CURL *curl, int *err, char *lp_url );

-bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                        int thr_id );
-
 int std_get_work_data_size();

 // Gate admin functions
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -115,7 +115,7 @@ void blake256_8way_close(void *cc, void *dst);
 void blake256_8way_update_le(void *cc, const void *data, size_t len);
 void blake256_8way_close_le(void *cc, void *dst);
 void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
-                                       const void *data );
+                                      void *data );
 void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
                                     const void *midhash, const void *data );

@@ -178,7 +178,7 @@ void blake256_16way_close(void *cc, void *dst);
 void blake256_16way_update_le(void *cc, const void *data, size_t len);
 void blake256_16way_close_le(void *cc, void *dst);
 void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
-                                       const void *data );
+                                       void *data );
 void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
                                     const void *midhash, const void *data );

--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
--- a/algo/blake/blake2b-hash-4way.c
+++ b/algo/blake/blake2b-hash-4way.c
@@ -388,11 +388,11 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
 #define B2B_G(a, b, c, d, x, y) \
 { \
   v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), x ); \
-	v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 32 ); \
+	v[d] = mm256_swap64_32( _mm256_xor_si256( v[d], v[a] ) ); \
 	v[c] = _mm256_add_epi64( v[c], v[d] ); \
-	v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 24 ); \
+	v[b] = mm256_shuflr64_24( _mm256_xor_si256( v[b], v[c] ) ); \
 	v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), y ); \
-	v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 16 ); \
+	v[d] = mm256_shuflr64_16( _mm256_xor_si256( v[d], v[a] ) ); \
 	v[c] = _mm256_add_epi64( v[c], v[d] ); \
 	v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 63 ); \
 }
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -108,11 +108,11 @@ do { \
   uint8_t s0 = sigma0; \
   uint8_t s1 = sigma1; \
   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
-   d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
+   d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
   c = _mm_add_epi32( c, d ); \
   b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s1 ] ); \
-   d = mm128_ror_32( _mm_xor_si128( d, a ),  8 ); \
+   d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
   c = _mm_add_epi32( c, d ); \
   b = mm128_ror_32( _mm_xor_si128( b, c ),  7 ); \
 } while(0)
@@ -320,11 +320,11 @@ do { \
   uint8_t s0 = sigma0; \
   uint8_t s1 = sigma1; \
   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
-   d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
+   d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
   c = _mm256_add_epi32( c, d ); \
   b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s1 ] ); \
-   d = mm256_ror_32( _mm256_xor_si256( d, a ),  8 ); \
+   d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
   c = _mm256_add_epi32( c, d ); \
   b = mm256_ror_32( _mm256_xor_si256( b, c ),  7 ); \
 } while(0)
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -314,10 +314,11 @@ static const sph_u64 CB[16] = {

 // Blake-512 8 way AVX512

-#define GB_8WAY(m0, m1, c0, c1, a, b, c, d)   do { \
+#define GB_8WAY( m0, m1, c0, c1, a, b, c, d ) \
+{ \
   a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
                 _mm512_set1_epi64( c1 ), m0 ), b ), a ); \
-   d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
+   d = mm512_swap64_32( _mm512_xor_si512( d, a ) ); \
   c = _mm512_add_epi64( c, d ); \
   b = mm512_ror_64( _mm512_xor_si512( b, c ), 25 ); \
   a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
@@ -325,9 +326,10 @@ static const sph_u64 CB[16] = {
   d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
   c = _mm512_add_epi64( c, d ); \
   b = mm512_ror_64( _mm512_xor_si512( b, c ), 11 ); \
-} while (0)
+}

-#define ROUND_B_8WAY(r)   do { \
+#define ROUND_B_8WAY( r ) \
+{ \
   GB_8WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
   GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
   GB_8WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
@@ -336,13 +338,13 @@ static const sph_u64 CB[16] = {
   GB_8WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
   GB_8WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
   GB_8WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
-   } while (0)
+}

 #define DECL_STATE64_8WAY \
   __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
   uint64_t T0, T1;

-#define COMPRESS64_8WAY( buf )   do \
+#define COMPRESS64_8WAY( buf ) \
 { \
  __m512i M0, M1, M2, M3, M4, M5, M6, M7; \
  __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
@@ -409,7 +411,7 @@ static const sph_u64 CB[16] = {
  H5 = mm512_xor3( VD, V5, H5 ); \
  H6 = mm512_xor3( VE, V6, H6 ); \
  H7 = mm512_xor3( VF, V7, H7 ); \
-} while (0)
+}

 void blake512_8way_compress( blake_8way_big_context *sc )
 { 
@@ -610,7 +612,7 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,

   V0 = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( 
                       _mm512_set1_epi64( CB9 ), sc->buf[ 8] ), V5 ), V0 ); 
-   VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 32 ); 
+   VF = mm512_swap64_32( _mm512_xor_si512( VF, V0 ) ); 
   VA = _mm512_add_epi64( VA, VF ); 
   V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 25 );
   V0 = _mm512_add_epi64( V0, V5 );
@@ -714,7 +716,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
 //   V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( c1 ), m0 );

   V1 = _mm512_add_epi64( V1, V5 );   
-   VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 32 );
+   VD = mm512_swap64_32( _mm512_xor_si512( VD, V1 ) );
   V9 = _mm512_add_epi64( V9, VD );
   V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 25 );
   V1 = _mm512_add_epi64( V1, _mm512_add_epi64( _mm512_xor_si512(
@@ -728,7 +730,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
 //   V2 = _mm512_add_epi64( V2, V6 );
   V2 = _mm512_add_epi64( V2, _mm512_xor_si512( 
                 _mm512_set1_epi64( CBF ), M9 ) );
-   VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 32 );
+   VE = mm512_swap64_32( _mm512_xor_si512( VE, V2 ) );
   VA = _mm512_add_epi64( VA, VE );
   V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 25 );
   V2 = _mm512_add_epi64( V2, _mm512_add_epi64( _mm512_xor_si512(
@@ -742,7 +744,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
 //   V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512( 
 //                 _mm512_set1_epi64( CBx(1, 7) ), Mx(1, 6) ), V7 ) ); 

-   VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 32 ); 
+   VF = mm512_swap64_32( _mm512_xor_si512( VF, V3 ) ); 
   VB = _mm512_add_epi64( VB, VF ); 
   V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 25 );
   V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
@@ -1054,20 +1056,22 @@ blake512_8way_close(void *cc, void *dst)

 // Blake-512 4 way

-#define GB_4WAY(m0, m1, c0, c1, a, b, c, d)   do { \
+#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) \
+{ \
   a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
                 _mm256_set1_epi64x( c1 ), m0 ), b ), a ); \
-   d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
+   d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
   c = _mm256_add_epi64( c, d ); \
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
   a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
                 _mm256_set1_epi64x( c0 ), m1 ), b ), a ); \
-   d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
+   d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \
   c = _mm256_add_epi64( c, d ); \
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
-} while (0)
+}

-#define ROUND_B_4WAY(r)   do { \
+#define ROUND_B_4WAY(r) \
+{ \
 	GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
 	GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
 	GB_4WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
@@ -1076,13 +1080,13 @@ blake512_8way_close(void *cc, void *dst)
 	GB_4WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
 	GB_4WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
 	GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
-	} while (0)
+}

 #define DECL_STATE64_4WAY \
 	__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
 	uint64_t T0, T1;

-#define COMPRESS64_4WAY   do \
+#define COMPRESS64_4WAY \
 { \
  __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
  __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
@@ -1147,7 +1151,7 @@ blake512_8way_close(void *cc, void *dst)
  H5 = mm256_xor3( VD, V5, H5 ); \
  H6 = mm256_xor3( VE, V6, H6 ); \
  H7 = mm256_xor3( VF, V7, H7 ); \
-} while (0)
+}


 void blake512_4way_compress( blake_4way_big_context *sc )
@@ -1277,7 +1281,7 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
   // G4 skip nonce
   V0 = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256(
                       _mm256_set1_epi64x( CB9 ), sc->buf[ 8] ), V5 ), V0 );
-   VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 32 );
+   VF = mm256_swap64_32( _mm256_xor_si256( VF, V0 ) );
   VA = _mm256_add_epi64( VA, VF );
   V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 25 );
   V0 = _mm256_add_epi64( V0, V5 );
@@ -1364,7 +1368,7 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
   // finish round 0, with the nonce now available 
   V0 = _mm256_add_epi64( V0, _mm256_xor_si256(
                                       _mm256_set1_epi64x( CB8 ), M9 ) );
-   VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 16 );
+   VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V0 ) );
   VA = _mm256_add_epi64( VA, VF );
   V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 11 );

@@ -1374,34 +1378,34 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,

   // G1
   V1 = _mm256_add_epi64( V1, V5 );
-   VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 32 );
+   VD = mm256_swap64_32( _mm256_xor_si256( VD, V1 ) );
   V9 = _mm256_add_epi64( V9, VD );
   V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 25 );
   V1 = _mm256_add_epi64( V1, _mm256_add_epi64( _mm256_xor_si256(
                 _mm256_set1_epi64x( CBx(1,2) ), Mx(1,3) ), V5 ) );
-   VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 16 );
+   VD = mm256_shuflr64_16( _mm256_xor_si256( VD, V1 ) );
   V9 = _mm256_add_epi64( V9, VD );
   V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 11 );

   // G2
   V2 = _mm256_add_epi64( V2, _mm256_xor_si256(
                 _mm256_set1_epi64x( CBF ), M9 ) );
-   VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 32 );
+   VE = mm256_swap64_32( _mm256_xor_si256( VE, V2 ) );
   VA = _mm256_add_epi64( VA, VE );
   V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 25 );
   V2 = _mm256_add_epi64( V2, _mm256_add_epi64( _mm256_xor_si256(
                 _mm256_set1_epi64x( CB9 ), MF ), V6 ) );
-   VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 16 );
+   VE = mm256_shuflr64_16( _mm256_xor_si256( VE, V2 ) );
   VA = _mm256_add_epi64( VA, VE );
   V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 11 );

   // G3
-   VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 32 );
+   VF = mm256_swap64_32( _mm256_xor_si256( VF, V3 ) );
   VB = _mm256_add_epi64( VB, VF );
   V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 25 );
   V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
                 _mm256_set1_epi64x( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
-   VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 16 );
+   VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V3 ) );
   VB = _mm256_add_epi64( VB, VF );
   V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 11 );

--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -1,74 +0,0 @@
-#include "decred-gate.h"
-#include "blake-hash-4way.h"
-#include <string.h>
-#include <stdint.h>
-#include <memory.h>
-#include <unistd.h>
-
-#if defined (DECRED_4WAY)
-
-static __thread blake256_4way_context blake_mid;
-
-void decred_hash_4way( void *state, const void *input )
-{
-     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-//     uint32_t hash0[8] __attribute__ ((aligned (32)));
-//     uint32_t hash1[8] __attribute__ ((aligned (32)));
-//     uint32_t hash2[8] __attribute__ ((aligned (32)));
-//     uint32_t hash3[8] __attribute__ ((aligned (32)));
-     const void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
-     int tail_len = 180 - DECRED_MIDSTATE_LEN; 
-     blake256_4way_context ctx __attribute__ ((aligned (64)));
-
-     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
-     blake256_4way_update( &ctx, tail, tail_len );
-     blake256_4way_close( &ctx, vhash );
-     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
-}
-
-int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t vdata[48*4] __attribute__ ((aligned (64)));
-   uint32_t hash[8*4] __attribute__ ((aligned (32)));
-   uint32_t _ALIGN(64) edata[48];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
-   uint32_t n = first_nonce;
-   const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-   // copy to buffer guaranteed to be aligned.
-   memcpy( edata, pdata, 180 );
-
-   // use the old way until  new way updated for size.
-   mm128_intrlv_4x32x( vdata, edata, edata, edata, edata, 180*8 );
-
-   blake256_4way_init( &blake_mid );
-   blake256_4way_update( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
-
-   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
-   do {
-      * noncep    = n;
-      *(noncep+1) = n+1;
-      *(noncep+2) = n+2;
-      *(noncep+3) = n+3;
-
-      decred_hash_4way( hash, vdata );
-
-      for ( int i = 0; i < 4; i++ )
-      if (  (hash+(i<<3))[7] <= HTarget )
-      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-      {
-          pdata[DECRED_NONCE_INDEX] = n+i;
-          submit_solution( work, hash+(i<<3), mythr );
-      }
-      n += 4;
-  } while ( (n < max_nonce) && !work_restart[thr_id].restart );
-
-  *hashes_done = n - first_nonce + 1;
-  return 0;
-}
-
-#endif
--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -1,171 +0,0 @@
-#include "decred-gate.h"
-#include <unistd.h>
-#include <memory.h>
-#include <string.h>
-
-uint32_t *decred_get_nonceptr( uint32_t *work_data )
-{
-   return &work_data[ DECRED_NONCE_INDEX ];
-}
-
-long double decred_calc_network_diff( struct work* work )
-{
-   // sample for diff 43.281 : 1c05ea29
-   // todo: endian reversed on longpoll could be zr5 specific...
-   uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
-   uint32_t bits = ( nbits & 0xffffff );
-   int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
-   int m;
-   long double d = (long double)0x0000ffff / (long double)bits;
-
-   for ( m = shift; m < 29; m++ )
-       d *= 256.0;
-   for ( m = 29; m < shift; m++ )
-       d /= 256.0;
-   if ( shift == 28 )
-       d *= 256.0; // testnet
-   if ( opt_debug_diff )
-       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", (double)d,
-                           shift, bits );
-   return net_diff;
-}
-
-void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
-{
-   // some random extradata to make the work unique
-   work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
-   work->height = work->data[32];
-   if (!have_longpoll && work->height > *net_blocks + 1)
-   {
-      char netinfo[64] = { 0 };
-      if ( net_diff > 0. )
-      {
-         if (net_diff != work->targetdiff)
-            sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
-                   work->targetdiff);
-         else
-             sprintf(netinfo, ", diff %.3f", net_diff);
-       }
-       applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
-                       netinfo);
-       *net_blocks = work->height - 1;
-   }
-}
-
-void decred_be_build_stratum_request( char *req, struct work *work,
-                                      struct stratum_ctx *sctx )
-{
-   unsigned char *xnonce2str;
-   uint32_t ntime, nonce;
-   char ntimestr[9], noncestr[9];
-
-   be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
-   be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
-   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
-   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
-   xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
-                                     sctx->xnonce1_size );
-   snprintf( req, JSON_BUF_LEN,
-        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
-   free(xnonce2str);
-}
-
-#if !defined(min)
-#define min(a,b) (a>b ? (b) :(a))
-#endif
-
-void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
-{
-   uchar merkle_root[64] = { 0 };
-   uint32_t extraheader[32] = { 0 };
-   int headersize = 0;
-   uint32_t* extradata = (uint32_t*) sctx->xnonce1;
-   int i;
-
-   // getwork over stratum, getwork merkle + header passed in coinb1
-   memcpy(merkle_root, sctx->job.coinbase, 32);
-   headersize = min((int)sctx->job.coinbase_size - 32,
-                  sizeof(extraheader) );
-   memcpy( extraheader, &sctx->job.coinbase[32], headersize );
-
-   // Assemble block header 
-   memset( g_work->data, 0, sizeof(g_work->data) );
-   g_work->data[0] = le32dec( sctx->job.version );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[1 + i] = swab32(
-                              le32dec( (uint32_t *) sctx->job.prevhash + i ) );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
-
-//   for ( i = 0; i < 8; i++ ) // prevhash
-//      g_work->data[1 + i] = swab32( g_work->data[1 + i] );
-//   for ( i = 0; i < 8; i++ ) // merkle
-//      g_work->data[9 + i] = swab32( g_work->data[9 + i] );
-
-   for ( i = 0; i < headersize/4; i++ ) // header
-      g_work->data[17 + i] = extraheader[i];
-   // extradata
-
-   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
-      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
-   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
-      g_work->data[i] = 0;
-   g_work->data[37] = (rand()*4) << 8;
-   // block header suffix from coinb2 (stake version)
-   memcpy( &g_work->data[44],
-           &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
-   sctx->block_height = g_work->data[32];
-   //applog_hex(work->data, 180);
-   //applog_hex(&work->data[36], 36);
-}
-
-#undef min
-
-bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                           int thr_id )
-{
-   if ( have_stratum && strcmp(stratum->job.job_id, work->job_id)  )
-      // need to regen g_work..
-      return false;
-   if ( have_stratum && !work->data[0] && !opt_benchmark )
-   {
-      sleep(1);
-      return false;
-   }
-   // extradata: prevent duplicates
-   work->data[ DECRED_XNONCE_INDEX     ] += 1;
-   work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
-   return true;
-}
-
-int decred_get_work_data_size() { return DECRED_DATA_SIZE; }
-
-bool register_decred_algo( algo_gate_t* gate )
-{
-#if defined(DECRED_4WAY)
-  four_way_not_tested();
-  gate->scanhash  = (void*)&scanhash_decred_4way;
-  gate->hash      = (void*)&decred_hash_4way;
-#else
-  gate->scanhash  = (void*)&scanhash_decred;
-  gate->hash      = (void*)&decred_hash;
-#endif
-  gate->optimizations = AVX2_OPT;
-//  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
-  gate->decode_extra_data     = (void*)&decred_decode_extradata;
-  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
-  gate->work_decode           = (void*)&std_be_work_decode;
-  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
-  gate->build_extraheader     = (void*)&decred_build_extraheader;
-  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
-  gate->nbits_index           = DECRED_NBITS_INDEX;
-  gate->ntime_index           = DECRED_NTIME_INDEX;
-  gate->nonce_index           = DECRED_NONCE_INDEX;
-  gate->get_work_data_size    = (void*)&decred_get_work_data_size;
-  gate->work_cmp_size         = DECRED_WORK_COMPARE_SIZE;
-  allow_mininginfo            = false;
-  have_gbt                    = false;
-  return true;
-}
-
--- a/algo/blake/decred-gate.h
+++ b/algo/blake/decred-gate.h
@@ -1,36 +0,0 @@
-#ifndef __DECRED_GATE_H__
-#define __DECRED_GATE_H__
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-#define DECRED_NBITS_INDEX 29
-#define DECRED_NTIME_INDEX 34
-#define DECRED_NONCE_INDEX 35
-#define DECRED_XNONCE_INDEX 36
-#define DECRED_DATA_SIZE 192
-#define DECRED_WORK_COMPARE_SIZE 140
-#define DECRED_MIDSTATE_LEN 128
-
-#if defined (__AVX2__) 
-//void blakehash_84way(void *state, const void *input);
-//int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
-//                         uint64_t *hashes_done );
-#endif
-
-#if defined(__SSE4_2__)
-  #define DECRED_4WAY
-#endif
-
-#if defined (DECRED_4WAY)
-void decred_hash_4way(void *state, const void *input);
-int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr );
-#endif
-
-void decred_hash( void *state, const void *input );
-int scanhash_decred( struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done, struct thr_info *mythr );
-
-#endif
-
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -1,282 +0,0 @@
-#include "decred-gate.h"
-
-#if !defined(DECRED_8WAY) && !defined(DECRED_4WAY)
-
-#include "sph_blake.h"
-
-#include <string.h>
-#include <stdint.h>
-#include <memory.h>
-#include <unistd.h>
-
-/*
-#ifndef min
-#define min(a,b) (a>b ? b : a)
-#endif
-#ifndef max 
-#define max(a,b) (a<b ? b : a)
-#endif
-*/
-/*
-#define DECRED_NBITS_INDEX 29
-#define DECRED_NTIME_INDEX 34
-#define DECRED_NONCE_INDEX 35
-#define DECRED_XNONCE_INDEX 36
-#define DECRED_DATA_SIZE 192
-#define DECRED_WORK_COMPARE_SIZE 140
-*/
-static __thread sph_blake256_context blake_mid;
-static __thread bool ctx_midstate_done = false;
-
-void decred_hash(void *state, const void *input)
-{
-//        #define MIDSTATE_LEN 128
-        sph_blake256_context ctx __attribute__ ((aligned (64)));
-
-        uint8_t *ending = (uint8_t*) input;
-        ending += DECRED_MIDSTATE_LEN;
-
-        if (!ctx_midstate_done) {
-                sph_blake256_init(&blake_mid);
-                sph_blake256(&blake_mid, input, DECRED_MIDSTATE_LEN);
-                ctx_midstate_done = true;
-        }
-        memcpy(&ctx, &blake_mid, sizeof(blake_mid));
-
-        sph_blake256(&ctx, ending, (180 - DECRED_MIDSTATE_LEN));
-        sph_blake256_close(&ctx, state);
-}
-
-void decred_hash_simple(void *state, const void *input)
-{
-        sph_blake256_context ctx;
-        sph_blake256_init(&ctx);
-        sph_blake256(&ctx, input, 180);
-        sph_blake256_close(&ctx, state);
-}
-
-int scanhash_decred( struct work *work, uint32_t max_nonce,
-               uint64_t *hashes_done, struct thr_info *mythr )
-{
-        uint32_t _ALIGN(64) endiandata[48];
-        uint32_t _ALIGN(64) hash32[8];
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-//        #define DCR_NONCE_OFT32 35
-
-        const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
-        const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
-
-        uint32_t n = first_nonce;
-
-        ctx_midstate_done = false;
-
-#if 1
-        memcpy(endiandata, pdata, 180);
-#else
-        for (int k=0; k < (180/4); k++)
-                be32enc(&endiandata[k], pdata[k]);
-#endif
-
-        do {
-                //be32enc(&endiandata[DCR_NONCE_OFT32], n);
-                endiandata[DECRED_NONCE_INDEX] = n;
-                decred_hash(hash32, endiandata);
-
-                if (hash32[7] <= HTarget && fulltest(hash32, ptarget))
-                {
-                   pdata[DECRED_NONCE_INDEX] = n;
-                   submit_solution( work, hash32, mythr );
-                }
-
-                n++;
-
-        } while (n < max_nonce && !work_restart[thr_id].restart);
-
-        *hashes_done = n - first_nonce + 1;
-        pdata[DECRED_NONCE_INDEX] = n;
-        return 0;
-}
-
-/*
-uint32_t *decred_get_nonceptr( uint32_t *work_data )
-{
-   return &work_data[ DECRED_NONCE_INDEX ];
-}
-
-double decred_calc_network_diff( struct work* work )
-{
-   // sample for diff 43.281 : 1c05ea29
-   // todo: endian reversed on longpoll could be zr5 specific...
-   uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
-   uint32_t bits = ( nbits & 0xffffff );
-   int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
-   int m;
-   double d = (double)0x0000ffff / (double)bits;
-
-   for ( m = shift; m < 29; m++ )
-       d *= 256.0;
-   for ( m = 29; m < shift; m++ )
-       d /= 256.0;
-   if ( shift == 28 )
-       d *= 256.0; // testnet
-   if ( opt_debug_diff )
-       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
-                           shift, bits );
-   return net_diff;
-}
-
-void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
-{
-   // some random extradata to make the work unique
-   work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
-   work->height = work->data[32];
-   if (!have_longpoll && work->height > *net_blocks + 1)
-   {
-      char netinfo[64] = { 0 };
-      if (net_diff > 0.)
-      {
-         if (net_diff != work->targetdiff)
-	    sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
-                   work->targetdiff);
-	 else
-	     sprintf(netinfo, ", diff %.3f", net_diff);
-       }
-       applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
-                       netinfo);
-       *net_blocks = work->height - 1;
-   }
-}
-
-void decred_be_build_stratum_request( char *req, struct work *work,
-                                      struct stratum_ctx *sctx )
-{
-   unsigned char *xnonce2str;
-   uint32_t ntime, nonce;
-   char ntimestr[9], noncestr[9];
-
-   be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
-   be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
-   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
-   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
-   xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
-                                     sctx->xnonce1_size );
-   snprintf( req, JSON_BUF_LEN,
-        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
-   free(xnonce2str);
-}
-*/
-/*
-// data shared between gen_merkle_root and build_extraheader.
-__thread uint32_t decred_extraheader[32] = { 0 };
-__thread int decred_headersize = 0;
-
-void decred_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
-{
-   // getwork over stratum, getwork merkle + header passed in coinb1
-   memcpy(merkle_root, sctx->job.coinbase, 32);
-   decred_headersize = min((int)sctx->job.coinbase_size - 32, 
-                  sizeof(decred_extraheader) );
-   memcpy( decred_extraheader, &sctx->job.coinbase[32], decred_headersize);
-}
-*/
-
-/*
-#define min(a,b) (a>b ? (b) :(a))
-
-void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
-{
-   uchar merkle_root[64] = { 0 };
-   uint32_t extraheader[32] = { 0 };
-   int headersize = 0;
-   uint32_t* extradata = (uint32_t*) sctx->xnonce1;
-   size_t t;
-   int i;
-
-   // getwork over stratum, getwork merkle + header passed in coinb1
-   memcpy(merkle_root, sctx->job.coinbase, 32);
-   headersize = min((int)sctx->job.coinbase_size - 32,
-                  sizeof(extraheader) );
-   memcpy( extraheader, &sctx->job.coinbase[32], headersize );
-
-   // Increment extranonce2 
-   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
-
-   // Assemble block header 
-   memset( g_work->data, 0, sizeof(g_work->data) );
-   g_work->data[0] = le32dec( sctx->job.version );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[1 + i] = swab32(
-                              le32dec( (uint32_t *) sctx->job.prevhash + i ) );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
-
-//   for ( i = 0; i < 8; i++ ) // prevhash
-//      g_work->data[1 + i] = swab32( g_work->data[1 + i] );
-//   for ( i = 0; i < 8; i++ ) // merkle
-//      g_work->data[9 + i] = swab32( g_work->data[9 + i] );
-
-   for ( i = 0; i < headersize/4; i++ ) // header
-      g_work->data[17 + i] = extraheader[i];
-   // extradata
-
-   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
-      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
-   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
-      g_work->data[i] = 0;
-   g_work->data[37] = (rand()*4) << 8;
-   // block header suffix from coinb2 (stake version)
-   memcpy( &g_work->data[44],
-           &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
-   sctx->bloc_height = g_work->data[32];
-   //applog_hex(work->data, 180);
-   //applog_hex(&work->data[36], 36);
-}
-
-#undef min
-
-bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                           int thr_id )
-{
-   if ( have_stratum && strcmp(stratum->job.job_id, work->job_id)  )
-      // need to regen g_work..
-      return false;
-   if ( have_stratum && !work->data[0] && !opt_benchmark )
-   {
-      sleep(1);
-      return false;
-   }      
-   // extradata: prevent duplicates
-   work->data[ DECRED_XNONCE_INDEX     ] += 1;
-   work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
-   return true;
-}
-
-
-bool register_decred_algo( algo_gate_t* gate )
-{
-  gate->optimizations         = SSE2_OPT;
-  gate->scanhash              = (void*)&scanhash_decred;
-  gate->hash                  = (void*)&decred_hash;
-  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
-  gate->decode_extra_data     = (void*)&decred_decode_extradata;
-  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
-  gate->work_decode           = (void*)&std_be_work_decode;
-  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
-  gate->build_extraheader     = (void*)&decred_build_extraheader;
-  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
-  gate->nbits_index           = DECRED_NBITS_INDEX;
-  gate->ntime_index           = DECRED_NTIME_INDEX;
-  gate->nonce_index           = DECRED_NONCE_INDEX;
-  gate->work_data_size        = DECRED_DATA_SIZE;
-  gate->work_cmp_size         = DECRED_WORK_COMPARE_SIZE; 
-  allow_mininginfo            = false;
-  have_gbt                    = false;
-  return true;
-}
-*/
-
-#endif
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -1,6 +1,6 @@
 #include "pentablake-gate.h"

-#if defined (__AVX2__)
+#if defined(PENTABLAKE_4WAY)

 #include <stdlib.h>
 #include <stdint.h>
--- a/algo/blake/pentablake-gate.h
+++ b/algo/blake/pentablake-gate.h
@@ -4,9 +4,10 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__)
-  #define PENTABLAKE_4WAY
-#endif
+// 4way is broken
+//#if defined(__AVX2__)
+//  #define PENTABLAKE_4WAY
+//#endif

 #if defined(PENTABLAKE_4WAY)
 void pentablakehash_4way( void *state, const void *input );
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -35,7 +35,6 @@
 #include "sph_blake2b.h"

 // Little-endian byte access.
-
 #define B2B_GET64(p)                            \
 	(((uint64_t) ((uint8_t *) (p))[0]) ^        \
 	(((uint64_t) ((uint8_t *) (p))[1]) << 8) ^  \
@@ -46,69 +45,78 @@
 	(((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
 	(((uint64_t) ((uint8_t *) (p))[7]) << 56))

-// G Mixing function.
-
 #if defined(__AVX2__)

-#define BLAKE2B_G( R, Sa, Sb, Sc, Sd, Na, Nb ) \
+#define BLAKE2B_G( Sa, Sb, Sc, Sd, Se, Sf, Sg, Sh ) \
 { \
  V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
-              _mm256_set_epi64x( m[ sigma[R][Sd] ], m[ sigma[R][Sc] ], \
-                                 m[ sigma[R][Sb] ], m[ sigma[R][Sa] ] ) ) ); \
-  V[3] = mm256_ror_64( _mm256_xor_si256( V[3], V[0] ), Na ); \
+              _mm256_set_epi64x( m[ sigmaR[ Sg ] ], m[ sigmaR[ Se ] ], \
+                                 m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
+  V[3] = mm256_swap64_32( _mm256_xor_si256( V[3], V[0] ) ); \
  V[2] = _mm256_add_epi64( V[2], V[3] ); \
-  V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), Nb ); \
+  V[1] = mm256_shuflr64_24( _mm256_xor_si256( V[1], V[2] ) ); \
+\
+  V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
+              _mm256_set_epi64x( m[ sigmaR[ Sh ] ], m[ sigmaR[ Sf ] ], \
+                                 m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
+  V[3] = mm256_shuflr64_16( _mm256_xor_si256( V[3], V[0] ) ); \
+  V[2] = _mm256_add_epi64( V[2], V[3] ); \
+  V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), 63 ); \
 }

 #define BLAKE2B_ROUND( R ) \
 { \
  __m256i *V = (__m256i*)v; \
-  BLAKE2B_G( R,  0,  2,  4,  6, 32, 24 ); \
-  BLAKE2B_G( R,  1,  3,  5,  7, 16, 63 ); \
+  const uint8_t *sigmaR = sigma[R]; \
+  BLAKE2B_G(  0,  1,  2,  3,  4,  5,  6,  7 ); \
  V[3] = mm256_shufll_64( V[3] ); \
  V[2] = mm256_swap_128( V[2] ); \
  V[1] = mm256_shuflr_64( V[1] ); \
-  BLAKE2B_G( R,  8, 10, 12, 14, 32, 24 ); \
-  BLAKE2B_G( R,  9, 11, 13, 15, 16, 63 ); \
+  BLAKE2B_G(  8,  9, 10, 11, 12, 13, 14, 15 ); \
  V[3] = mm256_shuflr_64( V[3] ); \
  V[2] = mm256_swap_128( V[2] ); \
  V[1] = mm256_shufll_64( V[1] ); \
 }

-#elif defined(__SSSE3__)
+#elif defined(__SSE2__)
+// always true

-#define BLAKE2B_G( R, Va, Vb, Vc, Vd, Sa, Sb, Na, Nb ) \
+#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
 { \
   Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
-                 _mm_set_epi64x( m[ sigma[R][Sb] ], m[ sigma[R][Sa] ] ) ) ); \
-   Vd = mm128_ror_64( _mm_xor_si128( Vd, Va ), Na ); \
+                 _mm_set_epi64x( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
+   Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
   Vc = _mm_add_epi64( Vc, Vd ); \
-   Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), Nb ); \
+   Vb = mm128_shuflr64_24( _mm_xor_si128( Vb, Vc ) ); \
+\
+   Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
+                 _mm_set_epi64x( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
+   Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
+   Vc = _mm_add_epi64( Vc, Vd ); \
+   Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 63 ); \
 }

 #define BLAKE2B_ROUND( R ) \
 { \
   __m128i *V = (__m128i*)v; \
   __m128i V2, V3, V6, V7; \
-   BLAKE2B_G( R, V[0], V[2], V[4], V[6], 0, 2, 32, 24 ); \
-   BLAKE2B_G( R, V[0], V[2], V[4], V[6], 1, 3, 16, 63 ); \
-   BLAKE2B_G( R, V[1], V[3], V[5], V[7], 4, 6, 32, 24 ); \
-   BLAKE2B_G( R, V[1], V[3], V[5], V[7], 5, 7, 16, 63 ); \
-   V2 = mm128_shufl2r_64( V[2], V[3] ); \
-   V3 = mm128_shufl2r_64( V[3], V[2] ); \
-   V6 = mm128_shufl2l_64( V[6], V[7] ); \
-   V7 = mm128_shufl2l_64( V[7], V[6] ); \
-   BLAKE2B_G( R, V[0], V2, V[5], V6,  8, 10, 32, 24 ); \
-   BLAKE2B_G( R, V[0], V2, V[5], V6,  9, 11, 16, 63 ); \
-   BLAKE2B_G( R, V[1], V3, V[4], V7, 12, 14, 32, 24 ); \
-   BLAKE2B_G( R, V[1], V3, V[4], V7, 13, 15, 16, 63 ); \
-   V[2] = mm128_shufl2l_64( V2, V3 ); \
-   V[3] = mm128_shufl2l_64( V3, V2 ); \
-   V[6] = mm128_shufl2r_64( V6, V7 ); \
-   V[7] = mm128_shufl2r_64( V7, V6 ); \
+   const uint8_t *sigmaR = sigma[R]; \
+   BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
+   BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
+   V2 = mm128_alignr_64( V[3], V[2], 1 ); \
+   V3 = mm128_alignr_64( V[2], V[3], 1 ); \
+   V6 = mm128_alignr_64( V[6], V[7], 1 ); \
+   V7 = mm128_alignr_64( V[7], V[6], 1 ); \
+   BLAKE2B_G( V[0], V2, V[5], V6,  8,  9, 10, 11 ); \
+   BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
+   V[2] = mm128_alignr_64( V2, V3, 1 ); \
+   V[3] = mm128_alignr_64( V3, V2, 1 ); \
+   V[6] = mm128_alignr_64( V7, V6, 1 ); \
+   V[7] = mm128_alignr_64( V6, V7, 1 ); \
 }

 #else
+// never used, SSE2 is always available

 #ifndef ROTR64
 #define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
@@ -120,6 +128,7 @@
   Vd = ROTR64( Vd ^ Va, 32 ); \
   Vc = Vc + Vd; \
   Vb = ROTR64( Vb ^ Vc, 24 ); \
+\
   Va = Va + Vb + m[ sigma[R][Sb] ]; \
   Vd = ROTR64( Vd ^ Va, 16 ); \
   Vc = Vc + Vd; \
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -747,38 +747,40 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
   mj[14] = mm256_rol_64( M[14], 15 );
   mj[15] = mm256_rol_64( M[15], 16 );

-   qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7],
-              (const __m256i)_mm256_set1_epi64x( 16 * 0x0555555555555555ULL ) );
-   qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8],
-              (const __m256i)_mm256_set1_epi64x( 17 * 0x0555555555555555ULL ) );
-   qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9],
-              (const __m256i)_mm256_set1_epi64x( 18 * 0x0555555555555555ULL ) );
-   qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10],
-              (const __m256i)_mm256_set1_epi64x( 19 * 0x0555555555555555ULL ) );
-   qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11],
-              (const __m256i)_mm256_set1_epi64x( 20 * 0x0555555555555555ULL ) );
-   qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12],
-              (const __m256i)_mm256_set1_epi64x( 21 * 0x0555555555555555ULL ) );
-   qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13],
-              (const __m256i)_mm256_set1_epi64x( 22 * 0x0555555555555555ULL ) );
-   qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14],
-              (const __m256i)_mm256_set1_epi64x( 23 * 0x0555555555555555ULL ) );
-   qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15],
-              (const __m256i)_mm256_set1_epi64x( 24 * 0x0555555555555555ULL ) );
-   qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0],
-              (const __m256i)_mm256_set1_epi64x( 25 * 0x0555555555555555ULL ) );
-   qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1],
-              (const __m256i)_mm256_set1_epi64x( 26 * 0x0555555555555555ULL ) );
-   qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2],
-              (const __m256i)_mm256_set1_epi64x( 27 * 0x0555555555555555ULL ) );
-   qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3],
-              (const __m256i)_mm256_set1_epi64x( 28 * 0x0555555555555555ULL ) );
-   qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4],
-              (const __m256i)_mm256_set1_epi64x( 29 * 0x0555555555555555ULL ) );
-   qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5],
-              (const __m256i)_mm256_set1_epi64x( 30 * 0x0555555555555555ULL ) );
-   qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6],
-              (const __m256i)_mm256_set1_epi64x( 31 * 0x0555555555555555ULL ) );
+   __m256i K = _mm256_set1_epi64x( 16 * 0x0555555555555555ULL );
+   const __m256i Kincr = _mm256_set1_epi64x( 0x0555555555555555ULL );
+
+   qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5], K );
+   K = _mm256_add_epi64( K, Kincr );
+   qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6], K );

   qt[16] = _mm256_add_epi64( qt[16], expand1_b( qt, 16 ) );
   qt[17] = _mm256_add_epi64( qt[17], expand1_b( qt, 17 ) );
@@ -1180,7 +1182,6 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
   qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );

   __m512i mj[16];
-   uint64_t K = 16 * 0x0555555555555555ULL;
 
   mj[ 0] = mm512_rol_64( M[ 0],  1 );
   mj[ 1] = mm512_rol_64( M[ 1],  2 );
@@ -1199,54 +1200,40 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
   mj[14] = mm512_rol_64( M[14], 15 );
   mj[15] = mm512_rol_64( M[15], 16 );

-   qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5],
-                        (const __m512i)_mm512_set1_epi64( K ) );
-   K += 0x0555555555555555ULL;
-   qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6],
-                        (const __m512i)_mm512_set1_epi64( K ) );
+   __m512i K = _mm512_set1_epi64( 16 * 0x0555555555555555ULL );
+   const __m512i Kincr = _mm512_set1_epi64( 0x0555555555555555ULL );

+   qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5], K );
+   K = _mm512_add_epi64( K, Kincr );
+   qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6], K );

   qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) );
   qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) );
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -24,9 +24,6 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )

  ctx->hashlen = hashlen;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
-
  for ( i = 0; i < SIZE512; i++ )
  {
     ctx->chaining[i] = _mm_setzero_si128();
@@ -46,9 +43,6 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
 {
  int i;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
-
  for ( i = 0; i < SIZE512; i++ )
  {
     ctx->chaining[i] = _mm_setzero_si128();
--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -22,9 +22,6 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )

  ctx->hashlen = hashlen;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
-
  for ( i = 0; i < SIZE256; i++ )
  {
     ctx->chaining[i] = _mm_setzero_si128();
@@ -43,9 +40,6 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
 {
  int i;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
-
  for ( i = 0; i < SIZE256; i++ )
  {
     ctx->chaining[i] = _mm_setzero_si128();
@@ -54,8 +48,6 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)

  ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 );

-//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-//  INIT256(ctx->chaining);
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -26,9 +26,6 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )

  ctx->hashlen = hashlen;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
-
  for ( i = 0; i < SIZE256; i++ )
  {
     ctx->chaining[i] = m512_zero;
@@ -54,8 +51,8 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
   __m512i* in = (__m512i*)input;
   int i;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
+//  if (ctx->chaining == NULL || ctx->buffer == NULL)
+//    return 1;

  for ( i = 0; i < SIZE256; i++ )
  {
@@ -179,8 +176,8 @@ int groestl256_2way_init( groestl256_2way_context* ctx, uint64_t hashlen )

  ctx->hashlen = hashlen;

-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
+//  if (ctx->chaining == NULL || ctx->buffer == NULL)
+//    return 1;

  for ( i = 0; i < SIZE256; i++ )
  {
@@ -207,9 +204,6 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
   __m256i* in = (__m256i*)input;
   int i;

-   if (ctx->chaining == NULL || ctx->buffer == NULL)
-     return 1;
-
   for ( i = 0; i < SIZE256; i++ )
   {
     ctx->chaining[i] = m256_zero;
--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -21,9 +21,6 @@

 int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
 {
-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
-
  memset_zero_512( ctx->chaining, SIZE512 );
  memset_zero_512( ctx->buffer, SIZE512 );

@@ -142,9 +139,6 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,

 int groestl512_2way_init( groestl512_2way_context* ctx, uint64_t hashlen )
 {
-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
-
  memset_zero_256( ctx->chaining, SIZE512 );
  memset_zero_256( ctx->buffer, SIZE512 );

--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -73,11 +73,11 @@ int scanhash_myriad( struct work *work, uint32_t max_nonce,
      be32enc(&endiandata[19], nonce);
      myriad_hash(hash, endiandata);

-      if (hash[7] <= Htarg && fulltest(hash, ptarget))
+      if (hash[7] <= Htarg )
+      if ( fulltest(hash, ptarget) && !opt_benchmark )
      {
         pdata[19] = nonce;
-         *hashes_done = pdata[19] - first_nonce;
-         return 1;
+         submit_solution( work, hash, mythr );
      }
      nonce++;

--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -585,9 +585,8 @@ do { \
  t = _mm512_xor_si512( t, c ); \
  d = mm512_xoror( a, b, t ); \
  t = mm512_xorand( t, a, b ); \
-  b = mm512_xor3( b, d, t ); \
  a = c; \
-  c = b; \
+  c = mm512_xor3( b, d, t ); \
  b = d; \
  d = mm512_not( t ); \
 } while (0)
@@ -635,7 +634,7 @@ do { \

 #define ROUND_BIG8( alpha ) \
 do { \
-   __m512i t0, t1, t2, t3; \
+   __m512i t0, t1, t2, t3, t4, t5; \
   s0 = _mm512_xor_si512( s0, alpha[ 0] ); /* m0 */ \
   s1 = _mm512_xor_si512( s1, alpha[ 1] ); /* c0 */ \
   s2 = _mm512_xor_si512( s2, alpha[ 2] ); /* m1 */ \
@@ -662,43 +661,35 @@ do { \
  s5 = mm512_swap64_32( s5 ); \
  sD = mm512_swap64_32( sD ); \
  sE = mm512_swap64_32( sE ); \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
-  L8( s0, t1, s9, t3 ); \
-  s4 = _mm512_mask_blend_epi32( 0x5555, s4, t1 ); \
-  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, t1 ); \
-  sD = _mm512_mask_blend_epi32( 0x5555, sD, t3 ); \
-  sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t3 ); \
+  t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
+  L8( s0, t0, s9, t1 ); \
 \
  s6 = mm512_swap64_32( s6 ); \
  sF = mm512_swap64_32( sF ); \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
-  L8( s1, t1, sA, t3 ); \
-  s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
-  s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, t1 ); \
-  sE = _mm512_mask_blend_epi32( 0x5555, sE, t3 ); \
-  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
+  L8( s1, t2, sA, t3 ); \
+  s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \
+  sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \
 \
  s7 = mm512_swap64_32( s7 ); \
  sC = mm512_swap64_32( sC ); \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
-  L8( s2, t1, sB, t3 ); \
-  s6 = _mm512_mask_blend_epi32( 0x5555, s6, t1 ); \
-  s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, t1 ); \
-  sF = _mm512_mask_blend_epi32( 0x5555, sF, t3 ); \
-  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t3 ); \
+  t4 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
+  t5 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
+  L8( s2, t4, sB, t5 ); \
+  s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \
+  sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \
  s6 = mm512_swap64_32( s6 ); \
  sF = mm512_swap64_32( sF ); \
 \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
-  L8( s3, t1, s8, t3 ); \
-  s7 = _mm512_mask_blend_epi32( 0x5555, s7, t1 ); \
-  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, t1 ); \
-  sC = _mm512_mask_blend_epi32( 0x5555, sC, t3 ); \
-  sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t3 ); \
+  L8( s3, t2, s8, t3 ); \
+  s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \
+  s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \
+  sC = _mm512_mask_blend_epi32( 0x5555, t5, t3 ); \
+  sD = _mm512_mask_blend_epi32( 0xaaaa, t1, t3 ); \
  s7 = mm512_swap64_32( s7 ); \
  sC = mm512_swap64_32( sC ); \
 \
@@ -924,10 +915,9 @@ do { \
  d = _mm256_xor_si256( d, a ); \
  a = _mm256_and_si256( a, b ); \
  t = _mm256_xor_si256( t, a ); \
-  b = _mm256_xor_si256( b, d ); \
-  b = _mm256_xor_si256( b, t ); \
  a = c; \
-  c = b; \
+  c = _mm256_xor_si256( b, d ); \
+  c = _mm256_xor_si256( c, t ); \
  b = d; \
  d = mm256_not( t ); \
 } while (0)
@@ -977,7 +967,7 @@ do { \

 #define ROUND_BIG( alpha ) \
 do { \
-   __m256i t0, t1, t2, t3; \
+   __m256i t0, t1, t2, t3, t4, t5; \
   s0 = _mm256_xor_si256( s0, alpha[ 0] ); \
   s1 = _mm256_xor_si256( s1, alpha[ 1] ); \
   s2 = _mm256_xor_si256( s2, alpha[ 2] ); \
@@ -1004,43 +994,35 @@ do { \
  s5 = mm256_swap64_32( s5 ); \
  sD = mm256_swap64_32( sD ); \
  sE = mm256_swap64_32( sE ); \
-  t1 = _mm256_blend_epi32( s4, s5, 0xaa ); \
-  t3 = _mm256_blend_epi32( sD, sE, 0xaa ); \
-  L( s0, t1, s9, t3 ); \
-  s4 = _mm256_blend_epi32( s4, t1, 0x55 ); \
-  s5 = _mm256_blend_epi32( s5, t1, 0xaa ); \
-  sD = _mm256_blend_epi32( sD, t3, 0x55 ); \
-  sE = _mm256_blend_epi32( sE, t3, 0xaa ); \
+  t0 = _mm256_blend_epi32( s4, s5, 0xaa ); \
+  t1 = _mm256_blend_epi32( sD, sE, 0xaa ); \
+  L( s0, t0, s9, t1 ); \
 \
  s6 = mm256_swap64_32( s6 ); \
  sF = mm256_swap64_32( sF ); \
-  t1 = _mm256_blend_epi32( s5, s6, 0xaa ); \
+  t2 = _mm256_blend_epi32( s5, s6, 0xaa ); \
  t3 = _mm256_blend_epi32( sE, sF, 0xaa ); \
-  L( s1, t1, sA, t3 ); \
-  s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
-  s6 = _mm256_blend_epi32( s6, t1, 0xaa ); \
-  sE = _mm256_blend_epi32( sE, t3, 0x55 ); \
-  sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
+  L( s1, t2, sA, t3 ); \
+  s5 = _mm256_blend_epi32( t0, t2, 0x55 ); \
+  sE = _mm256_blend_epi32( t1, t3, 0x55 ); \
 \
  s7 = mm256_swap64_32( s7 ); \
  sC = mm256_swap64_32( sC ); \
-  t1 = _mm256_blend_epi32( s6, s7, 0xaa ); \
-  t3 = _mm256_blend_epi32( sF, sC, 0xaa ); \
-  L( s2, t1, sB, t3 ); \
-  s6 = _mm256_blend_epi32( s6, t1, 0x55 ); \
-  s7 = _mm256_blend_epi32( s7, t1, 0xaa ); \
-  sF = _mm256_blend_epi32( sF, t3, 0x55 ); \
-  sC = _mm256_blend_epi32( sC, t3, 0xaa ); \
+  t4 = _mm256_blend_epi32( s6, s7, 0xaa ); \
+  t5 = _mm256_blend_epi32( sF, sC, 0xaa ); \
+  L( s2, t4, sB, t5 ); \
+  s6 = _mm256_blend_epi32( t2, t4, 0x55 ); \
+  sF = _mm256_blend_epi32( t3, t5, 0x55 ); \
  s6 = mm256_swap64_32( s6 ); \
  sF = mm256_swap64_32( sF ); \
 \
-  t1 = _mm256_blend_epi32( s7, s4, 0xaa ); \
+  t2 = _mm256_blend_epi32( s7, s4, 0xaa ); \
  t3 = _mm256_blend_epi32( sC, sD, 0xaa ); \
-  L( s3, t1, s8, t3 ); \
-  s7 = _mm256_blend_epi32( s7, t1, 0x55 ); \
-  s4 = _mm256_blend_epi32( s4, t1, 0xaa ); \
-  sC = _mm256_blend_epi32( sC, t3, 0x55 ); \
-  sD = _mm256_blend_epi32( sD, t3, 0xaa ); \
+  L( s3, t2, s8, t3 ); \
+  s7 = _mm256_blend_epi32( t4, t2, 0x55 ); \
+  s4 = _mm256_blend_epi32( t0, t2, 0xaa ); \
+  sC = _mm256_blend_epi32( t5, t3, 0x55 ); \
+  sD = _mm256_blend_epi32( t1, t3, 0xaa ); \
  s7 = mm256_swap64_32( s7 ); \
  sC = mm256_swap64_32( sC ); \
 \
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -141,6 +141,13 @@ do { \
                       _mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \
 } while (0)

+#define STEP1(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
+do { \
+   __m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
+                                      mm128_ror_32( x7, 11 ) ), w ); \
+} while (0)
+
 /*
 * PASSy(n, in) computes pass number "y", for a total of "n", using the
 * one-argument macro "in" to access input words. Current state is assumed
@@ -152,22 +159,22 @@ do { \
 #define PASS1(n, in)   do { \
 		unsigned pass_count; \
 		for (pass_count = 0; pass_count < 32; pass_count += 8) { \
-			STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
-				in(pass_count + 0), SPH_C32(0x00000000)); \
-			STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
-				in(pass_count + 1), SPH_C32(0x00000000)); \
-			STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
-				in(pass_count + 2), SPH_C32(0x00000000)); \
-			STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
-				in(pass_count + 3), SPH_C32(0x00000000)); \
-			STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
-				in(pass_count + 4), SPH_C32(0x00000000)); \
-			STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
-				in(pass_count + 5), SPH_C32(0x00000000)); \
-			STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
-				in(pass_count + 6), SPH_C32(0x00000000)); \
-			STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
-				in(pass_count + 7), SPH_C32(0x00000000)); \
+			STEP1(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
+				in(pass_count + 0) ); \
+			STEP1(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
+				in(pass_count + 1) ); \
+			STEP1(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
+				in(pass_count + 2) ); \
+			STEP1(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
+				in(pass_count + 3) ); \
+			STEP1(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
+				in(pass_count + 4) ); \
+			STEP1(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
+				in(pass_count + 5) ); \
+			STEP1(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
+				in(pass_count + 6) ); \
+			STEP1(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
+				in(pass_count + 7) ); \
   		} \
 	} while (0)

@@ -605,25 +612,32 @@ do { \
                       _mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
 } while (0)

+#define STEP1_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
+do { \
+   __m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
+                                      mm256_ror_32( x7, 11 ) ), w ); \
+} while (0)
+   
 #define PASS1_8W(n, in)   do { \
      unsigned pass_count; \
      for (pass_count = 0; pass_count < 32; pass_count += 8) { \
-         STEP_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
-            in(pass_count + 0), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
-            in(pass_count + 1), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
-            in(pass_count + 2), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
-            in(pass_count + 3), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
-            in(pass_count + 4), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
-            in(pass_count + 5), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
-            in(pass_count + 6), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
-            in(pass_count + 7), SPH_C32(0x00000000)); \
+         STEP1_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
+            in(pass_count + 0) ); \
+         STEP1_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
+            in(pass_count + 1) ); \
+         STEP1_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
+            in(pass_count + 2) ); \
+         STEP1_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
+            in(pass_count + 3) ); \
+         STEP1_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
+            in(pass_count + 4) ); \
+         STEP1_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
+            in(pass_count + 5) ); \
+         STEP1_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
+            in(pass_count + 6) ); \
+         STEP1_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
+            in(pass_count + 7) ); \
         } \
   } while (0)

--- a/algo/heavy/sph_hefty1.c
+++ b/algo/heavy/sph_hefty1.c
@@ -1,382 +0,0 @@
-/*
- * HEFTY1 cryptographic hash function
- *
- * Copyright (c) 2014, dbcc14 <BM-NBx4AKznJuyem3dArgVY8MGyABpihRy5>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * The views and conclusions contained in the software and documentation are those
- * of the authors and should not be interpreted as representing official policies,
- * either expressed or implied, of the FreeBSD Project.
- */
-
-#include <assert.h>
-#include <string.h>
-
-#ifdef _MSC_VER
-#define inline __inline
-#endif
-
-#include "sph_hefty1.h"
-
-#define Min(A, B) (A <= B ? A : B)
-#define RoundFunc(ctx, A, B, C, D, E, F, G, H, W, K)                    \
-    {                                                                   \
-        /* To thwart parallelism, Br modifies itself each time it's     \
-         * called.  This also means that calling it in different        \
-         * orders yeilds different results.  In C the order of          \
-         * evaluation of function arguments and + operands are          \
-         * unspecified (and depends on the compiler), so we must make   \
-         * the order of Br calls explicit.                              \
-         */                                                             \
-        uint32_t brG = Br(ctx, G);                                      \
-        uint32_t tmp1 = Ch(E, Br(ctx, F), brG) + H + W + K;             \
-        uint32_t tmp2 = tmp1 + Sigma1(Br(ctx, E));                      \
-        uint32_t brC = Br(ctx, C);                                      \
-        uint32_t brB = Br(ctx, B);                                      \
-        uint32_t tmp3 = Ma(Br(ctx, A), brB, brC);                       \
-        uint32_t tmp4 = tmp3 + Sigma0(Br(ctx, A));                      \
-        H = G;                                                          \
-        G = F;                                                          \
-        F = E;                                                          \
-        E = D + Br(ctx, tmp2);                                          \
-        D = C;                                                          \
-        C = B;                                                          \
-        B = A;                                                          \
-        A = tmp2 + tmp4;                                                \
-    }                                                                   \
-
-/* Nothing up my sleeve constants */
-const static uint32_t K[64] = {
-    0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
-    0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
-    0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
-    0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
-    0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
-    0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
-    0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
-    0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
-    0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
-    0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
-    0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
-    0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
-    0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
-    0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
-    0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
-    0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
-};
-
-/* Initial hash values */
-const static uint32_t H[HEFTY1_STATE_WORDS] = {
-    0x6a09e667UL,
-    0xbb67ae85UL,
-    0x3c6ef372UL,
-    0xa54ff53aUL,
-    0x510e527fUL,
-    0x9b05688cUL,
-    0x1f83d9abUL,
-    0x5be0cd19UL
-};
-
-static inline uint32_t Rr(uint32_t X, uint8_t n)
-{
-    return (X >> n) | (X << (32 - n));
-}
-
-static inline uint32_t Ch(uint32_t E, uint32_t F, uint32_t G)
-{
-    return (E & F) ^ (~E & G);
-}
-
-static inline uint32_t Sigma1(uint32_t E)
-{
-    return Rr(E, 6) ^ Rr(E, 11) ^ Rr(E, 25);
-}
-
-static inline uint32_t sigma1(uint32_t X)
-{
-    return Rr(X, 17) ^ Rr(X, 19) ^ (X >> 10);
-}
-
-static inline uint32_t Ma(uint32_t A, uint32_t B, uint32_t C)
-{
-    return (A & B) ^ (A & C) ^ (B & C);
-}
-
-static inline uint32_t Sigma0(uint32_t A)
-{
-    return Rr(A, 2) ^ Rr(A, 13) ^ Rr(A, 22);
-}
-
-static inline uint32_t sigma0(uint32_t X)
-{
-    return Rr(X, 7) ^ Rr(X, 18) ^ (X >> 3);
-}
-
-static inline uint32_t Reverse32(uint32_t n)
-{
-    #if BYTE_ORDER == LITTLE_ENDIAN
-        return n << 24 | (n & 0x0000ff00) << 8 | (n & 0x00ff0000) >> 8 | n >> 24;
-    #else
-        return n;
-    #endif
-}
-
-static inline uint64_t Reverse64(uint64_t n)
-{
-    #if BYTE_ORDER == LITTLE_ENDIAN
-        uint32_t a = n >> 32;
-        uint32_t b = (n << 32) >> 32;
-
-        return (uint64_t)Reverse32(b) << 32 | Reverse32(a);
-    #else
-        return n;
-    #endif
-}
-
-/* Smoosh byte into nibble */
-static inline uint8_t Smoosh4(uint8_t X)
-{
-    return (X >> 4) ^ (X & 0xf);
-}
-
-/* Smoosh 32-bit word into 2-bits */
-static inline uint8_t Smoosh2(uint32_t X)
-{
-    uint16_t w = (X >> 16) ^ (X & 0xffff);
-    uint8_t n = Smoosh4((w >> 8) ^ (w & 0xff));
-    return (n >> 2) ^ (n & 0x3);
-}
-
-static void Mangle(uint32_t *S)
-{
-    uint32_t *R = S;
-    uint32_t *C = &S[1];
-
-    uint8_t r0 = Smoosh4(R[0] >> 24);
-    uint8_t r1 = Smoosh4(R[0] >> 16);
-    uint8_t r2 = Smoosh4(R[0] >> 8);
-    uint8_t r3 = Smoosh4(R[0] & 0xff);
-
-    int i;
-
-    /* Diffuse */
-    uint32_t tmp = 0;
-    for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++) {
-        uint8_t r = Smoosh2(tmp);
-        switch (r) {
-        case 0:
-            C[i] ^= Rr(R[0], i + r0);
-            break;
-        case 1:
-            C[i] += Rr(~R[0], i + r1);
-            break;
-        case 2:
-            C[i] &= Rr(~R[0], i + r2);
-            break;
-        case 3:
-            C[i] ^= Rr(R[0], i + r3);
-            break;
-        }
-        tmp ^= C[i];
-    }
-
-    /* Compress */
-    tmp = 0;
-    for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++)
-        if (i % 2)
-            tmp ^= C[i];
-        else
-            tmp += C[i];
-    R[0] ^= tmp;
-}
-
-static void Absorb(uint32_t *S, uint32_t X)
-{
-    uint32_t *R = S;
-    R[0] ^= X;
-    Mangle(S);
-}
-
-static uint32_t Squeeze(uint32_t *S)
-{
-    uint32_t Y = S[0];
-    Mangle(S);
-    return Y;
-}
-
-/* Branch, compress and serialize function */
-static inline uint32_t Br(HEFTY1_CTX *ctx, uint32_t X)
-{
-    uint32_t R = Squeeze(ctx->sponge);
-
-    uint8_t r0 = R >> 8;
-    uint8_t r1 = R & 0xff;
-
-    uint32_t Y = 1 << (r0 % 32);
-
-    switch (r1 % 4)
-    {
-    case 0:
-        /* Do nothing */
-        break;
-    case 1:
-        return X & ~Y;
-    case 2:
-        return X | Y;
-    case 3:
-        return X ^ Y;
-    }
-
-    return X;
-}
-
-static void HashBlock(HEFTY1_CTX *ctx)
-{
-    uint32_t A, B, C, D, E, F, G, H;
-    uint32_t W[HEFTY1_BLOCK_BYTES];
-
-    assert(ctx);
-
-    A = ctx->h[0];
-    B = ctx->h[1];
-    C = ctx->h[2];
-    D = ctx->h[3];
-    E = ctx->h[4];
-    F = ctx->h[5];
-    G = ctx->h[6];
-    H = ctx->h[7];
-
-    int t = 0;
-    for (; t < 16; t++) {
-        W[t] = Reverse32(((uint32_t *)&ctx->block[0])[t]); /* To host byte order */
-        Absorb(ctx->sponge, W[t] ^ K[t]);
-    }
-
-    for (t = 0; t < 16; t++) {
-        Absorb(ctx->sponge, D ^ H);
-        RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]);
-    }
-    for (t = 16; t < 64; t++) {
-        Absorb(ctx->sponge, H + D);
-        W[t] = sigma1(W[t - 2]) + W[t - 7] + sigma0(W[t - 15]) + W[t - 16];
-        RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]);
-    }
-
-    ctx->h[0] += A;
-    ctx->h[1] += B;
-    ctx->h[2] += C;
-    ctx->h[3] += D;
-    ctx->h[4] += E;
-    ctx->h[5] += F;
-    ctx->h[6] += G;
-    ctx->h[7] += H;
-
-    A = 0;
-    B = 0;
-    C = 0;
-    D = 0;
-    E = 0;
-    F = 0;
-    G = 0;
-    H = 0;
-
-    memset(W, 0, sizeof(W));
-}
-
-/* Public interface */
-
-void HEFTY1_Init(HEFTY1_CTX *ctx)
-{
-    assert(ctx);
-
-    memcpy(ctx->h, H, sizeof(ctx->h));
-    memset(ctx->block, 0, sizeof(ctx->block));
-    ctx->written = 0;
-    memset(ctx->sponge, 0, sizeof(ctx->sponge));
-}
-
-void HEFTY1_Update(HEFTY1_CTX *ctx, const void *buf, size_t len)
-{
-    assert(ctx);
-
-    uint64_t read = 0;
-    while (len) {
-        size_t end = (size_t)(ctx->written % HEFTY1_BLOCK_BYTES);
-        size_t count = Min(len, HEFTY1_BLOCK_BYTES - end);
-        memcpy(&ctx->block[end], &((unsigned char *)buf)[read], count);
-        len -= count;
-        read += count;
-        ctx->written += count;
-        if (!(ctx->written % HEFTY1_BLOCK_BYTES))
-            HashBlock(ctx);
-    }
-}
-
-void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *ctx)
-{
-    assert(digest);
-    assert(ctx);
-
-    /* Pad message (FIPS 180 Section 5.1.1) */
-    size_t used = (size_t)(ctx->written % HEFTY1_BLOCK_BYTES);
-    ctx->block[used++] = 0x80; /* Append 1 to end of message */
-    if (used > HEFTY1_BLOCK_BYTES - 8) {
-        /* We have already written into the last 64bits, so
-         * we must continue into the next block. */
-        memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - used);
-        HashBlock(ctx);
-        used = 0; /* Create a new block (below) */
-    }
-
-    /* All remaining bits to zero */
-    memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - 8 - used);
-
-    /* The last 64bits encode the length (in network byte order) */
-    uint64_t *len = (uint64_t *)&ctx->block[HEFTY1_BLOCK_BYTES - 8];
-    *len = Reverse64(ctx->written*8);
-
-    HashBlock(ctx);
-
-    /* Convert back to network byte order */
-    int i = 0;
-    for (; i < HEFTY1_STATE_WORDS; i++)
-        ctx->h[i] = Reverse32(ctx->h[i]);
-
-    memcpy(digest, ctx->h, sizeof(ctx->h));
-    memset(ctx, 0, sizeof(HEFTY1_CTX));
-}
-
-unsigned char* HEFTY1(const unsigned char *buf, size_t len, unsigned char *digest)
-{
-    HEFTY1_CTX ctx;
-    static unsigned char m[HEFTY1_DIGEST_BYTES];
-
-    if (!digest)
-        digest = m;
-
-    HEFTY1_Init(&ctx);
-    HEFTY1_Update(&ctx, buf, len);
-    HEFTY1_Final(digest, &ctx);
-
-    return digest;
-}
--- a/algo/heavy/sph_hefty1.h
+++ b/algo/heavy/sph_hefty1.h
@@ -1,66 +0,0 @@
-/*
- * HEFTY1 cryptographic hash function
- *
- * Copyright (c) 2014, dbcc14 <BM-NBx4AKznJuyem3dArgVY8MGyABpihRy5>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * The views and conclusions contained in the software and documentation are those
- * of the authors and should not be interpreted as representing official policies,
- * either expressed or implied, of the FreeBSD Project.
- */
-
-#ifndef __HEFTY1_H__
-#define __HEFTY1_H__
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef WIN32
-#include <sys/types.h>
-#endif
-
-#include <inttypes.h>
-
-#define HEFTY1_DIGEST_BYTES 32
-#define HEFTY1_BLOCK_BYTES 64
-#define HEFTY1_STATE_WORDS 8
-#define HEFTY1_SPONGE_WORDS 4
-
-typedef struct HEFTY1_CTX {
-    uint32_t h[HEFTY1_STATE_WORDS];
-    uint8_t  block[HEFTY1_BLOCK_BYTES];
-    uint64_t written;
-    uint32_t sponge[HEFTY1_SPONGE_WORDS];
-} HEFTY1_CTX;
-
-void HEFTY1_Init(HEFTY1_CTX *cxt);
-void HEFTY1_Update(HEFTY1_CTX *cxt, const void *data, size_t len);
-void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *cxt);
-unsigned char* HEFTY1(const unsigned char *data, size_t len, unsigned char *digest);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* __HEFTY1_H__ */
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -49,12 +49,11 @@ extern "C"{

 #define Sb_8W(x0, x1, x2, x3, c) \
 do { \
-   __m512i cc = _mm512_set1_epi64( c ); \
-    x3 = mm512_not( x3 ); \
+    const __m512i cc = _mm512_set1_epi64( c ); \
    x0 = mm512_xorandnot( x0, x2, cc ); \
    tmp = mm512_xorand( cc, x0, x1 ); \
-    x0 = mm512_xorand( x0, x2, x3 ); \
-    x3 = mm512_xorandnot( x3, x1, x2 ); \
+    x0 = mm512_xorandnot( x0, x3, x2 ); \
+    x3 = _mm512_ternarylogic_epi64( x3, x1, x2, 0x2d ); /* ~x3 ^ (~x1 & x2) */\
    x1 = mm512_xorand( x1, x0, x2 ); \
    x2 = mm512_xorandnot( x2, x3, x0 ); \
    x0 = mm512_xoror( x0, x1, x3 ); \
@@ -79,7 +78,7 @@ do { \

 #define Sb(x0, x1, x2, x3, c) \
 do { \
-   __m256i cc = _mm256_set1_epi64x( c ); \
+   const __m256i cc = _mm256_set1_epi64x( c ); \
    x3 = mm256_not( x3 ); \
    x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
    tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -72,11 +72,11 @@ static const uint64_t RC[] = {
 // Targetted macros, keccak-macros.h is included for each target.

 #define DECL64(x)          __m512i x
-#define XOR(d, a, b)     (d = _mm512_xor_si512(a,b))
-#define XOR64 XOR
+#define XOR(d, a, b)       (d = _mm512_xor_si512(a,b))
+#define XOR64              XOR
 #define AND64(d, a, b)     (d = _mm512_and_si512(a,b))
 #define OR64(d, a, b)      (d = _mm512_or_si512(a,b))
-#define NOT64(d, s)        (d = _mm512_xor_si512(s,m512_neg1))
+#define NOT64(d, s)        (d = mm512_not( s ) )
 #define ROL64(d, v, n)     (d = mm512_rol_64(v, n))
 #define XOROR(d, a, b, c)  (d = mm512_xoror(a, b, c))
 #define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
@@ -257,14 +257,14 @@ keccak512_8way_close(void *cc, void *dst)
        kc->w[j ] = _mm256_xor_si256( kc->w[j], buf[j] ); \
 } while (0)

-#define DECL64(x)        __m256i x
-#define XOR(d, a, b)    (d = _mm256_xor_si256(a,b))
-#define XOR64 XOR
-#define AND64(d, a, b)   (d = _mm256_and_si256(a,b))
-#define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
-#define NOT64(d, s)      (d = _mm256_xor_si256(s,m256_neg1))
-#define ROL64(d, v, n)   (d = mm256_rol_64(v, n))
-#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
+#define DECL64(x)          __m256i x
+#define XOR(d, a, b)       (d = _mm256_xor_si256(a,b))
+#define XOR64              XOR
+#define AND64(d, a, b)     (d = _mm256_and_si256(a,b))
+#define OR64(d, a, b)      (d = _mm256_or_si256(a,b))
+#define NOT64(d, s)        (d = mm256_not( s ) )
+#define ROL64(d, v, n)     (d = mm256_rol_64(v, n))
+#define XOROR(d, a, b, c)  (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
 #define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
 #define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c ))

--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -62,186 +62,66 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {

 #define cns4w(i)  m512_const1_128( ( (__m128i*)CNS_INIT)[i] )

-#define ADD_CONSTANT4W(a,b,c0,c1)\
-    a = _mm512_xor_si512(a,c0);\
-    b = _mm512_xor_si512(b,c1);
+#define ADD_CONSTANT4W( a, b, c0, c1 ) \
+    a = _mm512_xor_si512( a, c0 ); \
+    b = _mm512_xor_si512( b, c1 );

 #define MULT24W( a0, a1 ) \
-do { \
+{ \
  __m512i b = _mm512_xor_si512( a0, \
                     _mm512_maskz_shuffle_epi32( 0xbbbb, a1, 16 ) ); \
-  a0 = _mm512_or_si512( _mm512_bsrli_epi128(  b, 4 ), \
-                        _mm512_bslli_epi128( a1,12 ) ); \
-  a1 = _mm512_or_si512( _mm512_bsrli_epi128( a1, 4 ), \
-                        _mm512_bslli_epi128(  b,12 ) ); \
-} while(0)
+  a0 = _mm512_alignr_epi8( a1,  b, 4 ); \
+  a1 = _mm512_alignr_epi8(  b, a1, 4 ); \
+}

-/*
-#define MULT24W( a0, a1, mask ) \
-do { \
-  __m512i b = _mm512_xor_si512( a0, \
-                   _mm512_shuffle_epi32( _mm512_and_si512(a1,mask), 16 ) ); \
-  a0 = _mm512_or_si512( _mm512_bsrli_epi128(b,4), _mm512_bslli_epi128(a1,12) );\
-  a1 = _mm512_or_si512( _mm512_bsrli_epi128(a1,4), _mm512_bslli_epi128(b,12) );\
-} while(0)
-*/
-
-// confirm pointer arithmetic
-// ok but use array indexes
-#define STEP_PART4W(x,c0,c1,t)\
-    SUBCRUMB4W(*x,*(x+1),*(x+2),*(x+3),*t);\
-    SUBCRUMB4W(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
-    MIXWORD4W(*x,*(x+4),*t,*(t+1));\
-    MIXWORD4W(*(x+1),*(x+5),*t,*(t+1));\
-    MIXWORD4W(*(x+2),*(x+6),*t,*(t+1));\
-    MIXWORD4W(*(x+3),*(x+7),*t,*(t+1));\
-    ADD_CONSTANT4W(*x, *(x+4), c0, c1);
-
-#define SUBCRUMB4W(a0,a1,a2,a3,t)\
-    t  = a0;\
+#define SUBCRUMB4W( a0, a1, a2, a3 ) \
+{ \
+    __m512i t = a0; \
    a0 = mm512_xoror( a3, a0, a1 ); \
-    a2 = _mm512_xor_si512(a2,a3);\
+    a2 = _mm512_xor_si512( a2, a3 ); \
    a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
    a3 = mm512_xorand( a2, a3, t ); \
-    a2 = mm512_xorand( a1, a2, a0);\
-    a1 = _mm512_or_si512(a1,a3);\
-    a3 = _mm512_xor_si512(a3,a2);\
-    t  = _mm512_xor_si512(t,a1);\
-    a2 = _mm512_and_si512(a2,a1);\
-    a1 = mm512_xnor(a1,a0);\
-    a0 = t;
+    a2 = mm512_xorand( a1, a2, a0); \
+    a1 = _mm512_or_si512( a1, a3 ); \
+    a3 = _mm512_xor_si512( a3, a2 ); \
+    t  = _mm512_xor_si512( t, a1 ); \
+    a2 = _mm512_and_si512( a2, a1 ); \
+    a1 = mm512_xnor( a1, a0 ); \
+    a0 = t; \
+}

-/*
-#define SUBCRUMB4W(a0,a1,a2,a3,t)\
-    t  = _mm512_load_si512(&a0);\
-    a0 = _mm512_or_si512(a0,a1);\
-    a2 = _mm512_xor_si512(a2,a3);\
-    a1 = _mm512_andnot_si512(a1, m512_neg1 );\
-    a0 = _mm512_xor_si512(a0,a3);\
-    a3 = _mm512_and_si512(a3,t);\
-    a1 = _mm512_xor_si512(a1,a3);\
-    a3 = _mm512_xor_si512(a3,a2);\
-    a2 = _mm512_and_si512(a2,a0);\
-    a0 = _mm512_andnot_si512(a0, m512_neg1 );\
-    a2 = _mm512_xor_si512(a2,a1);\
-    a1 = _mm512_or_si512(a1,a3);\
-    t  = _mm512_xor_si512(t,a1);\
-    a3 = _mm512_xor_si512(a3,a2);\
-    a2 = _mm512_and_si512(a2,a1);\
-    a1 = _mm512_xor_si512(a1,a0);\
-    a0 = _mm512_load_si512(&t);
-*/
+#define MIXWORD4W( a, b ) \
+    b = _mm512_xor_si512( a, b ); \
+    a = _mm512_xor_si512( b, _mm512_rol_epi32( a, 2 ) ); \
+    b = _mm512_xor_si512( a, _mm512_rol_epi32( b, 14 ) ); \
+    a = _mm512_xor_si512( b, _mm512_rol_epi32( a, 10 ) ); \
+    b = _mm512_rol_epi32( b, 1 );

-#define MIXWORD4W(a,b,t1,t2)\
-    b  = _mm512_xor_si512(a,b);\
-    t1 = _mm512_slli_epi32(a,2);\
-    t2 = _mm512_srli_epi32(a,30);\
-    a  = mm512_xoror( b, t1, t2 ); \
-    t1 = _mm512_slli_epi32(b,14);\
-    t2 = _mm512_srli_epi32(b,18);\
-    b  = _mm512_or_si512(t1,t2);\
-    b  = mm512_xoror( a, t1, t2 ); \
-    t1 = _mm512_slli_epi32(a,10);\
-    t2 = _mm512_srli_epi32(a,22);\
-    a  = mm512_xoror( b, t1, t2 ); \
-    t1 = _mm512_slli_epi32(b,1);\
-    t2 = _mm512_srli_epi32(b,31);\
-    b  = _mm512_or_si512(t1,t2);
+#define STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
+    SUBCRUMB4W( x0, x1, x2, x3 ); \
+    SUBCRUMB4W( x5, x6, x7, x4 ); \
+    MIXWORD4W( x0, x4 ); \
+    MIXWORD4W( x1, x5 ); \
+    MIXWORD4W( x2, x6 ); \
+    MIXWORD4W( x3, x7 ); \
+    ADD_CONSTANT4W( x0, x4, c0, c1 );

-/*
-#define MIXWORD4W(a,b,t1,t2)\
-    b  = _mm512_xor_si512(a,b);\
-    t1 = _mm512_slli_epi32(a,2);\
-    t2 = _mm512_srli_epi32(a,30);\
-     a = _mm512_or_si512(t1,t2);\
-    a  = _mm512_xor_si512(a,b);\
-    t1 = _mm512_slli_epi32(b,14);\
-    t2 = _mm512_srli_epi32(b,18);\
-    b  = _mm512_or_si512(t1,t2);\
-    b  = _mm512_xor_si512(a,b);\
-    t1 = _mm512_slli_epi32(a,10);\
-    t2 = _mm512_srli_epi32(a,22);\
-    a  = _mm512_or_si512(t1,t2);\
-    a  = _mm512_xor_si512(a,b);\
-    t1 = _mm512_slli_epi32(b,1);\
-    t2 = _mm512_srli_epi32(b,31);\
-    b  = _mm512_or_si512(t1,t2);
-*/
-
-#define STEP_PART24W(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
-    a1 = _mm512_shuffle_epi32(a1,147);\
-    t0 = _mm512_load_si512(&a1);\
-    a1 = _mm512_unpacklo_epi32(a1,a0);\
-    t0 = _mm512_unpackhi_epi32(t0,a0);\
-    t1 = _mm512_shuffle_epi32(t0,78);\
-    a0 = _mm512_shuffle_epi32(a1,78);\
-    SUBCRUMB4W(t1,t0,a0,a1,tmp0);\
-    t0 = _mm512_unpacklo_epi32(t0,t1);\
-    a1 = _mm512_unpacklo_epi32(a1,a0);\
-    a0 = _mm512_load_si512(&a1);\
-    a0 = _mm512_unpackhi_epi64(a0,t0);\
-    a1 = _mm512_unpacklo_epi64(a1,t0);\
-    a1 = _mm512_shuffle_epi32(a1,57);\
-    MIXWORD4W(a0,a1,tmp0,tmp1);\
-    ADD_CONSTANT4W(a0,a1,c0,c1);
-
-#define NMLTOM7684W(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
-    s2 = _mm512_load_si512(&r1);\
-    q2 = _mm512_load_si512(&p1);\
-    r2 = _mm512_shuffle_epi32(r2,216);\
-    p2 = _mm512_shuffle_epi32(p2,216);\
-    r1 = _mm512_unpacklo_epi32(r1,r0);\
-    p1 = _mm512_unpacklo_epi32(p1,p0);\
-    s2 = _mm512_unpackhi_epi32(s2,r0);\
-    q2 = _mm512_unpackhi_epi32(q2,p0);\
-    s0 = _mm512_load_si512(&r2);\
-    q0 = _mm512_load_si512(&p2);\
-    r2 = _mm512_unpacklo_epi64(r2,r1);\
-    p2 = _mm512_unpacklo_epi64(p2,p1);\
-    s1 = _mm512_load_si512(&s0);\
-    q1 = _mm512_load_si512(&q0);\
-    s0 = _mm512_unpackhi_epi64(s0,r1);\
-    q0 = _mm512_unpackhi_epi64(q0,p1);\
-    r2 = _mm512_shuffle_epi32(r2,225);\
-    p2 = _mm512_shuffle_epi32(p2,225);\
-    r0 = _mm512_load_si512(&s1);\
-    p0 = _mm512_load_si512(&q1);\
-    s0 = _mm512_shuffle_epi32(s0,225);\
-    q0 = _mm512_shuffle_epi32(q0,225);\
-    s1 = _mm512_unpacklo_epi64(s1,s2);\
-    q1 = _mm512_unpacklo_epi64(q1,q2);\
-    r0 = _mm512_unpackhi_epi64(r0,s2);\
-    p0 = _mm512_unpackhi_epi64(p0,q2);\
-    s2 = _mm512_load_si512(&r0);\
-    q2 = _mm512_load_si512(&p0);\
-    s3 = _mm512_load_si512(&r2);\
-    q3 = _mm512_load_si512(&p2);
-
-#define MIXTON7684W(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
-    s0 = _mm512_load_si512(&r0);\
-    q0 = _mm512_load_si512(&p0);\
-    s1 = _mm512_load_si512(&r2);\
-    q1 = _mm512_load_si512(&p2);\
-    r0 = _mm512_unpackhi_epi32(r0,r1);\
-    p0 = _mm512_unpackhi_epi32(p0,p1);\
-    r2 = _mm512_unpackhi_epi32(r2,r3);\
-    p2 = _mm512_unpackhi_epi32(p2,p3);\
-    s0 = _mm512_unpacklo_epi32(s0,r1);\
-    q0 = _mm512_unpacklo_epi32(q0,p1);\
-    s1 = _mm512_unpacklo_epi32(s1,r3);\
-    q1 = _mm512_unpacklo_epi32(q1,p3);\
-    r1 = _mm512_load_si512(&r0);\
-    p1 = _mm512_load_si512(&p0);\
-    r0 = _mm512_unpackhi_epi64(r0,r2);\
-    p0 = _mm512_unpackhi_epi64(p0,p2);\
-    s0 = _mm512_unpackhi_epi64(s0,s1);\
-    q0 = _mm512_unpackhi_epi64(q0,q1);\
-    r1 = _mm512_unpacklo_epi64(r1,r2);\
-    p1 = _mm512_unpacklo_epi64(p1,p2);\
-    s2 = _mm512_load_si512(&r0);\
-    q2 = _mm512_load_si512(&p0);\
-    s1 = _mm512_load_si512(&r1);\
-    q1 = _mm512_load_si512(&p1);
+#define STEP_PART24W( a0, a1, t0, t1, c0, c1 ) \
+    a1 = _mm512_shuffle_epi32( a1, 147 ); \
+    t0 = _mm512_load_si512( &a1 ); \
+    a1 = _mm512_unpacklo_epi32( a1, a0 ); \
+    t0 = _mm512_unpackhi_epi32( t0, a0 ); \
+    t1 = _mm512_shuffle_epi32( t0, 78 ); \
+    a0 = _mm512_shuffle_epi32( a1, 78 ); \
+    SUBCRUMB4W( t1, t0, a0, a1 ); \
+    t0 = _mm512_unpacklo_epi32( t0, t1 ); \
+    a1 = _mm512_unpacklo_epi32( a1, a0 ); \
+    a0 = _mm512_load_si512( &a1 ); \
+    a0 = _mm512_unpackhi_epi64( a0, t0 ); \
+    a1 = _mm512_unpacklo_epi64( a1, t0 ); \
+    a1 = _mm512_shuffle_epi32( a1, 57 ); \
+    MIXWORD4W( a0, a1 ); \
+    ADD_CONSTANT4W( a0, a1, c0, c1 );

 #define NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
    s1 = _mm512_load_si512(&r3);\
@@ -279,8 +159,7 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
    __m512i t0, t1;
    __m512i *chainv = state->chainv;
    __m512i msg0, msg1;
-    __m512i tmp[2];
-    __m512i x[8];
+    __m512i x0, x1, x2, x3, x4, x5, x6, x7;

    t0 = mm512_xor3( chainv[0], chainv[2], chainv[4] );
    t1 = mm512_xor3( chainv[1], chainv[3], chainv[5] );
@@ -372,42 +251,30 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
    chainv[7] = _mm512_rol_epi32( chainv[7], 3 );
    chainv[9] = _mm512_rol_epi32( chainv[9], 4 );

-    NMLTOM10244W( chainv[0], chainv[2], chainv[4], chainv[6],
-                x[0], x[1], x[2], x[3],
-                chainv[1],chainv[3],chainv[5],chainv[7],
-                x[4], x[5], x[6], x[7] );
+    NMLTOM10244W( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
+                  chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );

-    STEP_PART4W( &x[0], cns4w( 0), cns4w( 1), &tmp[0] );
-    STEP_PART4W( &x[0], cns4w( 2), cns4w( 3), &tmp[0] );
-    STEP_PART4W( &x[0], cns4w( 4), cns4w( 5), &tmp[0] );
-    STEP_PART4W( &x[0], cns4w( 6), cns4w( 7), &tmp[0] );
-    STEP_PART4W( &x[0], cns4w( 8), cns4w( 9), &tmp[0] );
-    STEP_PART4W( &x[0], cns4w(10), cns4w(11), &tmp[0] );
-    STEP_PART4W( &x[0], cns4w(12), cns4w(13), &tmp[0] );
-    STEP_PART4W( &x[0], cns4w(14), cns4w(15), &tmp[0] );
+    STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 0), cns4w( 1) );
+    STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 2), cns4w( 3) );
+    STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 4), cns4w( 5) );
+    STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 6), cns4w( 7) );
+    STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 8), cns4w( 9) );
+    STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w(10), cns4w(11) );
+    STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w(12), cns4w(13) );
+    STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w(14), cns4w(15) );

-    MIXTON10244W( x[0], x[1], x[2], x[3],
-                chainv[0], chainv[2], chainv[4],chainv[6],
-                x[4], x[5], x[6], x[7],
-                chainv[1],chainv[3],chainv[5],chainv[7]);
+    MIXTON10244W( x0, x1, x2, x3, chainv[0], chainv[2], chainv[4], chainv[6],
+                  x4, x5, x6, x7, chainv[1], chainv[3], chainv[5], chainv[7] );

    /* Process last 256-bit block */
-    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(16), cns4w(17),
-                tmp[0], tmp[1] );
-    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(18), cns4w(19),
-                tmp[0], tmp[1] );
-    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(20), cns4w(21),
-                tmp[0], tmp[1] );
-    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(22), cns4w(23),
-                tmp[0], tmp[1] );
-    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(24), cns4w(25),
-                tmp[0], tmp[1] );
-    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(26), cns4w(27),
-                tmp[0], tmp[1] );
-    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(28), cns4w(29),
-                tmp[0], tmp[1] );
-    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(30), cns4w(31),
-                tmp[0], tmp[1] );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(16), cns4w(17) );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(18), cns4w(19) );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(20), cns4w(21) );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(22), cns4w(23) );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(24), cns4w(25) );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(26), cns4w(27) );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(28), cns4w(29) );
+    STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(30), cns4w(31) );
 }

 void finalization512_4way( luffa_4way_context *state, uint32 *b )
@@ -683,138 +550,85 @@ int luffa_4way_update_close( luffa_4way_context *state,

 #define cns(i)  m256_const1_128( ( (__m128i*)CNS_INIT)[i] )

-#define ADD_CONSTANT(a,b,c0,c1)\
-    a = _mm256_xor_si256(a,c0);\
-    b = _mm256_xor_si256(b,c1);
+#define ADD_CONSTANT( a, b, c0, c1 ) \
+    a = _mm256_xor_si256( a, c0 ); \
+    b = _mm256_xor_si256( b, c1 );

-#define MULT2( a0, a1, mask ) \
-do { \
-  __m256i b = _mm256_xor_si256( a0, \
-                   _mm256_shuffle_epi32( _mm256_and_si256(a1,mask), 16 ) ); \
-  a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
-  a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) );  \
-} while(0)
+#define MULT2( a0, a1 ) \
+{ \
+  __m256i b = _mm256_xor_si256( a0, _mm256_shuffle_epi32( \
+                         _mm256_blend_epi32( a1, m256_zero, 0xee ), 16 ) ); \
+  a0 = _mm256_alignr_epi8( a1,  b, 4 ); \
+  a1 = _mm256_alignr_epi8(  b, a1, 4 ); \
+}

-#define STEP_PART(x,c0,c1,t)\
-    SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
-    SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
-    MIXWORD(*x,*(x+4),*t,*(t+1));\
-    MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
-    MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
-    MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
-    ADD_CONSTANT(*x, *(x+4), c0, c1);
+#define SUBCRUMB( a0, a1, a2, a3 ) \
+{ \
+    __m256i t = a0; \
+    a0 = _mm256_or_si256( a0, a1 ); \
+    a2 = _mm256_xor_si256( a2, a3 ); \
+    a1 = mm256_not( a1 ); \
+    a0 = _mm256_xor_si256( a0, a3 ); \
+    a3 = _mm256_and_si256( a3, t ); \
+    a1 = _mm256_xor_si256( a1, a3 ); \
+    a3 = _mm256_xor_si256( a3, a2 ); \
+    a2 = _mm256_and_si256( a2, a0 ); \
+    a0 = mm256_not( a0 ); \
+    a2 = _mm256_xor_si256( a2, a1 ); \
+    a1 = _mm256_or_si256(  a1, a3 ); \
+    t  = _mm256_xor_si256(  t, a1 ); \
+    a3 = _mm256_xor_si256( a3, a2 ); \
+    a2 = _mm256_and_si256( a2, a1 ); \
+    a1 = _mm256_xor_si256( a1, a0 ); \
+    a0 = t; \
+}

-#define SUBCRUMB(a0,a1,a2,a3,t)\
-    t  = a0;\
-    a0 = _mm256_or_si256(a0,a1);\
-    a2 = _mm256_xor_si256(a2,a3);\
-    a1 = mm256_not( a1 );\
-    a0 = _mm256_xor_si256(a0,a3);\
-    a3 = _mm256_and_si256(a3,t);\
-    a1 = _mm256_xor_si256(a1,a3);\
-    a3 = _mm256_xor_si256(a3,a2);\
-    a2 = _mm256_and_si256(a2,a0);\
-    a0 = mm256_not( a0 );\
-    a2 = _mm256_xor_si256(a2,a1);\
-    a1 = _mm256_or_si256(a1,a3);\
-    t  = _mm256_xor_si256(t,a1);\
-    a3 = _mm256_xor_si256(a3,a2);\
-    a2 = _mm256_and_si256(a2,a1);\
-    a1 = _mm256_xor_si256(a1,a0);\
-    a0 = t;\
+#define MIXWORD( a, b ) \
+{ \
+    __m256i t1, t2; \
+    b  = _mm256_xor_si256( a,b ); \
+    t1 = _mm256_slli_epi32( a,  2 ); \
+    t2 = _mm256_srli_epi32( a, 30 ); \
+    a  = _mm256_or_si256( t1, t2 ); \
+    a  = _mm256_xor_si256( a, b ); \
+    t1 = _mm256_slli_epi32( b, 14 ); \
+    t2 = _mm256_srli_epi32( b, 18 ); \
+    b  = _mm256_or_si256( t1, t2 ); \
+    b  = _mm256_xor_si256( a, b ); \
+    t1 = _mm256_slli_epi32( a, 10 ); \
+    t2 = _mm256_srli_epi32( a, 22 ); \
+    a  = _mm256_or_si256( t1,t2 ); \
+    a  = _mm256_xor_si256( a,b ); \
+    t1 = _mm256_slli_epi32( b,1 ); \
+    t2 = _mm256_srli_epi32( b,31 ); \
+    b  = _mm256_or_si256( t1, t2 ); \
+}

-#define MIXWORD(a,b,t1,t2)\
-    b  = _mm256_xor_si256(a,b);\
-    t1 = _mm256_slli_epi32(a,2);\
-    t2 = _mm256_srli_epi32(a,30);\
-     a = _mm256_or_si256(t1,t2);\
-    a  = _mm256_xor_si256(a,b);\
-    t1 = _mm256_slli_epi32(b,14);\
-    t2 = _mm256_srli_epi32(b,18);\
-    b  = _mm256_or_si256(t1,t2);\
-    b  = _mm256_xor_si256(a,b);\
-    t1 = _mm256_slli_epi32(a,10);\
-    t2 = _mm256_srli_epi32(a,22);\
-    a  = _mm256_or_si256(t1,t2);\
-    a  = _mm256_xor_si256(a,b);\
-    t1 = _mm256_slli_epi32(b,1);\
-    t2 = _mm256_srli_epi32(b,31);\
-    b  = _mm256_or_si256(t1,t2);
+#define STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
+    SUBCRUMB( x0, x1, x2, x3 ); \
+    SUBCRUMB( x5, x6, x7, x4 ); \
+    MIXWORD( x0, x4 ); \
+    MIXWORD( x1, x5 ); \
+    MIXWORD( x2, x6 ); \
+    MIXWORD( x3, x7 ); \
+    ADD_CONSTANT( x0, x4, c0, c1 );

-#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
-    a1 = _mm256_shuffle_epi32(a1,147);\
-    t0 = _mm256_load_si256(&a1);\
-    a1 = _mm256_unpacklo_epi32(a1,a0);\
-    t0 = _mm256_unpackhi_epi32(t0,a0);\
-    t1 = _mm256_shuffle_epi32(t0,78);\
-    a0 = _mm256_shuffle_epi32(a1,78);\
-    SUBCRUMB(t1,t0,a0,a1,tmp0);\
-    t0 = _mm256_unpacklo_epi32(t0,t1);\
-    a1 = _mm256_unpacklo_epi32(a1,a0);\
-    a0 = _mm256_load_si256(&a1);\
-    a0 = _mm256_unpackhi_epi64(a0,t0);\
-    a1 = _mm256_unpacklo_epi64(a1,t0);\
-    a1 = _mm256_shuffle_epi32(a1,57);\
-    MIXWORD(a0,a1,tmp0,tmp1);\
-    ADD_CONSTANT(a0,a1,c0,c1);
-
-#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
-    s2 = _mm256_load_si256(&r1);\
-    q2 = _mm256_load_si256(&p1);\
-    r2 = _mm256_shuffle_epi32(r2,216);\
-    p2 = _mm256_shuffle_epi32(p2,216);\
-    r1 = _mm256_unpacklo_epi32(r1,r0);\
-    p1 = _mm256_unpacklo_epi32(p1,p0);\
-    s2 = _mm256_unpackhi_epi32(s2,r0);\
-    q2 = _mm256_unpackhi_epi32(q2,p0);\
-    s0 = _mm256_load_si256(&r2);\
-    q0 = _mm256_load_si256(&p2);\
-    r2 = _mm256_unpacklo_epi64(r2,r1);\
-    p2 = _mm256_unpacklo_epi64(p2,p1);\
-    s1 = _mm256_load_si256(&s0);\
-    q1 = _mm256_load_si256(&q0);\
-    s0 = _mm256_unpackhi_epi64(s0,r1);\
-    q0 = _mm256_unpackhi_epi64(q0,p1);\
-    r2 = _mm256_shuffle_epi32(r2,225);\
-    p2 = _mm256_shuffle_epi32(p2,225);\
-    r0 = _mm256_load_si256(&s1);\
-    p0 = _mm256_load_si256(&q1);\
-    s0 = _mm256_shuffle_epi32(s0,225);\
-    q0 = _mm256_shuffle_epi32(q0,225);\
-    s1 = _mm256_unpacklo_epi64(s1,s2);\
-    q1 = _mm256_unpacklo_epi64(q1,q2);\
-    r0 = _mm256_unpackhi_epi64(r0,s2);\
-    p0 = _mm256_unpackhi_epi64(p0,q2);\
-    s2 = _mm256_load_si256(&r0);\
-    q2 = _mm256_load_si256(&p0);\
-    s3 = _mm256_load_si256(&r2);\
-    q3 = _mm256_load_si256(&p2);
-
-#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
-    s0 = _mm256_load_si256(&r0);\
-    q0 = _mm256_load_si256(&p0);\
-    s1 = _mm256_load_si256(&r2);\
-    q1 = _mm256_load_si256(&p2);\
-    r0 = _mm256_unpackhi_epi32(r0,r1);\
-    p0 = _mm256_unpackhi_epi32(p0,p1);\
-    r2 = _mm256_unpackhi_epi32(r2,r3);\
-    p2 = _mm256_unpackhi_epi32(p2,p3);\
-    s0 = _mm256_unpacklo_epi32(s0,r1);\
-    q0 = _mm256_unpacklo_epi32(q0,p1);\
-    s1 = _mm256_unpacklo_epi32(s1,r3);\
-    q1 = _mm256_unpacklo_epi32(q1,p3);\
-    r1 = _mm256_load_si256(&r0);\
-    p1 = _mm256_load_si256(&p0);\
-    r0 = _mm256_unpackhi_epi64(r0,r2);\
-    p0 = _mm256_unpackhi_epi64(p0,p2);\
-    s0 = _mm256_unpackhi_epi64(s0,s1);\
-    q0 = _mm256_unpackhi_epi64(q0,q1);\
-    r1 = _mm256_unpacklo_epi64(r1,r2);\
-    p1 = _mm256_unpacklo_epi64(p1,p2);\
-    s2 = _mm256_load_si256(&r0);\
-    q2 = _mm256_load_si256(&p0);\
-    s1 = _mm256_load_si256(&r1);\
-    q1 = _mm256_load_si256(&p1);\
+#define STEP_PART2( a0, a1, t0, t1, c0, c1 ) \
+    a1 = _mm256_shuffle_epi32( a1, 147); \
+    t0 = _mm256_load_si256( &a1 ); \
+    a1 = _mm256_unpacklo_epi32( a1, a0 ); \
+    t0 = _mm256_unpackhi_epi32( t0, a0 ); \
+    t1 = _mm256_shuffle_epi32( t0, 78 ); \
+    a0 = _mm256_shuffle_epi32( a1, 78 ); \
+    SUBCRUMB( t1, t0, a0, a1 );\
+    t0 = _mm256_unpacklo_epi32( t0, t1 ); \
+    a1 = _mm256_unpacklo_epi32( a1, a0 ); \
+    a0 = _mm256_load_si256( &a1 ); \
+    a0 = _mm256_unpackhi_epi64( a0, t0 ); \
+    a1 = _mm256_unpacklo_epi64( a1, t0 ); \
+    a1 = _mm256_shuffle_epi32( a1, 57 ); \
+    MIXWORD( a0, a1 ); \
+    ADD_CONSTANT( a0, a1, c0, c1 );

 #define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
    s1 = _mm256_load_si256(&r3);\
@@ -857,9 +671,7 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
    __m256i t0, t1;
    __m256i *chainv = state->chainv;
    __m256i msg0, msg1;
-    __m256i tmp[2];
-    __m256i x[8];
-    const __m256i MASK = m256_const1_i128( 0x00000000ffffffff );
+    __m256i x0, x1, x2, x3, x4, x5, x6, x7;

    t0 = chainv[0];
    t1 = chainv[1];
@@ -873,7 +685,7 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
    t0 = _mm256_xor_si256( t0, chainv[8] );
    t1 = _mm256_xor_si256( t1, chainv[9] );

-    MULT2( t0, t1, MASK );
+    MULT2( t0, t1 );

    msg0 = _mm256_shuffle_epi32( msg[0], 27 );
    msg1 = _mm256_shuffle_epi32( msg[1], 27 );
@@ -892,108 +704,96 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
    t0 = chainv[0];
    t1 = chainv[1];

-    MULT2( chainv[0], chainv[1], MASK );
+    MULT2( chainv[0], chainv[1] );
    chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
    chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );

-    MULT2( chainv[2], chainv[3], MASK );
+    MULT2( chainv[2], chainv[3] );
    chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
    chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);

-    MULT2( chainv[4], chainv[5], MASK );
+    MULT2( chainv[4], chainv[5] );
    chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
    chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);

-    MULT2( chainv[6], chainv[7], MASK );
+    MULT2( chainv[6], chainv[7] );
    chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
    chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);

-    MULT2( chainv[8], chainv[9], MASK );
+    MULT2( chainv[8], chainv[9] );
    chainv[8] = _mm256_xor_si256( chainv[8], t0 );
    chainv[9] = _mm256_xor_si256( chainv[9], t1 );

    t0 = chainv[8];
    t1 = chainv[9];

-    MULT2( chainv[8], chainv[9], MASK );
+    MULT2( chainv[8], chainv[9] );
    chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
    chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );

-    MULT2( chainv[6], chainv[7], MASK );
+    MULT2( chainv[6], chainv[7] );
    chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
    chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );

-    MULT2( chainv[4], chainv[5], MASK );
+    MULT2( chainv[4], chainv[5] );
    chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
    chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );

-    MULT2( chainv[2], chainv[3], MASK );
+    MULT2( chainv[2], chainv[3] );
    chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
    chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );

-    MULT2( chainv[0], chainv[1], MASK );
+    MULT2( chainv[0], chainv[1] );
    chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
    chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );
    chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
    chainv[3] = _mm256_xor_si256( chainv[3], msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );
    chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
    chainv[5] = _mm256_xor_si256( chainv[5], msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );
    chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
    chainv[7] = _mm256_xor_si256( chainv[7], msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );
    chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
    chainv[9] = _mm256_xor_si256( chainv[9], msg1 );

-    MULT2( msg0, msg1, MASK );
+    MULT2( msg0, msg1 );

    chainv[3] = mm256_rol_32( chainv[3], 1 );
    chainv[5] = mm256_rol_32( chainv[5], 2 );
    chainv[7] = mm256_rol_32( chainv[7], 3 );
    chainv[9] = mm256_rol_32( chainv[9], 4 );

-    NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
-                x[0], x[1], x[2], x[3],
-                chainv[1],chainv[3],chainv[5],chainv[7],
-                x[4], x[5], x[6], x[7] );
+    NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
+                chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );

-    STEP_PART( &x[0], cns( 0), cns( 1), &tmp[0] );
-    STEP_PART( &x[0], cns( 2), cns( 3), &tmp[0] );
-    STEP_PART( &x[0], cns( 4), cns( 5), &tmp[0] );
-    STEP_PART( &x[0], cns( 6), cns( 7), &tmp[0] );
-    STEP_PART( &x[0], cns( 8), cns( 9), &tmp[0] );
-    STEP_PART( &x[0], cns(10), cns(11), &tmp[0] );
-    STEP_PART( &x[0], cns(12), cns(13), &tmp[0] );
-    STEP_PART( &x[0], cns(14), cns(15), &tmp[0] );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 0), cns( 1) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 2), cns( 3) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 4), cns( 5) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 6), cns( 7) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 8), cns( 9) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(10), cns(11) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(12), cns(13) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(14), cns(15) );

-    MIXTON1024( x[0], x[1], x[2], x[3],
-                chainv[0], chainv[2], chainv[4],chainv[6],
-                x[4], x[5], x[6], x[7],
-                chainv[1],chainv[3],chainv[5],chainv[7]);
+    MIXTON1024( x0, x1, x2, x3, chainv[0], chainv[2], chainv[4], chainv[6],
+                x4, x5, x6, x7, chainv[1], chainv[3], chainv[5], chainv[7]);

    /* Process last 256-bit block */
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29),
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31),
-                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31) );
 }

 /***************************************************/
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -19,29 +19,37 @@
 */

 #include <string.h>
-#include <emmintrin.h>
 #include "simd-utils.h"
 #include "luffa_for_sse2.h"

+#if defined(__AVX512VL__)
+
+#define MULT2( a0, a1 ) \
+{ \
+  __m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
+  a0 = _mm_alignr_epi32( a1, b, 1 ); \
+  a1 = _mm_alignr_epi32( b, a1, 1 ); \
+}
+
+#elif defined(__SSE4_1__)
+
 #define MULT2( a0, a1 ) do \
 { \
-  __m128i b =  _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) ); \
-  a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) ); \
-  a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) );  \
+  __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
+  a0 = _mm_alignr_epi8( a1, b, 4 ); \
+  a1 = _mm_alignr_epi8( b, a1, 4 ); \
 } while(0)

-/*
-static inline __m256i mult2_avx2( a )
-{ 
-   __m128 a0, a0, b;
-   a0 = mm128_extractlo_256( a );
-   a1 = mm128_extracthi_256( a );
-   b =  _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) );
-   a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) );
-   a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) );
-   return mm256_concat_128( a1, a0 );
-}
-*/
+#else
+
+#define MULT2( a0, a1 ) do \
+{ \
+  __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
+  a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
+  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
+} while(0)
+
+#endif

 #define STEP_PART(x,c,t)\
    SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
@@ -73,13 +81,13 @@ static inline __m256i mult2_avx2( a )
    t  = _mm_load_si128(&a0);\
    a0 = _mm_or_si128(a0,a1);\
    a2 = _mm_xor_si128(a2,a3);\
-    a1 = _mm_andnot_si128(a1,ALLONE);\
+    a1 = mm128_not( a1 );\
    a0 = _mm_xor_si128(a0,a3);\
    a3 = _mm_and_si128(a3,t);\
    a1 = _mm_xor_si128(a1,a3);\
    a3 = _mm_xor_si128(a3,a2);\
    a2 = _mm_and_si128(a2,a0);\
-    a0 = _mm_andnot_si128(a0,ALLONE);\
+    a0 = mm128_not( a0 );\
    a2 = _mm_xor_si128(a2,a1);\
    a1 = _mm_or_si128(a1,a3);\
    t  = _mm_xor_si128(t,a1);\
@@ -255,17 +263,18 @@ static const uint32 CNS_INIT[128] __attribute((aligned(16))) = {


 __m128i CNS128[32];
-__m128i ALLONE;
+#if !defined(__SSE4_1__)
 __m128i MASK;
+#endif

 HashReturn init_luffa(hashState_luffa *state, int hashbitlen)
 {
    int i;
    state->hashbitlen = hashbitlen;
+#if !defined(__SSE4_1__)
    /* set the lower 32 bits to '1' */
    MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
-    /* set all bits to '1' */
-    ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
+#endif
    /* set the 32-bit round constant values to the 128-bit data field */
    for ( i=0; i<32; i++ )
        CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
@@ -365,10 +374,10 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
 // Optimized for integrals of 16 bytes, good for 64 and 80 byte len
    int i;
    state->hashbitlen = hashbitlen;
+#if !defined(__SSE4_1__)
    /* set the lower 32 bits to '1' */
    MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
-    /* set all bits to '1' */
-    ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
+#endif
    /* set the 32-bit round constant values to the 128-bit data field */
    for ( i=0; i<32; i++ )
        CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -230,25 +230,13 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
   block0_hash[7] = _mm512_set1_epi32( phash[7] );

   // Build vectored second block, interleave last 16 bytes of data using
-   // unique nonces, add padding.
+   // unique nonces.
   block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
   block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
   block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
   block_buf[ 3] =
             _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
-   block_buf[ 4] = m512_const1_32( 0x80000000 );
-   block_buf[ 5] =
-   block_buf[ 6] = 
-   block_buf[ 7] = 
-   block_buf[ 8] = 
-   block_buf[ 9] = 
-   block_buf[10] = 
-   block_buf[11] = 
-   block_buf[12] = m512_zero;
-   block_buf[13] = m512_one_32;
-   block_buf[14] = m512_zero;
-   block_buf[15] = m512_const1_32( 80*8 );

   // Partialy prehash second block without touching nonces in block_buf[3].
   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
@@ -425,24 +413,12 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
   block0_hash[7] = _mm256_set1_epi32( phash[7] );

   // Build vectored second block, interleave last 16 bytes of data using
-   // unique nonces and add padding.
+   // unique nonces.
   block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
   block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
   block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
-   block_buf[ 3] =
-            _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
-   block_buf[ 4] = m256_const1_32( 0x80000000 );
-   block_buf[ 5] =
-   block_buf[ 6] =
-   block_buf[ 7] =
-   block_buf[ 8] =
-   block_buf[ 9] =
-   block_buf[10] =
-   block_buf[11] =
-   block_buf[12] = m256_zero;
-   block_buf[13] = m256_one_32;
-   block_buf[14] = m256_zero;
-   block_buf[15] = m256_const1_32( 80*8 );
+   block_buf[ 3] = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4,
+                                     n+ 3, n+ 2, n+ 1, n );

   // Partialy prehash second block without touching nonces
   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -75,7 +75,7 @@ void lyra2rev2_16way_hash( void *state, const void *input )
   keccak256_8way_close( &ctx.keccak, vhash );

   dintrlv_8x64( hash8,  hash9,  hash10,  hash11,
-                 hash12, hash13, hash14, hash5, vhash, 256 );
+                 hash12, hash13, hash14, hash15, vhash, 256 );

   cubehash_full( &ctx.cube, (byte*) hash0,  256, (const byte*) hash0,  32 );
   cubehash_full( &ctx.cube, (byte*) hash1,  256, (const byte*) hash1,  32 );
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -120,25 +120,13 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
   block0_hash[7] = _mm512_set1_epi32( phash[7] );

   // Build vectored second block, interleave last 16 bytes of data using
-   // unique nonces and add padding.
+   // unique nonces.
   block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
   block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
   block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
   block_buf[ 3] =
             _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
-   block_buf[ 4] = m512_const1_32( 0x80000000 );
-   block_buf[ 5] =
-   block_buf[ 6] =
-   block_buf[ 7] =
-   block_buf[ 8] =
-   block_buf[ 9] =
-   block_buf[10] =
-   block_buf[11] =
-   block_buf[12] = m512_zero;
-   block_buf[13] = m512_one_32;
-   block_buf[14] = m512_zero;
-   block_buf[15] = m512_const1_32( 80*8 );

   // Partialy prehash second block without touching nonces in block_buf[3].
   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
@@ -240,24 +228,12 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
   block0_hash[7] = _mm256_set1_epi32( phash[7] );

   // Build vectored second block, interleave last 16 bytes of data using
-   // unique nonces and add padding.
+   // unique nonces.
   block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
   block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
   block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
   block_buf[ 3] =
            _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
-   block_buf[ 4] = m256_const1_32( 0x80000000 );
-   block_buf[ 5] =
-   block_buf[ 6] =
-   block_buf[ 7] =
-   block_buf[ 8] =
-   block_buf[ 9] =
-   block_buf[10] =
-   block_buf[11] =
-   block_buf[12] = m256_zero;
-   block_buf[13] = m256_one_32;
-   block_buf[14] = m256_zero;
-   block_buf[15] = m256_const1_32( 80*8 );

   // Partialy prehash second block without touching nonces
   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -3,7 +3,7 @@
 #include "lyra2.h"
 #include "simd-utils.h"

-__thread uint64_t* lyra2z330_wholeMatrix;
+static __thread uint64_t* lyra2z330_wholeMatrix;

 void lyra2z330_hash(void *state, const void *input, uint32_t height)
 {
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -97,11 +97,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 // returns void, updates all args
 #define G_4X64(a,b,c,d) \
   a = _mm256_add_epi64( a, b ); \
-   d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
+   d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
   c = _mm256_add_epi64( c, d ); \
-   b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
+   b = mm256_shuflr64_24( _mm256_xor_si256( b, c ) ); \
   a = _mm256_add_epi64( a, b ); \
-   d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
+   d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \
   c = _mm256_add_epi64( c, d ); \
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 63 );

@@ -137,23 +137,34 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 // returns void, all args updated
 #define G_2X64(a,b,c,d) \
   a = _mm_add_epi64( a, b ); \
-   d = mm128_ror_64( _mm_xor_si128( d, a), 32 ); \
+   d = mm128_swap64_32( _mm_xor_si128( d, a) ); \
   c = _mm_add_epi64( c, d ); \
-   b = mm128_ror_64( _mm_xor_si128( b, c ), 24 ); \
+   b = mm128_shuflr64_24( _mm_xor_si128( b, c ) ); \
   a = _mm_add_epi64( a, b ); \
-   d = mm128_ror_64( _mm_xor_si128( d, a ), 16 ); \
+   d = mm128_shuflr64_16( _mm_xor_si128( d, a ) ); \
   c = _mm_add_epi64( c, d ); \
   b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );

 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
+{ \
+   __m128i t; \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_vrol256_64( s6, s7 ); \
-   mm128_vror256_64( s2, s3 ); \
+   t = mm128_alignr_64( s7, s6, 1 ); \
+   s6 = mm128_alignr_64( s6, s7, 1 ); \
+   s7 = t; \
+   t = mm128_alignr_64( s2, s3, 1 ); \
+   s2 =  mm128_alignr_64( s3, s2, 1 ); \
+   s3 = t; \
   G_2X64( s0, s2, s5, s6 ); \
   G_2X64( s1, s3, s4, s7 ); \
-   mm128_vror256_64( s6, s7 ); \
-   mm128_vrol256_64( s2, s3 );
+   t = mm128_alignr_64( s6, s7, 1 ); \
+   s6 = mm128_alignr_64( s7, s6, 1 ); \
+   s7 = t; \
+   t = mm128_alignr_64( s3, s2, 1 ); \
+   s2 =  mm128_alignr_64( s2, s3, 1 ); \
+   s3 = t; \
+} 

 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -15,7 +15,8 @@

 #if defined (ANIME_8WAY)

-typedef struct {
+union _anime_8way_context_overlay
+{
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
 #if defined(__VAES__)
@@ -26,23 +27,9 @@ typedef struct {
    jh512_8way_context      jh;
    skein512_8way_context   skein;
    keccak512_8way_context  keccak;
-} anime_8way_ctx_holder;
+} __attribute__ ((aligned (64)));

-anime_8way_ctx_holder anime_8way_ctx __attribute__ ((aligned (64)));
-
-void init_anime_8way_ctx()
-{
-     blake512_8way_init( &anime_8way_ctx.blake );
-     bmw512_8way_init( &anime_8way_ctx.bmw );
-#if defined(__VAES__)
-     groestl512_4way_init( &anime_8way_ctx.groestl, 64 );
-#else
-     init_groestl( &anime_8way_ctx.groestl, 64 );
-#endif
-     skein512_8way_init( &anime_8way_ctx.skein );
-     jh512_8way_init( &anime_8way_ctx.jh );
-     keccak512_8way_init( &anime_8way_ctx.keccak );
-}
+typedef union _anime_8way_context_overlay anime_8way_context_overlay;

 void anime_8way_hash( void *state, const void *input )
 {
@@ -65,17 +52,14 @@ void anime_8way_hash( void *state, const void *input )
    __m512i* vhB = (__m512i*)vhashB;
    __m512i* vhC = (__m512i*)vhashC;
    const __m512i bit3_mask = m512_const1_64( 8 );
-    const __m512i zero = _mm512_setzero_si512();
    __mmask8 vh_mask;
-    anime_8way_ctx_holder ctx;
-    memcpy( &ctx, &anime_8way_ctx, sizeof(anime_8way_ctx) );
+    anime_8way_context_overlay ctx __attribute__ ((aligned (64)));

    bmw512_8way_full( &ctx.bmw, vhash, input, 80 );

    blake512_8way_full( &ctx.blake, vhash, vhash, 64 );

-    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
-                                       zero );
+    vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );

 #if defined(__VAES__)

@@ -152,8 +136,7 @@ void anime_8way_hash( void *state, const void *input )
    jh512_8way_update( &ctx.jh, vhash, 64 );
    jh512_8way_close( &ctx.jh, vhash );

-    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
-                                       zero );
+    vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );

    if ( ( vh_mask & 0xff ) != 0xff )
       blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
@@ -168,8 +151,7 @@ void anime_8way_hash( void *state, const void *input )

    skein512_8way_full( &ctx.skein, vhash, vhash, 64 );

-    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ), 
-                                       zero );
+    vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );

    if ( ( vh_mask & 0xff ) != 0xff )
    {
@@ -237,14 +219,20 @@ int scanhash_anime_8way( struct work *work, uint32_t max_nonce,

 #elif defined (ANIME_4WAY)

-typedef struct {
+union _anime_4way_context_overlay
+{
    blake512_4way_context  blake;
    bmw512_4way_context    bmw;
    hashState_groestl      groestl;
    jh512_4way_context     jh;
    skein512_4way_context  skein;
    keccak512_4way_context keccak;
-} anime_4way_ctx_holder;
+#if defined(__VAES__)
+    groestl512_2way_context groestl2;
+#endif
+} __attribute__ ((aligned (64)));
+
+typedef union _anime_4way_context_overlay anime_4way_context_overlay;

 void anime_4way_hash( void *state, const void *input )
 {
@@ -262,7 +250,7 @@ void anime_4way_hash( void *state, const void *input )
    int h_mask;
    const __m256i bit3_mask = m256_const1_64( 8 );
    const __m256i zero = _mm256_setzero_si256();
-    anime_4way_ctx_holder ctx;
+    anime_4way_context_overlay ctx __attribute__ ((aligned (64)));

    bmw512_4way_init( &ctx.bmw );
    bmw512_4way_update( &ctx.bmw, input, 80 );
@@ -293,7 +281,18 @@ void anime_4way_hash( void *state, const void *input )

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

-    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+#if defined(__VAES__)
+
+   rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+   groestl512_2way_full( &ctx.groestl2, vhashA, vhashA, 64 );
+   groestl512_2way_full( &ctx.groestl2, vhashB, vhashB, 64 );
+
+   rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

    groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
    groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
@@ -302,6 +301,8 @@ void anime_4way_hash( void *state, const void *input )

    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

+#endif
+
    jh512_4way_init( &ctx.jh );
    jh512_4way_update( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhash );
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -13,6 +13,7 @@
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/shavite/sph_shavite.h"
+#include "algo/shavite/shavite-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -98,8 +99,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
                           hash4, hash5, hash6,  hash7 );

-   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
-                                       m512_zero );
+   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );

   // A
 #if defined(__VAES__)
@@ -154,8 +154,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   keccak512_8way_update( &ctx.keccak, vhash, 64 );
   keccak512_8way_close( &ctx.keccak, vhash );

-   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
-                                       m512_zero );
+   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );

   // A
   if ( ( vh_mask & 0xff ) != 0xff )
@@ -174,8 +173,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );

   rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
-   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
-                                       m512_zero );
+   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );

   if ( likely( ( vh_mask & 0xff ) != 0xff ) )
   {
@@ -223,8 +221,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   simd512_4way_full( &ctx.simd, vhashB, vhashB, 64 );

   rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
-   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
-                                       m512_zero );
+   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
                     hash4, hash5, hash6, hash7, vhash );
   // 4x32 for haval
@@ -302,8 +299,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)

   blake512_8way_full( &ctx.blake, vhash, vhash, 64 );

-   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
-                                       m512_zero );
+   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );

   // A
 #if defined(__VAES__)
@@ -374,8 +370,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)

   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
                           hash4, hash5, hash6, hash7 );
-   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
-                                       m512_zero );
+   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );

     // A   
 #if defined(__VAES__)
@@ -455,8 +450,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)

   intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
                           hash4, hash5, hash6, hash7 );
-   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
-                                       m512_zero );
+   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );

   if ( hash0[0] & mask )
      fugue512_full( &ctx.fugue, hash0, hash0, 64 );
@@ -520,8 +514,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   sha512_8way_update( &ctx.sha512, vhash, 64 );
   sha512_8way_close( &ctx.sha512, vhash );

-   vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
-                                       m512_zero );
+   vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
                     hash4, hash5, hash6, hash7, vhash );

@@ -625,6 +618,7 @@ union _hmq1725_4way_context_overlay
    cube_2way_context       cube2;
    sph_shavite512_context  shavite;
    hashState_sd            sd;
+    shavite512_2way_context shavite2;
    simd_2way_context       simd;
    hashState_echo          echo;
    hamsi512_4way_context   hamsi;
@@ -633,6 +627,10 @@ union _hmq1725_4way_context_overlay
    sph_whirlpool_context   whirlpool;
    sha512_4way_context     sha512;
    haval256_5_4way_context haval;
+#if defined(__VAES__)
+    groestl512_2way_context groestl2;
+    echo_2way_context       echo2;
+#endif    
 } __attribute__ ((aligned (64)));

 typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;
@@ -750,15 +748,10 @@ extern void hmq1725_4way_hash(void *state, const void *input)

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

-    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

-    shavite512_full( &ctx.shavite, hash0, hash0, 64 );
-    shavite512_full( &ctx.shavite, hash1, hash1, 64 );
-    shavite512_full( &ctx.shavite, hash2, hash2, 64 );
-    shavite512_full( &ctx.shavite, hash3, hash3, 64 );
-
-    intrlv_2x128_512( vhashA, hash0, hash1 );
-    intrlv_2x128_512( vhashB, hash2, hash3 );
+    shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 );
+    shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 );

    simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
    simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
@@ -795,6 +788,17 @@ extern void hmq1725_4way_hash(void *state, const void *input)

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

+#if defined(__VAES__)
+
+   rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+   echo_2way_full( &ctx.echo2, vhashA, 512, vhashA, 64 );
+   echo_2way_full( &ctx.echo2, vhashB, 512, vhashB, 64 );
+
+   rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+    
    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
    
    echo_full( &ctx.echo, (BitSequence *)hash0, 512,
@@ -807,7 +811,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
                    (const BitSequence *)hash3, 64 );

    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-     
+
+#endif
+
    blake512_4way_full( &ctx.blake, vhash, vhash, 64 );

    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -939,6 +945,17 @@ extern void hmq1725_4way_hash(void *state, const void *input)

   mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

+#if defined(__VAES__)
+
+   rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+   groestl512_2way_full( &ctx.groestl2, vhashA, vhashA, 64 );
+   groestl512_2way_full( &ctx.groestl2, vhashB, vhashB, 64 );
+
+   rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+   
   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

   groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -948,6 +965,8 @@ extern void hmq1725_4way_hash(void *state, const void *input)

   intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

+#endif
+
   sha512_4way_init( &ctx.sha512 ); 
   sha512_4way_update( &ctx.sha512, vhash, 64 );
   sha512_4way_close( &ctx.sha512, vhash ); 
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -68,7 +68,6 @@ void quark_8way_hash( void *state, const void *input )
    quark_8way_ctx_holder ctx;
    const uint32_t mask = 8;
    const __m512i bit3_mask = m512_const1_64( mask );
-    const __m512i zero = _mm512_setzero_si512();

    memcpy( &ctx, &quark_8way_ctx, sizeof(quark_8way_ctx) );

@@ -76,9 +75,7 @@ void quark_8way_hash( void *state, const void *input )

    bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
    
-    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
-                                       zero );
-
+    vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
    
 #if defined(__VAES__)

@@ -154,8 +151,7 @@ void quark_8way_hash( void *state, const void *input )
    jh512_8way_update( &ctx.jh, vhash, 64 );
    jh512_8way_close( &ctx.jh, vhash );

-    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
-                                       zero );
+    vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );

    if ( ( vh_mask & 0xff ) != 0xff )
       blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
@@ -169,8 +165,7 @@ void quark_8way_hash( void *state, const void *input )

    skein512_8way_full( &ctx.skein, vhash, vhash, 64 );

-    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
-                                       zero );
+    vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );

    if ( ( vh_mask & 0xff ) != 0xff )
    {
--- a/algo/radiogatun/sph_radiogatun.c
+++ b/algo/radiogatun/sph_radiogatun.c
--- a/algo/radiogatun/sph_radiogatun.h
+++ b/algo/radiogatun/sph_radiogatun.h
@@ -1,186 +0,0 @@
-/* $Id: sph_radiogatun.h 226 2010-06-16 17:28:08Z tp $ */
-/**
- * RadioGatun interface.
- *
- * RadioGatun has been published in: G. Bertoni, J. Daemen, M. Peeters
- * and G. Van Assche, "RadioGatun, a belt-and-mill hash function",
- * presented at the Second Cryptographic Hash Workshop, Santa Barbara,
- * August 24-25, 2006. The main Web site, containing that article, the
- * reference code and some test vectors, appears to be currently located
- * at the following URL: http://radiogatun.noekeon.org/
- *
- * The presentation article does not specify endianness or padding. The
- * reference code uses the following conventions, which we also apply
- * here:
- * <ul>
- * <li>The input message is an integral number of sequences of three
- * words. Each word is either a 32-bit of 64-bit word (depending on
- * the version of RadioGatun).</li>
- * <li>Input bytes are decoded into words using little-endian
- * convention.</li>
- * <li>Padding consists of a single bit of value 1, using little-endian
- * convention within bytes (i.e. for a byte-oriented input, a single
- * byte of value 0x01 is appended), then enough bits of value 0 to finish
- * the current block.</li>
- * <li>Output consists of 256 bits. Successive output words are encoded
- * with little-endian convention.</li>
- * </ul>
- * These conventions are very close to those we use for PANAMA, which is
- * a close ancestor or RadioGatun.
- *
- * RadioGatun is actually a family of functions, depending on some
- * internal parameters. We implement here two functions, with a "belt
- * length" of 13, a "belt width" of 3, and a "mill length" of 19. The
- * RadioGatun[32] version uses 32-bit words, while the RadioGatun[64]
- * variant uses 64-bit words.
- *
- * Strictly speaking, the name "RadioGatun" should use an acute accent
- * on the "u", which we omitted here to keep strict ASCII-compatibility
- * of this file.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_radiogatun.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef SPH_RADIOGATUN_H__
-#define SPH_RADIOGATUN_H__
-
-#include <stddef.h>
-#include "algo/sha/sph_types.h"
-
-/**
- * Output size (in bits) for RadioGatun[32].
- */
-#define SPH_SIZE_radiogatun32   256
-
-/**
- * This structure is a context for RadioGatun[32] computations: it
- * contains intermediate values and some data from the last entered
- * block. Once a RadioGatun[32] computation has been performed, the
- * context can be reused for another computation.
- *
- * The contents of this structure are private. A running RadioGatun[32]
- * computation can be cloned by copying the context (e.g. with a
- * simple <code>memcpy()</code>).
- */
-typedef struct {
-#ifndef DOXYGEN_IGNORE
-	unsigned char data[156];   /* first field, for alignment */
-	unsigned data_ptr;
-	sph_u32 a[19], b[39];
-#endif
-} sph_radiogatun32_context;
-
-/**
- * Initialize a RadioGatun[32] context. This process performs no
- * memory allocation.
- *
- * @param cc   the RadioGatun[32] context (pointer to a
- *             <code>sph_radiogatun32_context</code>)
- */
-void sph_radiogatun32_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the RadioGatun[32] context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_radiogatun32(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current RadioGatun[32] computation and output the
- * result into the provided buffer. The destination buffer must be wide
- * enough to accomodate the result (32 bytes). The context is
- * automatically reinitialized.
- *
- * @param cc    the RadioGatun[32] context
- * @param dst   the destination buffer
- */
-void sph_radiogatun32_close(void *cc, void *dst);
-
-#if SPH_64
-
-/**
- * Output size (in bits) for RadioGatun[64].
- */
-#define SPH_SIZE_radiogatun64   256
-
-/**
- * This structure is a context for RadioGatun[64] computations: it
- * contains intermediate values and some data from the last entered
- * block. Once a RadioGatun[64] computation has been performed, the
- * context can be reused for another computation.
- *
- * The contents of this structure are private. A running RadioGatun[64]
- * computation can be cloned by copying the context (e.g. with a
- * simple <code>memcpy()</code>).
- */
-typedef struct {
-#ifndef DOXYGEN_IGNORE
-	unsigned char data[312];   /* first field, for alignment */
-	unsigned data_ptr;
-	sph_u64 a[19], b[39];
-#endif
-} sph_radiogatun64_context;
-
-/**
- * Initialize a RadioGatun[64] context. This process performs no
- * memory allocation.
- *
- * @param cc   the RadioGatun[64] context (pointer to a
- *             <code>sph_radiogatun64_context</code>)
- */
-void sph_radiogatun64_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the RadioGatun[64] context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_radiogatun64(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current RadioGatun[64] computation and output the
- * result into the provided buffer. The destination buffer must be wide
- * enough to accomodate the result (32 bytes). The context is
- * automatically reinitialized.
- *
- * @param cc    the RadioGatun[64] context
- * @param dst   the destination buffer
- */
-void sph_radiogatun64_close(void *cc, void *dst);
-
-#endif
-
-#endif
--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -4,24 +4,6 @@
 #include <string.h>
 #include <stdio.h>

-long double lbry_calc_network_diff( struct work *work )
-{
-        // sample for diff 43.281 : 1c05ea29
-        // todo: endian reversed on longpoll could be zr5 specific...
-
-   uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
-   uint32_t bits = (nbits & 0xffffff);
-   int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
-   long double d = (long double)0x0000ffff / (long double)bits;
-
-   for (int m=shift; m < 29; m++) d *= 256.0;
-   for (int m=29; m < shift; m++) d /= 256.0;
-   if (opt_debug_diff)
-      applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
-
-   return d;
-}
-
 // std_le should work but it doesn't
 void lbry_le_build_stratum_request( char *req, struct work *work,
                                      struct stratum_ctx *sctx )
@@ -41,31 +23,6 @@ void lbry_le_build_stratum_request( char *req, struct work *work,
   free(xnonce2str);
 }

-/*
-void lbry_build_block_header( struct work* g_work, uint32_t version,
-                             uint32_t *prevhash, uint32_t *merkle_root,
-                             uint32_t ntime, uint32_t nbits )
-{
-   int i;
-   memset( g_work->data, 0, sizeof(g_work->data) );
-   g_work->data[0] =  version;
-
-   if ( have_stratum )
-      for ( i = 0; i < 8; i++ )
-         g_work->data[1 + i] = le32dec( prevhash + i );
-   else
-      for (i = 0; i < 8; i++)
-         g_work->data[ 8-i ] = le32dec( prevhash + i );
-
-   for ( i = 0; i < 8; i++ )
-      g_work->data[9 + i] = be32dec( merkle_root + i );
-
-   g_work->data[ LBRY_NTIME_INDEX ] = ntime;
-   g_work->data[ LBRY_NBITS_INDEX ] = nbits;
-   g_work->data[28] = 0x80000000;
-}
-*/
-
 void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 {
   unsigned char merkle_root[64] = { 0 };
@@ -112,9 +69,7 @@ bool register_lbry_algo( algo_gate_t* gate )
  gate->hash                  = (void*)&lbry_hash;
  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
 #endif
-  gate->calc_network_diff     = (void*)&lbry_calc_network_diff;
  gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
-//  gate->build_block_header    = (void*)&build_block_header;
  gate->build_extraheader     = (void*)&lbry_build_extraheader;
  gate->ntime_index           = LBRY_NTIME_INDEX;
  gate->nbits_index           = LBRY_NBITS_INDEX;
--- a/algo/scrypt/scrypt-core-4way.c
+++ b/algo/scrypt/scrypt-core-4way.c
@@ -830,7 +830,7 @@ void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N )
   }
 }

-// Working, not up to date, needs stream optimization.
+// Working, not up to date, needs stream, shuffle optimizations.
 // 4x32 interleaving
 static void salsa8_simd128_4way( __m128i *b, const __m128i *c )
 {
@@ -937,46 +937,28 @@ void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
 // 4x memory usage
 // Working
 // 4x128 interleaving
-static void salsa_shuffle_4way_simd128( __m512i *X )
+static inline void salsa_shuffle_4way_simd128( __m512i *X )
 {
-   __m512i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3;
-
-   Y0 = _mm512_mask_blend_epi32( 0x1111, X[1], X[0] );
-   Z0 = _mm512_mask_blend_epi32( 0x4444, X[3], X[2] );
-
-   Y1 = _mm512_mask_blend_epi32( 0x1111, X[2], X[1] );
-   Z1 = _mm512_mask_blend_epi32( 0x4444, X[0], X[3] );
-
-   Y2 = _mm512_mask_blend_epi32( 0x1111, X[3], X[2] );
-   Z2 = _mm512_mask_blend_epi32( 0x4444, X[1], X[0] );
-
-   Y3 = _mm512_mask_blend_epi32( 0x1111, X[0], X[3] );
-   Z3 = _mm512_mask_blend_epi32( 0x4444, X[2], X[1] );
-
-   X[0] = _mm512_mask_blend_epi32( 0x3333, Z0, Y0 );
-   X[1] = _mm512_mask_blend_epi32( 0x3333, Z1, Y1 );
-   X[2] = _mm512_mask_blend_epi32( 0x3333, Z2, Y2 );
-   X[3] = _mm512_mask_blend_epi32( 0x3333, Z3, Y3 );
+  __m512i t0 = _mm512_mask_blend_epi32( 0xaaaa, X[0], X[1] );
+  __m512i t1 = _mm512_mask_blend_epi32( 0x5555, X[0], X[1] );
+  __m512i t2 = _mm512_mask_blend_epi32( 0xaaaa, X[2], X[3] );
+  __m512i t3 = _mm512_mask_blend_epi32( 0x5555, X[2], X[3] );
+  X[0] = _mm512_mask_blend_epi32( 0xcccc, t0, t2 );
+  X[1] = _mm512_mask_blend_epi32( 0x6666, t1, t3 );
+  X[2] = _mm512_mask_blend_epi32( 0x3333, t0, t2 );
+  X[3] = _mm512_mask_blend_epi32( 0x9999, t1, t3 );
 }

-static void salsa_unshuffle_4way_simd128( __m512i *X )
+static inline void salsa_unshuffle_4way_simd128( __m512i *X )
 {
-   __m512i Y0, Y1, Y2, Y3;
-
-   Y0 = _mm512_mask_blend_epi32( 0x8888, X[0], X[1] );
-   Y1 = _mm512_mask_blend_epi32( 0x1111, X[0], X[1] );
-   Y2 = _mm512_mask_blend_epi32( 0x2222, X[0], X[1] );
-   Y3 = _mm512_mask_blend_epi32( 0x4444, X[0], X[1] );
-
-   Y0 = _mm512_mask_blend_epi32( 0x4444, Y0, X[2] );
-   Y1 = _mm512_mask_blend_epi32( 0x8888, Y1, X[2] );
-   Y2 = _mm512_mask_blend_epi32( 0x1111, Y2, X[2] );
-   Y3 = _mm512_mask_blend_epi32( 0x2222, Y3, X[2] );
-
-   X[0] = _mm512_mask_blend_epi32( 0x2222, Y0, X[3] );
-   X[1] = _mm512_mask_blend_epi32( 0x4444, Y1, X[3] );
-   X[2] = _mm512_mask_blend_epi32( 0x8888, Y2, X[3] );
-   X[3] = _mm512_mask_blend_epi32( 0x1111, Y3, X[3] );
+  __m512i t0 = _mm512_mask_blend_epi32( 0xcccc, X[0], X[2] );
+  __m512i t1 = _mm512_mask_blend_epi32( 0x3333, X[0], X[2] );
+  __m512i t2 = _mm512_mask_blend_epi32( 0x6666, X[1], X[3] );
+  __m512i t3 = _mm512_mask_blend_epi32( 0x9999, X[1], X[3] );
+  X[0] = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 );
+  X[1] = _mm512_mask_blend_epi32( 0x5555, t0, t2 );
+  X[2] = _mm512_mask_blend_epi32( 0xaaaa, t1, t3 );
+  X[3] = _mm512_mask_blend_epi32( 0x5555, t1, t3 );
 }

 static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C)
@@ -1147,46 +1129,28 @@ void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N )
 // { l1xb, l1xa, l1c9, l1x8,   l0xb, l0xa, l0x9, l0x8 }   b[1]  B[23:16]
 // { l1xf, l1xe, l1xd, l1xc,   l0xf, l0xe, l0xd, l0xc }   b[0]  B[31:24]

-static void salsa_shuffle_2way_simd128( __m256i *X )
+static inline void salsa_shuffle_2way_simd128( __m256i *X )
 {
-   __m256i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3;
-
-   Y0 = _mm256_blend_epi32( X[1], X[0], 0x11 );
-   Z0 = _mm256_blend_epi32( X[3], X[2], 0x44 );
-
-   Y1 = _mm256_blend_epi32( X[2], X[1], 0x11 );
-   Z1 = _mm256_blend_epi32( X[0], X[3], 0x44 );
-
-   Y2 = _mm256_blend_epi32( X[3], X[2], 0x11 );
-   Z2 = _mm256_blend_epi32( X[1], X[0], 0x44 );
-
-   Y3 = _mm256_blend_epi32( X[0], X[3], 0x11 );
-   Z3 = _mm256_blend_epi32( X[2], X[1], 0x44 );
-
-   X[0] = _mm256_blend_epi32( Z0, Y0, 0x33 );
-   X[1] = _mm256_blend_epi32( Z1, Y1, 0x33 );
-   X[2] = _mm256_blend_epi32( Z2, Y2, 0x33 );
-   X[3] = _mm256_blend_epi32( Z3, Y3, 0x33 );
+  __m256i t0 = _mm256_blend_epi32( X[0], X[1], 0xaa );
+  __m256i t1 = _mm256_blend_epi32( X[0], X[1], 0x55 );
+  __m256i t2 = _mm256_blend_epi32( X[2], X[3], 0xaa );
+  __m256i t3 = _mm256_blend_epi32( X[2], X[3], 0x55 );
+  X[0] = _mm256_blend_epi32( t0, t2, 0xcc );
+  X[1] = _mm256_blend_epi32( t1, t3, 0x66 );
+  X[2] = _mm256_blend_epi32( t0, t2, 0x33 );
+  X[3] = _mm256_blend_epi32( t1, t3, 0x99 );
 }

-static void salsa_unshuffle_2way_simd128( __m256i *X )
+static inline void salsa_unshuffle_2way_simd128( __m256i *X )
 {
-   __m256i Y0, Y1, Y2, Y3;
-
-   Y0 = _mm256_blend_epi32( X[0], X[1], 0x88 );
-   Y1 = _mm256_blend_epi32( X[0], X[1], 0x11 );
-   Y2 = _mm256_blend_epi32( X[0], X[1], 0x22 );
-   Y3 = _mm256_blend_epi32( X[0], X[1], 0x44 );
-
-   Y0 = _mm256_blend_epi32( Y0, X[2], 0x44 );
-   Y1 = _mm256_blend_epi32( Y1, X[2], 0x88 );
-   Y2 = _mm256_blend_epi32( Y2, X[2], 0x11 );
-   Y3 = _mm256_blend_epi32( Y3, X[2], 0x22 );
-
-   X[0] = _mm256_blend_epi32( Y0, X[3], 0x22 );
-   X[1] = _mm256_blend_epi32( Y1, X[3], 0x44 );
-   X[2] = _mm256_blend_epi32( Y2, X[3], 0x88 );
-   X[3] = _mm256_blend_epi32( Y3, X[3], 0x11 );
+  __m256i t0 = _mm256_blend_epi32( X[0], X[2], 0xcc );
+  __m256i t1 = _mm256_blend_epi32( X[0], X[2], 0x33 );
+  __m256i t2 = _mm256_blend_epi32( X[1], X[3], 0x66 );
+  __m256i t3 = _mm256_blend_epi32( X[1], X[3], 0x99 );
+  X[0] = _mm256_blend_epi32( t0, t2, 0xaa );
+  X[1] = _mm256_blend_epi32( t0, t2, 0x55 );
+  X[2] = _mm256_blend_epi32( t1, t3, 0xaa );
+  X[3] = _mm256_blend_epi32( t1, t3, 0x55 );
 }

 static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C)
@@ -2163,7 +2127,7 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
   X2 = _mm_blend_epi32( B[1], B[0], 0x4 );
   Y3 = _mm_blend_epi32( B[0], B[3], 0x1 );
   X3 = _mm_blend_epi32( B[2], B[1], 0x4 );
-   X0 = _mm_blend_epi32( X0, Y0, 0x3);
+   X0 = _mm_blend_epi32( X0, Y0, 0x3 );
   X1 = _mm_blend_epi32( X1, Y1, 0x3 );
   X2 = _mm_blend_epi32( X2, Y2, 0x3 );
   X3 = _mm_blend_epi32( X3, Y3, 0x3 );
@@ -2311,91 +2275,34 @@ void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )
 // Double buffered, 2x memory usage
 // No interleaving

-static void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
+static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
 {
   __m128i *XA = (__m128i*)xa;
   __m128i *XB = (__m128i*)xb;
-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;

 #if defined(__SSE4_1__)

-//   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
-   __m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 );
-   YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 );
-   ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 );
-   ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 );
-
-   YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 );
-   YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 );
-   ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 );
-   ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 );
-
-   YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 );
-   YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 );
-   ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 );
-   ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 );
-
-   YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 );
-   YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 );
-   ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 );
-   ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 );
-
-   XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 );
-   XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 );
-
-   XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 );
-   XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 );
-
-   XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 );
-   XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 );
-
-   XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 );
-   XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 );
-
-#else
-
-//  SSE4.1
-
-   YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 );
-   YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 );
-   ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 );
-   ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 );
-
-   YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 );
-   YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 );
-   ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 );
-   ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 );
-
-   YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 );
-   YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 );
-   ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 );
-   ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 );
-
-   YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 );
-   YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 );
-   ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 );
-   ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 );
-
-   XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f );
-   XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f );
-
-   XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f );
-   XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f );
-
-   XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f );
-   XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f );
-
-   XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f );
-   XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f );
-
-#endif  // AVX2 else SSE4_1
+  __m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
+  __m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
+  __m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
+  __m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
+  XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XA[3] = _mm_blend_epi16( t1, t3, 0xc3 );
+  t0 = _mm_blend_epi16( XB[0], XB[1], 0xcc );
+  t1 = _mm_blend_epi16( XB[0], XB[1], 0x33 );
+  t2 = _mm_blend_epi16( XB[2], XB[3], 0xcc );
+  t3 = _mm_blend_epi16( XB[2], XB[3], 0x33 );
+  XB[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XB[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );

 #else   // SSE2
-  
+
+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
+   
   YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
   YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
   YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
@@ -2417,7 +2324,7 @@ static void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
 #endif
 }

-static void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
+static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
 {

   __m128i *XA = (__m128i*)xa;
@@ -2425,67 +2332,22 @@ static void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )

 #if defined(__SSE4_1__)

-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 );
-   YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 );
-   YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 );
-   YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 );
-   YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 );
-   YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 );
-   YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 );
-   YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 );
-
-   YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 );
-   YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 );
-   YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 );
-   YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 );
-   YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 );
-   YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 );
-   YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 );
-   YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 );
-
-   XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 );
-   XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 );
-   XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 );
-   XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 );
-   XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 );
-   XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 );
-   XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 );
-   XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 );
-
-#else   // SSE4_1
-
-   YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 );
-   YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 );
-   YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 );
-   YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 );
-   YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c );
-   YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c );
-   YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 );
-   YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 );
-
-   YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 );
-   YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 );
-   YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 );
-   YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 );
-   YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 );
-   YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 );
-   YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c );
-   YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c );
-
-   XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c );
-   XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c );
-   XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 );
-   XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 );
-   XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 );
-   XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 );
-   XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 );
-   XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 );
-
-#endif  // AVX2 else SSE4_1
+  __m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
+  __m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
+  __m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
+  __m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
+  XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XA[3] = _mm_blend_epi16( t1, t3, 0x33 );
+  t0 = _mm_blend_epi16( XB[0], XB[2], 0xf0 );
+  t1 = _mm_blend_epi16( XB[0], XB[2], 0x0f );
+  t2 = _mm_blend_epi16( XB[1], XB[3], 0x3c );
+  t3 = _mm_blend_epi16( XB[1], XB[3], 0xc3 );
+  XB[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XB[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XB[3] = _mm_blend_epi16( t1, t3, 0x33 );

 #else  // SSE2

@@ -2690,116 +2552,44 @@ void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
 }


-static void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
+static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
                                        uint32_t *xc )
 {
   __m128i *XA = (__m128i*)xa;
   __m128i *XB = (__m128i*)xb;
   __m128i *XC = (__m128i*)xc;
-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;

 #if defined(__SSE4_1__)

-   __m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3, ZC0, ZC1, ZC2, ZC3;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 );
-   YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 );
-   YC0 = _mm_blend_epi32( XC[1], XC[0], 0x1 );
-   ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 );
-   ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 );
-   ZC0 = _mm_blend_epi32( XC[3], XC[2], 0x4 );
-
-   YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 );
-   YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 );
-   YC1 = _mm_blend_epi32( XC[2], XC[1], 0x1 );
-   ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 );
-   ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 );
-   ZC1 = _mm_blend_epi32( XC[0], XC[3], 0x4 );
-
-   YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 );
-   YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 );
-   YC2 = _mm_blend_epi32( XC[3], XC[2], 0x1 );
-   ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 );
-   ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 );
-   ZC2 = _mm_blend_epi32( XC[1], XC[0], 0x4 );
-
-   YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 );
-   YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 );
-   YC3 = _mm_blend_epi32( XC[0], XC[3], 0x1 );
-   ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 );
-   ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 );
-   ZC3 = _mm_blend_epi32( XC[2], XC[1], 0x4 );
-
-   XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 );
-   XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 );
-   XC[0] = _mm_blend_epi32( ZC0, YC0, 0x3 );
-
-   XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 );
-   XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 );
-   XC[1] = _mm_blend_epi32( ZC1, YC1, 0x3 );
-
-   XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 );
-   XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 );
-   XC[2] = _mm_blend_epi32( ZC2, YC2, 0x3 );
-
-   XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 );
-   XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 );
-   XC[3] = _mm_blend_epi32( ZC3, YC3, 0x3 );
-
-#else   
-
-//  SSE4.1
-
-   YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 );
-   YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 );
-   YC0 = _mm_blend_epi16( XC[1], XC[0], 0x03 );
-   ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 );
-   ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 );
-   ZC0 = _mm_blend_epi16( XC[3], XC[2], 0x30 );
-
-   YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 );
-   YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 );
-   YC1 = _mm_blend_epi16( XC[2], XC[1], 0x03 );
-   ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 );
-   ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 );
-   ZC1 = _mm_blend_epi16( XC[0], XC[3], 0x30 );
-
-   YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 );
-   YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 );
-   YC2 = _mm_blend_epi16( XC[3], XC[2], 0x03 );
-   ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 );
-   ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 );
-   ZC2 = _mm_blend_epi16( XC[1], XC[0], 0x30 );
-
-   YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 );
-   YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 );
-   YC3 = _mm_blend_epi16( XC[0], XC[3], 0x03 );
-   ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 );
-   ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 );
-   ZC3 = _mm_blend_epi16( XC[2], XC[1], 0x30 );
-
-   XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f );
-   XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f );
-   XC[0] = _mm_blend_epi16( ZC0, YC0, 0x0f );
-
-   XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f );
-   XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f );
-   XC[1] = _mm_blend_epi16( ZC1, YC1, 0x0f );
-
-   XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f );
-   XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f );
-   XC[2] = _mm_blend_epi16( ZC2, YC2, 0x0f );
-
-   XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f );
-   XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f );
-   XC[3] = _mm_blend_epi16( ZC3, YC3, 0x0f );
-
-#endif  // AVX2 else SSE4_1
+  __m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
+  __m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
+  __m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
+  __m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
+  XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XA[3] = _mm_blend_epi16( t1, t3, 0xc3 );
+  t0 = _mm_blend_epi16( XB[0], XB[1], 0xcc );
+  t1 = _mm_blend_epi16( XB[0], XB[1], 0x33 );
+  t2 = _mm_blend_epi16( XB[2], XB[3], 0xcc );
+  t3 = _mm_blend_epi16( XB[2], XB[3], 0x33 );
+  XB[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XB[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );
+  t0 = _mm_blend_epi16( XC[0], XC[1], 0xcc );
+  t1 = _mm_blend_epi16( XC[0], XC[1], 0x33 );
+  t2 = _mm_blend_epi16( XC[2], XC[3], 0xcc );
+  t3 = _mm_blend_epi16( XC[2], XC[3], 0x33 );
+  XC[0] = _mm_blend_epi16( t0, t2, 0xf0 );
+  XC[1] = _mm_blend_epi16( t1, t3, 0x3c );
+  XC[2] = _mm_blend_epi16( t0, t2, 0x0f );
+  XC[3] = _mm_blend_epi16( t1, t3, 0xc3 );

 #else   // SSE2

+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
+
   YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
   YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
   YC0 = _mm_set_epi32( xc[15], xc[10], xc[ 5], xc[ 0] );
@@ -2829,7 +2619,7 @@ static void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
 #endif
 }

-static void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
+static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
                                          uint32_t* xc )
 {
   __m128i *XA = (__m128i*)xa;
@@ -2838,91 +2628,30 @@ static void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,

 #if defined(__SSE4_1__)

-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 );
-   YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 );
-   YC0 = _mm_blend_epi32( XC[0], XC[1], 0x8 );
-   YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 );
-   YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 );
-   YC1 = _mm_blend_epi32( XC[0], XC[1], 0x1 );
-   YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 );
-   YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 );
-   YC2 = _mm_blend_epi32( XC[0], XC[1], 0x2 );
-   YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 );
-   YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 );
-   YC3 = _mm_blend_epi32( XC[0], XC[1], 0x4 );
-
-   YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 );
-   YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 );
-   YC0 = _mm_blend_epi32( YC0, XC[2], 0x4 );
-   YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 );
-   YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 );
-   YC1 = _mm_blend_epi32( YC1, XC[2], 0x8 );
-   YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 );
-   YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 );
-   YC2 = _mm_blend_epi32( YC2, XC[2], 0x1 );
-   YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 );
-   YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 );
-   YC3 = _mm_blend_epi32( YC3, XC[2], 0x2 );
-
-   XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 );
-   XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 );
-   XC[0] = _mm_blend_epi32( YC0, XC[3], 0x2 );
-   XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 );
-   XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 );
-   XC[1] = _mm_blend_epi32( YC1, XC[3], 0x4 );
-   XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 );
-   XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 );
-   XC[2] = _mm_blend_epi32( YC2, XC[3], 0x8 );
-   XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 );
-   XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 );
-   XC[3] = _mm_blend_epi32( YC3, XC[3], 0x1 );
-
-#else   // SSE4_1
-
-   YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 );
-   YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 );
-   YC0 = _mm_blend_epi16( XC[0], XC[1], 0xc0 );
-   YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 );
-   YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 );
-   YC1 = _mm_blend_epi16( XC[0], XC[1], 0x03 );
-   YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c );
-   YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c );
-   YC2 = _mm_blend_epi16( XC[0], XC[1], 0x0c );
-   YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 );
-   YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 );
-   YC3 = _mm_blend_epi16( XC[0], XC[1], 0x30 );
-
-   YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 );
-   YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 );
-   YC0 = _mm_blend_epi16( YC0, XC[2], 0x30 );
-   YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 );
-   YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 );
-   YC1 = _mm_blend_epi16( YC1, XC[2], 0xc0 );
-   YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 );
-   YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 );
-   YC2 = _mm_blend_epi16( YC2, XC[2], 0x03 );
-   YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c );
-   YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c );
-   YC3 = _mm_blend_epi16( YC3, XC[2], 0x0c );
-
-   XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c );
-   XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c );
-   XC[0] = _mm_blend_epi16( YC0, XC[3], 0x0c );
-   XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 );
-   XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 );
-   XC[1] = _mm_blend_epi16( YC1, XC[3], 0x30 );
-   XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 );
-   XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 );
-   XC[2] = _mm_blend_epi16( YC2, XC[3], 0xc0 );
-   XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 );
-   XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 );
-   XC[3] = _mm_blend_epi16( YC3, XC[3], 0x03 );
-
-#endif  // AVX2 else SSE4_1
+  __m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
+  __m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
+  __m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
+  __m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
+  XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XA[3] = _mm_blend_epi16( t1, t3, 0x33 );
+  t0 = _mm_blend_epi16( XB[0], XB[2], 0xf0 );
+  t1 = _mm_blend_epi16( XB[0], XB[2], 0x0f );
+  t2 = _mm_blend_epi16( XB[1], XB[3], 0x3c );
+  t3 = _mm_blend_epi16( XB[1], XB[3], 0xc3 );
+  XB[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XB[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XB[3] = _mm_blend_epi16( t1, t3, 0x33 );
+  t0 = _mm_blend_epi16( XC[0], XC[2], 0xf0 );
+  t1 = _mm_blend_epi16( XC[0], XC[2], 0x0f );
+  t2 = _mm_blend_epi16( XC[1], XC[3], 0x3c );
+  t3 = _mm_blend_epi16( XC[1], XC[3], 0xc3 );
+  XC[0] = _mm_blend_epi16( t0, t2, 0xcc );
+  XC[1] = _mm_blend_epi16( t0, t2, 0x33 );
+  XC[2] = _mm_blend_epi16( t1, t3, 0xcc );
+  XC[3] = _mm_blend_epi16( t1, t3, 0x33 );

 #else  // SSE2

--- a/algo/sha/md-helper-4way.c
+++ b/algo/sha/md-helper-4way.c
@@ -1,270 +0,0 @@
-/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */
-/*
- * This file contains some functions which implement the external data
- * handling and padding for Merkle-Damgard hash functions which follow
- * the conventions set out by MD4 (little-endian) or SHA-1 (big-endian).
- *
- * API: this file is meant to be included, not compiled as a stand-alone
- * file. Some macros must be defined:
- *   RFUN   name for the round function
- *   HASH   "short name" for the hash function
- *   BE32   defined for big-endian, 32-bit based (e.g. SHA-1)
- *   LE32   defined for little-endian, 32-bit based (e.g. MD5)
- *   BE64   defined for big-endian, 64-bit based (e.g. SHA-512)
- *   LE64   defined for little-endian, 64-bit based (no example yet)
- *   PW01   if defined, append 0x01 instead of 0x80 (for Tiger)
- *   BLEN   if defined, length of a message block (in bytes)
- *   PLW1   if defined, length is defined on one 64-bit word only (for Tiger)
- *   PLW4   if defined, length is defined on four 64-bit words (for WHIRLPOOL)
- *   SVAL   if defined, reference to the context state information
- *
- * BLEN is used when a message block is not 16 (32-bit or 64-bit) words:
- * this is used for instance for Tiger, which works on 64-bit words but
- * uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are
- * ignored if 32-bit words are used; if 64-bit words are used and PLW1 is
- * set, then only one word (64 bits) will be used to encode the input
- * message length (in bits), otherwise two words will be used (as in
- * SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but
- * not PLW1), four 64-bit words will be used to encode the message length
- * (in bits). Note that regardless of those settings, only 64-bit message
- * lengths are supported (in bits): messages longer than 2 Exabytes will be
- * improperly hashed (this is unlikely to happen soon: 2 Exabytes is about
- * 2 millions Terabytes, which is huge).
- *
- * If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close()
- * function. This is used for Tiger2, which is identical to Tiger except
- * when it comes to the padding (Tiger2 uses the standard 0x80 byte instead
- * of the 0x01 from original Tiger).
- *
- * The RFUN function is invoked with two arguments, the first pointing to
- * aligned data (as a "const void *"), the second being state information
- * from the context structure. By default, this state information is the
- * "val" field from the context, and this field is assumed to be an array
- * of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64).
- * from the context structure. The "val" field can have any type, except
- * for the output encoding which assumes that it is an array of "sph_u32"
- * values. By defining NO_OUTPUT, this last step is deactivated; the
- * includer code is then responsible for writing out the hash result. When
- * NO_OUTPUT is defined, the third parameter to the "close()" function is
- * ignored.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifdef _MSC_VER
-#pragma warning (disable: 4146)
-#endif
-
-#undef SPH_XCAT
-#define SPH_XCAT(a, b)     SPH_XCAT_(a, b)
-#undef SPH_XCAT_
-#define SPH_XCAT_(a, b)    a ## b
-
-#undef SPH_BLEN
-#undef SPH_WLEN
-#if defined BE64 || defined LE64
-#define SPH_BLEN    128U
-#define SPH_WLEN      8U
-#else
-#define SPH_BLEN     64U
-#define SPH_WLEN      4U
-#endif
-
-#ifdef BLEN
-#undef SPH_BLEN
-#define SPH_BLEN    BLEN
-#endif
-
-#undef SPH_MAXPAD
-#if defined PLW1
-#define SPH_MAXPAD   (SPH_BLEN - SPH_WLEN)
-#elif defined PLW4
-#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 2))
-#else
-#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 1))
-#endif
-
-#undef SPH_VAL
-#undef SPH_NO_OUTPUT
-#ifdef SVAL
-#define SPH_VAL         SVAL
-#define SPH_NO_OUTPUT   1
-#else
-#define SPH_VAL   sc->val
-#endif
-
-#ifndef CLOSE_ONLY
-
-#ifdef SPH_UPTR
-static void
-SPH_XCAT(HASH, _short)( void *cc, const void *data, size_t len )
-#else
-void
-HASH ( void *cc, const void *data, size_t len )
-#endif
-{
-   SPH_XCAT( HASH, _context ) *sc;
-   __m256i *vdata = (__m256i*)data;
-   size_t ptr;
-
-   sc = cc;
-   ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
-   while ( len > 0 )
-   {
-      size_t clen;
-      clen = SPH_BLEN - ptr;
-      if ( clen > len )
-         clen = len;
-      memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
-      vdata = vdata + (clen>>3);
-      ptr += clen;
-      len -= clen;
-      if ( ptr == SPH_BLEN )
-      {
-         RFUN( sc->buf, SPH_VAL );
-         ptr = 0;
-      }
-         sc->count += clen;
-   }
-}
-
-#ifdef SPH_UPTR
-void
-HASH (void *cc, const void *data, size_t len)
-{
-   SPH_XCAT(HASH, _context) *sc;
-   __m256i *vdata = (__m256i*)data;
-   unsigned ptr;
-
-   if ( len < (2 * SPH_BLEN) )
-   {
-      SPH_XCAT(HASH, _short)(cc, data, len);
-      return;
-   }
-   sc = cc;
-   ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
-   if ( ptr > 0 )
-   {
-      unsigned t;
-      t = SPH_BLEN - ptr;
-      SPH_XCAT( HASH, _short )( cc, data, t );
-      vdata = vdata + (t>>3);
-      len -= t;
-   }
-   SPH_XCAT( HASH, _short )( cc, data, len );
-}
-#endif
-
-#endif
-
-/*
- * Perform padding and produce result. The context is NOT reinitialized
- * by this function.
- */
-static void
-SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
-          void *dst, unsigned rnum )
-{
-    SPH_XCAT(HASH, _context) *sc;
-    unsigned ptr, u;
-    sc = cc;
-    ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
-
-#ifdef PW01
-    sc->buf[ptr>>3] = m256_const1_64( 0x100 >> 8 );
-#else
-    sc->buf[ptr>>3] = m256_const1_64( 0x80 );
-#endif
-    ptr += 8;
-
-    if ( ptr > SPH_MAXPAD )
-    {
-         memset_zero_256( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
-         RFUN( sc->buf, SPH_VAL );
-         memset_zero_256( sc->buf, SPH_MAXPAD >> 3 );
-    }
-    else
-    {
-         memset_zero_256( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
-    }
-#if defined BE64
-#if defined PLW1
-    sc->buf[ SPH_MAXPAD>>3 ] =
-                 mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
-#elif defined PLW4
-    memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
-    sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-                mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
-    sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-                mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
-#else
-    sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-               mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
-    sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-               mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
-#endif  // PLW
-#else  // LE64
-#if defined PLW1
-    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
-#elif defined PLW4
-    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
-    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
-                       _mm256_set1_epi64x( c->count >> 61 );
-    memset_zero_256( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
-                       2 * SPH_WLEN );
-#else
-    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
-    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
-                          _mm256_set1_epi64x( sc->count >> 61 );
-#endif // PLW
-
-#endif // LE64
-
-    RFUN( sc->buf, SPH_VAL );
-
-#ifdef SPH_NO_OUTPUT
-    (void)dst;
-    (void)rnum;
-    (void)u;
-#else
-    for ( u = 0; u < rnum; u ++ )
-    {
-#if defined BE64
-       ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
-#else  // LE64
-       ((__m256i*)dst)[u] = sc->val[u];
-#endif
-    }
-#endif
-}
-
-static void
-SPH_XCAT( HASH, _mdclose )( void *cc, void *dst, unsigned rnum )
-{
-   SPH_XCAT( HASH, _addbits_and_close )( cc, 0, 0, dst, rnum );
-}
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -711,8 +711,11 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
 {
   __m256i A, B, C, D, E, F, G, H;

-   X[ 0] = SHA2x_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
-   X[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   // W[9:14] are zero, therefore X[9:13] are also zero and not needed.
+   // Except X[ 9] which is part of W[ 0] from the third group.
+   X[ 0] = _mm256_add_epi32( SSG2_0x( W[ 1] ), W[ 0] );
+   X[ 1] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( W[15] ),
+                             SSG2_0x( W[ 2] ) ), W[ 1] );
   X[ 2] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 0] ), W[11] ),
                             W[ 2] );
   X[ 3] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 1] ), W[12] ),
@@ -725,16 +728,12 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
                             W[ 6] );
   X[ 7] = _mm256_add_epi32( _mm256_add_epi32( X[ 0], SSG2_0x( W[ 8] ) ),
                             W[ 7] );
-   X[ 8] = _mm256_add_epi32( _mm256_add_epi32( X[ 1], SSG2_0x( W[ 9] ) ),
-                             W[ 8] );
-   X[ 9] = _mm256_add_epi32( SSG2_0x( W[10] ), W[ 9] );
-   X[10] = _mm256_add_epi32( SSG2_0x( W[11] ), W[10] );
-   X[11] = _mm256_add_epi32( SSG2_0x( W[12] ), W[11] );
-   X[12] = _mm256_add_epi32( SSG2_0x( W[13] ), W[12] );
-   X[13] = _mm256_add_epi32( SSG2_0x( W[14] ), W[13] );
-   X[14] = _mm256_add_epi32( SSG2_0x( W[15] ), W[14] );
+   X[ 8] = _mm256_add_epi32( X[ 1], W[ 8] );
+   X[14] = SSG2_0x( W[15] );
   X[15] = _mm256_add_epi32( SSG2_0x( X[ 0] ), W[15] );

+   X[ 9] = _mm256_add_epi32( SSG2_0x( X[ 1] ), X[ 0] );
+   
   A = _mm256_load_si256( state_in     );
   B = _mm256_load_si256( state_in + 1 );
   C = _mm256_load_si256( state_in + 2 );
@@ -779,10 +778,6 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
   G = _mm256_load_si256( state_mid + 6 );
   H = _mm256_load_si256( state_mid + 7 );

-//   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-//   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-//   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-
 #if !defined(__AVX512VL__)
   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( G, H );
 #endif
@@ -810,23 +805,36 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
   W[ 6] = _mm256_add_epi32( X[ 6], SSG2_1x( W[ 4] ) );
   W[ 7] = _mm256_add_epi32( X[ 7], SSG2_1x( W[ 5] ) );
   W[ 8] = _mm256_add_epi32( X[ 8], SSG2_1x( W[ 6] ) );
-   W[ 9] = _mm256_add_epi32( X[ 9], _mm256_add_epi32( SSG2_1x( W[ 7] ),
-                                                      W[ 2] ) );
-   W[10] = _mm256_add_epi32( X[10], _mm256_add_epi32( SSG2_1x( W[ 8] ),
-                                                      W[ 3] ) );
-   W[11] = _mm256_add_epi32( X[11], _mm256_add_epi32( SSG2_1x( W[ 9] ),
-                                                      W[ 4] ) );
-   W[12] = _mm256_add_epi32( X[12], _mm256_add_epi32( SSG2_1x( W[10] ),
-                                                      W[ 5] ) );
-   W[13] = _mm256_add_epi32( X[13], _mm256_add_epi32( SSG2_1x( W[11] ),
-                                                      W[ 6] ) );
+   W[ 9] = _mm256_add_epi32( SSG2_1x( W[ 7] ), W[ 2] );
+   W[10] = _mm256_add_epi32( SSG2_1x( W[ 8] ), W[ 3] );
+   W[11] = _mm256_add_epi32( SSG2_1x( W[ 9] ), W[ 4] );
+   W[12] = _mm256_add_epi32( SSG2_1x( W[10] ), W[ 5] );
+   W[13] = _mm256_add_epi32( SSG2_1x( W[11] ), W[ 6] );
   W[14] = _mm256_add_epi32( X[14], _mm256_add_epi32( SSG2_1x( W[12] ),
                                                      W[ 7] ) );
   W[15] = _mm256_add_epi32( X[15], _mm256_add_epi32( SSG2_1x( W[13] ),
                                                      W[ 8] ) );

   SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
-   SHA256x8_MSG_EXPANSION( W );
+
+   W[ 0] = _mm256_add_epi32( X[ 9], _mm256_add_epi32( SSG2_1x( W[14] ),
+                                                      W[ 9] ) );
+   W[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   W[ 2] = SHA2x_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
+   W[ 3] = SHA2x_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
+   W[ 4] = SHA2x_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
+   W[ 5] = SHA2x_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
+   W[ 6] = SHA2x_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
+   W[ 7] = SHA2x_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
+   W[ 8] = SHA2x_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
+   W[ 9] = SHA2x_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
+   W[10] = SHA2x_MEXP( W[ 8], W[ 3], W[11], W[10] );
+   W[11] = SHA2x_MEXP( W[ 9], W[ 4], W[12], W[11] );
+   W[12] = SHA2x_MEXP( W[10], W[ 5], W[13], W[12] );
+   W[13] = SHA2x_MEXP( W[11], W[ 6], W[14], W[13] );
+   W[14] = SHA2x_MEXP( W[12], W[ 7], W[15], W[14] );
+   W[15] = SHA2x_MEXP( W[13], W[ 8], W[ 0], W[15] ); 
+
   SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
   SHA256x8_MSG_EXPANSION( W );
   SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
@@ -1201,9 +1209,13 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
 {
   __m512i A, B, C, D, E, F, G, H;
   
-   // precalculate constant part msg expansion for second iteration.
-   X[ 0] = SHA2x16_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
-   X[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   // X is pre-expanded constant part of msg for second group, rounds 16 to 31.
+   // W[9:14] are zero, therefore X[9:13] are also zero and not needed.
+   // Except X[ 9] which is used to pre-expand part of W[ 0] from the third
+   // group, rounds 32 to 48.
+   X[ 0] = _mm512_add_epi32( SSG2_0x16( W[ 1] ), W[ 0] );
+   X[ 1] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( W[15] ),
+                             SSG2_0x16( W[ 2] ) ), W[ 1] );
   X[ 2] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 0] ), W[11] ),
                             W[ 2] );
   X[ 3] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 1] ), W[12] ),
@@ -1216,16 +1228,12 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
                             W[ 6] ); 
   X[ 7] = _mm512_add_epi32( _mm512_add_epi32( X[ 0], SSG2_0x16( W[ 8] ) ),
                             W[ 7] );
-   X[ 8] = _mm512_add_epi32( _mm512_add_epi32( X[ 1], SSG2_0x16( W[ 9] ) ),
-                             W[ 8] );
-   X[ 9] = _mm512_add_epi32( SSG2_0x16( W[10] ), W[ 9] );
-   X[10] = _mm512_add_epi32( SSG2_0x16( W[11] ), W[10] );
-   X[11] = _mm512_add_epi32( SSG2_0x16( W[12] ), W[11] );
-   X[12] = _mm512_add_epi32( SSG2_0x16( W[13] ), W[12] );
-   X[13] = _mm512_add_epi32( SSG2_0x16( W[14] ), W[13] );
-   X[14] = _mm512_add_epi32( SSG2_0x16( W[15] ), W[14] );
+   X[ 8] = _mm512_add_epi32( X[ 1], W[ 8] );
+   X[14] = SSG2_0x16( W[15] );
   X[15] = _mm512_add_epi32( SSG2_0x16( X[ 0] ), W[15] );

+   X[ 9] = _mm512_add_epi32( SSG2_0x16( X[ 1] ), X[ 0] );
+
   A = _mm512_load_si512( state_in     );
   B = _mm512_load_si512( state_in + 1 );
   C = _mm512_load_si512( state_in + 2 );
@@ -1280,7 +1288,7 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );

-   // update precalculated msg expansion with new nonce: W[3].
+   // inject nonce, W[3], to complete msg expansion.
   W[ 0] = X[ 0];
   W[ 1] = X[ 1];
   W[ 2] = _mm512_add_epi32( X[ 2], SSG2_0x16( W[ 3] ) );
@@ -1290,23 +1298,36 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
   W[ 6] = _mm512_add_epi32( X[ 6], SSG2_1x16( W[ 4] ) );
   W[ 7] = _mm512_add_epi32( X[ 7], SSG2_1x16( W[ 5] ) );
   W[ 8] = _mm512_add_epi32( X[ 8], SSG2_1x16( W[ 6] ) );
-   W[ 9] = _mm512_add_epi32( X[ 9], _mm512_add_epi32( SSG2_1x16( W[ 7] ),
-                                                      W[ 2] ) );
-   W[10] = _mm512_add_epi32( X[10], _mm512_add_epi32( SSG2_1x16( W[ 8] ),
-                                                      W[ 3] ) );
-   W[11] = _mm512_add_epi32( X[11], _mm512_add_epi32( SSG2_1x16( W[ 9] ),
-                                                      W[ 4] ) );
-   W[12] = _mm512_add_epi32( X[12], _mm512_add_epi32( SSG2_1x16( W[10] ),
-                                                      W[ 5] ) );
-   W[13] = _mm512_add_epi32( X[13], _mm512_add_epi32( SSG2_1x16( W[11] ),
-                                                      W[ 6] ) );
+   W[ 9] = _mm512_add_epi32( SSG2_1x16( W[ 7] ), W[ 2] );
+   W[10] = _mm512_add_epi32( SSG2_1x16( W[ 8] ), W[ 3] );
+   W[11] = _mm512_add_epi32( SSG2_1x16( W[ 9] ), W[ 4] );
+   W[12] = _mm512_add_epi32( SSG2_1x16( W[10] ), W[ 5] );
+   W[13] = _mm512_add_epi32( SSG2_1x16( W[11] ), W[ 6] );
   W[14] = _mm512_add_epi32( X[14], _mm512_add_epi32( SSG2_1x16( W[12] ),
                                                      W[ 7] ) );
   W[15] = _mm512_add_epi32( X[15], _mm512_add_epi32( SSG2_1x16( W[13] ),
                                                      W[ 8] ) );

   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
-   SHA256x16_MSG_EXPANSION( W );
+
+   W[ 0] = _mm512_add_epi32( X[ 9], _mm512_add_epi32( SSG2_1x16( W[14] ),
+                                                      W[ 9] ) ); 
+   W[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   W[ 2] = SHA2x16_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
+   W[ 3] = SHA2x16_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
+   W[ 4] = SHA2x16_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
+   W[ 5] = SHA2x16_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
+   W[ 6] = SHA2x16_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
+   W[ 7] = SHA2x16_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
+   W[ 8] = SHA2x16_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
+   W[ 9] = SHA2x16_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
+   W[10] = SHA2x16_MEXP( W[ 8], W[ 3], W[11], W[10] );
+   W[11] = SHA2x16_MEXP( W[ 9], W[ 4], W[12], W[11] );
+   W[12] = SHA2x16_MEXP( W[10], W[ 5], W[13], W[12] );
+   W[13] = SHA2x16_MEXP( W[11], W[ 6], W[14], W[13] );
+   W[14] = SHA2x16_MEXP( W[12], W[ 7], W[15], W[14] );
+   W[15] = SHA2x16_MEXP( W[13], W[ 8], W[ 0], W[15] );
+
   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
   SHA256x16_MSG_EXPANSION( W );
   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
@@ -1336,8 +1357,8 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
 {
   __m512i A, B, C, D, E, F, G, H;
   __m512i W[16];      memcpy_512( W, data, 16 );
-   // Value for H at round 60, before adding K, to produce valid final hash
-   //where H == 0.
+   // Value for H at round 60, before adding K, needed to produce valid final
+   // hash where H == 0.
   // H_ =  -( H256[7] + K256[60] );
   const __m512i H_ = m512_const1_32( 0x136032ED );

--- a/algo/sha/sha256dt.c
+++ b/algo/sha/sha256dt.c
@@ -0,0 +1,268 @@
+#include "algo-gate-api.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "sha-hash-4way.h"
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define SHA256DT_16WAY 1
+#elif defined(__AVX2__)
+  #define SHA256DT_8WAY 1
+#else
+  #define SHA256DT_4WAY 1
+#endif
+
+#if defined(SHA256DT_16WAY)
+
+int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m512i  vdata[32]    __attribute__ ((aligned (128)));
+   __m512i  block[16]    __attribute__ ((aligned (64)));
+   __m512i  hash32[8]    __attribute__ ((aligned (64)));
+   __m512i  initstate[8] __attribute__ ((aligned (64)));
+   __m512i  midstate1[8] __attribute__ ((aligned (64)));
+   __m512i  midstate2[8] __attribute__ ((aligned (64)));
+   __m512i  mexp_pre[16] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 16;
+   uint32_t n = first_nonce;
+   __m512i *noncev = vdata + 19; 
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m512i last_byte = m512_const1_32( 0x80000000 );
+   const __m512i sixteen = m512_const1_32( 16 );
+
+   for ( int i = 0; i < 19; i++ )
+      vdata[i] = mm512_bcast_i32( pdata[i] );
+
+   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
+                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
+
+   vdata[16+4] = last_byte;
+   memset_zero_512( vdata+16 + 5, 10 );
+   vdata[16+15] = mm512_bcast_i32( 0x480 ); 
+   
+   block[ 8] = last_byte;
+   memset_zero_512( block + 9, 6 );
+   block[15] = mm512_bcast_i32( 0x300 ); 
+   
+   initstate[0] = mm512_bcast_i64( 0xdfa9bf2cdfa9bf2c );
+   initstate[1] = mm512_bcast_i64( 0xb72074d4b72074d4 );
+   initstate[2] = mm512_bcast_i64( 0x6bb011226bb01122 );
+   initstate[3] = mm512_bcast_i64( 0xd338e869d338e869 );
+   initstate[4] = mm512_bcast_i64( 0xaa3ff126aa3ff126 );
+   initstate[5] = mm512_bcast_i64( 0x475bbf30475bbf30 );
+   initstate[6] = mm512_bcast_i64( 0x8fd52e5b8fd52e5b );
+   initstate[7] = mm512_bcast_i64( 0x9f75c9ad9f75c9ad );
+
+   sha256_16way_transform_le( midstate1, vdata, initstate );
+   
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
+
+   do
+   {
+      sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                 mexp_pre );
+      sha256_16way_transform_le( hash32, block, initstate );
+      mm512_block_bswap_32( hash32, hash32 );    
+
+      for ( int lane = 0; lane < 16; lane++ )
+      if ( hash32_d7[ lane ] <= targ32_d7 )
+      {
+         extr_lane_16x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+      }
+      *noncev = _mm512_add_epi32( *noncev, sixteen );
+      n += 16;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+#endif
+
+#if defined(SHA256DT_8WAY)
+
+int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m256i  vdata[32]    __attribute__ ((aligned (64)));
+   __m256i  block[16]    __attribute__ ((aligned (32)));
+   __m256i  hash32[8]    __attribute__ ((aligned (32)));
+   __m256i  initstate[8] __attribute__ ((aligned (32)));
+   __m256i  midstate1[8] __attribute__ ((aligned (32)));
+   __m256i  midstate2[8] __attribute__ ((aligned (32)));
+   __m256i  mexp_pre[16] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+   __m256i *noncev = vdata + 19;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m256i last_byte = m256_const1_32( 0x80000000 );
+   const __m256i eight = m256_const1_32( 8 );
+
+   for ( int i = 0; i < 19; i++ )
+      vdata[i] = mm256_bcast_i32( pdata[i] );
+
+   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
+
+   vdata[16+4] = last_byte;
+   memset_zero_256( vdata+16 + 5, 10 );
+   vdata[16+15] = mm256_bcast_i32( 0x480 );
+
+   block[ 8] = last_byte;
+   memset_zero_256( block + 9, 6 );
+   block[15] = mm256_bcast_i32( 0x300 ); 
+   
+   // initialize state
+   initstate[0] = mm256_bcast_i64( 0xdfa9bf2cdfa9bf2c );
+   initstate[1] = mm256_bcast_i64( 0xb72074d4b72074d4 );
+   initstate[2] = mm256_bcast_i64( 0x6bb011226bb01122 );
+   initstate[3] = mm256_bcast_i64( 0xd338e869d338e869 );
+   initstate[4] = mm256_bcast_i64( 0xaa3ff126aa3ff126 );
+   initstate[5] = mm256_bcast_i64( 0x475bbf30475bbf30 );
+   initstate[6] = mm256_bcast_i64( 0x8fd52e5b8fd52e5b );
+   initstate[7] = mm256_bcast_i64( 0x9f75c9ad9f75c9ad );
+
+   sha256_8way_transform_le( midstate1, vdata, initstate );
+
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
+   
+   do
+   {
+      sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                mexp_pre );
+      sha256_8way_transform_le( hash32, block, initstate );
+      mm256_block_bswap_32( hash32, hash32 );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( hash32_d7[ lane ] <= targ32_d7 )
+      {
+         extr_lane_8x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+      }
+      *noncev = _mm256_add_epi32( *noncev, eight );
+      n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
+
+#if defined(SHA256DT_4WAY)
+
+int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m128i  vdata[32]    __attribute__ ((aligned (64)));
+   __m128i  block[16]    __attribute__ ((aligned (32)));
+   __m128i  hash32[8]    __attribute__ ((aligned (32)));
+   __m128i  initstate[8] __attribute__ ((aligned (32)));
+   __m128i  midstate[8]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   uint32_t n = first_nonce;
+   __m128i *noncev = vdata + 19;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m128i last_byte = m128_const1_32( 0x80000000 );
+   const __m128i four = m128_const1_32( 4 );
+
+   for ( int i = 0; i < 19; i++ )
+       vdata[i] = mm128_bcast_i32( pdata[i] );
+
+   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
+
+   vdata[16+4] = last_byte;
+   memset_zero_128( vdata+16 + 5, 10 );
+   vdata[16+15] = mm128_bcast_i32( 0x480 );
+
+   block[ 8] = last_byte;
+   memset_zero_128( block + 9, 6 );
+   block[15] = mm128_bcast_i32( 0x300 );
+   
+   // initialize state
+   initstate[0] = mm128_bcast_i64( 0xdfa9bf2cdfa9bf2c );
+   initstate[1] = mm128_bcast_i64( 0xb72074d4b72074d4 );
+   initstate[2] = mm128_bcast_i64( 0x6bb011226bb01122 );
+   initstate[3] = mm128_bcast_i64( 0xd338e869d338e869 );
+   initstate[4] = mm128_bcast_i64( 0xaa3ff126aa3ff126 );
+   initstate[5] = mm128_bcast_i64( 0x475bbf30475bbf30 );
+   initstate[6] = mm128_bcast_i64( 0x8fd52e5b8fd52e5b );
+   initstate[7] = mm128_bcast_i64( 0x9f75c9ad9f75c9ad );
+
+   // hash first 64 bytes of data
+   sha256_4way_transform_le( midstate, vdata, initstate );
+
+   do
+   {
+      sha256_4way_transform_le( block,  vdata+16, midstate  );
+      sha256_4way_transform_le( hash32, block,    initstate );
+      mm128_block_bswap_32( hash32, hash32 );
+
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_4x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+       }
+       *noncev = _mm_add_epi32( *noncev, four );
+       n += 4;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
+bool register_sha256dt_algo( algo_gate_t* gate )
+{
+    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+#if defined(SHA256DT_16WAY)
+    gate->scanhash   = (void*)&scanhash_sha256dt_16way;
+#elif defined(SHA256DT_8WAY)
+    gate->scanhash   = (void*)&scanhash_sha256dt_8way;
+#else
+    gate->scanhash   = (void*)&scanhash_sha256dt_4way;
+#endif
+    return true;
+}
+
--- a/algo/sha/sha512256d-4way.c
+++ b/algo/sha/sha512256d-4way.c
@@ -0,0 +1,221 @@
+#include "algo-gate-api.h"
+#include "sha-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#define SHA512256D_8WAY 1
+#elif defined(__AVX2__)
+#define SHA512256D_4WAY 1
+#endif
+
+#if defined(SHA512256D_8WAY)
+
+static void sha512256d_8way_init( sha512_8way_context *ctx )
+{
+  ctx->count = 0;
+  ctx->initialized = true;
+  ctx->val[0] = mm512_bcast_i64( 0x22312194FC2BF72C );
+  ctx->val[1] = mm512_bcast_i64( 0x9F555FA3C84C64C2 );
+  ctx->val[2] = mm512_bcast_i64( 0x2393B86B6F53B151 );
+  ctx->val[3] = mm512_bcast_i64( 0x963877195940EABD );
+  ctx->val[4] = mm512_bcast_i64( 0x96283EE2A88EFFE3 );
+  ctx->val[5] = mm512_bcast_i64( 0xBE5E1E2553863992 );
+  ctx->val[6] = mm512_bcast_i64( 0x2B0199FC2C85B8AA );
+  ctx->val[7] = mm512_bcast_i64( 0x0EB72DDC81C52CA2 );
+}
+
+int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint64_t hash[8*8] __attribute__ ((aligned (128)));
+    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+    sha512_8way_context ctx; 
+    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+    uint64_t *hash_q3 = &(hash[3*8]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 8;
+    uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;
+    const int thr_id = mythr->id;
+    const bool bench = opt_benchmark;
+    const __m512i eight = mm512_bcast_i64( 0x0000000800000000 );
+
+    mm512_bswap32_intrlv80_8x64( vdata, pdata );
+    *noncev = mm512_intrlv_blend_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
+    do
+    {
+       sha512256d_8way_init( &ctx );
+       sha512_8way_update( &ctx, vdata, 80 );
+       sha512_8way_close( &ctx, hash );        
+
+       sha512256d_8way_init( &ctx );
+       sha512_8way_update( &ctx, hash, 32 );
+       sha512_8way_close( &ctx, hash );
+
+       for ( int lane = 0; lane < 8; lane++ )
+       if ( unlikely( hash_q3[ lane ] <= targ_q3 && !bench ) )
+       {
+          extr_lane_8x64( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) && !bench )
+          {
+             pdata[19] = bswap_32( n + lane );
+             submit_solution( work, lane_hash, mythr );
+          }
+       }
+       *noncev = _mm512_add_epi32( *noncev, eight );
+       n += 8;
+    } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+
+    pdata[19] = n;
+    *hashes_done = n - first_nonce;
+    return 0;
+}
+
+#elif defined(SHA512256D_4WAY)
+
+static void sha512256d_4way_init( sha512_4way_context *ctx )
+{
+  ctx->count = 0;
+  ctx->initialized = true;
+  ctx->val[0] = mm256_bcast_i64( 0x22312194FC2BF72C );
+  ctx->val[1] = mm256_bcast_i64( 0x9F555FA3C84C64C2 );
+  ctx->val[2] = mm256_bcast_i64( 0x2393B86B6F53B151 );
+  ctx->val[3] = mm256_bcast_i64( 0x963877195940EABD );
+  ctx->val[4] = mm256_bcast_i64( 0x96283EE2A88EFFE3 );
+  ctx->val[5] = mm256_bcast_i64( 0xBE5E1E2553863992 );
+  ctx->val[6] = mm256_bcast_i64( 0x2B0199FC2C85B8AA );
+  ctx->val[7] = mm256_bcast_i64( 0x0EB72DDC81C52CA2 );
+}
+
+int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint64_t hash[8*4] __attribute__ ((aligned (64)));
+    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+    sha512_4way_context ctx;
+    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+    uint64_t *hash_q3 = &(hash[3*4]);
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t last_nonce = max_nonce - 4;
+    uint32_t n = first_nonce;
+    __m256i  *noncev = (__m256i*)vdata + 9;
+    const int thr_id = mythr->id;
+    const bool bench = opt_benchmark;
+    const __m256i four = mm256_bcast_i64( 0x0000000400000000 );
+
+    mm256_bswap32_intrlv80_4x64( vdata, pdata );
+    *noncev = mm256_intrlv_blend_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+    do
+    {
+       sha512256d_4way_init( &ctx );
+       sha512_4way_update( &ctx, vdata, 80 );
+       sha512_4way_close( &ctx, hash );
+
+       sha512256d_4way_init( &ctx );
+       sha512_4way_update( &ctx, hash, 32 );
+       sha512_4way_close( &ctx, hash );
+
+       for ( int lane = 0; lane < 4; lane++ )
+       if ( hash_q3[ lane ] <= targ_q3 )
+       {
+          extr_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) && !bench )
+          {
+             pdata[19] = bswap_32( n + lane );
+             submit_solution( work, lane_hash, mythr );
+          }
+       }
+       *noncev = _mm256_add_epi32( *noncev, four );
+       n += 4;
+    } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
+    pdata[19] = n;
+    *hashes_done = n - first_nonce;
+    return 0;
+}
+
+#else
+
+#include "sph_sha2.h"
+
+static const uint64_t H512_256[8] =
+{
+   0x22312194FC2BF72C, 0x9F555FA3C84C64C2,
+   0x2393B86B6F53B151, 0x963877195940EABD,
+   0x96283EE2A88EFFE3, 0xBE5E1E2553863992,
+   0x2B0199FC2C85B8AA, 0x0EB72DDC81C52CA2,
+};
+
+static void sha512256d_init( sph_sha512_context *ctx )
+{
+   memcpy( ctx->val, H512_256, sizeof H512_256 );
+   ctx->count = 0;
+}
+
+int scanhash_sha512256d( struct work *work,   uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t hash64[8] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__ ((aligned (64)));
+   sph_sha512_context ctx;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   int thr_id = mythr->id;
+
+   swab32_array( endiandata, pdata, 20 );
+
+   do {
+      be32enc( &endiandata[19], n );
+
+      sha512256d_init( &ctx );
+      sph_sha512( &ctx, endiandata, 80 );
+      sph_sha512_close( &ctx, hash64 );
+
+      sha512256d_init( &ctx );
+      sph_sha512( &ctx, hash64, 32 );
+      sph_sha512_close( &ctx, hash64 );
+      
+      if ( hash64[7] <= Htarg )
+      if ( fulltest( hash64, ptarget ) && !opt_benchmark )
+      {
+         pdata[19] = n;
+         submit_solution( work, hash64, mythr );
+      }
+      n++;
+
+   } while (n < max_nonce && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   pdata[19] = n;
+
+   return 0;
+}
+
+#endif
+
+bool register_sha512256d_algo( algo_gate_t* gate )
+{
+   gate->optimizations = AVX2_OPT | AVX512_OPT;
+#if defined(SHA512256D_8WAY)
+   gate->scanhash = (void*)&scanhash_sha512256d_8way;
+#elif defined(SHA512256D_4WAY)
+   gate->scanhash = (void*)&scanhash_sha512256d_4way;
+#else
+   gate->scanhash = (void*)&scanhash_sha512256d;
+#endif
+   return true;
+};
+
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -33,6 +33,7 @@
 #include <stddef.h>
 #include <string.h>

+// 4way is only used with AVX2, 8way only with AVX512, 16way is not needed.
 #ifdef __SSE4_1__

 #include "shabal-hash-4way.h"
@@ -44,21 +45,6 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

-/*
- * Part of this code was automatically generated (the part between
- * the "BEGIN" and "END" markers).
- */
-
-#define sM    16
-
-#define C32   SPH_C32
-#define T32   SPH_T32
-
-#define O1   13
-#define O2    9
-#define O3    6
-
-
 #if defined(__AVX2__)

 #define DECL_STATE8   \
@@ -310,72 +296,71 @@ do { \
    mm256_swap512_256( BF, CF ); \
 } while (0)

-#define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
+#define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
-   xa0 = mm256_xor3( xm, xb1, _mm256_xor_si256(  \
-            _mm256_andnot_si256( xb3, xb2 ), \
-            _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
-               _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \
-                                   FIVE ) ), THREE ) ) ); \
+   xa0 = mm256_xor3( xm, xb1, mm256_xorandnot(  \
+           _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
+              _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
+           xb3, xb2 ) ); \
   xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
 } while (0)

 #define PERM_STEP_0_8   do { \
-      PERM_ELT8(A0, AB, B0, BD, B9, B6, C8, M0); \
-      PERM_ELT8(A1, A0, B1, BE, BA, B7, C7, M1); \
-      PERM_ELT8(A2, A1, B2, BF, BB, B8, C6, M2); \
-      PERM_ELT8(A3, A2, B3, B0, BC, B9, C5, M3); \
-      PERM_ELT8(A4, A3, B4, B1, BD, BA, C4, M4); \
-      PERM_ELT8(A5, A4, B5, B2, BE, BB, C3, M5); \
-      PERM_ELT8(A6, A5, B6, B3, BF, BC, C2, M6); \
-      PERM_ELT8(A7, A6, B7, B4, B0, BD, C1, M7); \
-      PERM_ELT8(A8, A7, B8, B5, B1, BE, C0, M8); \
-      PERM_ELT8(A9, A8, B9, B6, B2, BF, CF, M9); \
-      PERM_ELT8(AA, A9, BA, B7, B3, B0, CE, MA); \
-      PERM_ELT8(AB, AA, BB, B8, B4, B1, CD, MB); \
-      PERM_ELT8(A0, AB, BC, B9, B5, B2, CC, MC); \
-      PERM_ELT8(A1, A0, BD, BA, B6, B3, CB, MD); \
-      PERM_ELT8(A2, A1, BE, BB, B7, B4, CA, ME); \
-      PERM_ELT8(A3, A2, BF, BC, B8, B5, C9, MF); \
-   } while (0)
+      PERM_ELT8( A0, AB, B0, BD, B9, B6, C8, M0 ); \
+      PERM_ELT8( A1, A0, B1, BE, BA, B7, C7, M1 ); \
+      PERM_ELT8( A2, A1, B2, BF, BB, B8, C6, M2 ); \
+      PERM_ELT8( A3, A2, B3, B0, BC, B9, C5, M3 ); \
+      PERM_ELT8( A4, A3, B4, B1, BD, BA, C4, M4 ); \
+      PERM_ELT8( A5, A4, B5, B2, BE, BB, C3, M5 ); \
+      PERM_ELT8( A6, A5, B6, B3, BF, BC, C2, M6 ); \
+      PERM_ELT8( A7, A6, B7, B4, B0, BD, C1, M7 ); \
+      PERM_ELT8( A8, A7, B8, B5, B1, BE, C0, M8 ); \
+      PERM_ELT8( A9, A8, B9, B6, B2, BF, CF, M9 ); \
+      PERM_ELT8( AA, A9, BA, B7, B3, B0, CE, MA ); \
+      PERM_ELT8( AB, AA, BB, B8, B4, B1, CD, MB ); \
+      PERM_ELT8( A0, AB, BC, B9, B5, B2, CC, MC ); \
+      PERM_ELT8( A1, A0, BD, BA, B6, B3, CB, MD ); \
+      PERM_ELT8( A2, A1, BE, BB, B7, B4, CA, ME ); \
+      PERM_ELT8( A3, A2, BF, BC, B8, B5, C9, MF ); \
+} while (0)

 #define PERM_STEP_1_8   do { \
-      PERM_ELT8(A4, A3, B0, BD, B9, B6, C8, M0); \
-      PERM_ELT8(A5, A4, B1, BE, BA, B7, C7, M1); \
-      PERM_ELT8(A6, A5, B2, BF, BB, B8, C6, M2); \
-      PERM_ELT8(A7, A6, B3, B0, BC, B9, C5, M3); \
-      PERM_ELT8(A8, A7, B4, B1, BD, BA, C4, M4); \
-      PERM_ELT8(A9, A8, B5, B2, BE, BB, C3, M5); \
-      PERM_ELT8(AA, A9, B6, B3, BF, BC, C2, M6); \
-      PERM_ELT8(AB, AA, B7, B4, B0, BD, C1, M7); \
-      PERM_ELT8(A0, AB, B8, B5, B1, BE, C0, M8); \
-      PERM_ELT8(A1, A0, B9, B6, B2, BF, CF, M9); \
-      PERM_ELT8(A2, A1, BA, B7, B3, B0, CE, MA); \
-      PERM_ELT8(A3, A2, BB, B8, B4, B1, CD, MB); \
-      PERM_ELT8(A4, A3, BC, B9, B5, B2, CC, MC); \
-      PERM_ELT8(A5, A4, BD, BA, B6, B3, CB, MD); \
-      PERM_ELT8(A6, A5, BE, BB, B7, B4, CA, ME); \
-      PERM_ELT8(A7, A6, BF, BC, B8, B5, C9, MF); \
-   } while (0)
+      PERM_ELT8( A4, A3, B0, BD, B9, B6, C8, M0 ); \
+      PERM_ELT8( A5, A4, B1, BE, BA, B7, C7, M1 ); \
+      PERM_ELT8( A6, A5, B2, BF, BB, B8, C6, M2 ); \
+      PERM_ELT8( A7, A6, B3, B0, BC, B9, C5, M3 ); \
+      PERM_ELT8( A8, A7, B4, B1, BD, BA, C4, M4 ); \
+      PERM_ELT8( A9, A8, B5, B2, BE, BB, C3, M5 ); \
+      PERM_ELT8( AA, A9, B6, B3, BF, BC, C2, M6 ); \
+      PERM_ELT8( AB, AA, B7, B4, B0, BD, C1, M7 ); \
+      PERM_ELT8( A0, AB, B8, B5, B1, BE, C0, M8 ); \
+      PERM_ELT8( A1, A0, B9, B6, B2, BF, CF, M9 ); \
+      PERM_ELT8( A2, A1, BA, B7, B3, B0, CE, MA ); \
+      PERM_ELT8( A3, A2, BB, B8, B4, B1, CD, MB ); \
+      PERM_ELT8( A4, A3, BC, B9, B5, B2, CC, MC ); \
+      PERM_ELT8( A5, A4, BD, BA, B6, B3, CB, MD ); \
+      PERM_ELT8( A6, A5, BE, BB, B7, B4, CA, ME ); \
+      PERM_ELT8( A7, A6, BF, BC, B8, B5, C9, MF ); \
+} while (0)

 #define PERM_STEP_2_8   do { \
-      PERM_ELT8(A8, A7, B0, BD, B9, B6, C8, M0); \
-      PERM_ELT8(A9, A8, B1, BE, BA, B7, C7, M1); \
-      PERM_ELT8(AA, A9, B2, BF, BB, B8, C6, M2); \
-      PERM_ELT8(AB, AA, B3, B0, BC, B9, C5, M3); \
-      PERM_ELT8(A0, AB, B4, B1, BD, BA, C4, M4); \
-      PERM_ELT8(A1, A0, B5, B2, BE, BB, C3, M5); \
-      PERM_ELT8(A2, A1, B6, B3, BF, BC, C2, M6); \
-      PERM_ELT8(A3, A2, B7, B4, B0, BD, C1, M7); \
-      PERM_ELT8(A4, A3, B8, B5, B1, BE, C0, M8); \
-      PERM_ELT8(A5, A4, B9, B6, B2, BF, CF, M9); \
-      PERM_ELT8(A6, A5, BA, B7, B3, B0, CE, MA); \
-      PERM_ELT8(A7, A6, BB, B8, B4, B1, CD, MB); \
-      PERM_ELT8(A8, A7, BC, B9, B5, B2, CC, MC); \
-      PERM_ELT8(A9, A8, BD, BA, B6, B3, CB, MD); \
-      PERM_ELT8(AA, A9, BE, BB, B7, B4, CA, ME); \
-      PERM_ELT8(AB, AA, BF, BC, B8, B5, C9, MF); \
-   } while (0)
+      PERM_ELT8( A8, A7, B0, BD, B9, B6, C8, M0 ); \
+      PERM_ELT8( A9, A8, B1, BE, BA, B7, C7, M1 ); \
+      PERM_ELT8( AA, A9, B2, BF, BB, B8, C6, M2 ); \
+      PERM_ELT8( AB, AA, B3, B0, BC, B9, C5, M3 ); \
+      PERM_ELT8( A0, AB, B4, B1, BD, BA, C4, M4 ); \
+      PERM_ELT8( A1, A0, B5, B2, BE, BB, C3, M5 ); \
+      PERM_ELT8( A2, A1, B6, B3, BF, BC, C2, M6 ); \
+      PERM_ELT8( A3, A2, B7, B4, B0, BD, C1, M7 ); \
+      PERM_ELT8( A4, A3, B8, B5, B1, BE, C0, M8 ); \
+      PERM_ELT8( A5, A4, B9, B6, B2, BF, CF, M9 ); \
+      PERM_ELT8( A6, A5, BA, B7, B3, B0, CE, MA ); \
+      PERM_ELT8( A7, A6, BB, B8, B4, B1, CD, MB ); \
+      PERM_ELT8( A8, A7, BC, B9, B5, B2, CC, MC ); \
+      PERM_ELT8( A9, A8, BD, BA, B6, B3, CB, MD ); \
+      PERM_ELT8( AA, A9, BE, BB, B7, B4, CA, ME ); \
+      PERM_ELT8( AB, AA, BF, BC, B8, B5, C9, MF ); \
+} while (0)

 #define APPLY_P8 \
 do { \
@@ -437,8 +422,8 @@ do { \
 } while (0)

 #define INCR_W8   do { \
-      if ((Wlow = T32(Wlow + 1)) == 0) \
-         Whigh = T32(Whigh + 1); \
+      if ( ( Wlow = Wlow + 1 ) == 0 ) \
+         Whigh = Whigh + 1; \
   } while (0)

 static void
@@ -650,15 +635,8 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
   shabal_8way_close(cc, ub, n, dst, 16);
 }

-
 #endif  // AVX2

-/*
- * We copy the state into local variables, so that the compiler knows
- * that it can optimize them at will.
- */
-
-
 #define DECL_STATE   \
 	__m128i A0, A1, A2, A3, A4, A5, A6, A7, \
 	        A8, A9, AA, AB; \
@@ -888,15 +866,6 @@ do { \
   A1 = _mm_xor_si128( A1, _mm_set1_epi32( Whigh ) ); \
 } while (0)

-
-/*
-#define SWAP(v1, v2)   do { \
-		sph_u32 tmp = (v1); \
-		(v1) = (v2); \
-		(v2) = tmp; \
-	} while (0)
-*/
-
 #define SWAP_BC \
 do { \
    mm128_swap256_128( B0, C0 ); \
@@ -917,18 +886,6 @@ do { \
    mm128_swap256_128( BF, CF ); \
 } while (0)

-/*
-#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
-do { \
-  __m128i t1 = _mm_mullo_epi32(  mm_rol_32( xa1, 15 ),\
-                                   _mm_set1_epi32(5UL) ) \
-  __m128i t2 = _mm_xor_si128( xa0, xc ); \
-  xb0 = mm_not( _mm_xor_si256( xa0, mm_rol_32( xb0, 1 ) ) ); \
-  xa0 = mm_xor4( xm, xb1, _mm_andnot_si128( xb3, xb2 ), \
-              _mm_xor_si128( t2, \
-                      _mm_mullo_epi32( t1, _mm_set1_epi32(5UL) ) ) ) \
-*/
-
 #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
 do { \
   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
@@ -1056,8 +1013,8 @@ do { \
 } while (0)

 #define INCR_W   do { \
-		if ((Wlow = T32(Wlow + 1)) == 0) \
-			Whigh = T32(Whigh + 1); \
+		if ( ( Wlow = Wlow + 1 ) == 0 ) \
+			Whigh = Whigh + 1; \
 	} while (0)

 /*
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -75,7 +75,6 @@ void shabal512_8way_close( void *cc, void *dst );
 void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                       void *dst );

-
 #endif

 typedef struct {
@@ -97,7 +96,6 @@ void shabal256_4way_addbits_and_close(	void *cc, unsigned ub, unsigned n,

 void shabal512_4way_init( void *cc );
 void shabal512_4way_update( void *cc, const void *data, size_t len );
-//#define shabal512_4way shabal512_4way_update
 void shabal512_4way_close( void *cc, void *dst );
 void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                       void *dst );
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -383,11 +383,17 @@ static const m512_v16 FFT256_Twiddle4w[] =

 #define shufxor4w(x,s) _mm512_shuffle_epi32( x, XCAT( SHUFXOR_, s ))

+#define REDUCE4w(x) \
+  _mm512_sub_epi16( _mm512_maskz_mov_epi8( 0x5555555555555555, x ), \
+                    _mm512_srai_epi16( x, 8 ) )
+
+/*
 #define REDUCE4w(x) \
  _mm512_sub_epi16( _mm512_and_si512( x, m512_const1_64( \
                         0x00ff00ff00ff00ff ) ), _mm512_srai_epi16( x, 8 ) )
+*/

-#define EXTRA_REDUCE_S4w(x)\
+#define EXTRA_REDUCE_S4w(x) \
  _mm512_sub_epi16( x, _mm512_and_si512( \
             m512_const1_64( 0x0101010101010101 ), \
             _mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \
@@ -400,8 +406,8 @@ static const m512_v16 FFT256_Twiddle4w[] =

 #define DO_REDUCE_FULL_S4w(i) \
 do { \
-    X(i) = REDUCE4w( X(i) );                        \
-    X(i) = EXTRA_REDUCE_S4w( X(i) );                \
+    X(i) = REDUCE4w( X(i) ); \
+    X(i) = EXTRA_REDUCE_S4w( X(i) ); \
 } while(0)


@@ -431,10 +437,6 @@ void fft64_4way( void *a )
   //  Unrolled decimation in frequency (DIF) radix-2 NTT.
   //  Output data is in revbin_permuted order.

-  static const int w[] = {0, 2, 4, 6};
-//   __m256i *Twiddle = (__m256i*)FFT64_Twiddle;
-
-
 // targetted  
 #define BUTTERFLY_0( i,j ) \
 do { \
@@ -443,25 +445,25 @@ do { \
    X(i) = _mm512_sub_epi16( X(i), v ); \
 } while(0)

-#define BUTTERFLY_N( i,j,n ) \
+#define BUTTERFLY_N( i, j, w ) \
 do { \
    __m512i v = X(j); \
    X(j) = _mm512_add_epi16( X(i), X(j) ); \
-    X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w[n] ); \
+    X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w ); \
 } while(0)

  BUTTERFLY_0( 0, 4 );
-  BUTTERFLY_N( 1, 5, 1 );
-  BUTTERFLY_N( 2, 6, 2 );
-  BUTTERFLY_N( 3, 7, 3 );
+  BUTTERFLY_N( 1, 5, 2 );
+  BUTTERFLY_N( 2, 6, 4 );
+  BUTTERFLY_N( 3, 7, 6 );

  DO_REDUCE( 2 );
  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
-  BUTTERFLY_N( 1, 3, 2 );
-  BUTTERFLY_N( 5, 7, 2 );
+  BUTTERFLY_N( 1, 3, 4 );
+  BUTTERFLY_N( 5, 7, 4 );

  DO_REDUCE( 1 );

@@ -501,12 +503,11 @@ do { \
  // Transpose the FFT state with a revbin order permutation
  // on the rows and the column.
  // This will make the full FFT_64 in order.
-#define INTERLEAVE(i,j) \
+#define INTERLEAVE( i, j ) \
  do { \
-    __m512i t1= X(i); \
-    __m512i t2= X(j); \
-    X(i) = _mm512_unpacklo_epi16( t1, t2 ); \
-    X(j) = _mm512_unpackhi_epi16( t1, t2 ); \
+    __m512i u = X(j); \
+    X(j) = _mm512_unpackhi_epi16( X(i), X(j) ); \
+    X(i) = _mm512_unpacklo_epi16( X(i), u ); \
  } while(0)

  INTERLEAVE( 1, 0 );
@@ -534,10 +535,10 @@ do { \
 } while(0)


-#define BUTTERFLY_N( i,j,n ) \
+#define BUTTERFLY_N( i, j, w ) \
 do { \
   __m512i u = X(j); \
-   X(i) = _mm512_slli_epi16( X(i), w[n] ); \
+   X(i) = _mm512_slli_epi16( X(i), w ); \
   X(j) = _mm512_sub_epi16( X(j), X(i) ); \
   X(i) = _mm512_add_epi16( u, X(i) ); \
 } while(0)
@@ -558,15 +559,15 @@ do { \

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
-  BUTTERFLY_N( 1, 3, 2 );
-  BUTTERFLY_N( 5, 7, 2 );
+  BUTTERFLY_N( 1, 3, 4 );
+  BUTTERFLY_N( 5, 7, 4 );

  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 4 );
-  BUTTERFLY_N( 1, 5, 1 );
-  BUTTERFLY_N( 2, 6, 2 );
-  BUTTERFLY_N( 3, 7, 3 );
+  BUTTERFLY_N( 1, 5, 2 );
+  BUTTERFLY_N( 2, 6, 4 );
+  BUTTERFLY_N( 3, 7, 6 );

  DO_REDUCE_FULL_S4w( 0 );
  DO_REDUCE_FULL_S4w( 1 );
@@ -599,7 +600,6 @@ void fft128_4way( void *a )
  // Temp space to help for interleaving in the end
  __m512i B[8];
  __m512i *A = (__m512i*) a;
-//  __m256i *Twiddle = (__m256i*)FFT128_Twiddle;

  /* Size-2 butterflies */
  for ( i = 0; i<8; i++ )
@@ -633,7 +633,6 @@ void fft128_4way_msg( uint16_t *a, const uint8_t *x, int final )

  __m512i *X = (__m512i*)x;
  __m512i *A = (__m512i*)a;
-//  __m256i *Twiddle = (__m256i*)FFT128_Twiddle;

 #define UNPACK( i ) \
 do { \
@@ -686,7 +685,6 @@ void fft256_4way_msg( uint16_t *a, const uint8_t *x, int final )

  __m512i *X = (__m512i*)x;
  __m512i *A = (__m512i*)a;
-//  __m256i *Twiddle = (__m256i*)FFT256_Twiddle;

 #define UNPACK( i ) \
 do { \
@@ -776,109 +774,6 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
  // We split the round function in two halfes
  // so as to insert some independent computations in between

-// generic
-#if 0
-#define SUM7_00 0
-#define SUM7_01 1
-#define SUM7_02 2
-#define SUM7_03 3
-#define SUM7_04 4
-#define SUM7_05 5
-#define SUM7_06 6
-
-#define SUM7_10 1
-#define SUM7_11 2
-#define SUM7_12 3
-#define SUM7_13 4
-#define SUM7_14 5
-#define SUM7_15 6
-#define SUM7_16 0
-
-#define SUM7_20 2
-#define SUM7_21 3
-#define SUM7_22 4
-#define SUM7_23 5
-#define SUM7_24 6
-#define SUM7_25 0
-#define SUM7_26 1
-
-#define SUM7_30 3
-#define SUM7_31 4
-#define SUM7_32 5
-#define SUM7_33 6
-#define SUM7_34 0
-#define SUM7_35 1
-#define SUM7_36 2
-
-#define SUM7_40 4
-#define SUM7_41 5
-#define SUM7_42 6
-#define SUM7_43 0
-#define SUM7_44 1
-#define SUM7_45 2
-#define SUM7_46 3
-
-#define SUM7_50 5
-#define SUM7_51 6
-#define SUM7_52 0
-#define SUM7_53 1
-#define SUM7_54 2
-#define SUM7_55 3
-#define SUM7_56 4
-
-#define SUM7_60 6
-#define SUM7_61 0
-#define SUM7_62 1
-#define SUM7_63 2
-#define SUM7_64 3
-#define SUM7_65 4
-#define SUM7_66 5
-
-#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
-
-#define PERM_0(d,a) /* XOR 1 */ \
-do { \
-    d##l = shufxor( a##l, 1 ); \
-    d##h = shufxor( a##h, 1 ); \
- } while(0)
-
-#define PERM_1(d,a) /* XOR 6 */ \
-do { \
-    d##l = shufxor( a##h, 2 ); \
-    d##h = shufxor( a##l, 2 ); \
-} while(0)
-
-#define PERM_2(d,a) /* XOR 2 */ \
-do { \
-    d##l = shufxor( a##l, 2 ); \
-    d##h = shufxor( a##h, 2 ); \
-} while(0)
-
-#define PERM_3(d,a) /* XOR 3 */ \
-do { \
-    d##l = shufxor( a##l, 3 ); \
-    d##h = shufxor( a##h, 3 ); \
-} while(0)
-
-#define PERM_4(d,a) /* XOR 5 */ \
-do { \
-    d##l = shufxor( a##h, 1 ); \
-    d##h = shufxor( a##l, 1 ); \
-} while(0)
-
-#define PERM_5(d,a) /* XOR 7 */ \
-do { \
-    d##l = shufxor( a##h, 3 ); \
-    d##h = shufxor( a##l, 3 ); \
-} while(0)
-
-#define PERM_6(d,a) /* XOR 4 */ \
-do { \
-    d##l = a##h; \
-    d##h = a##l; \
-} while(0)
-#endif
-
 // targetted
  
 #define STEP_1_(a,b,c,d,w,fun,r,s,z) \
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -1106,8 +1106,7 @@ skein256_4way_close(void *cc, void *dst)
 }


-
-// Do not use with 128 bit data
+// Broken for 80 & 128 bytes, use prehash or full
 void
 skein512_4way_update(void *cc, const void *data, size_t len)
 {
--- a/algo/skein/skein.c
+++ b/algo/skein/skein.c
@@ -31,18 +31,19 @@ int scanhash_skein( struct work *work, uint32_t max_nonce,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t n = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;

   swab32_array( endiandata, pdata, 20 );

 	do {
 		be32enc(&endiandata[19], n); 
 		skeinhash(hash64, endiandata);
-		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
-			*hashes_done = n - first_nonce + 1;
-			pdata[19] = n;
-			return true;
-		}
+      if (hash64[7] <= Htarg )
+      if ( fulltest(hash64, ptarget) && !opt_benchmark )
+      {
+         pdata[19] = n;
+         submit_solution( work, hash64, mythr );
+      }
 		n++;

 	} while (n < max_nonce && !work_restart[thr_id].restart);
--- a/algo/skein/skein2.c
+++ b/algo/skein/skein2.c
@@ -34,31 +34,31 @@ void skein2hash(void *output, const void *input)
 	sph_skein512_close(&ctx_skein, hash);

 	memcpy(output, hash, 32);
-
 }

 int scanhash_skein2( struct work *work,	uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	uint32_t hash64[8] __attribute__ ((aligned (64)));
 	uint32_t endiandata[20] __attribute__ ((aligned (64)));
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t n = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id; 

-        swab32_array( endiandata, pdata, 20 );
+   swab32_array( endiandata, pdata, 20 );

 	do {
 		be32enc(&endiandata[19], n);
 		skein2hash(hash64, endiandata);
-		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
-			*hashes_done = n - first_nonce + 1;
-			pdata[19] = n;
-			return true;
-		}
+      if (hash64[7] <= Htarg )
+      if ( fulltest(hash64, ptarget) && !opt_benchmark )
+      {
+         pdata[19] = n;
+         submit_solution( work, hash64, mythr );
+      }
 		n++;

 	} while (n < max_nonce && !work_restart[thr_id].restart);
--- a/algo/whirlpool/md-helper-4way.c
+++ b/algo/whirlpool/md-helper-4way.c
@@ -1,291 +0,0 @@
-/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */
-/*
- * This file contains some functions which implement the external data
- * handling and padding for Merkle-Damgard hash functions which follow
- * the conventions set out by MD4 (little-endian) or SHA-1 (big-endian).
- *
- * API: this file is meant to be included, not compiled as a stand-alone
- * file. Some macros must be defined:
- *   RFUN   name for the round function
- *   HASH   "short name" for the hash function
- *   BE32   defined for big-endian, 32-bit based (e.g. SHA-1)
- *   LE32   defined for little-endian, 32-bit based (e.g. MD5)
- *   BE64   defined for big-endian, 64-bit based (e.g. SHA-512)
- *   LE64   defined for little-endian, 64-bit based (no example yet)
- *   PW01   if defined, append 0x01 instead of 0x80 (for Tiger)
- *   BLEN   if defined, length of a message block (in bytes)
- *   PLW1   if defined, length is defined on one 64-bit word only (for Tiger)
- *   PLW4   if defined, length is defined on four 64-bit words (for WHIRLPOOL)
- *   SVAL   if defined, reference to the context state information
- *
- * BLEN is used when a message block is not 16 (32-bit or 64-bit) words:
- * this is used for instance for Tiger, which works on 64-bit words but
- * uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are
- * ignored if 32-bit words are used; if 64-bit words are used and PLW1 is
- * set, then only one word (64 bits) will be used to encode the input
- * message length (in bits), otherwise two words will be used (as in
- * SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but
- * not PLW1), four 64-bit words will be used to encode the message length
- * (in bits). Note that regardless of those settings, only 64-bit message
- * lengths are supported (in bits): messages longer than 2 Exabytes will be
- * improperly hashed (this is unlikely to happen soon: 2 Exabytes is about
- * 2 millions Terabytes, which is huge).
- *
- * If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close()
- * function. This is used for Tiger2, which is identical to Tiger except
- * when it comes to the padding (Tiger2 uses the standard 0x80 byte instead
- * of the 0x01 from original Tiger).
- *
- * The RFUN function is invoked with two arguments, the first pointing to
- * aligned data (as a "const void *"), the second being state information
- * from the context structure. By default, this state information is the
- * "val" field from the context, and this field is assumed to be an array
- * of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64).
- * from the context structure. The "val" field can have any type, except
- * for the output encoding which assumes that it is an array of "sph_u32"
- * values. By defining NO_OUTPUT, this last step is deactivated; the
- * includer code is then responsible for writing out the hash result. When
- * NO_OUTPUT is defined, the third parameter to the "close()" function is
- * ignored.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifdef _MSC_VER
-#pragma warning (disable: 4146)
-#endif
-
-#undef SPH_XCAT
-#define SPH_XCAT(a, b)     SPH_XCAT_(a, b)
-#undef SPH_XCAT_
-#define SPH_XCAT_(a, b)    a ## b
-
-#undef SPH_BLEN
-#undef SPH_WLEN
-#if defined BE64 || defined LE64
-#define SPH_BLEN    128U
-#define SPH_WLEN      8U
-#else
-#define SPH_BLEN     64U
-#define SPH_WLEN      4U
-#endif
-
-#ifdef BLEN
-#undef SPH_BLEN
-#define SPH_BLEN    BLEN
-#endif
-
-#undef SPH_MAXPAD
-#if defined PLW1
-#define SPH_MAXPAD   (SPH_BLEN - SPH_WLEN)
-#elif defined PLW4
-#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 2))
-#else
-#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 1))
-#endif
-
-#undef SPH_VAL
-#undef SPH_NO_OUTPUT
-#ifdef SVAL
-#define SPH_VAL         SVAL
-#define SPH_NO_OUTPUT   1
-#else
-#define SPH_VAL   sc->val
-#endif
-
-#ifndef CLOSE_ONLY
-
-#ifdef SPH_UPTR
-static void
-SPH_XCAT(HASH, _short)( void *cc, const void *data, size_t len )
-#else
-void
-HASH ( void *cc, const void *data, size_t len )
-#endif
-{
-   SPH_XCAT( HASH, _context ) *sc;
-   __m256i *vdata = (__m256i*)data;
-   size_t ptr;
-
-   sc = cc;
-   ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
-   while ( len > 0 )
-   {
-      size_t clen;
-      clen = SPH_BLEN - ptr;
-      if ( clen > len )
-         clen = len;
-      memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
-      vdata = vdata + (clen>>3);
-      ptr += clen;
-      len -= clen;
-      if ( ptr == SPH_BLEN )
-      {
-         RFUN( sc->buf, SPH_VAL );
-         ptr = 0;
-      }
-         sc->count += clen;
-   }
-}
-
-#ifdef SPH_UPTR
-void
-HASH (void *cc, const void *data, size_t len)
-{
-   SPH_XCAT(HASH, _context) *sc;
-   __m256i *vdata = (__m256i*)data;
-   unsigned ptr;
-
-   if ( len < (2 * SPH_BLEN) )
-   {
-      SPH_XCAT(HASH, _short)(cc, data, len);
-      return;
-   }
-   sc = cc;
-   ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
-   if ( ptr > 0 )
-   {
-      unsigned t;
-      t = SPH_BLEN - ptr;
-      SPH_XCAT( HASH, _short )( cc, data, t );
-      vdata = vdata + (t>>3);
-      len -= t;
-   }
-   SPH_XCAT( HASH, _short )( cc, data, len );
-}
-#endif
-
-#endif
-
-/*
- * Perform padding and produce result. The context is NOT reinitialized
- * by this function.
- */
-static void
-SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
-          void *dst, unsigned rnum )
-{
-    SPH_XCAT(HASH, _context) *sc;
-    unsigned ptr, u;
-    sc = cc;
-    ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
-
-//uint64_t *b= (uint64_t*)sc->buf;
-//uint64_t *s= (uint64_t*)sc->state;
-//printf("Vptr 1= %u\n", ptr);
-//printf("VBuf %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
-//printf("VBuf %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
-
-#ifdef PW01
-    sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x100 >> 8 );
-//    sc->buf[ptr++] = 0x100 >> 8;
-#else
-// need to overwrite exactly one byte
-//    sc->buf[ptr>>3] = _mm256_set_epi64x( 0, 0, 0, 0x80 );
-    sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x80 );
-//    ptr++;
-#endif
-    ptr += 8;
-
-//printf("Vptr 2= %u\n", ptr);
-//printf("VBuf %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
-//printf("VBuf %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
-
-    if ( ptr > SPH_MAXPAD )
-    {
-         memset_zero_256( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
-         RFUN( sc->buf, SPH_VAL );
-         memset_zero_256( sc->buf, SPH_MAXPAD >> 3 );
-    }
-    else
-    {
-         memset_zero_256( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
-    }
-#if defined BE64
-#if defined PLW1
-    sc->buf[ SPH_MAXPAD>>3 ] =
-                 mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
-#elif defined PLW4
-    memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
-    sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-                mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
-    sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-                mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
-#else
-    sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-               mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
-    sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-               mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
-#endif  // PLW
-#else  // LE64
-#if defined PLW1
-    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
-#elif defined PLW4
-    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
-    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
-                       _mm256_set1_epi64x( c->count >> 61 );
-    memset_zero_256( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
-                       2 * SPH_WLEN );
-#else
-    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
-    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
-                          _mm256_set1_epi64x( sc->count >> 61 );
-#endif // PLW
-
-#endif // LE64
-
-//printf("Vptr 3= %u\n", ptr);
-//printf("VBuf   %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
-//printf("VBuf   %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
-    RFUN( sc->buf, SPH_VAL );
-
-//printf("Vptr after= %u\n", ptr);
-//printf("VState %016llx %016llx %016llx %016llx\n", s[0], s[4], s[8], s[12] );
-//printf("VState %016llx %016llx %016llx %016llx\n", s[16], s[20], s[24], s[28] );
-
-#ifdef SPH_NO_OUTPUT
-    (void)dst;
-    (void)rnum;
-    (void)u;
-#else
-    for ( u = 0; u < rnum; u ++ )
-    {
-#if defined BE64
-       ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
-#else  // LE64
-       ((__m256i*)dst)[u] = sc->val[u];
-#endif
-    }
-#endif
-}
-
-static void
-SPH_XCAT( HASH, _mdclose )( void *cc, void *dst, unsigned rnum )
-{
-   SPH_XCAT( HASH, _addbits_and_close )( cc, 0, 0, dst, rnum );
-}
--- a/algo/whirlpool/whirlpool-hash-4way.c
+++ b/algo/whirlpool/whirlpool-hash-4way.c
--- a/algo/whirlpool/whirlpool-hash-4way.h
+++ b/algo/whirlpool/whirlpool-hash-4way.h
@@ -1,108 +0,0 @@
-/* $Id: sph_whirlpool.h 216 2010-06-08 09:46:57Z tp $ */
-/**
- * WHIRLPOOL interface.
- *
- * WHIRLPOOL knows three variants, dubbed "WHIRLPOOL-0" (original
- * version, published in 2000, studied by NESSIE), "WHIRLPOOL-1"
- * (first revision, 2001, with a new S-box) and "WHIRLPOOL" (current
- * version, 2003, with a new diffusion matrix, also described as "plain
- * WHIRLPOOL"). All three variants are implemented here.
- *
- * The original WHIRLPOOL (i.e. WHIRLPOOL-0) was published in: P. S. L.
- * M. Barreto, V. Rijmen, "The Whirlpool Hashing Function", First open
- * NESSIE Workshop, Leuven, Belgium, November 13--14, 2000.
- *
- * The current WHIRLPOOL specification and a reference implementation
- * can be found on the WHIRLPOOL web page:
- * http://paginas.terra.com.br/informatica/paulobarreto/WhirlpoolPage.html
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_whirlpool.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef WHIRLPOOL_HASH_4WAY_H__
-#define WHIRLPOOL_HASH_4WAY_H__
-
-#ifdef __AVX2__
-
-#include <stddef.h>
-#include "algo/sha/sph_types.h"
-#include "simd-utils.h"
-
-/**
- * Output size (in bits) for WHIRLPOOL.
- */
-#define SPH_SIZE_whirlpool   512
-
-/**
- * Output size (in bits) for WHIRLPOOL-0.
- */
-#define SPH_SIZE_whirlpool0   512
-
-/**
- * Output size (in bits) for WHIRLPOOL-1.
- */
-#define SPH_SIZE_whirlpool1   512
-
-typedef struct {
-    __m256i buf[8] __attribute__ ((aligned (64)));
-    __m256i state[8];
-    sph_u64 count;
-} whirlpool_4way_context;
-
-void whirlpool_4way_init( void *cc );
-
-void whirlpool_4way( void *cc, const void *data, size_t len );
-
-void whirlpool_4way_close( void *cc, void *dst );
-
-/**
- * WHIRLPOOL-0 uses the same structure than plain WHIRLPOOL.
- */
-typedef whirlpool_4way_context whirlpool0_4way_context;
-
-#define whirlpool0_4way_init whirlpool_4way_init
-
-void whirlpool0_4way( void *cc, const void *data, size_t len );
-
-void whirlpool0_4way_close( void *cc, void *dst );
-
-/**
- * WHIRLPOOL-1 uses the same structure than plain WHIRLPOOL.
- */
-typedef whirlpool_4way_context whirlpool1_4way_context;
-
-#define whirlpool1_4way_init whirlpool_4way_init
-
-void whirlpool1_4way(void *cc, const void *data, size_t len);
-
-void whirlpool1_4way_close(void *cc, void *dst);
-
-#endif
-
-#endif
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -12,6 +12,7 @@
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
+#include "algo/shavite/shavite-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #if defined(__VAES__)
@@ -22,15 +23,15 @@

 #if defined (C11_8WAY)

-typedef struct {
+union _c11_8way_context_overlay
+{
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
    skein512_8way_context   skein;
    jh512_8way_context      jh;
    keccak512_8way_context  keccak;
    luffa_4way_context      luffa;
-    cube_4way_context       cube;
-    simd_4way_context       simd;
+    cube_4way_2buf_context   cube;
 #if defined(__VAES__)
    groestl512_4way_context groestl;
    shavite512_4way_context shavite;
@@ -40,32 +41,14 @@ typedef struct {
    sph_shavite512_context  shavite;
    hashState_echo          echo;
 #endif
-} c11_8way_ctx_holder;
+    simd_4way_context       simd;
+} __attribute__ ((aligned (64)));
+typedef union _c11_8way_context_overlay c11_8way_context_overlay;

-c11_8way_ctx_holder c11_8way_ctx;
+static __thread __m512i c11_8way_midstate[16] __attribute__((aligned(64)));
+static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));

-void init_c11_8way_ctx()
-{
-     blake512_8way_init( &c11_8way_ctx.blake );
-     bmw512_8way_init( &c11_8way_ctx.bmw );
-     skein512_8way_init( &c11_8way_ctx.skein );
-     jh512_8way_init( &c11_8way_ctx.jh );
-     keccak512_8way_init( &c11_8way_ctx.keccak );
-     luffa_4way_init( &c11_8way_ctx.luffa, 512 );
-     cube_4way_init( &c11_8way_ctx.cube, 512, 16, 32 );
-     simd_4way_init( &c11_8way_ctx.simd, 512 );
-#if defined(__VAES__)
-     groestl512_4way_init( &c11_8way_ctx.groestl, 64 );
-     shavite512_4way_init( &c11_8way_ctx.shavite );
-     echo_4way_init( &c11_8way_ctx.echo, 512 );
-#else
-     init_groestl( &c11_8way_ctx.groestl, 64 );
-     sph_shavite512_init( &c11_8way_ctx.shavite );
-     init_echo( &c11_8way_ctx.echo, 512 );
-#endif
-}
-
-void c11_8way_hash( void *state, const void *input )
+int c11_8way_hash( void *state, const void *input, int thr_id )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
     uint64_t vhashA[4*8] __attribute__ ((aligned (64)));     
@@ -78,24 +61,19 @@ void c11_8way_hash( void *state, const void *input )
     uint64_t hash5[8] __attribute__ ((aligned (64)));
     uint64_t hash6[8] __attribute__ ((aligned (64)));
     uint64_t hash7[8] __attribute__ ((aligned (64)));
-     c11_8way_ctx_holder ctx;
-     memcpy( &ctx, &c11_8way_ctx, sizeof(c11_8way_ctx) );
+     c11_8way_context_overlay ctx;

-     // 1 Blake 4way
-     blake512_8way_update( &ctx.blake, input, 80 );
-     blake512_8way_close( &ctx.blake, vhash );
-
-     // 2 Bmw
-     bmw512_8way_update( &ctx.bmw, vhash, 64 );
-     bmw512_8way_close( &ctx.bmw, vhash );
+     blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
+                             c11_8way_midstate );

+     bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
+     
 #if defined(__VAES__)

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

-     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
-     groestl512_4way_init( &ctx.groestl, 64 );
-     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+     groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );

     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );

@@ -104,21 +82,14 @@ void c11_8way_hash( void *state, const void *input )
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash );

-     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
-     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
-     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
-     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );

     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7 );
@@ -126,83 +97,56 @@ void c11_8way_hash( void *state, const void *input )
 #endif

     // 4 JH
+     jh512_8way_init( &ctx.jh );
     jh512_8way_update( &ctx.jh, vhash, 64 );
     jh512_8way_close( &ctx.jh, vhash );

     // 5 Keccak
+     keccak512_8way_init( &ctx.keccak );
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );

     // 6 Skein
-     skein512_8way_update( &ctx.skein, vhash, 64 );
-     skein512_8way_close( &ctx.skein, vhash );
+     skein512_8way_full( &ctx.skein, vhash, vhash, 64 );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

-     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
-     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
-
-     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
-     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+     luffa512_4way_full( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa512_4way_full( &ctx.luffa, vhashB, vhashB, 64 );

+     cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 512, vhashA, vhashB, 64 );
+     
 #if defined(__VAES__)

-     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
-     shavite512_4way_init( &ctx.shavite );
-     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 );

 #else
     
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );

-     sph_shavite512( &ctx.shavite, hash0, 64 );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash1, 64 );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash2, 64 );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash3, 64 );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash4, 64 );
-     sph_shavite512_close( &ctx.shavite, hash4 );
-     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash5, 64 );
-     sph_shavite512_close( &ctx.shavite, hash5 );
-     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash6, 64 );
-     sph_shavite512_close( &ctx.shavite, hash6 );
-     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash7, 64 );
-     sph_shavite512_close( &ctx.shavite, hash7 );
-
+     shavite512_full( &ctx.shavite, hash0, hash0, 64 );
+     shavite512_full( &ctx.shavite, hash1, hash1, 64 );
+     shavite512_full( &ctx.shavite, hash2, hash2, 64 );
+     shavite512_full( &ctx.shavite, hash3, hash3, 64 );
+     shavite512_full( &ctx.shavite, hash4, hash4, 64 );
+     shavite512_full( &ctx.shavite, hash5, hash5, 64 );
+     shavite512_full( &ctx.shavite, hash6, hash6, 64 );
+     shavite512_full( &ctx.shavite, hash7, hash7, 64 );
+     
     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );

 #endif

-     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+     simd512_4way_full( &ctx.simd, vhashA, vhashA, 64 );
+     simd512_4way_full( &ctx.simd, vhashB, vhashB, 64 );

 #if defined(__VAES__)

-     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
-     echo_4way_init( &ctx.echo, 512 );
-     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+     echo_4way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
+     echo_4way_full( &ctx.echo, vhashB, 512, vhashB, 64 );

     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
@@ -212,29 +156,22 @@ void c11_8way_hash( void *state, const void *input )
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
     
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                       (const BitSequence *) hash0, 512 );
-     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                       (const BitSequence *) hash1, 512 );
-     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                       (const BitSequence *) hash2, 512 );
-     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                       (const BitSequence *) hash3, 512 );
-     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash4,
-                       (const BitSequence *) hash4, 512 );
-     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash5,
-                       (const BitSequence *) hash5, 512 );
-     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash6,
-                       (const BitSequence *) hash6, 512 );
-     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash7,
-                       (const BitSequence *) hash7, 512 );
+     echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                     (const BitSequence *)hash0, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                     (const BitSequence *)hash1, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                     (const BitSequence *)hash2, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                     (const BitSequence *)hash3, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash4, 512,
+                     (const BitSequence *)hash4, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash5, 512,
+                     (const BitSequence *)hash5, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash6, 512,
+                     (const BitSequence *)hash6, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash7, 512,
+                     (const BitSequence *)hash7, 64 );

 #endif

@@ -246,225 +183,223 @@ void c11_8way_hash( void *state, const void *input )
     memcpy( state+160, hash5, 32 );
     memcpy( state+192, hash6, 32 );
     memcpy( state+224, hash7, 32 );
+
+     return 1;
 }

 int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[8*8] __attribute__ ((aligned (128)));
-     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
-     uint32_t *pdata = work->data;
-     uint32_t *ptarget = work->target;
-     uint32_t n = pdata[19];
-     const uint32_t first_nonce = pdata[19];
-     int thr_id = mythr->id;   
-     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-     const uint32_t Htarg = ptarget[7];
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   __m128i edata[5] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   __m512i  *noncev = (__m512i*)vdata + 9;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const uint32_t targ32_d7 = ptarget[7];
+   const __m512i eight = m512_const1_64( 8 );
+   const bool bench = opt_benchmark;

-     max_nonce -= 8;
+   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
+   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
+   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
+   edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
+   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );

-     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   mm512_intrlv80_8x64( vdata, edata );
+   *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
+                            0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0 ) );
+   blake512_8way_prehash_le( &blake512_8way_ctx, c11_8way_midstate, vdata );

-     do
-     {
-        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-        _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                          n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
-
-        c11_8way_hash( hash, vdata );
-        pdata[19] = n;
-
-        for ( int i = 0; i < 8; i++ )
-        if ( ( ( hash+(i<<3) )[7] <= Htarg )
-             && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-        {
-           pdata[19] = n+i;
-           submit_solution( work, hash+(i<<3), mythr );
-        }
-        n += 8;
-     } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
-     *hashes_done = n - first_nonce;
-     return 0;
+   do
+   {
+      if ( likely( c11_8way_hash( hash, vdata, thr_id ) ) )
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( ( ( hash + ( lane << 3 ) )[7] <= targ32_d7 )
+           && valid_hash( hash +( lane << 3 ), ptarget ) && !bench )
+      {
+         pdata[19] = n + lane;
+         submit_solution( work, hash + ( lane << 3 ), mythr );
+      }
+      *noncev = _mm512_add_epi32( *noncev, eight );
+      n += 8;
+   } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
 }
     
 #elif defined (C11_4WAY)

-typedef struct {
+union _c11_4way_context_overlay
+{
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
+#if defined(__VAES__)
+    groestl512_2way_context groestl;
+    echo512_2way_context    echo;
+#else
    hashState_groestl       groestl;
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;    
-    keccak512_4way_context  keccak;    
-    luffa_2way_context      luffa;
-    cubehashParam           cube;
-    sph_shavite512_context  shavite;
-    simd_2way_context       simd;
    hashState_echo          echo;
-} c11_4way_ctx_holder;
+#endif
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    luffa_2way_context      luffa;
+    cube_2way_context       cube;
+    shavite512_2way_context shavite;
+    simd_2way_context       simd;
+};
+typedef union _c11_4way_context_overlay c11_4way_context_overlay;

-c11_4way_ctx_holder c11_4way_ctx;
+static __thread __m256i c11_4way_midstate[16] __attribute__((aligned(64)));
+static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64)));

-void init_c11_4way_ctx()
-{
-     blake512_4way_init( &c11_4way_ctx.blake );
-     bmw512_4way_init( &c11_4way_ctx.bmw );
-     init_groestl( &c11_4way_ctx.groestl, 64 );
-     skein512_4way_init( &c11_4way_ctx.skein );
-     jh512_4way_init( &c11_4way_ctx.jh );
-     keccak512_4way_init( &c11_4way_ctx.keccak );
-     luffa_2way_init( &c11_4way_ctx.luffa, 512 );
-     cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 );
-     sph_shavite512_init( &c11_4way_ctx.shavite );
-     simd_2way_init( &c11_4way_ctx.simd, 512 );
-     init_echo( &c11_4way_ctx.echo, 512 );
-}
-
-void c11_4way_hash( void *state, const void *input )
+int c11_4way_hash( void *state, const void *input, int thr_id )
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     uint64_t vhashA[8*2] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*2] __attribute__ ((aligned (64)));
-     c11_4way_ctx_holder ctx;
-     memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
+     c11_4way_context_overlay ctx;

-     // 1 Blake 4way
-     blake512_4way_update( &ctx.blake, input, 80 );
-     blake512_4way_close( &ctx.blake, vhash );
+     blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ),
+                             c11_4way_midstate );

-     // 2 Bmw
+     bmw512_4way_init( &ctx.bmw );
     bmw512_4way_update( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
+     
+#if defined(__VAES__)

-     // Serial
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

-     // 3 Groestl
-     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );

-     // 4way
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );

-     // 4 JH
+#else
+
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
+
+     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
+
+#endif
+     
+     jh512_4way_init( &ctx.jh );
     jh512_4way_update( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );

-     // 5 Keccak
+     keccak512_4way_init( &ctx.keccak );
     keccak512_4way_update( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );

-     // 6 Skein
-     skein512_4way_update( &ctx.skein, vhash, 64 );
-     skein512_4way_close( &ctx.skein, vhash );
+     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );

-     // Serial
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

-     // 7 Luffa
-     intrlv_2x128( vhash, hash0, hash1, 512 );
-     intrlv_2x128( vhashB, hash2, hash3, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     luffa_2way_init( &ctx.luffa, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
-     dintrlv_2x128( hash0, hash1, vhash, 512 );
-     dintrlv_2x128( hash2, hash3, vhashB, 512 );
+     luffa512_2way_full( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa512_2way_full( &ctx.luffa, vhashB, vhashB, 64 );

-     // 8 Cubehash
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
-     memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
-     memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
-     memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+     cube_2way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
+     cube_2way_full( &ctx.cube, vhashB, 512, vhashB, 64 );

-     // 9 Shavite
-     sph_shavite512( &ctx.shavite, hash0, 64 );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash1, 64 );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash2, 64 );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash3, 64 );
-     sph_shavite512_close( &ctx.shavite, hash3 );
+     shavite512_2way_full( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_2way_full( &ctx.shavite, vhashB, vhashB, 64 );

-     // 10 Simd
-     intrlv_2x128( vhash, hash0, hash1, 512 );
-     intrlv_2x128( vhashB, hash2, hash3, 512 );
-     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     simd_2way_init( &ctx.simd, 512 );
-     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
-     dintrlv_2x128( hash0, hash1, vhash, 512 );
-     dintrlv_2x128( hash2, hash3, vhashB, 512 );
+     simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
+     simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );

-     // 11 Echo
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                       (const BitSequence *) hash0, 512 );
-     memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                       (const BitSequence *) hash1, 512 );
-     memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                       (const BitSequence *) hash2, 512 );
-     memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                       (const BitSequence *) hash3, 512 );
+#if defined(__VAES__)
+
+     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
+     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
+
+     dintrlv_2x128_512( hash0, hash1, vhashA );
+     dintrlv_2x128_512( hash2, hash3, vhashB );
+
+#else
+
+     dintrlv_2x128_512( hash0, hash1, vhashA );
+     dintrlv_2x128_512( hash2, hash3, vhashB );
+
+     echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                     (const BitSequence *)hash0, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                     (const BitSequence *)hash1, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                     (const BitSequence *)hash2, 64 );
+     echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                     (const BitSequence *)hash3, 64 );
+
+#endif

     memcpy( state,    hash0, 32 );
     memcpy( state+32, hash1, 32 );
     memcpy( state+64, hash2, 32 );
     memcpy( state+96, hash3, 32 );
+
+     return 1;
 }

 int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[4*8] __attribute__ ((aligned (64)));
-     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-     uint32_t *pdata = work->data;
-     uint32_t *ptarget = work->target;
-     uint32_t n = pdata[19];
-     const uint32_t first_nonce = pdata[19];
-     int thr_id = mythr->id;  // thr_id arg is deprecated
-     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-     const uint32_t Htarg = ptarget[7];
+   uint32_t hash[8*4] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   __m128i edata[5] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   __m256i  *noncev = (__m256i*)vdata + 9;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const uint32_t targ32_d7 = ptarget[7];
+   const __m256i four = m256_const1_64( 4 );
+   const bool bench = opt_benchmark;

-     mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
+   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
+   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
+   edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
+   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );

-     do
-     {
-        *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-             _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+   mm256_intrlv80_4x64( vdata, edata );

-        c11_4way_hash( hash, vdata );
-        pdata[19] = n;
+   *noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32(
+                                           0, 3, 0, 2, 0, 1, 0, 0 ) );
+   blake512_4way_prehash_le( &blake512_4way_ctx, c11_4way_midstate, vdata );

-        for ( int i = 0; i < 4; i++ )
-        if ( ( ( hash+(i<<3) )[7] <= Htarg )
-            && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-        {
-           pdata[19] = n+i;
-           submit_solution( work, hash+(i<<3), mythr );
-        }
-        n += 4;
-     } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
-     *hashes_done = n - first_nonce + 1;
-     return 0;
+   do
+   {
+      if ( likely( c11_4way_hash( hash, vdata, thr_id ) ) )
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( ( ( hash + ( lane << 3 ) )[7] <= targ32_d7 )
+           && valid_hash( hash +( lane << 3 ), ptarget ) && !bench )
+      {
+         pdata[19] = n + lane;
+         submit_solution( work, hash + ( lane << 3 ), mythr );
+      }
+      *noncev = _mm256_add_epi32( *noncev, four );
+      n += 4;
+   } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
 }

 #endif
--- a/algo/x11/c11-gate.c
+++ b/algo/x11/c11-gate.c
@@ -3,11 +3,9 @@
 bool register_c11_algo( algo_gate_t* gate )
 {
 #if defined (C11_8WAY)
-  init_c11_8way_ctx();
  gate->scanhash  = (void*)&scanhash_c11_8way;
  gate->hash      = (void*)&c11_8way_hash;
 #elif defined (C11_4WAY)
-  init_c11_4way_ctx();
  gate->scanhash  = (void*)&scanhash_c11_4way;
  gate->hash      = (void*)&c11_4way_hash;
 #else
--- a/algo/x11/c11-gate.h
+++ b/algo/x11/c11-gate.h
@@ -14,14 +14,14 @@
 bool register_c11_algo( algo_gate_t* gate );
 #if defined(C11_8WAY)

-void c11_8way_hash( void *state, const void *input );
+int c11_8way_hash( void *state, const void *input, int thr_id );
 int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-void init_c11_8way_ctx();
+//void init_c11_8way_ctx();

 #elif defined(C11_4WAY)

-void c11_4way_hash( void *state, const void *input );
+int c11_4way_hash( void *state, const void *input, int thr_id );
 int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 void init_c11_4way_ctx();
--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -112,8 +112,9 @@ void timetravel_4way_hash(void *output, const void *input)
              intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
        case 3:
-           skein512_4way_update( &ctx.skein, vhashA, dataLen );
-           skein512_4way_close( &ctx.skein, vhashB );
+           skein512_4way_full( &ctx.skein, vhashB, vhashA, dataLen );
+//           skein512_4way_update( &ctx.skein, vhashA, dataLen );
+//           skein512_4way_close( &ctx.skein, vhashB );
           if ( i == 7 )
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
        break;
--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -118,8 +118,9 @@ void timetravel10_4way_hash(void *output, const void *input)
              intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
        case 3:
-           skein512_4way_update( &ctx.skein, vhashA, dataLen );
-           skein512_4way_close( &ctx.skein, vhashB );
+           skein512_4way_full( &ctx.skein, vhashB, vhashA, dataLen );
+//           skein512_4way_update( &ctx.skein, vhashA, dataLen );
+//           skein512_4way_close( &ctx.skein, vhashB );
           if ( i == 9 )
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
        break;
--- a/algo/x14/polytimos-4way.c
+++ b/algo/x14/polytimos-4way.c
@@ -33,9 +33,10 @@ void polytimos_4way_hash( void *output, const void *input )
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     poly_4way_context_overlay ctx;

-     skein512_4way_init( &ctx.skein );
-     skein512_4way_update( &ctx.skein, input, 80 );
-     skein512_4way_close( &ctx.skein, vhash );
+     skein512_4way_full( &ctx.skein, vhash, input, 80 );
+//     skein512_4way_init( &ctx.skein );
+//     skein512_4way_update( &ctx.skein, input, 80 );
+//     skein512_4way_close( &ctx.skein, vhash );

     // Need to convert from 64 bit interleaved to 32 bit interleaved.
     uint32_t vhash32[16*4];
--- a/algo/x14/veltor-4way.c
+++ b/algo/x14/veltor-4way.c
@@ -38,8 +38,10 @@ void veltor_4way_hash( void *output, const void *input )
     veltor_4way_ctx_holder ctx __attribute__ ((aligned (64)));
     memcpy( &ctx, &veltor_4way_ctx, sizeof(veltor_4way_ctx) );

-     skein512_4way_update( &ctx.skein, input, 80 );
-     skein512_4way_close( &ctx.skein, vhash );
+//     skein512_4way_update( &ctx.skein, input, 80 );
+//     skein512_4way_close( &ctx.skein, vhash );
+
+     skein512_4way_full( &ctx.skein, vhash, input, 80 );
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     sph_shavite512( &ctx.shavite, hash0, 64 );
@@ -105,7 +107,7 @@ int scanhash_veltor_4way( struct work *work, uint32_t max_nonce,
         pdata[19] = n;

         for ( int i = 0; i < 4; i++ )
-         if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+         if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) && ! opt_benchmark )
         {
            pdata[19] = n+i;
            submit_solution( work, hash+(i<<3), mythr );
--- a/algo/x16/minotaur.c
+++ b/algo/x16/minotaur.c
@@ -18,6 +18,7 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sph_sha2.h"
+#include "algo/yespower/yespower.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -31,6 +32,9 @@
 // Config
 #define MINOTAUR_ALGO_COUNT	16

+static const yespower_params_t minotaurx_yespower_params =
+                         { YESPOWER_1_0, 2048, 8, "et in arcadia ego", 17 };
+
 typedef struct TortureNode TortureNode;
 typedef struct TortureGarden TortureGarden;

@@ -59,20 +63,22 @@ struct TortureGarden
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
        sph_sha512_context      sha512;
-
-    struct TortureNode {
+    struct TortureNode
+    {
        unsigned int algo;
        TortureNode *child[2];
    } nodes[22];
 } __attribute__ ((aligned (64)));

 // Get a 64-byte hash for given 64-byte input, using given TortureGarden contexts and given algo index
-static void get_hash( void *output, const void *input, TortureGarden *garden,
-	              unsigned int algo )
+static int get_hash( void *output, const void *input, TortureGarden *garden,
+	                  unsigned int algo, int thr_id )
 {    
 	unsigned char hash[64] __attribute__ ((aligned (64)));
+   int rc = 1;

-    switch (algo) {
+    switch ( algo )
+    {
        case 0:
            sph_blake512_init(&garden->blake);
            sph_blake512(&garden->blake, input, 64);
@@ -97,14 +103,14 @@ static void get_hash( void *output, const void *input, TortureGarden *garden,
            sph_echo512(&garden->echo, input, 64);
            sph_echo512_close(&garden->echo, hash);          
 #endif
-	    break;
+	         break;
        case 4:
 #if defined(__AES__)
            fugue512_full( &garden->fugue, hash, input, 64 );
 #else
            sph_fugue512_full( &garden->fugue, hash, input, 64 );
 #endif
-	    break;
+	         break;
        case 5:
 #if defined(__AES__)
            groestl512_full( &garden->groestl, (char*)hash, (char*)input, 512 );
@@ -113,7 +119,7 @@ static void get_hash( void *output, const void *input, TortureGarden *garden,
            sph_groestl512(&garden->groestl, input, 64);
            sph_groestl512_close(&garden->groestl, hash);          
 #endif
-	    break;
+	         break;
        case 6:
            sph_hamsi512_init(&garden->hamsi);
            sph_hamsi512(&garden->hamsi, input, 64);
@@ -164,16 +170,20 @@ static void get_hash( void *output, const void *input, TortureGarden *garden,
            sph_whirlpool(&garden->whirlpool, input, 64);
            sph_whirlpool_close(&garden->whirlpool, hash);          
            break;
+        case 16: // minotaurx only, yespower hardcoded for last node
+            rc = yespower_tls( input, 64, &minotaurx_yespower_params,
+                               (yespower_binary_t*)hash, thr_id );
    }

    memcpy(output, hash, 64);
+    return rc;
 }

 static __thread TortureGarden garden;

 bool initialize_torture_garden()
 {
-    // Create torture garden nodes. Note that both sides of 19 and 20 lead to 21, and 21 has no children (to make traversal complete).
+   // Create torture garden nodes. Note that both sides of 19 and 20 lead to 21, and 21 has no children (to make traversal complete).

   garden.nodes[ 0].child[0] = &garden.nodes[ 1];
   garden.nodes[ 0].child[1] = &garden.nodes[ 2];
@@ -219,7 +229,6 @@ bool initialize_torture_garden()
   garden.nodes[20].child[1] = &garden.nodes[21];
   garden.nodes[21].child[0] = NULL;
   garden.nodes[21].child[1] = NULL;
-
   return true;
 }

@@ -227,38 +236,45 @@ bool initialize_torture_garden()
 int minotaur_hash( void *output, const void *input, int thr_id )
 {    
    unsigned char hash[64] __attribute__ ((aligned (64)));
+    int rc = 1;

    // Find initial sha512 hash
    sph_sha512_init( &garden.sha512 );
    sph_sha512( &garden.sha512, input, 80 );
    sph_sha512_close( &garden.sha512, hash );
-
-    // algo 6 (Hamsi) is very slow. It's faster to skip hashing this nonce
-    // if Hamsi is needed but only the first and last functions are
-    // currently known. Abort if either is Hamsi.
-    if ( ( ( hash[ 0] % MINOTAUR_ALGO_COUNT ) == 6 )
-      || ( ( hash[21] % MINOTAUR_ALGO_COUNT ) == 6 ) )
-         return 0;
+    
+    if ( opt_algo != ALGO_MINOTAURX )
+    {
+       // algo 6 (Hamsi) is very slow. It's faster to skip hashing this nonce
+       // if Hamsi is needed but only the first and last functions are
+       // currently known. Abort if either is Hamsi.
+       if ( ( ( hash[ 0] % MINOTAUR_ALGO_COUNT ) == 6 )
+         || ( ( hash[21] % MINOTAUR_ALGO_COUNT ) == 6 ) )
+           return 0;
+    }

    // Assign algos to torture garden nodes based on initial hash
    for ( int i = 0; i < 22; i++ )
        garden.nodes[i].algo = hash[i] % MINOTAUR_ALGO_COUNT;

+    // MinotaurX override algo for last node with yespower
+    if ( opt_algo == ALGO_MINOTAURX )
+        garden.nodes[21].algo = MINOTAUR_ALGO_COUNT;
+    
    // Send the initial hash through the torture garden
    TortureNode *node = &garden.nodes[0];
-
-    while ( node )
+    while ( rc && node )
    {
-      get_hash( hash, hash, &garden, node->algo );
+      rc = get_hash( hash, hash, &garden, node->algo, thr_id );
      node = node->child[ hash[63] & 1 ];
    }

    memcpy( output, hash, 32 );
-    return 1;
+    return rc;
 }

 int scanhash_minotaur( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
+                       uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t edata[20] __attribute__((aligned(64)));
   uint32_t hash[8] __attribute__((aligned(64)));
@@ -277,7 +293,7 @@ int scanhash_minotaur( struct work *work, uint32_t max_nonce,
      edata[19] = n;
      if ( likely( algo_gate.hash( hash, edata, thr_id ) ) )
      {
-	 if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
+         if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
         {
            pdata[19] = bswap_32( n );
            submit_solution( work, hash, mythr );
@@ -291,12 +307,14 @@ int scanhash_minotaur( struct work *work, uint32_t max_nonce,
   return 0;
 }

+// hash function has hooks for minotaurx
 bool register_minotaur_algo( algo_gate_t* gate )
 {
-  gate->scanhash = (void*)&scanhash_minotaur;
-  gate->hash      = (void*)&minotaur_hash;
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->scanhash          = (void*)&scanhash_minotaur;
+  gate->hash              = (void*)&minotaur_hash;
  gate->miner_thread_init = (void*)&initialize_torture_garden;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  if ( opt_algo == ALGO_MINOTAURX ) gate->optimizations |= SHA_OPT;
  return true;
 };

--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -163,7 +163,7 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
-            bmw512_8way_update( &ctx.bmw, vhash, size );
+               bmw512_8way_update( &ctx.bmw, vhash, size );
            }
            bmw512_8way_close( &ctx.bmw, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -198,7 +198,7 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
   {
       char* data;
       data = (char*)malloc( 2 + strlen( denom10_str ) * 4 + 16 * 4
-                             + strlen( merkleroot_str ) * 3 );
+                             + strlen( merkleroot_str ) * 3 + 1 );
       // Build the block header veildatahash in hex
       sprintf( data, "%s%s%s%s%s%s%s%s%s%s%s%s",
                       merkleroot_str, witmerkleroot_str, "04",
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -31,7 +31,7 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = masked_ntime;
      if ( !thr_id )
-          applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
+          applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
   }

@@ -85,7 +85,7 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = masked_ntime;
      if ( !thr_id )
-          applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
+          applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
   }

--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -257,6 +257,7 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
   const __m512i eight = m512_const1_64( 8 );
   const bool bench = opt_benchmark;

+   // convert LE32 to LE64
   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
@@ -264,10 +265,8 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );

   mm512_intrlv80_8x64( vdata, edata );
-
-   *noncev = mm512_intrlv_blend_32( *noncev,
-                           _mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4,
-                                             0, n+3, 0, n+2, 0, n+1, 0, n ) );
+   *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
+                                    0,7, 0,6, 0,5, 0,4, 0,3, 0,2, 0,1, 0,0 ) );
   blake512_8way_prehash_le( &blake512_8way_ctx, x17_8way_midstate, vdata );
   
   do
@@ -279,7 +278,7 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
         extr_lane_8x32( lane_hash, hash32, lane, 256 );
         if ( likely( valid_hash( lane_hash, ptarget ) ) )
         {
-            pdata[19] =  n + lane;
+            pdata[19] = n + lane;
            submit_solution( work, lane_hash, mythr );
         }
      }
@@ -291,8 +290,6 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
   return 0;
 }

-
-
 #elif defined(X17_4WAY)

 union _x17_4way_context_overlay
@@ -322,6 +319,9 @@ union _x17_4way_context_overlay
 };  
 typedef union _x17_4way_context_overlay x17_4way_context_overlay;

+static __thread __m256i x17_4way_midstate[16] __attribute__((aligned(64)));
+static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64)));
+
 int x17_4way_hash( void *state, const void *input, int thr_id )
 {
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
@@ -333,7 +333,10 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
     uint64_t hash3[8] __attribute__ ((aligned (32)));
     x17_4way_context_overlay ctx;

-     blake512_4way_full( &ctx.blake, vhash, input, 80 );
+     blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ),
+                             x17_4way_midstate );
+     
+//     blake512_4way_full( &ctx.blake, vhash, input, 80 );

     bmw512_4way_init( &ctx.bmw );
     bmw512_4way_update( &ctx.bmw, vhash, 64 );
@@ -449,4 +452,55 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
     return 1;
 }

+int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash32[8*4] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   __m128i edata[5] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *hash32_d7 = &(hash32[7*4]);
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   __m256i  *noncev = (__m256i*)vdata + 9;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const uint32_t targ32_d7 = ptarget[7];
+   const __m256i four = m256_const1_64( 4 );
+   const bool bench = opt_benchmark;
+
+   // convert LE32 to LE64
+   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
+   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
+   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
+   edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
+   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
+
+   mm256_intrlv80_4x64( vdata, edata );
+   *noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32( 0,3,0,2, 0,1,0,0 ) );
+   blake512_4way_prehash_le( &blake512_4way_ctx, x17_4way_midstate, vdata );
+
+   do
+   {
+      if ( likely( x17_4way_hash( hash32, vdata, thr_id ) ) )
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( unlikely( ( hash32_d7[ lane ] <= targ32_d7 ) && !bench ) )
+      {
+         extr_lane_4x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+      }
+      *noncev = _mm256_add_epi32( *noncev, four );
+      n += 4;
+   } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
 #endif
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -6,7 +6,8 @@ bool register_x17_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x17_8way;
  gate->hash      = (void*)&x17_8way_hash;
 #elif defined (X17_4WAY)
-  gate->scanhash  = (void*)&scanhash_4way_64in_32out;
+  gate->scanhash  = (void*)&scanhash_x17_4way;
+//  gate->scanhash  = (void*)&scanhash_4way_64in_32out;
  gate->hash      = (void*)&x17_4way_hash;
 #else
  gate->hash      = (void*)&x17_hash;
--- a/algo/x20/x20r-gate.c
+++ b/algo/x20/x20r-gate.c
@@ -1,34 +0,0 @@
-#include "x20r-gate.h"
-
-void getAlgoString( const uint8_t* prevblock, char *output )
-{
-    char *sptr = outpuit;
-
-    for ( int j = 0; j < X20R_HASH_FUNC_COUNT; j++ )
-    {
-        char b = (19 - j) >> 1; // 16 ascii hex chars, reversed
-        uint8_t algoDigit = (j & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4;
-        if (algoDigit >= 10)
-            sprintf(sptr, "%c", 'A' + (algoDigit - 10));
-         else
-            sprintf(sptr, "%u", (uint32_t) algoDigit);
-        sptr++;
-     }
-     *sptr = '\0';
-}
-
-bool register_x20r_algo( algo_gate_t* gate )
-{
-#if defined (X20R_4WAY)
-  gate->scanhash  = (void*)&scanhash_x20r_4way;
-  gate->hash      = (void*)&x20r_4way_hash;
-#else
-  gate->scanhash  = (void*)&scanhash_x20r;
-  gate->hash      = (void*)&x20r_hash;
-#endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
-  x20_r_s_getAlgoString = (void*)&x20r_getAlgoString;
-  opt_target_factor = 256.;
-  return true;
-};
-
--- a/algo/x20/x20r-gate.h
+++ b/algo/x20/x20r-gate.h
@@ -1,58 +0,0 @@
-#ifndef X20R_GATE_H__
-#define X20R_GATE_H__ 1
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-/*
-#if defined(__AVX2__) && defined(__AES__)
-  #define X20R_4WAY
-#endif
-*/
-
-enum x20r_Algo {
-        BLAKE = 0,
-        BMW,
-        GROESTL,
-        JH,
-        KECCAK,
-        SKEIN,
-        LUFFA,
-        CUBEHASH,
-        SHAVITE,
-        SIMD,
-        ECHO,
-        HAMSI,
-        FUGUE,
-        SHABAL,
-        WHIRLPOOL,
-        SHA_512,
-        HAVAL,      // 256-bits output
-        GOST,
-        RADIOGATUN, // 256-bits output
-        PANAMA,     // 256-bits output
-        X20R_HASH_FUNC_COUNT
-};
-
-void (*x20_r_s_getAlgoString) ( const uint8_t*, char* );
-
-void x20r_getAlgoString( const uint8_t* prevblock, char *output );
-
-bool register_xi20r_algo( algo_gate_t* gate );
-
-#if defined(X20R_4WAY)
-
-void x20r_4way_hash( void *state, const void *input );
-
-int scanhash_x20r_4way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr );
-
-#endif
-
-void x20rhash( void *state, const void *input );
-
-int scanhash_x20r( struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done, struct thr_info *mythr );
-
-#endif
-
--- a/algo/x20/x20r.c
+++ b/algo/x20/x20r.c
@@ -1,252 +0,0 @@
-#include "x20r-gate.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "algo/blake/sph_blake.h"
-#include "algo/bmw/sph_bmw.h"
-#include "algo/jh/sph_jh.h"
-#include "algo/keccak/sph_keccak.h"
-#include "algo/skein/sph_skein.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/hamsi/sph_hamsi.h"
-#include "algo/fugue/sph_fugue.h"
-#include "algo/shabal/sph_shabal.h"
-#include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/haval/sph-haval.h"
-#include "algo/radiogatun/sph_radiogatun.h"
-#include "algo/panama/sph_panama.h"
-#include "algo/gost/sph_gost.h"
-#include "algo/sha/sph_sha2.h"
-#if defined(__AES__)
-  #include "algo/echo/aes_ni/hash_api.h"
-  #include "algo/groestl/aes_ni/hash-groestl.h"
-#else
-  #include "algo/groestl/sph_groestl.h"
-  #include "algo/echo/sph_echo.h"
-#endif 
-#include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/cubehash_sse2.h"
-#include "algo/simd/nist.h"
-
-
-static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread char hashOrder[X20R_HASH_FUNC_COUNT + 1] = { 0 };
-
-union _x20r_context_overlay
-{
-    sph_blake512_context     blake;
-    sph_bmw512_context       bmw;
-#if defined(__AES__)
-    hashState_groestl        groestl;
-    hashState_echo           echo;
-#else
-    sph_groestl512_context   groestl;
-    sph_echo512_context      echo;
-#endif
-    sph_skein512_context     skein;
-    sph_jh512_context        jh;
-    sph_keccak512_context    keccak;
-    hashState_luffa          luffa;
-    cubehashParam            cube;
-    hashState_sd             simd;
-    sph_shavite512_context   shavite;
-    sph_hamsi512_context     hamsi;
-    sph_fugue512_context     fugue;
-    sph_shabal512_context    shabal;
-    sph_whirlpool_context    whirlpool;
-    sph_sha512_context       sha512;
-    sph_haval256_5_context   haval;
-    sph_gost512_context      gost;
-    sph_radiogatun64_context radiogatun;
-    sph_panama_context       panama;
-};
-typedef union _x20r_context_overlay x20r_context_overlay;
-
-void x20r_hash(void* output, const void* input)
-{
-   uint32_t _ALIGN(128) hash[64/4];
-   x20r_context_overlay ctx;
-   void *in = (void*) input;
-   int size = 80;
-
-   if ( s_ntime == UINT32_MAX )
-   {
-	const uint8_t* in8 = (uint8_t*) input;
-	x20_r_s_getAlgoString(&in8[4], hashOrder);
-   }
-
-   for (int i = 0; i < 20; i++)
-   {
-	const char elem = hashOrder[i];
-	const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-
-	switch ( algo )
-       	{
-	   case BLAKE:
-		sph_blake512_init(&ctx.blake);
-		sph_blake512(&ctx.blake, in, size);
-		sph_blake512_close(&ctx.blake, hash);
-		break;
-	   case BMW:
-		sph_bmw512_init(&ctx.bmw);
-		sph_bmw512(&ctx.bmw, in, size);
-		sph_bmw512_close(&ctx.bmw, hash);
-		break;
-	   case GROESTL:
-#if defined(__AES__)
-                init_groestl( &ctx.groestl, 64 );
-                update_and_final_groestl( &ctx.groestl, (char*)hash,
-                                         (const char*)in, size<<3 );
-#else
-                sph_groestl512_init(&ctx.groestl);
-                sph_groestl512(&ctx.groestl, in, size);
-                sph_groestl512_close(&ctx.groestl, hash);
-#endif
-                break;
-           case SKEIN:
-		sph_skein512_init(&ctx.skein);
-		sph_skein512(&ctx.skein, in, size);
-		sph_skein512_close(&ctx.skein, hash);
-		break;
-	   case JH:
-		sph_jh512_init(&ctx.jh);
-		sph_jh512(&ctx.jh, in, size);
-		sph_jh512_close(&ctx.jh, hash);
-		break;
-	   case KECCAK:
-		sph_keccak512_init(&ctx.keccak);
-		sph_keccak512(&ctx.keccak, in, size);
-		sph_keccak512_close(&ctx.keccak, hash);
-		break;
-	   case LUFFA:
-                init_luffa( &ctx.luffa, 512 );
-                update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
-                                        (const BitSequence*)in, size );
-		break;
-           case CUBEHASH:
-                cubehashInit( &ctx.cube, 512, 16, 32 );
-                cubehashUpdateDigest( &ctx.cube, (byte*) hash,
-                                      (const byte*)in, size );
-		break;
-	   case SHAVITE:
-		sph_shavite512_init(&ctx.shavite);
-		sph_shavite512(&ctx.shavite, in, size);
-		sph_shavite512_close(&ctx.shavite, hash);
-		break;
-           case SIMD:
-                init_sd( &ctx.simd, 512 );
-                update_final_sd( &ctx.simd, (BitSequence *)hash,
-                                 (const BitSequence *)in, size<<3 );
-			break;
-           case ECHO:
-#if defined(__AES__)
-                init_echo( &ctx.echo, 512 );
-                update_final_echo ( &ctx.echo, (BitSequence *)hash,
-                                    (const BitSequence *)in, size<<3 );
-#else
-	        sph_echo512_init(&ctx.echo);
-	        sph_echo512(&ctx.echo, in, size);
-	        sph_echo512_close(&ctx.echo, hash);
-#endif
-		break;
-	   case HAMSI:
-		sph_hamsi512_init(&ctx.hamsi);
-		sph_hamsi512(&ctx.hamsi, in, size);
-		sph_hamsi512_close(&ctx.hamsi, hash);
-		break;
-	   case FUGUE:
-		sph_fugue512_init(&ctx.fugue);
-		sph_fugue512(&ctx.fugue, in, size);
-		sph_fugue512_close(&ctx.fugue, hash);
-		break;
-	   case SHABAL:
-		sph_shabal512_init(&ctx.shabal);
-		sph_shabal512(&ctx.shabal, in, size);
-		sph_shabal512_close(&ctx.shabal, hash);
-		break;
-	   case WHIRLPOOL:
-		sph_whirlpool_init(&ctx.whirlpool);
-		sph_whirlpool(&ctx.whirlpool, in, size);
-		sph_whirlpool_close(&ctx.whirlpool, hash);
-		break;
-	   case SHA_512:
-                sph_sha512_Init( &ctx.sha512 );
-                sph_sha512( &ctx.sha512, in, size );
-                sph_sha512_close( &ctx.sha512, hash );
-		break;
-	   case HAVAL:
-		sph_haval256_5_init(&ctx.haval);
-		sph_haval256_5(&ctx.haval, in, size);
-		sph_haval256_5_close(&ctx.haval, hash);
-		memset(&hash[8], 0, 32);
-		break;
-	   case GOST:
-		sph_gost512_init(&ctx.gost);
-		sph_gost512(&ctx.gost, in, size);
-		sph_gost512_close(&ctx.gost, hash);
-		break;
-	   case RADIOGATUN:
-		sph_radiogatun64_init(&ctx.radiogatun);
-		sph_radiogatun64(&ctx.radiogatun, in, size);
-		sph_radiogatun64_close(&ctx.radiogatun, hash);
-		memset(&hash[8], 0, 32);
-		break;
-	   case PANAMA:
-		sph_panama_init(&ctx.panama);
-		sph_panama(&ctx.panama, in, size);
-		sph_panama_close(&ctx.panama, hash);
-		memset(&hash[8], 0, 32);
-		break;
-	}
-   in = (void*) hash;
-   size = 64;
-   }
-   memcpy(output, hash, 32);
-}
-
-int scanhash_x20r( struct work *work, uint32_t max_nonce,
-	           uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t _ALIGN(128) hash32[8];
-   uint32_t _ALIGN(128) endiandata[20];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   uint32_t nonce = first_nonce;
-   int thr_id = mythr->id;
-   volatile uint8_t *restart = &(work_restart[thr_id].restart);
-
-   for (int k=0; k < 19; k++)
-	be32enc( &endiandata[k], pdata[k] );
-
-   if ( s_ntime != pdata[17] )
-   {
-	uint32_t ntime = swab32(pdata[17]);
-	x20_r_s_getAlgoString( (const char*) (&endiandata[1]), hashOrder );
-	s_ntime = ntime;
-	if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime);
-   }
-
-   if ( opt_benchmark )
-	ptarget[7] = 0x0cff;
-
-   do {
-	be32enc( &endiandata[19], nonce );
-	x20r_hash( hash32, endiandata );
-
-	if ( hash32[7] <= Htarg && fulltest( hash32, ptarget ) )
-  	{
-        pdata[19] = nonce;
-        submit_solution( work, hash32, mythr );
-	}
-	nonce++;
-
-   } while (nonce < max_nonce && !(*restart));
-
-   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce + 1;
-   return 0;
-}
--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -581,10 +581,8 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );   

   mm512_intrlv80_8x64( vdata, edata );
-
-   *noncev = mm512_intrlv_blend_32( *noncev,
-                           _mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4,
-                                             0, n+3, 0, n+2, 0, n+1, 0, n ) );
+   *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
+                       0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0 ) );
   blake512_8way_prehash_le( &blake512_8way_ctx, x25x_8way_midstate, vdata ); 

   do
@@ -941,9 +939,8 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );

   mm256_intrlv80_4x64( vdata, edata );
-
-   *noncev = mm256_intrlv_blend_32( *noncev,
-                           _mm256_set_epi32( 0, n+3, 0, n+2, 0, n+1, 0, n ) );
+   *noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32(
+                                                 0, 3, 0, 2, 0, 1, 0, 0 ) );
   blake512_4way_prehash_le( &blake512_4way_ctx, x25x_4way_midstate, vdata );
   
   do
--- a/algo/yescrypt/yescrypt-best.c
+++ b/algo/yescrypt/yescrypt-best.c
@@ -1,5 +0,0 @@
-#ifdef __SSE2__
-#include "yescrypt-simd.c"
-#else
-#include "yescrypt-opt.c"
-#endif
--- a/algo/yescrypt/yescrypt-platform.h
+++ b/algo/yescrypt/yescrypt-platform.h
@@ -1,213 +0,0 @@
-/*-
- * Copyright 2013,2014 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifdef MAP_ANON
-#include <sys/mman.h>
-#endif
-
-#include "yescrypt.h"
-#define HUGEPAGE_THRESHOLD		(12 * 1024 * 1024)
-
-#ifdef __x86_64__
-#define HUGEPAGE_SIZE			(2 * 1024 * 1024)
-#else
-#undef HUGEPAGE_SIZE
-#endif
-
-/*
-static __inline uint32_t
-le32dec(const void *pp)
-{
-	const uint8_t *p = (uint8_t const *)pp;
-
-	return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
-	    ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
-}
-
-static __inline void
-le32enc(void *pp, uint32_t x)
-{
-	uint8_t * p = (uint8_t *)pp;
-
-	p[0] = x & 0xff;
-	p[1] = (x >> 8) & 0xff;
-	p[2] = (x >> 16) & 0xff;
-	p[3] = (x >> 24) & 0xff;
-}
-*/
-
-static void *
-alloc_region(yescrypt_region_t * region, size_t size)
-{
-	size_t base_size = size;
-	uint8_t * base, * aligned;
-#ifdef MAP_ANON
-	int flags =
-#ifdef MAP_NOCORE
-	    MAP_NOCORE |
-#endif
-	    MAP_ANON | MAP_PRIVATE;
-#if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE)
-	size_t new_size = size;
-	const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1;
-	if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) {
-		flags |= MAP_HUGETLB;
-/*
- * Linux's munmap() fails on MAP_HUGETLB mappings if size is not a multiple of
- * huge page size, so let's round up to huge page size here.
- */
-		new_size = size + hugepage_mask;
-		new_size &= ~hugepage_mask;
-	}
-	base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0);
-	if (base != MAP_FAILED) {
-		base_size = new_size;
-	} else
-	if (flags & MAP_HUGETLB) {
-		flags &= ~MAP_HUGETLB;
-		base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
-	}
-
-#else
-	base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
-#endif
-	if (base == MAP_FAILED)
-		base = NULL;
-	aligned = base;
-#elif defined(HAVE_POSIX_MEMALIGN)
-	if ((errno = posix_memalign((void **)&base, 64, size)) != 0)
-		base = NULL;
-	aligned = base;
-#else
-	base = aligned = NULL;
-	if (size + 63 < size) {
-		errno = ENOMEM;
-	} else if ((base = malloc(size + 63)) != NULL) {
-		aligned = base + 63;
-		aligned -= (uintptr_t)aligned & 63;
-	}
-#endif
-	region->base = base;
-	region->aligned = aligned;
-	region->base_size = base ? base_size : 0;
-	region->aligned_size = base ? size : 0;
-	return aligned;
-}
-
-static __inline void
-init_region(yescrypt_region_t * region)
-{
-	region->base = region->aligned = NULL;
-	region->base_size = region->aligned_size = 0;
-}
-
-static int
-free_region(yescrypt_region_t * region)
-{
-	if (region->base) {
-#ifdef MAP_ANON
-		if (munmap(region->base, region->base_size))
-			return -1;
-#else
-		free(region->base);
-#endif
-	}
-	init_region(region);
-	return 0;
-}
-
-int yescrypt_init_shared(yescrypt_shared_t * shared, const uint8_t * param, size_t paramlen,
-    uint64_t N, uint32_t r, uint32_t p, yescrypt_init_shared_flags_t flags, uint32_t mask,
-    uint8_t * buf, size_t buflen)
-{
-	yescrypt_shared1_t* shared1 = &shared->shared1;
-	yescrypt_shared_t dummy, half1, half2;
-	uint8_t salt[32];
-
-	if (flags & YESCRYPT_SHARED_PREALLOCATED) {
-		if (!shared1->aligned || !shared1->aligned_size)
-			return -1;
-	} else {
-		init_region(shared1);
-	}
-	shared->mask1 = 1;
-	if (!param && !paramlen && !N && !r && !p && !buf && !buflen)
-		return 0;
-
-	init_region(&dummy.shared1);
-	dummy.mask1 = 1;
-	if (yescrypt_kdf(&dummy, shared1,
-	    param, paramlen, NULL, 0, N, r, p, 0,
-	    YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
-	    salt, sizeof(salt), 0 ) )
-		goto out;
-
-	half1 = half2 = *shared;
-	half1.shared1.aligned_size /= 2;
-	half2.shared1.aligned = (void*) ((size_t)half2.shared1.aligned + half1.shared1.aligned_size);
-	half2.shared1.aligned_size = half1.shared1.aligned_size;
-	N /= 2;
-
-	if (p > 1 && yescrypt_kdf(&half1, &half2.shared1,
-	    param, paramlen, salt, sizeof(salt), N, r, p, 0,
-	    YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_2,
-	    salt, sizeof(salt), 0 ))
-		goto out;
-
-	if (yescrypt_kdf(&half2, &half1.shared1,
-	    param, paramlen, salt, sizeof(salt), N, r, p, 0,
-	    YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
-	    salt, sizeof(salt), 0))
-		goto out;
-
-	if (yescrypt_kdf(&half1, &half2.shared1,
-	    param, paramlen, salt, sizeof(salt), N, r, p, 0,
-	    YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
-	    buf, buflen, 0))
-		goto out;
-
-	shared->mask1 = mask;
-
-	return 0;
-
-out:
-	if (!(flags & YESCRYPT_SHARED_PREALLOCATED))
-		free_region(shared1);
-	return -1;
-}
-
-int
-yescrypt_free_shared(yescrypt_shared_t * shared)
-{
-	return free_region(&shared->shared1);
-}
-
-int
-yescrypt_init_local(yescrypt_local_t * local)
-{
-	init_region(local);
-	return 0;
-}
-
-int
-yescrypt_free_local(yescrypt_local_t * local)
-{
-	return free_region(local);
-}
--- a/algo/yescrypt/yescrypt-simd.c
+++ b/algo/yescrypt/yescrypt-simd.c
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -1,488 +0,0 @@
-/*-
- * Copyright 2013,2014 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-
-#include "compat.h"
-
-#include "yescrypt.h"
-#include "algo/sha/hmac-sha256-hash.h"
-#include "algo-gate-api.h"
-
-#define BYTES2CHARS(bytes) \
-	((((bytes) * 8) + 5) / 6)
-
-#define HASH_SIZE 32 /* bytes */
-#define HASH_LEN BYTES2CHARS(HASH_SIZE) /* base-64 chars */
-#define YESCRYPT_FLAGS (YESCRYPT_RW | YESCRYPT_PWXFORM)
-
-static const char * const itoa64 =
-	"./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
-
-static uint8_t* encode64_uint32(uint8_t* dst, size_t dstlen, uint32_t src, uint32_t srcbits)
-{
-	uint32_t bit;
-
-	for (bit = 0; bit < srcbits; bit += 6) {
-		if (dstlen < 1)
-			return NULL;
-		*dst++ = itoa64[src & 0x3f];
-		dstlen--;
-		src >>= 6;
-	}
-
-	return dst;
-}
-
-static uint8_t* encode64(uint8_t* dst, size_t dstlen, const uint8_t* src, size_t srclen)
-{
-	size_t i;
-
-	for (i = 0; i < srclen; ) {
-		uint8_t * dnext;
-		uint32_t value = 0, bits = 0;
-		do {
-			value |= (uint32_t)src[i++] << bits;
-			bits += 8;
-		} while (bits < 24 && i < srclen);
-		dnext = encode64_uint32(dst, dstlen, value, bits);
-		if (!dnext)
-			return NULL;
-		dstlen -= dnext - dst;
-		dst = dnext;
-	}
-
-	return dst;
-}
-
-static int decode64_one(uint32_t* dst, uint8_t src)
-{
-	const char * ptr = strchr(itoa64, src);
-	if (ptr) {
-		*dst = (uint32_t) (ptr - itoa64);
-		return 0;
-	}
-	*dst = 0;
-	return -1;
-}
-
-static const uint8_t* decode64_uint32(uint32_t* dst, uint32_t dstbits, const uint8_t* src)
-{
-	uint32_t bit;
-	uint32_t value;
-
-	value = 0;
-	for (bit = 0; bit < dstbits; bit += 6) {
-		uint32_t one;
-		if (decode64_one(&one, *src)) {
-			*dst = 0;
-			return NULL;
-		}
-		src++;
-		value |= one << bit;
-	}
-
-	*dst = value;
-	return src;
-}
-
-uint8_t* yescrypt_r(const yescrypt_shared_t* shared, yescrypt_local_t* local,
-    const uint8_t* passwd, size_t passwdlen, const uint8_t* setting,
-    uint8_t* buf, size_t buflen, int thrid )
-{
-	uint8_t hash[HASH_SIZE];
-	const uint8_t * src, * salt;
-	uint8_t * dst;
-	size_t prefixlen, saltlen, need;
-	uint8_t version;
-	uint64_t N;
-	uint32_t r, p;
-	yescrypt_flags_t flags = YESCRYPT_WORM;
-
-	printf("pass1 ...");
-	fflush(stdout);
-
-	if (setting[0] != '$' || setting[1] != '7') {
-		printf("died$7 ...");
-		fflush(stdout);
-		return NULL;
-	}
-
-	printf("died80 ...");
-	fflush(stdout);
-
-	src = setting + 2;
-
-	printf("hello '%p'\n", (char *)src);
-	fflush(stdout);
-
-	switch ((version = *src)) {
-	case '$':
-		printf("died2 ...");
-		fflush(stdout);
-		break;
-	case 'X':
-		src++;
-		flags = YESCRYPT_RW;
-		printf("died3 ...");
-		fflush(stdout);
-		break;
-	default:
-		printf("died4 ...");
-		fflush(stdout);
-		return NULL;
-	}
-
-	printf("pass2 ...");
-	fflush(stdout);
-
-	if (*src != '$') {
-		uint32_t decoded_flags;
-		if (decode64_one(&decoded_flags, *src)) {
-			printf("died5 ...");
-			fflush(stdout);
-			return NULL;
-		}
-		flags = decoded_flags;
-		if (*++src != '$') {
-			printf("died6 ...");
-			fflush(stdout);
-			return NULL;
-		}
-	}
-
-	src++;
-
-	{
-		uint32_t N_log2;
-		if (decode64_one(&N_log2, *src)) {
-			printf("died7 ...");
-			return NULL;
-		}
-		src++;
-		N = (uint64_t)1 << N_log2;
-	}
-
-	src = decode64_uint32(&r, 30, src);
-	if (!src) {
-		printf("died6 ...");
-		return NULL;
-	}
-
-	src = decode64_uint32(&p, 30, src);
-	if (!src) {
-		printf("died7 ...");
-		return NULL;
-	}
-
-	prefixlen = src - setting;
-
-	salt = src;
-	src = (uint8_t *)strrchr((char *)salt, '$');
-	if (src)
-		saltlen = src - salt;
-	else
-		saltlen = strlen((char *)salt);
-
-	need = prefixlen + saltlen + 1 + HASH_LEN + 1;
-	if (need > buflen || need < saltlen) {
-		printf("'%d %d %d'", (int) need, (int) buflen, (int) saltlen);
-		printf("died8killbuf ...");
-		fflush(stdout);
-		return NULL;
-	}
-
-	if ( yescrypt_kdf( shared, local, passwd, passwdlen, salt, saltlen, N, r, p,
-            0, flags, hash, sizeof(hash), thrid ) == -1 )
-   {
-		printf("died10 ...");
-		fflush(stdout);
-		return NULL;
-	}
-
-	dst = buf;
-	memcpy(dst, setting, prefixlen + saltlen);
-	dst += prefixlen + saltlen;
-	*dst++ = '$';
-
-	dst = encode64(dst, buflen - (dst - buf), hash, sizeof(hash));
-	/* Could zeroize hash[] here, but yescrypt_kdf() doesn't zeroize its
-	 * memory allocations yet anyway. */
-	if (!dst || dst >= buf + buflen) { /* Can't happen */
-		printf("died11 ...");
-		return NULL;
-	}
-
-	*dst = 0; /* NUL termination */
-
-	printf("died12 ...");
-	fflush(stdout);
-
-	return buf;
-}
-
-uint8_t* yescrypt(const uint8_t* passwd, const uint8_t* setting, int thrid )
-{
-	static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1 + HASH_LEN + 1];
-	yescrypt_shared_t shared;
-	yescrypt_local_t local;
-	uint8_t * retval;
-
-	if (yescrypt_init_shared(&shared, NULL, 0,
-	    0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0))
-		return NULL;
-	if (yescrypt_init_local(&local)) {
-		yescrypt_free_shared(&shared);
-		return NULL;
-	}
-	retval = yescrypt_r(&shared, &local,
-	    passwd, 80, setting, buf, sizeof(buf), thrid );
-	//printf("hashse='%s'\n", (char *)retval);
-	if (yescrypt_free_local(&local)) {
-		yescrypt_free_shared(&shared);
-		return NULL;
-	}
-	if (yescrypt_free_shared(&shared))
-		return NULL;
-	return retval;
-}
-
-uint8_t* yescrypt_gensalt_r(uint32_t N_log2, uint32_t r, uint32_t p, yescrypt_flags_t flags,
-    const uint8_t* src, size_t srclen, uint8_t* buf, size_t buflen)
-{
-	uint8_t * dst;
-	size_t prefixlen = 3 + 1 + 5 + 5;
-	size_t saltlen = BYTES2CHARS(srclen);
-	size_t need;
-
-	if (p == 1)
-		flags &= ~YESCRYPT_PARALLEL_SMIX;
-
-	if (flags) {
-		if (flags & ~0x3f)
-			return NULL;
-
-		prefixlen++;
-		if (flags != YESCRYPT_RW)
-			prefixlen++;
-	}
-
-	need = prefixlen + saltlen + 1;
-	if (need > buflen || need < saltlen || saltlen < srclen)
-		return NULL;
-
-	if (N_log2 > 63 || ((uint64_t)r * (uint64_t)p >= (1U << 30)))
-		return NULL;
-
-	dst = buf;
-	*dst++ = '$';
-	*dst++ = '7';
-	if (flags) {
-		*dst++ = 'X'; /* eXperimental, subject to change */
-		if (flags != YESCRYPT_RW)
-			*dst++ = itoa64[flags];
-	}
-	*dst++ = '$';
-
-	*dst++ = itoa64[N_log2];
-
-	dst = encode64_uint32(dst, buflen - (dst - buf), r, 30);
-	if (!dst) /* Can't happen */
-		return NULL;
-
-	dst = encode64_uint32(dst, buflen - (dst - buf), p, 30);
-	if (!dst) /* Can't happen */
-		return NULL;
-
-	dst = encode64(dst, buflen - (dst - buf), src, srclen);
-	if (!dst || dst >= buf + buflen) /* Can't happen */
-		return NULL;
-
-	*dst = 0; /* NUL termination */
-
-	return buf;
-}
-
-uint8_t* yescrypt_gensalt(uint32_t N_log2, uint32_t r, uint32_t p, yescrypt_flags_t flags,
-    const uint8_t * src, size_t srclen)
-{
-	static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1];
-	return yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen,
-	    buf, sizeof(buf));
-}
-
-static int yescrypt_bsty(const uint8_t * passwd, size_t passwdlen,
-    const uint8_t * salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p,
-    uint8_t * buf, size_t buflen, int thrid )
-{
-	static __thread int initialized = 0;
-	static __thread yescrypt_shared_t shared;
-	static __thread yescrypt_local_t local;
-	int retval;
-	if (!initialized) {
-/* "shared" could in fact be shared, but it's simpler to keep it private
- * along with "local".  It's dummy and tiny anyway. */
-		if (yescrypt_init_shared(&shared, NULL, 0,
-		    0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0))
-			return -1;
-		if (yescrypt_init_local(&local)) {
-			yescrypt_free_shared(&shared);
-			return -1;
-		}
-		initialized = 1;
-	}
-	retval = yescrypt_kdf(&shared, &local,
-	    passwd, passwdlen, salt, saltlen, N, r, p, 0, YESCRYPT_FLAGS,
-	    buf, buflen, thrid );
-#if 0
-	if (yescrypt_free_local(&local)) {
-		yescrypt_free_shared(&shared);
-		return -1;
-	}
-	if (yescrypt_free_shared(&shared))
-		return -1;
-	initialized = 0;
-#endif
-	return retval;
-}
-
-// scrypt parameters initialized at run time.
-uint64_t YESCRYPT_N;
-uint32_t YESCRYPT_R;
-uint32_t YESCRYPT_P;
-char *yescrypt_client_key = NULL;
-int yescrypt_client_key_len = 0;
-
-/* main hash 80 bytes input */
-int yescrypt_hash( const char *input, char *output, uint32_t len, int thrid )
-{
-   return yescrypt_bsty( (uint8_t*)input, len, (uint8_t*)input, len, YESCRYPT_N,
-                  YESCRYPT_R, YESCRYPT_P, (uint8_t*)output, 32, thrid );
-}
-
-/* for util.c test */
-int yescrypthash(void *output, const void *input, int thrid)
-{
-	return yescrypt_hash((char*) input, (char*) output, 80, thrid);
-}
-
-int scanhash_yescrypt( struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t _ALIGN(64) vhash[8];
-   uint32_t _ALIGN(64) endiandata[20];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce;
-   uint32_t n = first_nonce;
-   int thr_id = mythr->id; 
-
-   for ( int k = 0; k < 19; k++ )
-      be32enc( &endiandata[k], pdata[k] );
-   endiandata[19] = n;
-   do {
-      if ( yescrypt_hash((char*) endiandata, (char*) vhash, 80, thr_id ) )
-      if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark )
-      {
-          be32enc( pdata+19, n );
-          submit_solution( work, vhash, mythr );
-      }
-      endiandata[19] = ++n;
-   } while ( n < last_nonce && !work_restart[thr_id].restart );
-   *hashes_done = n - first_nonce;
-   pdata[19] = n;
-   return 0;
-}
-
-void yescrypt_gate_base(algo_gate_t *gate )
-{
-   gate->optimizations = SSE2_OPT | SHA_OPT;
-   gate->scanhash   = (void*)&scanhash_yescrypt;
-   gate->hash       = (void*)&yescrypt_hash;
-   opt_target_factor = 65536.0;
-}
-
-bool register_yescrypt_algo( algo_gate_t* gate )
-{
-   yescrypt_gate_base( gate );
-
-   if ( opt_param_n )  YESCRYPT_N = opt_param_n;
-   else                YESCRYPT_N = 2048;
-
-   if ( opt_param_r )  YESCRYPT_R = opt_param_r;
-   else                YESCRYPT_R = 8;
- 
-   if ( opt_param_key ) 
-   {   
-     yescrypt_client_key = opt_param_key;
-     yescrypt_client_key_len = strlen( opt_param_key );
-   }
-   else
-   {   
-     yescrypt_client_key = NULL;
-     yescrypt_client_key_len = 0;
-   }
-
-   YESCRYPT_P = 1;
-
-   applog( LOG_NOTICE,"Yescrypt parameters: N= %d, R= %d", YESCRYPT_N,
-                                                            YESCRYPT_R );
-   if ( yescrypt_client_key )
-     applog( LOG_NOTICE,"Key= \"%s\"\n", yescrypt_client_key );
-
-   return true;
-}
-
-bool register_yescryptr8_algo( algo_gate_t* gate )
-{
-   yescrypt_gate_base( gate );
-   yescrypt_client_key = "Client Key";
-   yescrypt_client_key_len = 10;
-   YESCRYPT_N = 2048;
-   YESCRYPT_R = 8;
-   YESCRYPT_P = 1;
-   return true;
-}
-
-bool register_yescryptr16_algo( algo_gate_t* gate )
-{
-   yescrypt_gate_base( gate );
-   yescrypt_client_key = "Client Key";
-   yescrypt_client_key_len = 10;
-   YESCRYPT_N = 4096;   
-   YESCRYPT_R = 16;   
-   YESCRYPT_P = 1;   
-   return true;
-}
-
-bool register_yescryptr32_algo( algo_gate_t* gate )
-{
-   yescrypt_gate_base( gate );
-   yescrypt_client_key = "WaviBanana";
-   yescrypt_client_key_len = 10;
-   YESCRYPT_N = 4096;
-   YESCRYPT_R = 32;
-   YESCRYPT_P = 1;
-   return true;
-}
-
--- a/algo/yescrypt/yescrypt.h
+++ b/algo/yescrypt/yescrypt.h
@@ -1,382 +0,0 @@
-/*-
- * Copyright 2009 Colin Percival
- * Copyright 2013,2014 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * This file was originally written by Colin Percival as part of the Tarsnap
- * online backup system.
- */
-
-#ifndef YESCRYPT_H
-#define YESCRYPT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stdint.h>
-#include <stdlib.h> /* for size_t */
-#include <stdbool.h>
-#include "miner.h"
-
-//#define  __SSE4_1__
-
-int yescrypt_hash(const char* input, char* output, uint32_t len, int thrid );
-
-int yescrypthash(void *output, const void *input, int thrid );
-
-/**
- * crypto_scrypt(passwd, passwdlen, salt, saltlen, N, r, p, buf, buflen):
- * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
- * p, buflen) and write the result into buf.  The parameters r, p, and buflen
- * must satisfy r * p < 2^30 and buflen <= (2^32 - 1) * 32.  The parameter N
- * must be a power of 2 greater than 1.
- *
- * Return 0 on success; or -1 on error.
- *
- * MT-safe as long as buf is local to the thread.
- */
-extern int crypto_scrypt(const uint8_t * __passwd, size_t __passwdlen,
-    const uint8_t * __salt, size_t __saltlen,
-    uint64_t __N, uint32_t __r, uint32_t __p,
-    uint8_t * __buf, size_t __buflen);
-
-/**
- * Internal type used by the memory allocator.  Please do not use it directly.
- * Use yescrypt_shared_t and yescrypt_local_t as appropriate instead, since
- * they might differ from each other in a future version.
- */
-typedef struct {
-	void * base, * aligned;
-	size_t base_size, aligned_size;
-} yescrypt_region_t;
-
-/**
- * Types for shared (ROM) and thread-local (RAM) data structures.
- */
-typedef yescrypt_region_t yescrypt_shared1_t;
-typedef struct {
-	yescrypt_shared1_t shared1;
-	uint32_t mask1;
-} yescrypt_shared_t;
-typedef yescrypt_region_t yescrypt_local_t;
-
-/**
- * Possible values for yescrypt_init_shared()'s flags argument.
- */
-typedef enum {
-	YESCRYPT_SHARED_DEFAULTS = 0,
-	YESCRYPT_SHARED_PREALLOCATED = 0x100
-} yescrypt_init_shared_flags_t;
-
-/**
- * Possible values for the flags argument of yescrypt_kdf(),
- * yescrypt_gensalt_r(), yescrypt_gensalt().  These may be OR'ed together,
- * except that YESCRYPT_WORM and YESCRYPT_RW are mutually exclusive.
- * Please refer to the description of yescrypt_kdf() below for the meaning of
- * these flags.
- */
-typedef enum {
-/* public */
-	YESCRYPT_WORM = 0,
-	YESCRYPT_RW = 1,
-	YESCRYPT_PARALLEL_SMIX = 2,
-	YESCRYPT_PWXFORM = 4,
-/* private */
-	__YESCRYPT_INIT_SHARED_1 = 0x10000,
-	__YESCRYPT_INIT_SHARED_2 = 0x20000,
-	__YESCRYPT_INIT_SHARED = 0x30000
-} yescrypt_flags_t;
-
-extern char *yescrypt_client_key;
-extern int yescrypt_client_key_len;
-
-
-#define YESCRYPT_KNOWN_FLAGS \
-	(YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | YESCRYPT_PWXFORM | \
-	__YESCRYPT_INIT_SHARED)
-
-/**
- * yescrypt_init_shared(shared, param, paramlen, N, r, p, flags, mask,
- *     buf, buflen):
- * Optionally allocate memory for and initialize the shared (ROM) data
- * structure.  The parameters N, r, and p must satisfy the same conditions as
- * with crypto_scrypt().  param and paramlen specify a local parameter with
- * which the ROM is seeded.  If buf is not NULL, then it is used to return
- * buflen bytes of message digest for the initialized ROM (the caller may use
- * this to verify that the ROM has been computed in the same way that it was on
- * a previous run).
- *
- * Return 0 on success; or -1 on error.
- *
- * If bit YESCRYPT_SHARED_PREALLOCATED in flags is set, then memory for the
- * ROM is assumed to have been preallocated by the caller, with
- * shared->shared1.aligned being the start address of the ROM and
- * shared->shared1.aligned_size being its size (which must be consistent with
- * N, r, and p).  This may be used e.g. when the ROM is to be placed in a SysV
- * shared memory segment allocated by the caller.
- *
- * mask controls the frequency of ROM accesses by yescrypt_kdf().  Normally it
- * should be set to 1, to interleave RAM and ROM accesses, which works well
- * when both regions reside in the machine's RAM anyway.  Other values may be
- * used e.g. when the ROM is memory-mapped from a disk file.  Recommended mask
- * values are powers of 2 minus 1 or minus 2.  Here's the effect of some mask
- * values:
- * mask	value	ROM accesses in SMix 1st loop	ROM accesses in SMix 2nd loop
- *	0		0				1/2
- *	1		1/2				1/2
- *	2		0				1/4
- *	3		1/4				1/4
- *	6		0				1/8
- *	7		1/8				1/8
- *	14		0				1/16
- *	15		1/16				1/16
- *	1022		0				1/1024
- *	1023		1/1024				1/1024
- *
- * Actual computation of the ROM contents may be avoided, if you don't intend
- * to use a ROM but need a dummy shared structure, by calling this function
- * with NULL, 0, 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0 for the
- * arguments starting with param and on.
- *
- * MT-safe as long as shared is local to the thread.
- */
-extern int yescrypt_init_shared(yescrypt_shared_t * __shared,
-    const uint8_t * __param, size_t __paramlen,
-    uint64_t __N, uint32_t __r, uint32_t __p,
-    yescrypt_init_shared_flags_t __flags, uint32_t __mask,
-    uint8_t * __buf, size_t __buflen);
-
-/**
- * yescrypt_free_shared(shared):
- * Free memory that had been allocated with yescrypt_init_shared().
- *
- * Return 0 on success; or -1 on error.
- *
- * MT-safe as long as shared is local to the thread.
- */
-extern int yescrypt_free_shared(yescrypt_shared_t * __shared);
-
-/**
- * yescrypt_init_local(local):
- * Initialize the thread-local (RAM) data structure.  Actual memory allocation
- * is currently fully postponed until a call to yescrypt_kdf() or yescrypt_r().
- *
- * Return 0 on success; or -1 on error.
- *
- * MT-safe as long as local is local to the thread.
- */
-extern int yescrypt_init_local(yescrypt_local_t * __local);
-
-/**
- * yescrypt_free_local(local):
- * Free memory that may have been allocated for an initialized thread-local
- * (RAM) data structure.
- *
- * Return 0 on success; or -1 on error.
- *
- * MT-safe as long as local is local to the thread.
- */
-extern int yescrypt_free_local(yescrypt_local_t * __local);
-
-/**
- * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
- *     N, r, p, t, flags, buf, buflen):
- * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
- * p, buflen), or a revision of scrypt as requested by flags and shared, and
- * write the result into buf.  The parameters N, r, p, and buflen must satisfy
- * the same conditions as with crypto_scrypt().  t controls computation time
- * while not affecting peak memory usage.  shared and flags may request
- * special modes as described below.  local is the thread-local data
- * structure, allowing to preserve and reuse a memory allocation across calls,
- * thereby reducing its overhead.
- *
- * Return 0 on success; or -1 on error.
- *
- * t controls computation time.  t = 0 is optimal in terms of achieving the
- * highest area-time for ASIC attackers.  Thus, higher computation time, if
- * affordable, is best achieved by increasing N rather than by increasing t.
- * However, if the higher memory usage (which goes along with higher N) is not
- * affordable, or if fine-tuning of the time is needed (recall that N must be a
- * power of 2), then t = 1 or above may be used to increase time while staying
- * at the same peak memory usage.  t = 1 increases the time by 25% and
- * decreases the normalized area-time to 96% of optimal.  (Of course, in
- * absolute terms the area-time increases with higher t.  It's just that it
- * would increase slightly more with higher N*r rather than with higher t.)
- * t = 2 increases the time by another 20% and decreases the normalized
- * area-time to 89% of optimal.  Thus, these two values are reasonable to use
- * for fine-tuning.  Values of t higher than 2 result in further increase in
- * time while reducing the efficiency much further (e.g., down to around 50% of
- * optimal for t = 5, which runs 3 to 4 times slower than t = 0, with exact
- * numbers varying by the flags settings).
- *
- * Classic scrypt is available by setting t = 0 and flags to YESCRYPT_WORM and
- * passing a dummy shared structure (see the description of
- * yescrypt_init_shared() above for how to produce one).  In this mode, the
- * thread-local memory region (RAM) is first sequentially written to and then
- * randomly read from.  This algorithm is friendly towards time-memory
- * tradeoffs (TMTO), available both to defenders (albeit not in this
- * implementation) and to attackers.
- *
- * Setting YESCRYPT_RW adds extra random reads and writes to the thread-local
- * memory region (RAM), which makes TMTO a lot less efficient.  This may be
- * used to slow down the kinds of attackers who would otherwise benefit from
- * classic scrypt's efficient TMTO.  Since classic scrypt's TMTO allows not
- * only for the tradeoff, but also for a decrease of attacker's area-time (by
- * up to a constant factor), setting YESCRYPT_RW substantially increases the
- * cost of attacks in area-time terms as well.  Yet another benefit of it is
- * that optimal area-time is reached at an earlier time than with classic
- * scrypt, and t = 0 actually corresponds to this earlier completion time,
- * resulting in quicker hash computations (and thus in higher request rate
- * capacity).  Due to these properties, YESCRYPT_RW should almost always be
- * set, except when compatibility with classic scrypt or TMTO-friendliness are
- * desired.
- *
- * YESCRYPT_PARALLEL_SMIX moves parallelism that is present with p > 1 to a
- * lower level as compared to where it is in classic scrypt.  This reduces
- * flexibility for efficient computation (for both attackers and defenders) by
- * requiring that, short of resorting to TMTO, the full amount of memory be
- * allocated as needed for the specified p, regardless of whether that
- * parallelism is actually being fully made use of or not.  (For comparison, a
- * single instance of classic scrypt may be computed in less memory without any
- * CPU time overhead, but in more real time, by not making full use of the
- * parallelism.)  This may be desirable when the defender has enough memory
- * with sufficiently low latency and high bandwidth for efficient full parallel
- * execution, yet the required memory size is high enough that some likely
- * attackers might end up being forced to choose between using higher latency
- * memory than they could use otherwise (waiting for data longer) or using TMTO
- * (waiting for data more times per one hash computation).  The area-time cost
- * for other kinds of attackers (who would use the same memory type and TMTO
- * factor or no TMTO either way) remains roughly the same, given the same
- * running time for the defender.  In the TMTO-friendly YESCRYPT_WORM mode, as
- * long as the defender has enough memory that is just as fast as the smaller
- * per-thread regions would be, doesn't expect to ever need greater
- * flexibility (except possibly via TMTO), and doesn't need backwards
- * compatibility with classic scrypt, there are no other serious drawbacks to
- * this setting.  In the YESCRYPT_RW mode, which is meant to discourage TMTO,
- * this new approach to parallelization makes TMTO less inefficient.  (This is
- * an unfortunate side-effect of avoiding some random writes, as we have to in
- * order to allow for parallel threads to access a common memory region without
- * synchronization overhead.)  Thus, in this mode this setting poses an extra
- * tradeoff of its own (higher area-time cost for a subset of attackers vs.
- * better TMTO resistance).  Setting YESCRYPT_PARALLEL_SMIX also changes the
- * way the running time is to be controlled from N*r*p (for classic scrypt) to
- * N*r (in this modification).  All of this applies only when p > 1.  For
- * p = 1, this setting is a no-op.
- *
- * Passing a real shared structure, with ROM contents previously computed by
- * yescrypt_init_shared(), enables the use of ROM and requires YESCRYPT_RW for
- * the thread-local RAM region.  In order to allow for initialization of the
- * ROM to be split into a separate program, the shared->shared1.aligned and
- * shared->shared1.aligned_size fields may be set by the caller of
- * yescrypt_kdf() manually rather than with yescrypt_init_shared().
- *
- * local must be initialized with yescrypt_init_local().
- *
- * MT-safe as long as local and buf are local to the thread.
- */
-extern int yescrypt_kdf(const yescrypt_shared_t * __shared,
-    yescrypt_local_t * __local,
-    const uint8_t * __passwd, size_t __passwdlen,
-    const uint8_t * __salt, size_t __saltlen,
-    uint64_t __N, uint32_t __r, uint32_t __p, uint32_t __t,
-    yescrypt_flags_t __flags,
-    uint8_t * __buf, size_t __buflen, int thrid);
-
-/**
- * yescrypt_r(shared, local, passwd, passwdlen, setting, buf, buflen):
- * Compute and encode an scrypt or enhanced scrypt hash of passwd given the
- * parameters and salt value encoded in setting.  If the shared structure is
- * not dummy, a ROM is used and YESCRYPT_RW is required.  Otherwise, whether to
- * use the YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff
- * discouraging modification) is determined by the setting string.  shared and
- * local must be initialized as described above for yescrypt_kdf().  buf must
- * be large enough (as indicated by buflen) to hold the encoded hash string.
- *
- * Return the encoded hash string on success; or NULL on error.
- *
- * MT-safe as long as local and buf are local to the thread.
- */
-extern uint8_t * yescrypt_r(const yescrypt_shared_t * __shared,
-    yescrypt_local_t * __local,
-    const uint8_t * __passwd, size_t __passwdlen,
-    const uint8_t * __setting,
-    uint8_t * __buf, size_t __buflen, int thrid);
-
-/**
- * yescrypt(passwd, setting):
- * Compute and encode an scrypt or enhanced scrypt hash of passwd given the
- * parameters and salt value encoded in setting.  Whether to use the
- * YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff
- * discouraging modification) is determined by the setting string.
- *
- * Return the encoded hash string on success; or NULL on error.
- *
- * This is a crypt(3)-like interface, which is simpler to use than
- * yescrypt_r(), but it is not MT-safe, it does not allow for the use of a ROM,
- * and it is slower than yescrypt_r() for repeated calls because it allocates
- * and frees memory on each call.
- *
- * MT-unsafe.
- */
-extern uint8_t * yescrypt(const uint8_t * __passwd, const uint8_t * __setting, int thrid );
-
-/**
- * yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, buf, buflen):
- * Generate a setting string for use with yescrypt_r() and yescrypt() by
- * encoding into it the parameters N_log2 (which is to be set to base 2
- * logarithm of the desired value for N), r, p, flags, and a salt given by src
- * (of srclen bytes).  buf must be large enough (as indicated by buflen) to
- * hold the setting string.
- *
- * Return the setting string on success; or NULL on error.
- *
- * MT-safe as long as buf is local to the thread.
- */
-extern uint8_t * yescrypt_gensalt_r(
-    uint32_t __N_log2, uint32_t __r, uint32_t __p,
-    yescrypt_flags_t __flags,
-    const uint8_t * __src, size_t __srclen,
-    uint8_t * __buf, size_t __buflen);
-
-/**
- * yescrypt_gensalt(N_log2, r, p, flags, src, srclen):
- * Generate a setting string for use with yescrypt_r() and yescrypt().  This
- * function is the same as yescrypt_gensalt_r() except that it uses a static
- * buffer and thus is not MT-safe.
- *
- * Return the setting string on success; or NULL on error.
- *
- * MT-unsafe.
- */
-extern uint8_t * yescrypt_gensalt(
-    uint32_t __N_log2, uint32_t __r, uint32_t __p,
-    yescrypt_flags_t __flags,
-    const uint8_t * __src, size_t __srclen);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/algo/yespower/yespower-gate.c
+++ b/algo/yespower/yespower-gate.c
@@ -161,7 +161,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate )

 // Legacy Yescrypt (yespower v0.5)

-bool register_yescrypt_05_algo( algo_gate_t* gate )
+bool register_yescrypt_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
@@ -194,7 +194,7 @@ bool register_yescrypt_05_algo( algo_gate_t* gate )
 }


-bool register_yescryptr8_05_algo( algo_gate_t* gate )
+bool register_yescryptr8_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
@@ -207,7 +207,7 @@ bool register_yescryptr8_05_algo( algo_gate_t* gate )
   return true;
 }

-bool register_yescryptr16_05_algo( algo_gate_t* gate )
+bool register_yescryptr16_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
@@ -220,7 +220,7 @@ bool register_yescryptr16_05_algo( algo_gate_t* gate )
   return true;
 }

-bool register_yescryptr32_05_algo( algo_gate_t* gate )
+bool register_yescryptr32_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
--- a/algo/yespower/yespower-opt.c
+++ b/algo/yespower/yespower-opt.c
--- a/api.c
+++ b/api.c
@@ -336,7 +336,7 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
 	char inpkey[128] = { 0 };
 	char seckey[64];
 	uchar sha1[20];
-	SHA_CTX ctx;
+//	SHA_CTX ctx;

 	if (opt_protocol)
 		applog(LOG_DEBUG, "clientkey: %s", clientkey);
@@ -346,9 +346,11 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
 	// SHA-1 test from rfc, returns in base64 "s3pPLMBiTxaQ9kYGzzhZRbK+xOo="
 	//sprintf(inpkey, "dGhlIHNhbXBsZSBub25jZQ==258EAFA5-E914-47DA-95CA-C5AB0DC85B11");

-	SHA1_Init(&ctx);
-	SHA1_Update(&ctx, inpkey, strlen(inpkey));
-	SHA1_Final(sha1, &ctx);
+   SHA1( inpkey, strlen(inpkey), sha1 );
+// Deprecated in openssl-3
+// SHA1_Init(&ctx);
+//	SHA1_Update(&ctx, inpkey, strlen(inpkey));
+//	SHA1_Final(sha1, &ctx);

 	base64_encode(sha1, 20, seckey, sizeof(seckey));

--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,18 +4,48 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.

-rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3  > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake > /dev/null

 # AVX512 SHA VAES: Intel Core Icelake, Rocketlake
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 CFLAGS="-O3 -march=icelake-client -Wall -fno-common" ./configure --with-curl
+# Rocketlake needs gcc-11
 #CFLAGS="-O3 -march=rocketlake -Wall -fno-common" ./configure --with-curl
 make -j 8
 strip -s cpuminer
 mv cpuminer cpuminer-avx512-sha-vaes

+# AVX256 SHA VAES: Intel Core Alderlake, needs gcc-12
+#make clean || echo clean
+#rm -f config.status
+#./autogen.sh || echo done
+#CFLAGS="-O3 -march=alderlake -Wall -fno-common" ./configure --with-curl
+#make -j 8
+#strip -s cpuminer
+#mv cpuminer cpuminer-alderlake
+
+# Zen4 AVX512 SHA VAES
+make clean || echo clean
+rm -f config.status
+# znver3 needs gcc-11, znver4 ?
+#CFLAGS="-O3 -march=znver4 -Wall -fno-common " ./configure --with-curl
+CFLAGS="-O3 -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
+#CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
+make -j 8
+strip -s cpuminer
+mv cpuminer cpuminer-zen4
+
+# Zen3 AVX2 SHA VAES
+make clean || echo clean
+rm -f config.status
+#CFLAGS="-O3 -march=znver2 -mvaes -fno-common " ./configure --with-curl
+CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl
+make -j 8
+strip -s cpuminer
+mv cpuminer cpuminer-zen3
+
 # AVX512 AES: Intel Core HEDT Sylake-X, Cascadelake
 make clean || echo clean
 rm -f config.status
@@ -59,7 +89,7 @@ make -j 8
 strip -s cpuminer
 mv cpuminer cpuminer-avx

-# SSE4.2 AES: Intel Westmere
+# SSE4.2 AES: Intel Westmere, most Pentium & Celeron
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl
--- a/clean-all.sh
+++ b/clean-all.sh
@@ -2,8 +2,8 @@
 #
 # make clean and rm all the targetted executables.

-rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 > /dev/null

-rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe > /dev/null
+rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe > /dev/null

 make distclean > /dev/null
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.22.2.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.20.1'
-PACKAGE_STRING='cpuminer-opt 3.20.1'
+PACKAGE_VERSION='3.22.2'
+PACKAGE_STRING='cpuminer-opt 3.22.2'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.20.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.22.2 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.20.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.22.2:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.20.1
+cpuminer-opt configure 3.22.2
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.20.1, which was
+It was created by cpuminer-opt $as_me 3.22.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.20.1'
+ VERSION='3.22.2'


 cat >>confdefs.h <<_ACEOF
@@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.20.1, which was
+This file was extended by cpuminer-opt $as_me 3.22.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6784,7 +6784,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.20.1
+cpuminer-opt config.status 3.22.2
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.20.1])
+AC_INIT([cpuminer-opt], [3.22.2])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -3,7 +3,7 @@
 * Copyright 2012-2014 pooler
 * Copyright 2014 Lucas Jones
 * Copyright 2014-2016 Tanguy Pruvot
- * Copyright 2016-2021 Jay D Dee
+ * Copyright 2016-2023 Jay D Dee
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
@@ -37,6 +37,7 @@
 #include <curl/curl.h>
 #include <jansson.h>
 #include <openssl/sha.h>
+//#include <mm_malloc.h>
 #include "sysinfos.c"
 #include "algo/sha/sha256d.h"

@@ -120,7 +121,6 @@ static uint64_t opt_affinity = 0xFFFFFFFFFFFFFFFFULL;  // default, use all cores
 int opt_priority = 0;  // deprecated
 int num_cpus = 1;
 int num_cpugroups = 1;  // For Windows
-#define max_cpus 256   // max for affinity
 char *rpc_url = NULL;
 char *rpc_userpass = NULL;
 char *rpc_user, *rpc_pass;
@@ -131,10 +131,9 @@ bool opt_verify = false;
 static bool opt_stratum_keepalive = false;
 static struct timeval stratum_keepalive_timer;
 // Stratum typically times out in 5 minutes or 300 seconds
-#define stratum_keepalive_timeout 180  // 3 minutes
+#define stratum_keepalive_timeout 150  // 2.5 minutes
 static struct timeval stratum_reset_time;

-
 // pk_buffer_size is used as a version selector by b58 code, therefore
 // it must be set correctly to work.
 const int pk_buffer_size_max = 26;
@@ -224,8 +223,7 @@ char*  lp_id;

 static void   workio_cmd_free(struct workio_cmd *wc);

-// array mapping thread to cpu
-static uint8_t thread_affinity_map[ max_cpus ];
+static int *thread_affinity_map;

 // display affinity mask graphically
 static void format_affinity_mask( char *mask_str, uint64_t mask )
@@ -318,8 +316,9 @@ static void affine_to_cpu( struct thr_info *thr )
   if ( !ok )
   {
      last_error = GetLastError();
-      applog( LOG_WARNING, "affine_to_cpu_mask for %u returned 0x%x",
-                           thread, last_error );
+      if ( !thread )
+      applog( LOG_WARNING, "Set affinity returned error 0x%x for thread %d",
+                           last_error, thread );
   }
 }   

@@ -390,11 +389,11 @@ bool std_le_work_decode( struct work *work )
 {
    int i;
    const int adata_sz    = algo_gate.get_work_data_size() / 4;
-    const int atarget_sz  = ARRAY_SIZE(work->target);
+//    const int atarget_sz  = ARRAY_SIZE(work->target);

    for ( i = 0; i < adata_sz; i++ )
          work->data[i] = le32dec( work->data + i );
-    for ( i = 0; i < atarget_sz; i++ )
+    for ( i = 0; i < 8; i++ )
          work->target[i] = le32dec( work->target + i );
    return true;
 }
@@ -403,11 +402,11 @@ bool std_be_work_decode( struct work *work )
 {
    int i;
    const int adata_sz    = algo_gate.get_work_data_size() / 4;
-    const int atarget_sz  = ARRAY_SIZE(work->target);
+//    const int atarget_sz  = ARRAY_SIZE(work->target);

    for ( i = 0; i < adata_sz; i++ )
          work->data[i] = be32dec( work->data + i );
-    for ( i = 0; i < atarget_sz; i++ )
+    for ( i = 0; i < 8; i++ )
          work->target[i] = le32dec( work->target + i );
    return true;
 }
@@ -431,20 +430,18 @@ static bool work_decode( const json_t *val, struct work *work )
    if ( unlikely( !algo_gate.work_decode( work ) ) )
        return false;

-    if ( !allow_mininginfo )
-        net_diff = algo_gate.calc_network_diff( work );
-    else
-        net_diff = hash_to_diff( work->target );
-
-    work->targetdiff = net_diff;
-    stratum_diff = last_targetdiff = work->targetdiff;
+    // many of these aren't used solo.
+    net_diff =
+    work->targetdiff = 
+    stratum_diff =
+    last_targetdiff = hash_to_diff( work->target );
    work->sharediff = 0;
    algo_gate.decode_extra_data( work, &net_blocks );

    return true;
 }

-// good alternative for wallet mining, difficulty and net hashrate
+// Only used for net_hashrate with GBT/getwork, data is from previous block.
 static const char *info_req =
 "{\"method\": \"getmininginfo\", \"params\": [], \"id\":8}\r\n";

@@ -470,17 +467,14 @@ static bool get_mininginfo( CURL *curl, struct work *work )
   // "networkhashps": 56475980
   if ( res )
   {
-      // net_diff is a global that is set from the work hash target by
-      // both getwork and GBT. Don't overwrite it, define a local to override
-      // the global.
-      double net_diff = 0.;
+      double difficulty = 0.;
  		json_t *key = json_object_get( res, "difficulty" );
   	if ( key )
      {
 	   	if ( json_is_object( key ) )
 		   	key = json_object_get( key, "proof-of-work" );
 		   if ( json_is_real( key ) )
-			   net_diff = json_real_value( key );
+			   difficulty = json_real_value( key );
 	   }

      key = json_object_get( res, "networkhashps" );
@@ -497,12 +491,13 @@ static bool get_mininginfo( CURL *curl, struct work *work )
 		  	net_blocks = json_integer_value( key );

      if ( opt_debug )
-         applog(LOG_INFO,"Mining info: diff %.5g, net_hashrate %f, height %d",
-                              net_diff, net_hashrate, net_blocks );
-      
+         applog( LOG_INFO,"getmininginfo: difficulty %.5g, networkhashps %.5g, blocks %d", difficulty, net_hashrate, net_blocks );
+
      if ( !work->height )
      {
 	      // complete missing data from getwork
+         if ( opt_debug )
+            applog( LOG_DEBUG, "work height set by getmininginfo" );
 	      work->height = (uint32_t) net_blocks + 1;
 	      if ( work->height > g_work.height )
            restart_threads();
@@ -518,11 +513,10 @@ static bool get_mininginfo( CURL *curl, struct work *work )

 static bool gbt_work_decode( const json_t *val, struct work *work )
 {
-   int i, n;
+   uint32_t prevhash[8] __attribute__ ((aligned (32)));
+   uint32_t target[8] __attribute__ ((aligned (32)));
+   unsigned char final_sapling_hash[32] __attribute__ ((aligned (32)));
   uint32_t version, curtime, bits;
-   uint32_t prevhash[8];
-   uint32_t target[8];
-   unsigned char final_sapling_hash[32];
   int cbtx_size;
   uchar *cbtx = NULL;
   int tx_count, tx_size;
@@ -534,9 +528,9 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
   bool version_reduce = false;
   json_t *tmp, *txa;
   bool rc = false;
-
-// Segwit BEGIN
+   int i, n;
   bool segwit = false;
+
   tmp = json_object_get( val, "rules" );
   if ( tmp && json_is_array( tmp ) )
   {
@@ -554,8 +548,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
         }
      }
   }
-// Segwit END
-   
+
   tmp = json_object_get( val, "mutable" );
   if ( tmp && json_is_array( tmp ) )
   {
@@ -637,7 +630,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
         goto out;
      }
   }
-   
+
   /* find count and size of transactions */
   txa = json_object_get(val, "transactions" );
   if ( !txa || !json_is_array( txa ) )
@@ -712,12 +705,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      cbtx[41] = cbtx_size - 42; /* scriptsig length */
      le32enc( (uint32_t *)( cbtx+cbtx_size ), 0xffffffff ); /* sequence */
      cbtx_size += 4;
-
-// Segwit BEGIN
-      //cbtx[cbtx_size++] = 1; /* out-counter */
-        cbtx[cbtx_size++] = segwit ? 2 : 1; /* out-counter */
-// Segwit END
-
+      cbtx[cbtx_size++] = segwit ? 2 : 1; /* out-counter */
      le32enc( (uint32_t *)( cbtx+cbtx_size) , (uint32_t)cbvalue ); /* value */
      le32enc( (uint32_t *)( cbtx+cbtx_size+4 ), cbvalue >> 32 );
      cbtx_size += 8;
@@ -725,7 +713,6 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      memcpy( cbtx+cbtx_size, pk_script, pk_script_size );
      cbtx_size += (int) pk_script_size;

-// Segwit BEGIN
       if ( segwit )
       {
          unsigned char (*wtree)[32] = calloc(tx_count + 2, 32);
@@ -760,12 +747,11 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
            for ( i = 0; i < n; i++ )
               sha256d( wtree[i], wtree[2*i], 64 );
         }
-         memset( wtree[1], 0, 32 );  /* witness reserved value = 0 */
+         memset( wtree[1], 0, 32 );  // witness reserved value = 0
         sha256d( cbtx+cbtx_size, wtree[0], 64 );
         cbtx_size += 32;
         free( wtree );
      }
-// Segwit END

      le32enc( (uint32_t *)( cbtx+cbtx_size ), 0 ); /* lock time */
      cbtx_size += 4;
@@ -784,10 +770,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
            xsig_len += n;
         }
         else
-         {
            applog( LOG_WARNING,
                        "Signature does not fit in coinbase, skipping" );
-         }
      }
      tmp = json_object_get( val, "coinbaseaux" );
      if ( tmp && json_is_object( tmp ) )
@@ -814,8 +798,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      if ( xsig_len )
      {
         unsigned char *ssig_end = cbtx + 42 + cbtx[41];
-         int push_len = cbtx[41] + xsig_len < 76 ? 1 :
-		               cbtx[41] + 2 + xsig_len > 100 ? 0 : 2;
+         int push_len = cbtx[41] + xsig_len < 76
+                        ? 1 : cbtx[41] + 2 + xsig_len > 100 ? 0 : 2;
         n = xsig_len + push_len;
         memmove( ssig_end + n, ssig_end, cbtx_size - 42 - cbtx[41] );
         cbtx[41] += n;
@@ -842,7 +826,6 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      const char *tx_hex = json_string_value( json_object_get( tmp, "data" ) );
      const int tx_size = tx_hex ? (int) ( strlen( tx_hex ) / 2 ) : 0;

-// Segwit BEGIN      
      if ( segwit )
      {
         const char *txid = json_string_value( json_object_get( tmp, "txid" ) );
@@ -855,8 +838,6 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      }
      else
      {
-// Segwit END
-
         unsigned char *tx = (uchar*) malloc( tx_size );
         if ( !tx_hex || !hex2bin( tx, tx_hex, tx_size ) )
         {
@@ -866,10 +847,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
         }
         sha256d( merkle_tree[1 + i], tx, tx_size );
         free( tx );
-
-// Segwit BEGIN      
      }
-// Segwit END

      if ( !submit_coinbase )
         strcat( work->txs, tx_hex );
@@ -887,6 +865,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
         sha256d( merkle_tree[i], merkle_tree[2*i], 64 );
   }

+   work->tx_count = tx_count;
+
   /* assemble block header */
   algo_gate.build_block_header( work, swab32( version ),
                                 (uint32_t*) prevhash, (uint32_t*) merkle_tree,
@@ -898,10 +878,12 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
      applog( LOG_ERR, "JSON invalid target" );
      goto out;
   }
-   for ( i = 0; i < ARRAY_SIZE( work->target ); i++ )
-      work->target[7 - i] = be32dec( target + i );
+
+   // reverse the bytes in target
+   casti_m128i( work->target, 0 ) = mm128_bswap_128( casti_m128i( target, 1 ) );
+   casti_m128i( work->target, 1 ) = mm128_bswap_128( casti_m128i( target, 0 ) );
   net_diff = work->targetdiff = hash_to_diff( work->target );
-   
+
   tmp = json_object_get( val, "workid" );
   if ( tmp )
   {
@@ -1077,12 +1059,11 @@ void report_summary_log( bool force )
   timeval_subtract( &et, &now, &start_time );
   timeval_subtract( &uptime, &total_hashes_time, &session_start );
   
-   double share_time = (double)et.tv_sec + (double)et.tv_usec / 1e6;
+   double share_time = (double)et.tv_sec + (double)et.tv_usec * 1e-6;
   double ghrate = safe_div( total_hashes, (double)uptime.tv_sec, 0. );
   double target_diff = exp32 * last_targetdiff;
   double shrate = safe_div( target_diff * (double)(accepts),
                             share_time, 0. );
-//   global_hashrate = ghrate;
   double sess_hrate = safe_div( exp32 * norm_diff_sum,
                                 (double)uptime.tv_sec, 0. );
   double submit_rate = safe_div( (double)submits * 60., share_time, 0. );
@@ -1103,7 +1084,7 @@ void report_summary_log( bool force )
   applog2( LOG_NOTICE, "Periodic Report     %s        %s", et_str, upt_str );
   applog2( LOG_INFO, "Share rate        %.2f/min     %.2f/min",
            submit_rate, safe_div( (double)submitted_share_count*60.,
-              ( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ), 0. ) );
+              ( (double)uptime.tv_sec + (double)uptime.tv_usec * 1e-6 ), 0. ) );
   applog2( LOG_INFO, "Hash rate       %7.2f%sh/s   %7.2f%sh/s   (%.2f%sh/s)",
            shrate, shr_units, sess_hrate, sess_hr_units, ghrate, ghr_units );

@@ -1459,6 +1440,7 @@ char* std_malloc_txs_request( struct work *work )
  json_t *val;
  char data_str[2 * sizeof(work->data) + 1];
  int i;
+  // datasize is an ugly hack, it should go through the gate
  int datasize = work->sapling ? 112 : 80;

  for ( i = 0; i < ARRAY_SIZE(work->data); i++ )
@@ -1549,7 +1531,6 @@ const char *getwork_req =

 #define GBT_CAPABILITIES "[\"coinbasetxn\", \"coinbasevalue\", \"longpoll\", \"workid\"]"

-// Segwit BEGIN
 #define GBT_RULES "[\"segwit\"]"
 static const char *gbt_req =
   "{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
@@ -1558,16 +1539,6 @@ const char *gbt_lp_req =
   "{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
   GBT_CAPABILITIES ", \"rules\": " GBT_RULES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n";

-/*
-static const char *gbt_req =
-	"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
-	GBT_CAPABILITIES "}], \"id\":0}\r\n";
-const char *gbt_lp_req =
-	"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
-	GBT_CAPABILITIES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n";
-*/
-// Segwit END
-
 static bool get_upstream_work( CURL *curl, struct work *work )
 {
   json_t *val;
@@ -1642,49 +1613,49 @@ start:
         last_block_height = work->height;
         last_targetdiff = net_diff;

-         applog( LOG_BLUE, "New Block %d, Net Diff %.5g, Ntime %08x",
-                                work->height, net_diff,
+         applog( LOG_BLUE, "New Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
+                                work->height, work->tx_count, net_diff,
                                work->data[ algo_gate.ntime_index ] );
-
-         if ( !opt_quiet )
-         {
-            double miner_hr = 0.;
-            double net_hr = net_hashrate;
-            double nd = net_diff * exp32;
-            char net_hr_units[4] = {0};
-            char miner_hr_units[4] = {0};
-            char net_ttf[32];
-            char miner_ttf[32];
-
-            pthread_mutex_lock( &stats_lock );
-
-            for ( int i = 0; i < opt_n_threads; i++ )
-               miner_hr += thr_hashrates[i];
-            global_hashrate = miner_hr;
-
-            pthread_mutex_unlock( &stats_lock );
-
-            if ( net_hr > 0. )
-               sprintf_et( net_ttf, nd / net_hr );
-            else
-               sprintf( net_ttf, "NA" );
-            if ( miner_hr > 0. )
-               sprintf_et( miner_ttf, nd / miner_hr );
-            else
-               sprintf( miner_ttf, "NA" );
-
-            scale_hash_for_display ( &miner_hr, miner_hr_units );
-            scale_hash_for_display ( &net_hr, net_hr_units );
-            applog2( LOG_INFO,
-                     "Miner TTF @ %.2f %sh/s %s, Net TTF @ %.2f %sh/s %s",
-                     miner_hr, miner_hr_units, miner_ttf, net_hr,
-                     net_hr_units, net_ttf );
-         }
-      }  // work->height > last_block_height
+      }
      else if ( memcmp( &work->data[1], &g_work.data[1], 32 ) )
-         applog( LOG_BLUE, "New Work: Block %d, Net Diff %.5g, Ntime %08x",
-                                      work->height, net_diff,
-                                      work->data[ algo_gate.ntime_index ] );
+         applog( LOG_BLUE, "New Work: Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
+                                work->height, work->tx_count, net_diff,
+                                work->data[ algo_gate.ntime_index ] );
+       
+      if ( !opt_quiet )
+      {
+         double miner_hr = 0.;
+         double net_hr = net_hashrate;
+         double nd = net_diff * exp32;
+         char net_hr_units[4] = {0};
+         char miner_hr_units[4] = {0};
+         char net_ttf[32];
+         char miner_ttf[32];
+
+         pthread_mutex_lock( &stats_lock );
+
+         for ( int i = 0; i < opt_n_threads; i++ )
+             miner_hr += thr_hashrates[i];
+         global_hashrate = miner_hr;
+
+         pthread_mutex_unlock( &stats_lock );
+
+         if ( net_hr > 0. )
+            sprintf_et( net_ttf, nd / net_hr );
+         else
+            sprintf( net_ttf, "NA" );
+         if ( miner_hr > 0. )
+            sprintf_et( miner_ttf, nd / miner_hr );
+         else
+            sprintf( miner_ttf, "NA" );
+
+         scale_hash_for_display ( &miner_hr, miner_hr_units );
+         scale_hash_for_display ( &net_hr, net_hr_units );
+         applog2( LOG_INFO,
+                  "Miner TTF @ %.2f %sh/s %s, Net TTF @ %.2f %sh/s %s",
+                  miner_hr, miner_hr_units, miner_ttf, net_hr,
+                  net_hr_units, net_ttf );
+      }
   }  // rc

   return rc;
@@ -1710,36 +1681,36 @@ static void workio_cmd_free(struct workio_cmd *wc)

 static bool workio_get_work( struct workio_cmd *wc, CURL *curl )
 {
-   struct work *ret_work;
+   struct work *work_heap;
   int failures = 0;

-   ret_work = (struct work*) calloc( 1, sizeof(*ret_work) );
-   if ( !ret_work )
-	return false;
+   work_heap = calloc( 1, sizeof(struct work) );
+   if ( !work_heap )  return false;

   /* obtain new work from bitcoin via JSON-RPC */
-   while ( !get_upstream_work( curl, ret_work ) )
+   while ( !get_upstream_work( curl, work_heap ) )
   {
      if ( unlikely( ( opt_retries >= 0 ) && ( ++failures > opt_retries ) ) )
      {
         applog( LOG_ERR, "json_rpc_call failed, terminating workio thread" );
-         free( ret_work );
-	      return false;
+         free( work_heap );
+         return false;
      }

      /* pause, then restart work-request loop */
-	   applog( LOG_ERR, "json_rpc_call failed, retry after %d seconds",
-		        opt_fail_pause );
+      applog( LOG_ERR, "json_rpc_call failed, retry after %d seconds",
+              opt_fail_pause );
      sleep( opt_fail_pause );
   }

   /* send work to requesting thread */
-   if ( !tq_push(wc->thr->q, ret_work ) )
-   	free( ret_work );
+   if ( !tq_push(wc->thr->q, work_heap ) )
+      free( work_heap );

   return true;
 }

+
 static bool workio_submit_work(struct workio_cmd *wc, CURL *curl)
 {
   int failures = 0;
@@ -1810,7 +1781,7 @@ static void *workio_thread(void *userdata)
 static bool get_work(struct thr_info *thr, struct work *work)
 {
 	struct workio_cmd *wc;
-	struct work *work_heap;
+   struct work *work_heap;

 	if unlikely( opt_benchmark )
   {
@@ -1835,17 +1806,16 @@ static bool get_work(struct thr_info *thr, struct work *work)
 	wc->thr = thr;
 	/* send work request to workio thread */
 	if (!tq_push(thr_info[work_thr_id].q, wc))
-        {
+   {
 		workio_cmd_free(wc);
 		return false;
 	}
 	/* wait for response, a unit of work */
 	work_heap = (struct work*) tq_pop(thr->q, NULL);
-	if (!work_heap)
-		return false;
-	/* copy returned work into storage provided by caller */
-	memcpy(work, work_heap, sizeof(*work));
-	free(work_heap);
+	if ( !work_heap ) return false;
+   /* copy returned work into storage provided by caller */
+	memcpy( work, work_heap, sizeof(*work) );
+	free( work_heap );
 	return true;
 }

@@ -1895,9 +1865,9 @@ static void update_submit_stats( struct work *work, const void *hash )
 bool submit_solution( struct work *work, const void *hash,
                      struct thr_info *thr )
 {
-   // Job went stale during hashing of a valid share.
-   if ( !opt_quiet && work_restart[ thr->id ].restart )
-      applog( LOG_INFO, CL_LBL "Share may be stale, submitting anyway..." CL_N );
+// Job went stale during hashing of a valid share.
+//   if ( !opt_quiet && work_restart[ thr->id ].restart )
+//      applog( LOG_INFO, CL_LBL "Share may be stale, submitting anyway..." CL_N );
   
   work->sharediff = hash_to_diff( hash );
   if ( likely( submit_work( thr, work ) ) )
@@ -1915,32 +1885,34 @@ bool submit_solution( struct work *work, const void *hash,
     if ( !opt_quiet )
     {
        if ( have_stratum )
+        {
           applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Job %s",
                   submitted_share_count, work->sharediff, work->height,
                   work->job_id );
+           if ( opt_debug && opt_extranonce )
+           {
+              unsigned char *xnonce2str = abin2hex( work->xnonce2,
+                                                    work->xnonce2_len );
+              applog( LOG_INFO, "Xnonce2 %s", xnonce2str );
+              free( xnonce2str );
+           }
+        }
        else
           applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Ntime %08x",
                   submitted_share_count, work->sharediff, work->height,
                   work->data[ algo_gate.ntime_index ] );
-     }

-     if ( opt_debug )
-     {
-        uint32_t* h = (uint32_t*)hash;
-        uint32_t* t = (uint32_t*)work->target;
-        uint32_t* d = (uint32_t*)work->data;
+        if ( opt_debug )
+        {
+           uint32_t* h = (uint32_t*)hash;
+           uint32_t* t = (uint32_t*)work->target;
+           uint32_t* d = (uint32_t*)work->data;

-        unsigned char *xnonce2str = abin2hex( work->xnonce2,
-                                              work->xnonce2_len );
-        applog(LOG_INFO,"Thread %d, Nonce %08x, Xnonce2 %s", thr->id,
-                       work->data[ algo_gate.nonce_index ], xnonce2str );
-        free( xnonce2str );
-        applog(LOG_INFO,"Data[0:19]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9] );
-        applog(LOG_INFO,"          : %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[10],d[11],d[12],d[13],d[14],d[15],d[16],d[17],d[18],d[19]);
-        applog(LOG_INFO,"Hash[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
-                                    h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0]);
-        applog(LOG_INFO,"Targ[7:0]: %08x %08x %08x %08x %08x %08x %08x %08x",
-                                    t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0]);
+           applog( LOG_INFO, "Data[ 0: 9]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9] );
+           applog( LOG_INFO, "Data[10:19]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[10],d[11],d[12],d[13],d[14],d[15],d[16],d[17],d[18],d[19] );
+           applog( LOG_INFO, "Hash[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x", h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0] );
+           applog( LOG_INFO, "Targ[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x", t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0] );
+        }
     }
     return true;
   }
@@ -1958,15 +1930,15 @@ static bool wanna_mine(int thr_id)
 		float temp = cpu_temp(0);
 		if (temp > opt_max_temp)
      {
-			if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
-				applog(LOG_INFO, "temperature too high (%.0fC), waiting...", temp);
-			state = false;
+         if ( !thr_id && !conditional_state[thr_id] && !opt_quiet )
+           applog(LOG_NOTICE, "CPU temp too high: %.0fC max %.0f, waiting...", temp, opt_max_temp );
+         state = false;
 		}
 	}
 	if (opt_max_diff > 0.0 && net_diff > opt_max_diff)
   {
 		if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
-			applog(LOG_INFO, "network diff too high, waiting...");
+			applog(LOG_NOTICE, "network diff too high, waiting...");
 		state = false;
 	}
 	if (opt_max_rate > 0.0 && net_hashrate > opt_max_rate)
@@ -1975,12 +1947,14 @@ static bool wanna_mine(int thr_id)
      {
 			char rate[32];
 			format_hashrate(opt_max_rate, rate);
-			applog(LOG_INFO, "network hashrate too high, waiting %s...", rate);
+			applog(LOG_NOTICE, "network hashrate too high (%s), waiting...", rate);
 		}
 		state = false;
 	}
-	if (thr_id < MAX_CPUS)
-		conditional_state[thr_id] = (uint8_t) !state;
+  
+   if ( conditional_state[thr_id] && state && !thr_id && !opt_quiet )
+      applog(LOG_NOTICE, "...resuming" );
+	conditional_state[thr_id] = (uint8_t) !state;
 	return state;
 }

@@ -2014,33 +1988,6 @@ void set_work_data_big_endian( struct work *work )
        be32enc( work->data + i, work->data[i] );
 }

-// calculate net diff from nbits.
-double std_calc_network_diff( struct work* work )
-{
-   uint32_t nbits = work->data[ algo_gate.nbits_index ];
-   uint32_t shift = nbits & 0xff;
-   uint32_t bits = bswap_32( nbits ) & 0x00ffffff;
-/*
-   // sample for diff 43.281 : 1c05ea29
-   // todo: endian reversed on longpoll could be zr5 specific...
-   int nbits_index = algo_gate.nbits_index;
-   uint32_t nbits = have_longpoll ? work->data[ nbits_index]
-                                  : swab32( work->data[ nbits_index ] );
-   uint32_t bits  = ( nbits & 0xffffff );
-   int16_t  shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
-*/
-
-   int m;
-   long double d = (long double)0x0000ffff / (long double)bits;
-   for ( m = shift; m < 29; m++ )
-       d *= 256.0;
-   for ( m = 29; m < shift; m++ )
-       d /= 256.0;
-   if ( opt_debug_diff )
-      applog(LOG_DEBUG, "net diff: %8f -> shift %u, bits %08x", (double)d, shift, bits);
-   return (double)d;
-}
-
 void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
                     uint32_t *end_nonce_ptr )
 {
@@ -2064,17 +2011,6 @@ void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
       ++(*nonceptr);
 }

-bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                           int thr_id )
-{
-   if ( have_stratum && !work->data[0] && !opt_benchmark )
-   {
-      sleep(1);
-      return false;
-   }
-   return true;
-}
-
 static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
 {
   bool new_job;
@@ -2091,7 +2027,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   g_work->xnonce2 = (uchar*) realloc( g_work->xnonce2, sctx->xnonce2_size );
   memcpy( g_work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size );
   algo_gate.build_extraheader( g_work, sctx );
-   net_diff = algo_gate.calc_network_diff( g_work );
+   net_diff = nbits_to_diff( g_work->data[ algo_gate.nbits_index ] );
   algo_gate.set_work_data_endian( g_work );
   g_work->height = sctx->block_height;
   g_work->targetdiff = sctx->job.diff
@@ -2120,14 +2056,17 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   pthread_mutex_unlock( &stats_lock );

   if ( stratum_diff != sctx->job.diff )
-      applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Job %s",
-                        sctx->job.diff, sctx->block_height, g_work->job_id );
+      applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Tx %d, Job %s",
+                        sctx->job.diff, sctx->block_height,
+                        sctx->job.merkle_count, g_work->job_id );
   else if ( last_block_height != sctx->block_height )
-      applog( LOG_BLUE, "New Block %d, Net diff %.5g, Job %s",
-                        sctx->block_height, net_diff, g_work->job_id );
+      applog( LOG_BLUE, "New Block %d, Tx %d, Netdiff %.5g, Job %s",
+                        sctx->block_height, sctx->job.merkle_count,
+                        net_diff, g_work->job_id );
   else if ( g_work->job_id && new_job )
-      applog( LOG_BLUE, "New Work: Block %d, Net diff %.5g, Job %s",
-                         sctx->block_height, net_diff, g_work->job_id );
+      applog( LOG_BLUE, "New Work: Block %d, Tx %d, Netdiff %.5g, Job %s",
+                         sctx->block_height, sctx->job.merkle_count,
+                         net_diff, g_work->job_id );
   else if ( !opt_quiet )
   {
      unsigned char *xnonce2str = bebin2hex( g_work->xnonce2,
@@ -2141,8 +2080,6 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   if ( ( stratum_diff != sctx->job.diff )
   || ( last_block_height != sctx->block_height ) )
   {
-      static bool multipool = false;
-      if ( stratum.block_height < last_block_height ) multipool = true;
      if ( unlikely( !session_first_block ) )
         session_first_block = stratum.block_height;
      last_block_height = stratum.block_height;
@@ -2150,56 +2087,47 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
      last_targetdiff   = g_work->targetdiff;
      if ( lowest_share < last_targetdiff )
         lowest_share = 9e99;
+    }

-      if ( !opt_quiet )
-      {
-         applog2( LOG_INFO, "Diff: Net %.5g, Stratum %.5g, Target %.5g",
-                            net_diff, stratum_diff, g_work->targetdiff );
+    if ( !opt_quiet )
+    {
+       applog2( LOG_INFO, "Diff: Net %.5g, Stratum %.5g, Target %.5g",
+                          net_diff, stratum_diff, g_work->targetdiff );

-         if ( likely( hr > 0. ) )
-         {
-            double nd = net_diff * exp32;
-            char hr_units[4] = {0};
-            char block_ttf[32];
-            char share_ttf[32];
+       if ( likely( hr > 0. ) )
+       {
+          double nd = net_diff * exp32;
+          char hr_units[4] = {0};
+          char block_ttf[32];
+          char share_ttf[32];
+          static bool multipool = false;
+      
+          if ( stratum.block_height < last_block_height ) multipool = true;
+            
+          sprintf_et( block_ttf, nd / hr );
+          sprintf_et( share_ttf, ( g_work->targetdiff * exp32 ) / hr );
+          scale_hash_for_display ( &hr, hr_units );
+          applog2( LOG_INFO, "TTF @ %.2f %sh/s: Block %s, Share %s",
+                             hr, hr_units, block_ttf, share_ttf );

-            sprintf_et( block_ttf, nd /  hr );
-            sprintf_et( share_ttf, ( g_work->targetdiff * exp32 ) / hr );
-            scale_hash_for_display ( &hr, hr_units );
-            applog2( LOG_INFO, "TTF @ %.2f %sh/s: Block %s, Share %s",
-                               hr, hr_units, block_ttf, share_ttf );
-
-            if ( !multipool && last_block_height > session_first_block )
-            {
-               struct timeval now, et;
-               gettimeofday( &now, NULL );
-               timeval_subtract( &et, &now, &session_start );
-               uint64_t net_ttf =
-                    ( last_block_height - session_first_block ) == 0 ? 0
-                    : et.tv_sec / ( last_block_height - session_first_block );
-               if ( net_diff > 0. && net_ttf )
-               {
-                  double net_hr = nd / net_ttf;
-                  char net_hr_units[4] = {0};
-                  scale_hash_for_display ( &net_hr, net_hr_units );
-                  applog2( LOG_INFO, "Net hash rate (est) %.2f %sh/s",
-                                     net_hr, net_hr_units );
-               }
-            }
-         }  // hr > 0
-      } // !quiet
-   }  // new diff/block
-
-   if ( new_job && !( opt_quiet || stratum_errors ) )
-   {
-      int mismatch = submitted_share_count - ( accepted_share_count
-                                             + stale_share_count
-                                             + rejected_share_count );
-      if ( mismatch )
-         applog( LOG_INFO,
-                 CL_LBL "%d Submitted share pending, maybe stale" CL_N,
-                 submitted_share_count );
-   }
+          if ( !multipool && last_block_height > session_first_block )
+          {
+             struct timeval now, et;
+             gettimeofday( &now, NULL );
+             timeval_subtract( &et, &now, &session_start );
+             uint64_t net_ttf = safe_div( et.tv_sec,
+                                 last_block_height - session_first_block, 0 );
+             if ( net_diff > 0. && net_ttf )
+             {
+                double net_hr = safe_div( nd, net_ttf, 0. );
+                char net_hr_units[4] = {0};
+                scale_hash_for_display ( &net_hr, net_hr_units );
+                applog2( LOG_INFO, "Net hash rate (est) %.2f %sh/s",
+                                   net_hr, net_hr_units );
+             }
+          }
+       }  // hr > 0
+    } // !quiet
 }

 static void *miner_thread( void *userdata )
@@ -2337,9 +2265,14 @@ static void *miner_thread( void *userdata )
       } // do_this_thread
       algo_gate.resync_threads( thr_id, &work );

-       if ( unlikely( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) )
+       // conditional mining
+       if ( unlikely( !wanna_mine( thr_id ) ) )
+       {
+          restart_threads();
+          sleep(5);
          continue;
-
+       }
+       
       // opt_scantime expressed in hashes
       max64 = opt_scantime * thr_hashrates[thr_id];

@@ -2444,8 +2377,8 @@ static void *miner_thread( void *userdata )
          {
             scale_hash_for_display( &hashrate,  hr_units );
             sprintf( hr, "%.2f", hashrate );
-             applog( LOG_INFO, "CPU #%d: %s %sh/s",
-                               thr_id, hr, hr_units );
+             applog( LOG_INFO, "Thread %d, CPU %d: %s %sh/s",
+                        thr_id, thread_affinity_map[ thr_id ], hr, hr_units );
          }
       }

@@ -2486,14 +2419,6 @@ static void *miner_thread( void *userdata )
             }
          }
       }  // benchmark
-
-       // conditional mining
-       if ( unlikely( !wanna_mine( thr_id ) ) )
-       {
-          sleep(5);
-          continue;
-       }
-
   }  // miner_thread loop

 out:
@@ -2885,7 +2810,7 @@ static void *stratum_thread(void *userdata )
            else
              timeval_subtract( &et, &now, &stratum_reset_time );

-            if ( et.tv_sec > stratum_keepalive_timeout + 60 )
+            if ( et.tv_sec > stratum_keepalive_timeout + 90 )
            {
               applog( LOG_NOTICE, "No shares submitted, resetting stratum connection" );
               stratum_need_reset = true;
@@ -3668,7 +3593,7 @@ int main(int argc, char *argv[])

 #if defined(WIN32)

-// Are Windows CPU Groups supported?
+// Get the number of cpus, display after parsing command line
 #if defined(WINDOWS_CPU_GROUPS_ENABLED)
 	num_cpus = 0;
 	num_cpugroups = GetActiveProcessorGroupCount();
@@ -3677,8 +3602,8 @@ int main(int argc, char *argv[])
 	   int cpus = GetActiveProcessorCount( i );
 	   num_cpus += cpus;

-	   if (opt_debug)
-         applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
+//	   if (opt_debug)
+//         applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
 	}

 #else
@@ -3695,7 +3620,7 @@ int main(int argc, char *argv[])
 	sysctl(req, 2, &num_cpus, &len, NULL, 0);
 #else
 	num_cpus = 1;
-#endif
+#endif 

   if ( num_cpus < 1 )
      num_cpus = 1;
@@ -3719,7 +3644,6 @@ int main(int argc, char *argv[])
   if ( opt_time_limit )
      time_limit_stop = (unsigned int)time(NULL) + opt_time_limit;

-
   // need to register to get algo optimizations for cpu capabilities
   // but that causes registration logs before cpu capabilities is output.
   // Would need to split register function into 2 parts. First part sets algo
@@ -3847,20 +3771,30 @@ int main(int argc, char *argv[])
 	}
 #endif

-   if ( opt_affinity && num_cpus > max_cpus )
-   {
-      applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled",
-                            max_cpus );
-      opt_affinity = 0ULL;
-   }
+#if defined(WIN32) && defined(WINDOWS_CPU_GROUPS_ENABLED)
+      if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )
+         applog( LOG_INFO, "Found %d CPUs in %d groups",
+                           num_cpus, num_cpugroups );
+#endif
   
+   const int map_size = opt_n_threads < num_cpus ? num_cpus : opt_n_threads;   
+   thread_affinity_map = malloc( map_size * (sizeof (int)) );
+   if ( !thread_affinity_map )
+   {
+      applog( LOG_ERR, "CPU Affinity disabled, memory allocation failed" );
+      opt_affinity = 0ULL;
+   }   
   if ( opt_affinity )
   {
-      for ( int thr = 0, cpu = 0; thr < opt_n_threads; thr++, cpu++ )
+      int active_cpus = 0; // total CPUs available using rolling affinity mask
+      for ( int thr = 0, cpu = 0; thr < map_size; thr++, cpu++ )
      {
-         while ( !( ( opt_affinity >> ( cpu&63 ) ) & 1ULL ) ) cpu++;   
+         while ( !( ( opt_affinity >> ( cpu & 63 ) ) & 1ULL ) ) cpu++;   
         thread_affinity_map[ thr ] = cpu % num_cpus;
+         if ( cpu < num_cpus ) active_cpus++;
      }
+      if ( opt_n_threads > active_cpus )
+         applog( LOG_WARNING, "Affinity: more threads (%d) than active CPUs (%d)", opt_n_threads, active_cpus );
      if ( !opt_quiet )
      {
         char affinity_mask[64];
@@ -3992,7 +3926,7 @@ int main(int argc, char *argv[])
 		}
   }

-   // Initialize stats times and counters
+   // Initialize stats timers and counters
   memset( share_stats, 0, s_stats_size *  sizeof (struct share_stats_t) );
   gettimeofday( &last_submit_time, NULL );
   memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
--- a/miner.h
+++ b/miner.h
@@ -24,6 +24,11 @@

 #endif /* _MSC_VER */

+// prevent questions from ARM users that don't read the requirements.
+#if !defined(__x86_64__)
+#error "CPU architecture not supported. Consult the requirements for supported CPUs."
+#endif
+
 #include <stdbool.h>
 #include <inttypes.h>
 #include <sys/time.h>
@@ -91,6 +96,19 @@ enum {
   LOG_PINK  = 0x14 };
 #endif

+#define WORK_ALIGNMENT 64
+
+// When working with dynamically allocated memory to guarantee data alignment
+// for large vectors. Physical block size must be extended by alignment number
+// of bytes when allocated. free() should use the physical pointer returned by
+// malloc(), not the aligned pointer. All others shoujld use the logical,
+// aligned, pointer returned by this function. 
+static inline void *align_ptr( const void *ptr, const uint64_t alignment )
+{
+  const uint64_t mask = alignment - 1;
+  return (void*)( ( ((const uint64_t)ptr) + mask ) & (~mask) );
+}
+
 extern bool is_power_of_2( int n );

 static inline bool is_windows(void)
@@ -118,7 +136,7 @@ static inline bool is_windows(void)
 static inline uint32_t swab32(uint32_t v)
 {
 #ifdef WANT_BUILTIN_BSWAP
-	return __builtin_bswap32(v);
+   return __builtin_bswap32(v);
 #else
 	return bswap_32(v);
 #endif
@@ -317,7 +335,7 @@ extern void cbin2hex(char *out, const char *in, size_t len);
 void   bin2hex( char *s, const unsigned char *p, size_t len );
 char  *abin2hex( const unsigned char *p, size_t len );
 char  *bebin2hex( const unsigned char *p, size_t len );
-bool   hex2bin( unsigned char *p, const char *hexstr, size_t len );
+bool   hex2bin( unsigned char *p, const char *hexstr, const size_t len );
 bool   jobj_binary( const json_t *obj, const char *key, void *buf,
                    size_t buflen );
 int    varint_encode( unsigned char *p, uint64_t n );
@@ -333,10 +351,7 @@ extern void memrev(unsigned char *p, size_t len);
 // number of hashes.
 //
 //     https://en.bitcoin.it/wiki/Difficulty
-//
 //     hash = diff * 2**32
-//
-// diff_to_hash = 2**32 = 0x100000000 = 4294967296 = exp32;

 #define EXP16 65536.
 #define EXP32 4294967296.
@@ -350,8 +365,9 @@ extern const long double exp160; // 2**160
 bool   fulltest( const uint32_t *hash, const uint32_t *target );
 bool   valid_hash( const void*, const void* );

-double hash_to_diff( const void* );
+extern double hash_to_diff( const void* );
 extern void diff_to_hash( uint32_t*, const double );
+extern double nbits_to_diff( uint32_t );

 double hash_target_ratio( uint32_t* hash, uint32_t* target );
 void   work_set_target_ratio( struct work* work, const void *hash );
@@ -399,13 +415,14 @@ struct work
   double stratum_diff;
 	int height;
 	char *txs;
-	char *workid;
+   int tx_count;
+   char *workid;
 	char *job_id;
 	size_t xnonce2_len;
 	unsigned char *xnonce2;
   bool sapling;
   bool stale;
-} __attribute__ ((aligned (64)));
+} __attribute__ ((aligned (WORK_ALIGNMENT)));

 struct stratum_job
 {
@@ -416,7 +433,8 @@ struct stratum_job
 	unsigned char *coinbase;
 	unsigned char *xnonce2;
 	int merkle_count;
-	unsigned char **merkle;
+   int merkle_buf_size;
+   unsigned char **merkle;
 	unsigned char version[4];
 	unsigned char nbits[4];
 	unsigned char ntime[4];
@@ -540,7 +558,6 @@ enum algos {
        ALGO_BMW,        
        ALGO_BMW512,
        ALGO_C11,         
-        ALGO_DECRED,
        ALGO_DEEP,
        ALGO_DMD_GR,
        ALGO_GROESTL,     
@@ -559,6 +576,7 @@ enum algos {
        ALGO_LYRA2Z330,
        ALGO_M7M,
        ALGO_MINOTAUR,
+        ALGO_MINOTAURX,
        ALGO_MYR_GR,      
        ALGO_NEOSCRYPT,
        ALGO_NIST5,       
@@ -571,9 +589,11 @@ enum algos {
        ALGO_QUBIT,       
        ALGO_SCRYPT,
        ALGO_SHA256D,
+        ALGO_SHA256DT,
        ALGO_SHA256Q,
        ALGO_SHA256T,
        ALGO_SHA3D,
+        ALGO_SHA512256D,
        ALGO_SHAVITE3,    
        ALGO_SKEIN,       
        ALGO_SKEIN2,      
@@ -633,7 +653,6 @@ static const char* const algo_names[] = {
        "bmw",
        "bmw512",
        "c11",
-        "decred",
        "deep",
        "dmd-gr",
        "groestl",
@@ -652,6 +671,7 @@ static const char* const algo_names[] = {
        "lyra2z330",
        "m7m",
        "minotaur",
+        "minotaurx",
        "myr-gr",
        "neoscrypt",
        "nist5",
@@ -664,9 +684,11 @@ static const char* const algo_names[] = {
        "qubit",
        "scrypt",
        "sha256d",
+        "sha256dt",
        "sha256q",
        "sha256t",
        "sha3d",
+        "sha512256d",
        "shavite3",
        "skein",
        "skein2",
@@ -793,7 +815,6 @@ Options:\n\
                          bmw           BMW 256\n\
                          bmw512        BMW 512\n\
                          c11           Chaincoin\n\
-                          decred        Blake256r14dcr\n\
                          deep          Deepcoin (DCN)\n\
                          dmd-gr        Diamond\n\
                          groestl       Groestl coin\n\
@@ -813,6 +834,7 @@ Options:\n\
                          m7m           Magi (XMG)\n\
                          myr-gr        Myriad-Groestl\n\
                          minotaur\n\
+                          minotaurx\n\
                          neoscrypt     NeoScrypt(128, 2, 1)\n\
                          nist5         Nist5\n\
                          pentablake    5 x blake512\n\
@@ -826,9 +848,11 @@ Options:\n\
                          scrypt:N      scrypt(N, 1, 1)\n\
                          scryptn2      scrypt(1048576, 1,1)\n\
                          sha256d       Double SHA-256\n\
+                          sha256dt      Modified sha256d (Novo)\n\
                          sha256q       Quad SHA-256, Pyrite (PYE)\n\
                          sha256t       Triple SHA-256, Onecoin (OC)\n\
                          sha3d         Double Keccak256 (BSHA3)\n\
+                          sha512256d    Double SHA-512 (Radiant)\n\
                          shavite3      Shavite3\n\
                          skein         Skein+Sha (Skeincoin)\n\
                          skein2        Double Skein (Woodcoin)\n\
--- a/simd-utils.h
+++ b/simd-utils.h
@@ -57,10 +57,15 @@
 //    32 bytes for 256 bit vectors and 64 bytes for 512 bit vectors. 64 byte
 //    alignment is recommended in all cases for best cache alignment.
 //
+//    All functions are defined with type agnostic pointers (void*) arguments
+//    and are cast or aliased as the appropriate type. This adds convenience
+//    for the applications but also adds responsibility to ensure adequate data
+//    alignment.
+//
 //    Windows has problems with function vector arguments larger than
 //    128 bits. Stack alignment is only guaranteed to 16 bytes. Always use
-//    pointers for larger vectors in function arguments. Macros can be
-//    used for larger value arguments.
+//    pointers for larger vectors in function arguments. Macros can be used
+//    for larger value arguments.
 //
 //    An attempt was made to make the names as similar as possible to
 //    Intel's intrinsic function format. Most variations are to avoid
@@ -74,7 +79,7 @@
 //     to avoid the ambiguity of "mm".
 //   - the element size does not include additional type specifiers
 //      like "epi".
-//   - some macros contain value args that are updated.
+//   - some macros may contain value args that are updated.
 //   - specialized shift and rotate functions that move elements around
 //     use the notation "1x32" to indicate the distance moved as units of
 //     the element size.
@@ -86,10 +91,10 @@
 //   
 //    Function names follow this pattern:
 //
-//         prefix_op[esize]_[vsize]
+//         prefix_op[vsize]_[esize]
 //
-//    Prefix: usually the size of the largest vectors used. Following
-//            are some examples:
+//    Prefix: usually the size of the returned vector.
+//    Following are some examples:
 //
 //    u64:  unsigned 64 bit integer function
 //    i128: signed 128 bit integer function (rarely used)
@@ -102,10 +107,12 @@
 //    esize: optional, element size of operation
 //
 //    vsize: optional, lane size used when a function operates on elements
-//           of vectors within lanes of a vector.
+//           within lanes of a larger vector.
 //
-//    Ex: mm256_ror1x64_128 rotates each 128 bit lane of a 256 bit vector
-//        right by 64 bits.
+//    m256_const_64 defines a vector contructed from the supplied 64 bit
+//        integer arguments.
+//    mm256_shuflr128_32 rotates each 128 bit lane of a 256 bit vector
+//        right by 32 bits.
 //
 // Vector constants
 //
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -54,7 +54,7 @@ static inline __m128i mm128_mov64_128( const uint64_t n )
 #else
  asm( "movq %1, %0\n\t" : "=x"(a) : "r"(n) );
 #endif
-  return  a;
+  return a;
 }

 static inline __m128i mm128_mov32_128( const uint32_t n )
@@ -65,7 +65,7 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
 #else  
  asm( "movd %1, %0\n\t" : "=x"(a) : "r"(n) );
 #endif
-  return  a;
+  return a;
 }

 // Inconstant naming, prefix should reflect return value:
@@ -79,7 +79,7 @@ static inline uint64_t u64_mov128_64( const __m128i a )
 #else  
  asm( "movq %1, %0\n\t" : "=r"(n) : "x"(a) );
 #endif
-  return  n;
+  return n;
 }

 static inline uint32_t u32_mov128_32( const __m128i a )
@@ -90,13 +90,18 @@ static inline uint32_t u32_mov128_32( const __m128i a )
 #else  
  asm( "movd %1, %0\n\t" : "=r"(n) : "x"(a) );
 #endif
-  return  n;
+  return n;
 }

-// Equivalent of set1, broadcast integer to all elements.
-#define m128_const_i128( i ) mm128_mov64_128( i )
-#define m128_const1_64( i ) _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 )
-#define m128_const1_32( i ) _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 )
+// Emulate broadcast & insert instructions not available in SSE2
+#define mm128_bcast_i64( i )   _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 )
+#define mm128_bcast_i32( i )   _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 )
+
+#define m128_const_i128( i )    mm128_mov64_128( i )
+
+// deprecated
+#define m128_const1_64          mm128_bcast_i64
+#define m128_const1_32          mm128_bcast_i32

 #if defined(__SSE4_1__)

@@ -104,7 +109,7 @@ static inline uint32_t u32_mov128_32( const __m128i a )
 #define m128_const_64( hi, lo ) \
   _mm_insert_epi64( mm128_mov64_128( lo ), hi, 1 )

-#else  // No insert in SSE2
+#else 

 #define m128_const_64  _mm_set_epi64x

@@ -114,12 +119,10 @@ static inline uint32_t u32_mov128_32( const __m128i a )

 #define m128_zero      _mm_setzero_si128()
 #define m128_one_128   mm128_mov64_128( 1 )
-#define m128_one_64    _mm_shuffle_epi32( mm128_mov64_128( 1 ), 0x44 )
-#define m128_one_32    _mm_shuffle_epi32( mm128_mov32_128( 1 ), 0x00 )
-#define m128_one_16    _mm_shuffle_epi32( \
-                                 mm128_mov32_128( 0x00010001 ), 0x00 )
-#define m128_one_8     _mm_shuffle_epi32( \
-                                 mm128_mov32_128( 0x01010101 ), 0x00 )
+#define m128_one_64    mm128_bcast_i64( 1 )
+#define m128_one_32    mm128_bcast_i32( 1 )
+#define m128_one_16    mm128_bcast_i32( 0x00010001 )
+#define m128_one_8     mm128_bcast_i32( 0x01010101 )

 // ASM avoids the need to initialize return variable to avoid compiler warning.
 // Macro abstracts function parentheses to look like an identifier.
@@ -149,7 +152,7 @@ static inline __m128i mm128_neg1_fn()
 // sizing. It's unique.
 //
 // It can:
-//   - zero 32 bit elements of a 128 bit vector.
+//   - zero any number of 32 bit elements of a 128 bit vector.
 //   - extract any 32 bit element from one 128 bit vector and insert the
 //     data to any 32 bit element of another 128 bit vector, or the same vector.
 //   - do both simultaneoulsly.
@@ -162,14 +165,21 @@ static inline __m128i mm128_neg1_fn()
 //    c[5:4] destination element selector
 //    c[7:6] source element selector

-// Convert type and abbreviate name: e"x"tract "i"nsert "m"ask
+// Convert type and abbreviate name: eXtract Insert Mask = XIM
 #define mm128_xim_32( v1, v2, c ) \
   _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
                                    _mm_castsi128_ps( v2 ), c ) )

-// Some examples of simple operations:
+/* Another way to do it with individual arguments.
+#define mm128_xim_32( v1, i1, v2, i2, mask ) \
+   _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
+                                    _mm_castsi128_ps( v2 ), \
+                                    (mask) | ((i1)<<4) | ((i2)<<6) ) )
+*/

-// Insert 32 bit integer into v at element c and return modified v.
+// Examples of simple operations using xim:
+
+// Insert 32 bit integer into v at element c and return updated v.
 static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
                                       const int c )
 {   return mm128_xim_32( v, mm128_mov32_128( i ), c<<4 ); }
@@ -178,13 +188,12 @@ static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
 static inline uint32_t mm128_extract_32( const __m128i v, const int c )
 {   return u32_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }

-// Clear (zero) 32 bit elements based on bits set in 4 bit mask.
+// Zero 32 bit elements when bit in mask is set.
 static inline __m128i mm128_mask_32( const __m128i v, const int m ) 
 {   return mm128_xim_32( v, v, m ); }

-// Move element i2 of v2 to element i1 of v1. For reference and convenience,
-// it's faster to precalculate the index.
-#define mm128_shuflmov_32( v1, i1, v2, i2 ) \
+// Move element i2 of v2 to element i1 of v1 and return updated v1.
+#define mm128_mov32_32( v1, i1, v2, i2 ) \
  mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )

 #endif  // SSE4_1
@@ -193,13 +202,23 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m )
 // Basic operations without equivalent SIMD intrinsic

 // Bitwise not (~v)  
+#if defined(__AVX512VL__)
+
+static inline __m128i mm128_not( const __m128i v )
+{  return _mm_ternarylogic_epi64( v, v, v, 1 ); }
+
+#else
+
 #define mm128_not( v )          _mm_xor_si128( v, m128_neg1 ) 

+#endif
+
+/*
 // Unary negation of elements (-v)
 #define mm128_negate_64( v )    _mm_sub_epi64( m128_zero, v )
 #define mm128_negate_32( v )    _mm_sub_epi32( m128_zero, v )  
 #define mm128_negate_16( v )    _mm_sub_epi16( m128_zero, v )  
-
+*/

 // Add 4 values, fewer dependencies than sequential addition.
 #define mm128_add4_64( a, b, c, d ) \
@@ -255,27 +274,23 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #if defined(__AVX512VL__)

 // a ^ b ^ c
-#define mm128_xor3( a, b, c ) \
-   _mm_ternarylogic_epi64( a, b, c, 0x96 )
+#define mm128_xor3( a, b, c )    _mm_ternarylogic_epi64( a, b, c, 0x96 )

 // a ^ ( b & c )
-#define mm128_xorand( a, b, c ) \
-   _mm_ternarylogic_epi64( a, b, c, 0x78 )
+#define mm128_xorand( a, b, c )  _mm_ternarylogic_epi64( a, b, c, 0x78 )

 #else

-#define mm128_xor3( a, b, c ) \
-   _mm_xor_si128( a, _mm_xor_si128( b, c ) )
+#define mm128_xor3( a, b, c )    _mm_xor_si128( a, _mm_xor_si128( b, c ) )

-#define mm128_xorand( a, b, c ) \
-  _mm_xor_si128( a, _mm_and_si128( b, c ) )
+#define mm128_xorand( a, b, c )  _mm_xor_si128( a, _mm_and_si128( b, c ) )

 #endif

 // Mask making
-
 // Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
-// Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements.
+// Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements.
+// Effectively a sign test.

 #define mm_movmask_64( v ) \
   _mm_castpd_si128( _mm_movmask_pd( _mm_castsi128_pd( v ) ) )
@@ -283,57 +298,14 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm_movmask_32( v ) \
   _mm_castps_si128( _mm_movmask_ps( _mm_castsi128_ps( v ) ) )

-
-// Diagonal blend
-
-// Blend 4 32 bit elements from 4 vectors
-
-#if defined (__AVX2__)
-
-#define mm128_diagonal_32( v3, v2, v1, v0 ) \
-  mm_blend_epi32( _mm_blend_epi32( s3, s2, 0x4 ), \
-                  _mm_blend_epi32( s1, s0, 0x1 ), 0x3 )
-
-#elif defined(__SSE4_1__)
-
-#define mm128_diagonal_32( v3, v2, v1, v0 ) \
-  mm_blend_epi16( _mm_blend_epi16( s3, s2, 0x30 ), \
-                  _mm_blend_epi16( s1, s0, 0x03 ), 0x0f )
-
-#endif
-
-
 //
 // Bit rotations

-// AVX512VL has implemented bit rotation for 128 bit vectors with
-// 64 and 32 bit elements.
-
 // x2 rotates elements in 2 individual vectors in a double buffered
 // optimization for SSE2, does nothing for AVX512 but is there for
 // transparency.

-// compiler doesn't like when a variable is used for the last arg of
-// _mm_rol_epi32, must be "8 bit immediate". Oddly _mm_slli has the same
-// specification but works with a variable. Therefore use rol_var where
-// necessary.
-// sm3-hash-4way.c has one instance where mm128_rol_var_32 is required.
-
-#define mm128_ror_var_64( v, c ) \
-   _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
-
-#define mm128_rol_var_64( v, c ) \
-   _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
-
-#define mm128_ror_var_32( v, c ) \
-   _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
-
-#define mm128_rol_var_32( v, c ) \
-   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
-
-
 #if defined(__AVX512VL__)
-//#if defined(__AVX512F__) && defined(__AVX512VL__)

 #define mm128_ror_64    _mm_ror_epi64
 #define mm128_rol_64    _mm_rol_epi64
@@ -358,10 +330,17 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )

 #else  // SSE2

-#define mm128_ror_64   mm128_ror_var_64
-#define mm128_rol_64   mm128_rol_var_64
-#define mm128_ror_32   mm128_ror_var_32
-#define mm128_rol_32   mm128_rol_var_32
+#define mm128_ror_64( v, c ) \
+   _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
+
+#define mm128_rol_64( v, c ) \
+   _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
+
+#define mm128_ror_32( v, c ) \
+   _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
+
+#define mm128_rol_32( v, c ) \
+   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )

 #define mm128_rorx2_64( v1, v0, c ) \
 { \
@@ -411,42 +390,90 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_16( v, c ) \
   _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )

-// Limited 2 input shuffle, combines shuffle with blend. The destination low
-// half is always taken from src a, and the high half from src b.
-#define mm128_shuffle2_64( a, b, c ) \
-   _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( a ), \
-                                     _mm_castsi128_pd( b ), c ) ); 
-
-#define mm128_shuffle2_32( a, b, c ) \
-   _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( a ), \
-                                     _mm_castsi128_ps( b ), c ) ); 
-
+// Deprecated.
+#define mm128_rol_var_32( v, c ) \
+   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )

+// Cross lane shuffles
 //
+// Limited 2 input shuffle, combines shuffle with blend. The destination low
+// half is always taken from v1, and the high half from v2.
+#define mm128_shuffle2_64( v1, v2, c ) \
+   _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( v1 ), \
+                                     _mm_castsi128_pd( v2 ), c ) ); 
+
+#define mm128_shuffle2_32( v1, v2, c ) \
+   _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
+                                     _mm_castsi128_ps( v2 ), c ) ); 
+
 // Rotate vector elements accross all lanes

-#define mm128_swap_64( v )    _mm_shuffle_epi32( v, 0x4e )
-#define mm128_shuflr_64       mm128_swap_64
-#define mm128_shufll_64       mm128_swap_64
+#define mm128_swap_64( v )     _mm_shuffle_epi32( v, 0x4e )
+#define mm128_shuflr_64        mm128_swap_64
+#define mm128_shufll_64        mm128_swap_64

 #define mm128_shuflr_32( v )   _mm_shuffle_epi32( v, 0x39 )
 #define mm128_shufll_32( v )   _mm_shuffle_epi32( v, 0x93 )

-
-// Swap 32 bit elements in 64 bit lanes
-#define mm128_swap64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
-#define mm128_shuflr64_32 mm128_swap64_32
-#define mm128_shufll64_32 mm128_swap64_32
-
 #if defined(__SSSE3__)

 // Rotate right by c bytes, no SSE2 equivalent.
 static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 { return _mm_alignr_epi8( v, v, c ); }

+#endif
+
+//  Rotate 64 bit lanes
+
+#define mm128_swap64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
+#define mm128_shuflr64_32     mm128_swap64_32
+#define mm128_shufll64_32     mm128_swap64_32
+
+#if defined(__SSSE3__) && !defined(__AVX512VL__)
+  #define mm128_shuflr64_24( v ) \
+    _mm_shuffle_epi8( v, _mm_set_epi64x( \
+                                    0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
+#else
+  #define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 )
+#endif
+
+#if defined(__SSSE3__) && !defined(__AVX512VL__)
+  #define mm128_shuflr64_16( v ) \
+    _mm_shuffle_epi8( v, _mm_set_epi64x( \
+                                    0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
+#else
+  #define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 )
+#endif
+
+// Rotate 32 bit lanes
+
+#if defined(__SSSE3__) && !defined(__AVX512VL__)
+  #define mm128_swap32_16( v ) \
+    _mm_shuffle_epi8( v, _mm_set_epi64x( \
+                                    0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
+#else
+  #define mm128_swap32_16( v ) mm128_ror_32( v, 16 )
+#endif
+#define mm128_shuflr32_16      mm128_swap32_16
+#define mm128_shufll32_16      mm128_swap32_16
+
+#if defined(__SSSE3__) && !defined(__AVX512VL__)
+  #define mm128_shuflr32_8( v ) \
+    _mm_shuffle_epi8( v, _mm_set_epi64x( \
+                                    0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
+#else
+  #define mm128_shuflr32_8( v ) mm128_ror_32( v, 8 )
+#endif
+
 //
 // Endian byte swap.

+#if defined(__SSSE3__)
+
+#define mm128_bswap_128( v ) \
+   _mm_shuffle_epi8( v, m128_const_64( 0x0001020304050607, \
+                                       0x08090a0b0c0d0e0f ) )
+
 #define mm128_bswap_64( v ) \
   _mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \
                                       0x0001020304050607 ) )
@@ -508,6 +535,9 @@ static inline __m128i mm128_bswap_16( __m128i v )
  return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
 }

+#define mm128_bswap_128( v ) \
+   mm128_swap_64( mm128_bswap_64( v ) )
+
 static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
 {
   d[0] = mm128_bswap_64( s[0] );
@@ -534,174 +564,31 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )

 #endif // SSSE3 else SSE2

-//
-// Rotate in place concatenated 128 bit vectors as one 256 bit vector.
-
-// Swap 128 bit vectorse.
-
+// Swap 128 bit vectors.
+// This should be avoided, it's more efficient to switch references.
 #define mm128_swap256_128( v1, v2 ) \
   v1 = _mm_xor_si128( v1, v2 ); \
   v2 = _mm_xor_si128( v1, v2 ); \
   v1 = _mm_xor_si128( v1, v2 );


-// Two input shuffle-rotate.
-// Concatenate v1 & v2 and bit rotate as one 256 bit vector.
+// alignr instruction for 32 & 64 bit elements is only available with AVX512
+// but emulated here. Behaviour is consistent with Intel alignr intrinsics.

 #if defined(__SSSE3__)

-// Function macros with two inputs and one output, inputs are preserved.
-// Returns the high 128 bits, ie updated v1.
-// These two-input functions are not available without SSSE3. Use procedure
-// macros below instead.
+#define mm128_alignr_64( hi, lo, c )    _mm_alignr_epi8( hi, lo, (c)*8 )
+#define mm128_alignr_32( hi, lo, c )    _mm_alignr_epi8( hi, lo, (c)*4 )

-#define mm128_shufl2r_64( v1, v2 )     _mm_alignr_epi8( v2, v1, 8 )
-#define mm128_shufl2l_64( v1, v2 )     _mm_alignr_epi8( v1, v2, 8 )
+#else

-#define mm128_shufl2r_32( v1, v2 )     _mm_alignr_epi8( v2, v1, 4 )
-#define mm128_shufl2l_32( v1, v2 )     _mm_alignr_epi8( v1, v2, 4 )
+#define mm128_alignr_64( hi, lo, c ) \
+   _mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )

-#define mm128_shufl2r_16( v1, v2 )     _mm_alignr_epi8( v2, v1, 2 )
-#define mm128_shufl2l_16( v1, v2 )     _mm_alignr_epi8( v1, v2, 2 )
+#define mm128_alignr_32( hi, lo, c ) \
+   _mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )

-#define mm128_shufl2r_8( v1, v2 )      _mm_alignr_epi8( v2, v1, 8 )
-#define mm128_shufl2l_8( v1, v2 )      _mm_alignr_epi8( v1, v2, 8 )
-
-// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
-// Deprecated for SSSE3 and above, they exist for SSSE3 only for compatibility
-// with existing code. The function macros above can be used more effciently.
-
-#define mm128_vror256_64( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
-           v1 = _mm_alignr_epi8( v2, v1, 8 ); \
-           v2 = t; \
-} while(0)
-
-#define mm128_vrol256_64( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
-           v2 = _mm_alignr_epi8( v2, v1, 8 ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vror256_32( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 4 ); \
-           v1 = _mm_alignr_epi8( v2, v1, 4 ); \
-           v2 = t; \
-} while(0)
-
-#define mm128_vrol256_32( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 12 ); \
-           v2 = _mm_alignr_epi8( v2, v1, 12 ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vror256_16( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 2 ); \
-           v1 = _mm_alignr_epi8( v2, v1, 2 ); \
-           v2 = t; \
-} while(0)
-
-#define mm128_vrol256_16( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 14 ); \
-           v2 = _mm_alignr_epi8( v2, v1, 14 ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vror256_8( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 1 ); \
-           v1 = _mm_alignr_epi8( v2, v1, 1 ); \
-           v2 = t; \
-} while(0)
-
-#define mm128_vrol256_8( v1, v2 ) \
-do { \
-   __m128i t  = _mm_alignr_epi8( v1, v2, 15 ); \
-           v2 = _mm_alignr_epi8( v2, v1, 15 ); \
-           v1 = t; \
-} while(0)
-
-#else  // SSE2
-
-#define mm128_vror256_64( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
-                              _mm_slli_si128( v2, 8 ) ); \
-           v2 = _mm_or_si128( _mm_srli_si128( v2, 8 ), \
-                              _mm_slli_si128( v1, 8 ) ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vrol256_64( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
-                              _mm_srli_si128( v2, 8 ) ); \
-           v2 = _mm_or_si128( _mm_slli_si128( v2, 8 ), \
-                              _mm_srli_si128( v1, 8 ) ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vror256_32( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
-                              _mm_slli_si128( v2, 12 ) ); \
-           v2 = _mm_or_si128( _mm_srli_si128( v2, 4 ), \
-                              _mm_slli_si128( v1, 12 ) ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vrol256_32( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
-                              _mm_srli_si128( v2, 12 ) ); \
-           v2 = _mm_or_si128( _mm_slli_si128( v2, 4 ), \
-                              _mm_srli_si128( v1, 12 ) ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vror256_16( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
-                              _mm_slli_si128( v2, 14 ) ); \
-           v2 = _mm_or_si128( _mm_srli_si128( v2, 2 ), \
-                              _mm_slli_si128( v1, 14 ) ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vrol256_16( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
-                              _mm_srli_si128( v2, 14 ) ); \
-           v2 = _mm_or_si128( _mm_slli_si128( v2, 2 ), \
-                              _mm_srli_si128( v1, 14 ) ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vror256_8( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
-                              _mm_slli_si128( v2, 15 ) ); \
-           v2 = _mm_or_si128( _mm_srli_si128( v2, 1 ), \
-                              _mm_slli_si128( v1, 15 ) ); \
-           v1 = t; \
-} while(0)
-
-#define mm128_vrol256_8( v1, v2 ) \
-do { \
-   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
-                              _mm_srli_si128( v2, 15 ) ); \
-           v2 = _mm_or_si128( _mm_slli_si128( v2, 1 ), \
-                              _mm_srli_si128( v1, 15 ) ); \
-           v1 = t; \
-} while(0)
-
-#endif  // SSE4.1 else SSE2
+#endif

 #endif // __SSE2__
 #endif // SIMD_128_H__
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -1,18 +1,29 @@
 #if !defined(SIMD_256_H__)
 #define SIMD_256_H__ 1

-//#if defined(__AVX2__)
-
 /////////////////////////////////////////////////////////////////////
 //
 //             AVX2 256 bit vectors
 //
 // Basic support for 256 bit vectors is available with AVX but integer
 // support requires AVX2.
-// Some 256 bit vector utilities require AVX512 or have more efficient
-// AVX512 implementations. They will be selected automatically but their use
-// is limited because 256 bit vectors are less likely to be used when 512
-// is available.
+//
+// AVX512VL backports some AVX512 features to 256 bit vectors and can produce
+// more efficient implementations of some functions. They will be selected
+// automatically but their use is limited because 256 bit vectors are less
+// likely to be used when 512 is available.
+//
+// "_mm256_shuffle_epi8" and "_mm256_alignr_epi8" are restricted to 128 bit
+// lanes and data can't cross the 128 bit lane boundary.  
+// Full width byte shuffle is available with AVX512VL using the mask version
+// with a full mask (-1). 
+// Instructions that can move data across 128 bit lane boundary incur a
+// performance penalty over those that can't.
+// Some usage of index vectors may be encoded as if full vector shuffles are
+// supported. This has no side effects and would have the same results using
+// either version.
+// If the need arises and AVX512VL is available, 256 bit full vector byte 
+// shuffles can be implemented using the AVX512 mask feature with a NULL mask.

 #if defined(__AVX__)

@@ -45,8 +56,8 @@ typedef union
 #define casto_m256i(p,o) (((__m256i*)(p))+(o))

 #endif
-#if defined(__AVX2__)

+#if defined(__AVX2__)

 // Move integer to low element of vector, other elements are set to zero.
 #define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
@@ -56,37 +67,34 @@ typedef union
 #define u64_mov256_64( v ) u64_mov128_64( _mm256_castsi256_si128( v ) )
 #define u32_mov256_32( v ) u32_mov128_32( _mm256_castsi256_si128( v ) )

-// deprecated
-//#define mm256_mov256_64 u64_mov256_64 
-//#define mm256_mov256_32 u32_mov256_32
-
-
 // concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
+
 #define mm256_concat_128( hi, lo ) \
   _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )

+#define mm256_bcast_m128( v ) \
+                 _mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
+#define mm256_bcast_i128( i ) mm256_bcast_m128( mm128_mov64_128( i ) )
+#define mm256_bcast_i64( i )  _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
+#define mm256_bcast_i32( i )  _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
+#define mm256_bcast_i16( i )  _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
+#define mm256_bcast_i8( i )   _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )

 // Equivalent of set, move 64 bit integer constants to respective 64 bit
 // elements.
 static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
                                     const uint64_t i1, const uint64_t i0 )
 {
-  union { __m256i m256i;
-          uint64_t u64[4]; } v;
+  union { __m256i m256i;  uint64_t u64[4]; } v;
  v.u64[0] = i0; v.u64[1] = i1; v.u64[2] = i2; v.u64[3] = i3;
  return v.m256i;
 }

-// Equivalent of set1.
-// 128 bit vector argument
-#define m256_const1_128( v ) \
-   _mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
-// 64 bit integer argument zero extended to 128 bits.
-#define m256_const1_i128( i ) m256_const1_128( mm128_mov64_128( i ) )
-#define m256_const1_64( i )  _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
-#define m256_const1_32( i )  _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
-#define m256_const1_16( i )  _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
-#define m256_const1_8 ( i )  _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )
+// Deprecated
+#define m256_const1_128      mm256_bcast_m128
+#define m256_const1_i128     mm256_bcast_i128
+#define m256_const1_64       mm256_bcast_i64
+#define m256_const1_32       mm256_bcast_i32

 #define m256_const2_64( i1, i0 ) \
  m256_const1_128( m128_const_64( i1, i0 ) )
@@ -95,13 +103,13 @@ static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
 // All SIMD constant macros are actually functions containing executable
 // code and therefore can't be used as compile time initializers.

-#define m256_zero      _mm256_setzero_si256()
-#define m256_one_256   mm256_mov64_256( 1 )
-#define m256_one_128   m256_const1_i128( 1 )
-#define m256_one_64    _mm256_broadcastq_epi64( mm128_mov64_128( 1 ) )
-#define m256_one_32    _mm256_broadcastd_epi32( mm128_mov64_128( 1 ) )
-#define m256_one_16    _mm256_broadcastw_epi16( mm128_mov64_128( 1 ) )
-#define m256_one_8     _mm256_broadcastb_epi8 ( mm128_mov64_128( 1 ) )
+#define m256_zero         _mm256_setzero_si256()
+#define m256_one_256      mm256_mov64_256( 1 )
+#define m256_one_128      mm256_bcast_i128( 1 )
+#define m256_one_64       mm256_bcast_i64( 1 )
+#define m256_one_32       mm256_bcast_i32( 1 )
+#define m256_one_16       mm256_bcast_i16( 1 )
+#define m256_one_8        mm256_bcast_i8 ( 1 )

 static inline __m256i mm256_neg1_fn()
 {
@@ -112,8 +120,8 @@ static inline __m256i mm256_neg1_fn()
 #define m256_neg1  mm256_neg1_fn()

 // Consistent naming for similar operations.
-#define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v )
-#define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 )
+#define mm128_extr_lo128_256( v )    _mm256_castsi256_si128( v )
+#define mm128_extr_hi128_256( v )    _mm256_extracti128_si256( v, 1 )

 //
 // Memory functions
@@ -132,13 +140,23 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
 // Basic operations without SIMD equivalent

-// Bitwise not ( ~v )
+#if defined(__AVX512VL__)
+
+static inline __m256i mm256_not( const __m256i v )
+{  return _mm256_ternarylogic_epi64( v, v, v, 1 ); }
+
+#else
+
 #define mm256_not( v )       _mm256_xor_si256( v, m256_neg1 ) \

+#endif
+
+/*
 // Unary negation of each element ( -v )
 #define mm256_negate_64( v ) _mm256_sub_epi64( m256_zero, v )
 #define mm256_negate_32( v ) _mm256_sub_epi32( m256_zero, v )
 #define mm256_negate_16( v ) _mm256_sub_epi16( m256_zero, v )
+*/


 // Add 4 values, fewer dependencies than sequential addition.
@@ -160,44 +178,34 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 // AVX512 has ternary logic that supports any 3 input boolean expression.

 // a ^ b ^ c
-#define mm256_xor3( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x96 )
+#define mm256_xor3( a, b, c )      _mm256_ternarylogic_epi64( a, b, c, 0x96 )

 // legacy convenience only
-#define mm256_xor4( a, b, c, d ) \
-   _mm256_xor_si256( a, mm256_xor3( b, c, d ) )
+#define mm256_xor4( a, b, c, d )   _mm256_xor_si256( a, mm256_xor3( b, c, d ) )

 // a & b & c
-#define mm256_and3( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x80 )
+#define mm256_and3( a, b, c )      _mm256_ternarylogic_epi64( a, b, c, 0x80 )

 // a | b | c
-#define mm256_or3( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0xfe )
+#define mm256_or3( a, b, c )       _mm256_ternarylogic_epi64( a, b, c, 0xfe )

 // a ^ ( b & c )
-#define mm256_xorand( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x78 )
+#define mm256_xorand( a, b, c )    _mm256_ternarylogic_epi64( a, b, c, 0x78 )

 // a & ( b ^ c )
-#define mm256_andxor( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x60 )
+#define mm256_andxor( a, b, c )    _mm256_ternarylogic_epi64( a, b, c, 0x60 )

 // a ^ ( b | c )
-#define mm256_xoror( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0x1e )
+#define mm256_xoror( a, b, c )     _mm256_ternarylogic_epi64( a, b, c, 0x1e )

 // a ^ ( ~b & c )   
-#define mm256_xorandnot( a, b, c ) \
-  _mm256_ternarylogic_epi64( a, b, c, 0xd2 )
+#define mm256_xorandnot( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0xd2 )

 // a | ( b & c )
-#define mm256_orand( a, b, c ) \
-   _mm256_ternarylogic_epi64( a, b, c, 0xf8  )
+#define mm256_orand( a, b, c )     _mm256_ternarylogic_epi64( a, b, c, 0xf8 )

 // ~( a ^ b ), same as (~a) ^ b
-#define mm256_xnor( a, b ) \
-   _mm256_ternarylogic_epi64( a, b, b, 0x81  )
+#define mm256_xnor( a, b )         _mm256_ternarylogic_epi64( a, b, b, 0x81 )
    
 #else

@@ -234,9 +242,9 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #endif

 // Mask making
-
 // Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
-// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
+// Returns 4 or 8 bit integer mask from MSBit of 64 or 32 bit elements.
+// Effectively a sign test.

 #define mm256_movmask_64( v ) \
   _mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) )
@@ -244,71 +252,14 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_movmask_32( v ) \
   _mm256_castps_si256( _mm256_movmask_ps( _mm256_castsi256_ps( v ) ) )

-
-// Diagonal blending
-
-// Blend 4 64 bit elements from 4 vectors
-#define mm256_diagonal_64( v3, v2, v1, v0 ) \
-  mm256_blend_epi32( _mm256_blend_epi32( v3, v2, 0x30 ), \
-                     _mm256_blend_epi32( v1, v0, 0x03 ), 0x0f )
-
-// Blend 8 32 bit elements from 8 vectors
-#define mm256_diagonal_32( v7, v6, v5, v4, v3, v2, v1, v0 ) \
-  _mm256_blend_epi32( \
-        _mm256_blend_epi32( \
-               _mm256_blend_epi32( v7, v6, 0x40 ), \
-               _mm256_blend_epi32( v5, v4, 0x10 ) 0x30 ), \
-        _mm256_blend_epi32( \
-               _mm256_blend_epi32( v3, v2, 0x04) \
-               _mm256_blend_epi32( v1, v0, 0x01 ), 0x03 ), 0x0f )  
-
-
-// Blend 4 32 bit elements from each 128 bit lane.
-#define mm256_diagonal128_32( v3, v2, v1, v0 ) \
-    _mm256_blend_epi32( \
-           _mm256_blend_epi32( v3, v2, 0x44) \
-           _mm256_blend_epi32( v1, v0, 0x11 ) )  
-
-
 //
 //           Bit rotations.
 //
-// The only bit shift for more than 64 bits is with __int128 which is slow.
-//
-// AVX512 has bit rotate for 256 bit vectors with 64 or 32 bit elements
-//
 // x2 rotates elements in 2 individual vectors in a double buffered
-// optimization for SSE2, does nothing for AVX512 but is there for
+// optimization for AVX2, does nothing for AVX512 but is here for
 // transparency.

-
-// compiler doesn't like when a variable is used for the last arg of
-// _mm_rol_epi32, must be "8 bit immediate". Therefore use rol_var where
-// necessary. 
-
-#define mm256_ror_var_64( v, c ) \
-   _mm256_or_si256( _mm256_srli_epi64( v, c ), \
-                    _mm256_slli_epi64( v, 64-(c) ) )
-
-#define mm256_rol_var_64( v, c ) \
-   _mm256_or_si256( _mm256_slli_epi64( v, c ), \
-                    _mm256_srli_epi64( v, 64-(c) ) )
-
-#define mm256_ror_var_32( v, c ) \
-   _mm256_or_si256( _mm256_srli_epi32( v, c ), \
-                    _mm256_slli_epi32( v, 32-(c) ) )
-
-#define mm256_rol_var_32( v, c ) \
-   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
-                    _mm256_srli_epi32( v, 32-(c) ) )
-
-
-// The spec says both F & VL are required, but just in case AMD
-// decides to implement ROL/R without AVX512F.
 #if defined(__AVX512VL__)
-//#if defined(__AVX512F__) && defined(__AVX512VL__)
-
-// AVX512, control must be 8 bit immediate.

 #define mm256_ror_64    _mm256_ror_epi64
 #define mm256_rol_64    _mm256_rol_epi64
@@ -333,10 +284,23 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )

 #else   // AVX2

-#define mm256_ror_64    mm256_ror_var_64 
-#define mm256_rol_64    mm256_rol_var_64
-#define mm256_ror_32    mm256_ror_var_32
-#define mm256_rol_32    mm256_rol_var_32
+// use shuflr64 shuflr32 below for optimized bit rotations of multiples of 8.
+
+#define mm256_ror_64( v, c ) \
+   _mm256_or_si256( _mm256_srli_epi64( v, c ), \
+                    _mm256_slli_epi64( v, 64-(c) ) )
+
+#define mm256_rol_64( v, c ) \
+   _mm256_or_si256( _mm256_slli_epi64( v, c ), \
+                    _mm256_srli_epi64( v, 64-(c) ) )
+
+#define mm256_ror_32( v, c ) \
+   _mm256_or_si256( _mm256_srli_epi32( v, c ), \
+                    _mm256_slli_epi32( v, 32-(c) ) )
+
+#define mm256_rol_32( v, c ) \
+   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
+                    _mm256_srli_epi32( v, 32-(c) ) )

 #define mm256_rorx2_64( v1, v0, c ) \
 { \
@@ -388,21 +352,38 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
   _mm256_or_si256( _mm256_slli_epi16( v, c ), \
                    _mm256_srli_epi16( v, 16-(c) ) )

+// Deprecated.
+#define mm256_rol_var_32( v, c ) \
+   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
+                    _mm256_srli_epi32( v, 32-(c) ) )

+//
+// Cross lane shuffles
 //
 // Rotate elements accross all lanes.

 // Swap 128 bit elements in 256 bit vector.
 #define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )
-#define mm256_shuflr_128 mm256_swap_128
-#define mm256_shufll_128 mm256_swap_128
+#define mm256_shuflr_128        mm256_swap_128
+#define mm256_shufll_128        mm256_swap_128

 // Rotate 256 bit vector by one 64 bit element
 #define mm256_shuflr_64( v )    _mm256_permute4x64_epi64( v, 0x39 )
-
 #define mm256_shufll_64( v )    _mm256_permute4x64_epi64( v, 0x93 )

+
+/* Not used
 // Rotate 256 bit vector by one 32 bit element.
+#if defined(__AVX512VL__)
+
+static inline __m256i mm256_shuflr_32( const __m256i v )
+{ return _mm256_alignr_epi32( v, v, 1 ); }
+
+static inline __m256i mm256_shufll_32( const __m256i v )
+{ return _mm256_alignr_epi32( v, v, 15 ); }
+
+#else
+
 #define mm256_shuflr_32( v ) \
    _mm256_permutevar8x32_epi32( v, \
                     m256_const_64( 0x0000000000000007, 0x0000000600000005, \
@@ -413,19 +394,20 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
                     m256_const_64( 0x0000000600000005,  0x0000000400000003, \
                                    0x0000000200000001,  0x0000000000000007 ) )

-       
+#endif
+*/
+
 //
 // Rotate elements within each 128 bit lane of 256 bit vector.

 // Limited 2 input shuffle
-#define mm256_shuffle2_64( a, b, c ) \
-   _mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( a ), \
-                                           _mm256_castsi256_pd( b ), c ) ); 
-
-#define mm256_shuffle2_32( a, b, c ) \
-   _mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( a ), \
-                                           _mm256_castsi256_ps( b ), c ) ); 
+#define mm256_shuffle2_64( v1, v2, c ) \
+   _mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( v1 ), \
+                                           _mm256_castsi256_pd( v2 ), c ) ); 

+#define mm256_shuffle2_32( v1, v2, c ) \
+   _mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( v1 ), \
+                                           _mm256_castsi256_ps( v2 ), c ) ); 

 #define mm256_swap128_64( v )  _mm256_shuffle_epi32( v, 0x4e )
 #define mm256_shuflr128_64 mm256_swap128_64
@@ -437,40 +419,67 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 { return _mm256_alignr_epi8( v, v, c ); }

-// Swap 32 bit elements in each 64 bit lane.
-#define mm256_swap64_32( v )   _mm256_shuffle_epi32( v, 0xb1 )
-#define mm256_shuflr64_32 mm256_swap64_32
-#define mm256_shufll64_32 mm256_swap64_32
+// 64 bit lanes

-// NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
-// lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
-// AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
-// needed for a shuffle that crosses 128 bit lanes. BSWAP doesn't therefore the
-// AVX2 version will work here. The bswap control vector is coded to work
-// with both versions, bit 4 is ignored in AVX2. 
+#define mm256_swap64_32( v )      _mm256_shuffle_epi32( v, 0xb1 )
+#define mm256_shuflr64_32         mm256_swap64_32
+#define mm256_shufll64_32         mm256_swap64_32
+
+#if defined(__AVX512VL__)
+  #define mm256_shuflr64_24( v )  _mm256_ror_epi64( v, 24 )
+#else
+  #define mm256_shuflr64_24( v ) \
+    _mm256_shuffle_epi8( v, m256_const2_64( \
+                                    0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
+#endif
+
+#if defined(__AVX512VL__)
+  #define mm256_shuflr64_16( v )  _mm256_ror_epi64( v, 16 )
+#else
+  #define mm256_shuflr64_16( v ) \
+    _mm256_shuffle_epi8( v, m256_const2_64( \
+                                    0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
+#endif
+
+// 32 bit lanes
+
+#if defined(__AVX512VL__)
+  #define mm256_swap32_16( v )  _mm256_ror_epi32( v, 16 )
+#else
+  #define mm256_swap32_16( v ) \
+    _mm256_shuffle_epi8( v, m256_const2_64( \
+                                    0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
+#endif
+#define mm256_shuflr32_16       mm256_swap32_16
+#define mm256_shufll32_16       mm256_swap32_16
+
+#if defined(__AVX512VL__)
+  #define mm256_shuflr32_8( v )  _mm256_ror_epi32( v, 8 )
+#else
+  #define mm256_shuflr32_8( v ) \
+    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
+                                    0x0c0f0e0d080b0a09, 0x0407060500030201, \
+                                    0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
+#endif

 // Reverse byte order in elements, endian bswap.
 #define mm256_bswap_64( v ) \
   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                        0x08090a0b0c0d0e0f, 0x0001020304050607 ) )
+         m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) )

 #define mm256_bswap_32( v ) \
   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
-                        0x0c0d0e0f08090a0b, 0x0405060700010203 ) )
+         m256_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) )

 #define mm256_bswap_16( v ) \
   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x1e1f1c1d1a1b1819, 0x1617141512131011, \
-                        0x0e0f0c0d0a0b0809, 0x0607040502030001, ) )
+         m256_const2_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001, ) )

 // Source and destination are pointers, may point to same memory.
 // 8 byte qword * 8 qwords * 4 lanes = 256 bytes
 #define mm256_block_bswap_64( d, s ) do \
 { \
-  __m256i ctl = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                               0x08090a0b0c0d0e0f, 0x0001020304050607 ) ; \
+  __m256i ctl = m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ; \
  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
@@ -484,8 +493,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 // 4 byte dword * 8 dwords * 8 lanes = 256 bytes
 #define mm256_block_bswap_32( d, s ) do \
 { \
-  __m256i ctl = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
-                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+  __m256i ctl = m256_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
  casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
  casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
@@ -496,18 +504,8 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
  casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
 } while(0)

-//
-// Rotate two concatenated 256 bit vectors as one 512 bit vector by specified
-// number of elements. Rotate is done in place, source arguments are
-// overwritten.
-// Some of these can use permute but appears to be slower. Maybe a Ryzen
-// issue
-
-//  _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
-//  makes these macros unnecessary.
-
-// continue using vror/vrol notation for now to avoid confusion with
-// shufl2r/shufl2l macro functions available with AVX512.
+// swap 256 bit vectors in place.
+// This should be avoided, it's more efficient to switch references.
 #define mm256_swap512_256( v1, v2 ) \
   v1 = _mm256_xor_si256( v1, v2 ); \
   v2 = _mm256_xor_si256( v1, v2 ); \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -2,42 +2,57 @@
 #define SIMD_512_H__ 1

 ////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////
 //
-//       AVX-512
+//   AVX512 512 bit vectors
 //
 //   The baseline for these utilities is AVX512F, AVX512DQ, AVX512BW
 //   and AVX512VL, first available in quantity in Skylake-X.
-//   Some utilities may require additional features available in subsequent
-//   architectures and are noted. 
-
+//   Some utilities may require additional AVX512 extensions available in
+//   subsequent architectures and are noted where used. 
+//   AVX512VL is used to backport AVX512 instructions to 128 and 256 bit
+//   vectors. It is therefore not technically required for any 512 bit vector
+//   utilities defined below.

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 //  AVX512 intrinsics have a few changes from previous conventions.
 //
-//    cmp instruction now returns a bitmask instead of a vector mask.
-//    This eliminates the need for the blendv instruction.
+//    "_mm512_cmp" instructions now returns a bitmask instead of a vector mask.
+//    This removes the need for an explicit movemask instruction.
 //
-//    The new rotate instructions require the count to be an 8 bit
-//    immediate value only. Compilation fails if a variable is used.
-//    The documentation is the same as for shift and it works with
-//    variables. The inconsistency is likely due to compiler optimizations
-//    that can eliminate the variable in some instances.
+//    Many previously sizeless (si) instructions now have sized (epi) versions
+//    to accomodate masking packed elements.
 //
-//    _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
-//    usually shuffles accross all lanes.
+//    Many AVX512 instructions have a different argument order from the AVX2
+//    versions of similar instructions. There is also some inconsistency in how
+//    different AVX512 instructions position the mask register in the argument
+//    list.
 //
-//    permutexvar has args reversed, index is first arg. Previously all
-//    permutes and shuffles have the index last.
+//    "_mm512_permutex_epi64" only shuffles within 256 bit lanes. All other
+//    AVX512 permutes can cross all lanes.
 //
-//    _mm512_permutexvar_epi8 requires AVX512-VBMI, larger elements don't.
-//    It also performs the same op as _mm512_shuffle_epi8.
+//    "_mm512_shuffle_epi8" shuffles accross the entire 512 bits. Shuffle
+//    instructions generally don't cross 128 bit lane boundaries and the AVX2
+//    version of this specific instruction does not.
 //
-//    shuffle_epi8 shuffles accross entire 512 bits. Shuffle usually
-//    doesn't cross 128 bit lane boundaries but is consistent with AVX2
-//    where shuffle_epi8 spans the entire vector.
+//    New alignr instructions for epi64 and epi32 operate across the entire
+//    vector but slower than epi8 which continues to be restricted to 128 bit
+//    lanes.
 //
-//    There are 2 areas where overhead is aconcern: constants and
+//    "_mm512_permutexvar_epi8" and "_mm512_permutex2var_epi8" require
+//    AVX512-VBMI. The same instructions with larger elements don't have this
+//    requirement. "_mm512_permutexvar_epi8" also performs the same operation
+//    as "_mm512_shuffle_epi8" which only requires AVX512-BW.
+//
+//    Two coding conventions are used to prevent macro argument side effects:
+//      - if a macro arg is used in an expression it must be protected by
+//        parentheses to ensure an expression argument is evaluated first.
+//      - if an argument is to referenced multiple times a C inline function
+//        should be used instead of a macro to prevent an expression argument
+//        from being evaluated multiple times.
+//
+//    There are 2 areas where overhead is a major concern: constants and
 //    permutations.
 //
 //    Constants need to be composed at run time by assembling individual
@@ -60,13 +75,10 @@
 //    The same rules apply, if an index is to be reused it should be defined
 //    as a local. This applies specifically to bswap operations.
 //
-//    Additionally, permutations using smaller vectors can be more efficient
-//    if the permutation doesn't cross lane boundaries, typically 128 bits,
-//    and the smaller vector can use an imm comtrol.
-//
-//    If the permutation doesn't cross lane boundaries a shuffle instructions
-//    can be used with imm control instead of permute.
-
+//    Permutations that cross 128 bit lanes are typically slower and often need
+//    a vector control index. If the permutation doesn't need to cross 128 bit
+//    lanes a shuffle instruction can often be used with an imm control.
+//    
 //////////////////////////////////////////////////////////////
 //
 //   AVX512 512 bit vectors
@@ -101,7 +113,17 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
 #define mm512_concat_256( hi, lo ) \
   _mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 )

-#define m512_const_128( v3, v2, v1, v0 ) \
+// Work in progress.
+// modified naming scheme to align more with opcode mnenonic:
+// m512_const1 becomes mm512_bcast_m[n] or mm512_bcast_i[n], short for
+// broadcast, i indicates integer arg, m is vector. Set1 intrinsics should
+// genarally be used for integer data.
+// mm512_const should only be used with immediate integer arguments, use
+// _mm512_set intrinsic instead.
+// mm512_set, mm512_set[n] macros may be defined when no intrinsic exists
+// for either the arg size or arg count.
+
+#define mm512_set_128( v3, v2, v1, v0 ) \
   mm512_concat_256( mm256_concat_128( v3, v2 ), \
                     mm256_concat_128( v1, v0 ) )

@@ -121,29 +143,35 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
  return v.m512i;
 }

+// Broadcast with vector argument is generally more efficient except for
+// integer immediate constants or when data was most recently referenced as
+// integer and is still available in an integer register.
+
+/* not used
 // Equivalent of set1, broadcast lo element to all elements.
 static inline __m512i m512_const1_256( const __m256i v )
 { return _mm512_inserti64x4( _mm512_castsi256_si512( v ), v, 1 ); }  
+*/

-#define m512_const1_128( v ) \
-    mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
-// Integer input argument up to 64 bits
-#define m512_const1_i128( i ) \
-    mm512_perm_128( _mm512_castsi128_si512( mm128_mov64_128( i ) ), 0 )
+#define mm512_bcast_m128( v )  mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
+// Low 64 bits only, high 64 bits are zeroed.
+#define mm512_bcast_i128( i )  mm512_bcast_m128( mm128_mov64_128( i ) )
+#define mm512_bcast_i64( i )   _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
+#define mm512_bcast_i32( i )   _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
+#define mm512_bcast_i16( i )   _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
+#define mm512_bcast_i8( i )    _mm512_broadcastb_epi8( mm128_mov32_128( i ) )

-//#define m512_const1_256( v )   _mm512_broadcast_i64x4( v )
-//#define m512_const1_128( v )   _mm512_broadcast_i64x2( v )
-#define m512_const1_64( i )    _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
-#define m512_const1_32( i )    _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
-#define m512_const1_16( i )    _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
-#define m512_const1_8( i )     _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) )
+// const1 is deprecated, use bcast instead
+#define m512_const1_128   mm512_bcast_m128
+#define m512_const1_i128  mm512_bcast_i128
+#define m512_const1_64    mm512_bcast_i64
+#define m512_const1_32    mm512_bcast_i32

 #define m512_const2_128( v1, v0 ) \
-   m512_const1_256( _mm512_inserti64x2( _mm512_castsi128_si512( v0 ), v1, 1 ) )
+   _mm512_inserti64x2( _mm512_castsi128_si512( v0 ), v1, 1 )

 #define m512_const2_64( i1, i0 ) \
-   m512_const1_128( m128_const_64( i1, i0 ) )
-
+   mm512_bcast_m128( m128_const_64( i1, i0 ) )

 static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
                                      const uint64_t i1, const uint64_t i0 )
@@ -167,28 +195,34 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
 #define m512_zero       _mm512_setzero_si512()
 #define m512_one_512    mm512_mov64_512( 1 )
 #define m512_one_256    _mm512_inserti64x4( m512_one_512, m256_one_256, 1 )  
-#define m512_one_128    m512_const1_i128( 1 )
-#define m512_one_64     m512_const1_64( 1 )
-#define m512_one_32     m512_const1_32( 1 )
-#define m512_one_16     m512_const1_16( 1 )
-#define m512_one_8      m512_const1_8( 1 )
+#define m512_one_128    mm512_bcast_i128( (__uint128_t)1 )
+#define m512_one_64     mm512_bcast_i64( (uint64_t)1 )
+#define m512_one_32     mm512_bcast_i32( (uint32_t)1 )
+#define m512_one_16     mm512_bcast_i16( (uint16_t)1 )
+#define m512_one_8      mm512_bcast_i8(  (uint8_t)1 )

-//#define m512_neg1 m512_const1_64( 0xffffffffffffffff )
-#define m512_neg1 _mm512_movm_epi64( 0xff )
+// use asm to avoid compiler warning for unitialized local
+static inline __m512i mm512_neg1_fn()
+{
+   __m512i a;
+   asm( "vpternlogq $0xff, %0, %0, %0\n\t" : "=x"(a) );
+   return a;
+}
+#define m512_neg1 mm512_neg1_fn()                          // 1 clock

 //
 // Basic operations without SIMD equivalent

-// ~x
-// #define mm512_not( x )       _mm512_xor_si512( x, m512_neg1 )
+// Bitwise NOT: ~x
 static inline __m512i mm512_not( const __m512i x )
 {  return _mm512_ternarylogic_epi64( x, x, x, 1 ); }

-// -x
+/*
+// Unary negation: -x
 #define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x )
 #define mm512_negate_32( x ) _mm512_sub_epi32( m512_zero, x )  
 #define mm512_negate_16( x ) _mm512_sub_epi16( m512_zero, x )  
-
+*/

 //
 // Pointer casting
@@ -242,140 +276,61 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // expression using any number or combinations of AND, OR, XOR, NOT.

 // a ^ b ^ c
-#define mm512_xor3( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x96 )
+#define mm512_xor3( a, b, c )      _mm512_ternarylogic_epi64( a, b, c, 0x96 )

 // legacy convenience only
-#define mm512_xor4( a, b, c, d ) \
-   _mm512_xor_si512( a, mm512_xor3( b, c, d ) )
+#define mm512_xor4( a, b, c, d )   _mm512_xor_si512( a, mm512_xor3( b, c, d ) )

 // a & b & c
-#define mm512_and3( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x80 )
+#define mm512_and3( a, b, c )      _mm512_ternarylogic_epi64( a, b, c, 0x80 )

 // a | b | c
-#define mm512_or3( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0xfe )
+#define mm512_or3( a, b, c )       _mm512_ternarylogic_epi64( a, b, c, 0xfe )

 // a ^ ( b & c )
-#define mm512_xorand( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x78 )
+#define mm512_xorand( a, b, c )    _mm512_ternarylogic_epi64( a, b, c, 0x78 )

 // a & ( b ^ c )
-#define mm512_andxor( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x60 )
+#define mm512_andxor( a, b, c )    _mm512_ternarylogic_epi64( a, b, c, 0x60 )

 // a ^ ( b | c )
-#define mm512_xoror( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0x1e )
+#define mm512_xoror( a, b, c )     _mm512_ternarylogic_epi64( a, b, c, 0x1e )

-// a ^ ( ~b & c )     xor( a, andnot( b, c ) )
-#define mm512_xorandnot( a, b, c ) \
-  _mm512_ternarylogic_epi64( a, b, c, 0xd2 ) 
+// a ^ ( ~b & c ),     xor( a, andnot( b, c ) )
+#define mm512_xorandnot( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0xd2 ) 

 // a | ( b & c )
-#define mm512_orand( a, b, c ) \
-   _mm512_ternarylogic_epi64( a, b, c, 0xf8  )
+#define mm512_orand( a, b, c )     _mm512_ternarylogic_epi64( a, b, c, 0xf8 )

 // Some 2 input operations that don't have their own instruction mnemonic.
+// Use with caution, args are not expression safe.

 // ~( a | b ),  (~a) & (~b)
-#define mm512_nor( a, b ) \
-   _mm512_ternarylogic_epi64( a, b, b, 0x01  )
+#define mm512_nor( a, b )          _mm512_ternarylogic_epi64( a, b, b, 0x01 )

 // ~( a ^ b ),  (~a) ^ b
-#define mm512_xnor( a, b ) \
-   _mm512_ternarylogic_epi64( a, b, b, 0x81  )
+#define mm512_xnor( a, b )         _mm512_ternarylogic_epi64( a, b, b, 0x81 )

 // ~( a & b )
-#define mm512_nand( a, b ) \
-   _mm512_ternarylogic_epi64( a, b, b, 0xef  )
-
-
-// Diagonal blending
-// Blend 8 64 bit elements from 8 vectors
-#define mm512_diagonal_64( v7, v6, v5, v4, v3, v2, v1, v0 ) \
-  _mm512_mask_blend_epi64( 0x0f, \
-        _mm512_mask_blend_epi64( 0x30, \
-               _mm512_mask_blend_epi64( 0x40, v7, v6 ), \
-               _mm512_mask_blend_epi64( 0x40, v5, v4 ) ), \
-        _mm512_mask_blend_epi64( 0x03, \
-               _mm512_mask_blend_epi64( 0x04, v3, v2 ) \
-               _mm512_mask_blend_epi64( 0x01, v1, v0 ) ) )  
-
-
-// Blend 4 32 bit elements from each 128 bit lane.
-#define mm512_diagonal128_32( v3, v2, v1, v0 ) \
-    _mm512_mask_blend_epi32( 0x3333, \
-           _mm512_mask_blend_epi32( 0x4444, v3, v2 ), \
-           _mm512_mask_blend_epi32( 0x1111, v1, v0 ) )  
-
-
-
+#define mm512_nand( a, b )         _mm512_ternarylogic_epi64( a, b, b, 0xef )

 // Bit rotations.

 // AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
-// elements and can be called directly. But they only accept immediate 8
-// for control arg. 
-// The workaround is a fraud, just a fluke of the compiler's optimizer.
-// It fails without -O3. The compiler seems to unroll shift loops, eliminating
-// the variable control, better than rotate loops. 
+// elements and can be called directly. 
 //
 // _mm512_rol_epi64,  _mm512_ror_epi64,  _mm512_rol_epi32,  _mm512_ror_epi32
 // _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
 //

-// For convenience and consistency with AVX2
+// For convenience and consistency with AVX2 macros.
 #define mm512_ror_64 _mm512_ror_epi64
 #define mm512_rol_64 _mm512_rol_epi64
 #define mm512_ror_32 _mm512_ror_epi32
 #define mm512_rol_32 _mm512_rol_epi32

-static inline __m512i mm512_ror_var_64( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_srli_epi64( v, c ),
-                           _mm512_slli_epi64( v, 64-c ) );
-}
-
-static inline __m512i mm512_rol_var_64( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_slli_epi64( v, c ),
-                           _mm512_srli_epi64( v, 64-c ) );
-}
-
-static inline __m512i mm512_ror_var_32( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_srli_epi32( v, c ),
-                           _mm512_slli_epi32( v, 32-c ) );
-}
-
-static inline __m512i mm512_rol_var_32( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_slli_epi32( v, c ),
-                           _mm512_srli_epi32( v, 32-c ) );
-}
-
-static inline __m512i mm512_ror_16( __m512i const v, const int c )
-{
-   return _mm512_or_si512( _mm512_srli_epi16( v, c ),
-                           _mm512_slli_epi16( v, 16-c ) );
-}
-
-static inline __m512i mm512_rol_16( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_slli_epi16( v, c ),
-                           _mm512_srli_epi16( v, 16-c ) );
-}
-
-// Rotations using a vector control index are very slow due to overhead
-// to generate the index vector. Repeated rotations using the same index
-// are better handled by the calling function where the index only needs
-// to be generated once then reused very efficiently.
-// Permutes and shuffles using an immediate index are significantly faster.
-
 //
-// Swap bytes in vector elements, vectorized endian conversion.
+// Reverse byte order of packed elements, vectorized endian conversion.

 #define mm512_bswap_64( v ) \
   _mm512_shuffle_epi8( v, \
@@ -402,10 +357,10 @@ static inline __m512i mm512_rol_16( const __m512i v, const int c )
 // 8 lanes of 64 bytes each
 #define mm512_block_bswap_64( d, s ) do \
 { \
-  __m512i ctl = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
-                               0x28292a2b2c2d2e2f, 0x2021222324252627, \
-                               0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                               0x08090a0b0c0d0e0f, 0x0001020304050607  ); \
+  const __m512i ctl = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
+                                     0x28292a2b2c2d2e2f, 0x2021222324252627, \
+                                     0x18191a1b1c1d1e1f, 0x1011121314151617, \
+                                     0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
@@ -419,10 +374,10 @@ static inline __m512i mm512_rol_16( const __m512i v, const int c )
 // 16 lanes of 32 bytes each
 #define mm512_block_bswap_32( d, s ) do \
 { \
-  __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
-                               0x2c2d2e2f28292a2b, 0x2425262720212223, \
-                               0x1c1d1e1f18191a1b, 0x1415161710111213, \
-                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+  const __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
+                                     0x2c2d2e2f28292a2b, 0x2425262720212223, \
+                                     0x1c1d1e1f18191a1b, 0x1415161710111213, \
+                                     0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
@@ -434,30 +389,10 @@ static inline __m512i mm512_rol_16( const __m512i v, const int c )
 } while(0)


-// Cross-lane shuffles implementing rotate & shift of elements within a vector.
-//
-
-#define mm512_shiftr_256( v ) \
-  _mm512_alignr_epi64( _mm512_setzero, v, 4 )
-#define mm512_shiftl_256( v ) mm512_shifr_256
-
-#define mm512_shiftr_128( v ) \
-  _mm512_alignr_epi64( _mm512_setzero, v, 2 )
-#define mm512_shiftl_128( v ) \
-  _mm512_alignr_epi64( v,  _mm512_setzero, 6 )
-
-#define mm512_shiftr_64( v ) \
-  _mm512_alignr_epi64( _mm512_setzero, v, 1 )
-#define mm512_shiftl_64( v ) \
-  _mm512_alignr_epi64( v, _mm512_setzero, 7 )
-
-#define mm512_shiftr_32( v ) \
-  _mm512_alignr_epi32( _mm512_setzero, v, 1 )
-#define mm512_shiftl_32( v ) \
-  _mm512_alignr_epi32( v, _mm512_setzero, 15 )
-
-// Shuffle-rotate elements left or right in 512 bit vector.
+// Cross-lane shuffles implementing rotation of packed elements.
+// 

+// Rotate elements across entire vector.
 static inline __m512i mm512_swap_256( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 4 ); }
 #define mm512_shuflr_256( v ) mm512_swap_256
@@ -491,16 +426,16 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuflr_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                       0x0000001F001E001D, 0x001C001B001A0019, \
-                       0X0018001700160015, 0X0014001300120011, \
-                       0X0010000F000E000D, 0X000C000B000A0009, \
-                       0X0008000700060005, 0X0004000300020001 ), v )
+                       0x0018001700160015, 0x0014001300120011, \
+                       0x0010000F000E000D, 0x000C000B000A0009, \
+                       0x0008000700060005, 0x0004000300020001 ), v )

 #define mm512_shufll_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                       0x001E001D001C001B, 0x001A001900180017, \
-                       0X0016001500140013, 0X001200110010000F, \
-                       0X000E000D000C000B, 0X000A000900080007, \
-                       0X0006000500040003, 0X000200010000001F ), v )
+                       0x0016001500140013, 0x001200110010000F, \
+                       0x000E000D000C000B, 0x000A000900080007, \
+                       0x0006000500040003, 0x000200010000001F ), v )

 #define mm512_shuflr_8( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
@@ -516,9 +451,8 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
                       0x1E1D1C1B1A191817, 0x161514131211100F, \
                       0x0E0D0C0B0A090807, 0x060504030201003F ) )

-//
+// 256 bit lanes used only by lyra2, move these there
 // Rotate elements within 256 bit lanes of 512 bit vector.
-// 128 bit lane shift is handled by bslli bsrli.

 // Swap hi & lo 128 bits in each 256 bit lane
 #define mm512_swap256_128( v )      _mm512_permutex_epi64( v, 0x4e )
@@ -529,6 +463,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 #define mm512_shuflr256_64( v )     _mm512_permutex_epi64( v, 0x39 )
 #define mm512_shufll256_64( v )     _mm512_permutex_epi64( v, 0x93 )

+/*  Not used
 // Rotate 256 bit lanes by one 32 bit element
 #define mm512_shuflr256_32( v ) \
   _mm512_permutexvar_epi32( m512_const_64( \
@@ -571,61 +506,56 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
                     0x2e2d2c2b2a292827, 0x262524232221203f, \
                     0x1e1d1c1b1a191817, 0x161514131211100f, \
                     0x0e0d0c0b0a090807, 0x060504030201001f ) )
-
+*/
 //
 // Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
 
-// Limited 2 input, 1 output shuffle, combines shuffle with blend.
-// Like most shuffles it's limited to 128 bit lanes and like some shuffles
-// destination elements must come from a specific source. 
-#define mm512_shuffle2_64( a, b, c ) \
-   _mm512_castpd_si512( _mm512_shuffle_pd( _mm512_castsi512_pd( a ), \
-                                           _mm512_castsi512_pd( b ), c ) ); 
-
-#define mm512_shuffle2_32( a, b, c ) \
-   _mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( a ), \
-                                           _mm512_castsi512_ps( b ), c ) ); 
-
-// Swap 64 bits in each 128 bit lane
 #define mm512_swap128_64( v )   _mm512_shuffle_epi32( v, 0x4e )
-#define mm512_shuflr128_64  mm512_swap128_64
-#define mm512_shufll128_64  mm512_swap128_64
+#define mm512_shuflr128_64      mm512_swap128_64
+#define mm512_shufll128_64      mm512_swap128_64

 // Rotate 128 bit lanes by one 32 bit element
 #define mm512_shuflr128_32( v )    _mm512_shuffle_epi32( v, 0x39 )
 #define mm512_shufll128_32( v )    _mm512_shuffle_epi32( v, 0x93 )

-// Rotate right 128 bit lanes by c bytes, versatile and just as fast
+// Rotate 128 bit lanes right by c bytes, versatile and just as fast
 static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 {  return _mm512_alignr_epi8( v, v, c ); }

-// Swap 32 bits in each 64 bit lane. Can be done with rotate instruction
-// but only with AVX512. Shuffle is just as fast and availble with AVX2
-// & SSE2.
+// Limited 2 input, 1 output shuffle, combines shuffle with blend.
+// Like most shuffles it's limited to 128 bit lanes and like some shuffles
+// destination elements must come from a specific source arg. 
+#define mm512_shuffle2_64( v1, v2, c ) \
+   _mm512_castpd_si512( _mm512_shuffle_pd( _mm512_castsi512_pd( v1 ), \
+                                           _mm512_castsi512_pd( v2 ), c ) ); 
+
+#define mm512_shuffle2_32( v1, v2, c ) \
+   _mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( v1 ), \
+                                           _mm512_castsi512_ps( v2 ), c ) ); 
+
+// 64 bit lanes
+
 #define mm512_swap64_32( v )    _mm512_shuffle_epi32( v, 0xb1 )
-#define mm512_shuflr64_32 mm512_swap64_32
-#define mm512_shufll64_32 mm512_swap64_32
+#define mm512_shuflr64_32       mm512_swap64_32
+#define mm512_shufll64_32       mm512_swap64_32

-// Need good way to distinguish 1 input shuffles, 2 input shuffle functions,
-// and 2 input 2 output shuffle macros.
-//
-// shuflr is 1 input
-// shufl2r is 2 input ...
-// Drop macros? They can easilly be rebuilt using shufl2 functions
+#define mm512_shuflr64_24( v )  _mm512_ror_epi64( v, 24 )
+#define mm512_shufll64_24( v )  _mm512_rol_epi64( v, 24 )

-// 2 input, 1 output
-// Rotate concatenated { v1, v2 ) right or left and return v1. 
-#define mm512_shufl2r_256( v1, v2 )    _mm512_alignr_epi64( v2, v1, 4 )
-#define mm512_shufl2l_256( v1, v2 )    _mm512_alignr_epi64( v1, v2, 4 )
+#define mm512_shuflr64_16( v )  _mm512_ror_epi64( v, 16 )
+#define mm512_shufll64_16( v )  _mm512_rol_epi64( v, 16 )

-#define mm512_shufl2r_128( v1, v2 )    _mm512_alignr_epi64( v2, v1, 2 )
-#define mm512_shufl2l_128( v1, v2 )    _mm512_alignr_epi64( v1, v2, 2 )
+#define mm512_shuflr64_8(  v )  _mm512_ror_epi64( v,  8 )
+#define mm512_shufll64_8(  v )  _mm512_rol_epi64( v,  8 )

-#define mm512_shufl2r_64( v1, v2 )     _mm512_alignr_epi64( v2, v1, 1 )
-#define mm512_shufl2l_64( v1, v2 )     _mm512_alignr_epi64( v1, v2, 1 )
+// 32 bit lanes

-#define mm512_shufl2r_32( v1, v2 )     _mm512_alignr_epi32( v2, v1, 1 )
-#define mm512_shufl2l_32( v1, v2 )     _mm512_alignr_epi32( v1, v2, 1 )
+#define mm512_swap32_16( v )    _mm512_ror_epi32( v, 16 )
+#define mm512_shuflr32_16       mm512_swap32_16
+#define mm512_shufll32_16       mm512_swap32_16
+
+#define mm512_shuflr32_8( v )   _mm512_ror_epi32( v,  8 )
+#define mm512_shufll32_8( v )   _mm512_rol_epi32( v,  8 )

 #endif // AVX512
 #endif // SIMD_512_H__
--- a/simd-utils/simd-64.h
+++ b/simd-utils/simd-64.h
@@ -34,10 +34,12 @@
 //#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 )
 #define mm64_not( a ) ( (__m64)( ~( (uint64_t)(a) ) )

+/*      
 // Unary negate elements
 #define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, v )
 #define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, v )
 #define mm64_negate_8(  v ) _mm_sub_pi8(  m64_zero, v )
+*/

 // Rotate bits in packed elements of 64 bit vector
 #define mm64_rol_64( a, n ) \
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -55,6 +55,13 @@
 typedef          __int128  int128_t;
 typedef unsigned __int128 uint128_t;

+typedef union
+{
+   uint128_t u128;
+   uint64_t  u64[2];
+   uint32_t  u32[4];
+} __attribute__ ((aligned (16))) u128_ovly;
+
 // Extracting the low bits is a trivial cast.
 // These specialized functions are optimized while providing a
 // consistent interface.
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -333,7 +333,7 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 // CPU_INFO ECX
 #define SSE3_Flag      1    
 #define SSSE3_Flag    (1<< 9)
-#define XOP_Flag      (1<<11)
+#define XOP_Flag      (1<<11)   // obsolete, only available on pre-Ryzen AMD
 #define FMA3_Flag     (1<<12)
 #define AES_Flag      (1<<25)
 #define SSE41_Flag    (1<<19)
--- a/util.c
+++ b/util.c
@@ -44,28 +44,22 @@
 #include <libgen.h>
 #endif

-//#include "miner.h"
 #include "elist.h"
 #include "algo-gate-api.h"
 #include "algo/sha/sha256d.h"

-//extern pthread_mutex_t stats_lock;
-
-struct data_buffer {
-	void		*buf;
-	size_t		len;
-};
-
-struct upload_buffer {
-	const void	*buf;
-	size_t		len;
-	size_t		pos;
-};
-
 struct header_info {
 	char		*lp_path;
 	char		*reason;
 	char		*stratum_url;
+   size_t	content_length;
+};
+
+struct data_buffer {
+	void			*buf;
+	size_t			len;
+	size_t			allocated;
+	struct header_info	*headers;
 };

 struct tq_ent {
@@ -127,7 +121,6 @@ void applog2( int prio, const char *fmt, ... )
      int len;
 //    struct tm tm;
 //    time_t now = time(NULL);
-
 //    localtime_r(&now, &tm);

      switch ( prio )
@@ -395,67 +388,53 @@ static void databuf_free(struct data_buffer *db)
 static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb,
 			  void *user_data)
 {
-	struct data_buffer *db = (struct data_buffer *) user_data;
+	struct data_buffer *db = user_data;
 	size_t len = size * nmemb;
-	size_t oldlen, newlen;
+	size_t newalloc, reqalloc;
 	void *newmem;
 	static const unsigned char zero = 0;
+	static const size_t max_realloc_increase = 8 * 1024 * 1024;
+	static const size_t initial_alloc = 16 * 1024;

-	oldlen = db->len;
-	newlen = oldlen + len;
+	/* minimum required allocation size */
+	reqalloc = db->len + len + 1;

-	newmem = realloc(db->buf, newlen + 1);
-	if (!newmem)
-		return 0;
+	if (reqalloc > db->allocated) {
+		if (db->len > 0) {
+			newalloc = db->allocated * 2;
+		} else {
+			if (db->headers->content_length > 0)
+				newalloc = db->headers->content_length + 1;
+			else
+				newalloc = initial_alloc;
+		}

-	db->buf = newmem;
-	db->len = newlen;
-	memcpy((uchar*) db->buf + oldlen, ptr, len);
-	memcpy((uchar*) db->buf + newlen, &zero, 1);	/* null terminate */
+		if (db->headers->content_length == 0) {
+			/* limit the maximum buffer increase */
+			if (newalloc - db->allocated > max_realloc_increase)
+				newalloc = db->allocated + max_realloc_increase;
+		}
+
+		/* ensure we have a big enough allocation */
+		if (reqalloc > newalloc)
+			newalloc = reqalloc;
+
+		newmem = realloc(db->buf, newalloc);
+		if (!newmem)
+			return 0;
+
+		db->buf = newmem;
+		db->allocated = newalloc;
+	}
+
+	memcpy(db->buf + db->len, ptr, len); /* append new data */
+	memcpy(db->buf + db->len + len, &zero, 1); /* null terminate */
+
+	db->len += len;

 	return len;
 }

-static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb,
-			     void *user_data)
-{
-	struct upload_buffer *ub = (struct upload_buffer *) user_data;
-	size_t len = size * nmemb;
-
-	if (len > ub->len - ub->pos)
-		len = ub->len - ub->pos;
-
-	if (len) {
-		memcpy(ptr, ((uchar*)ub->buf) + ub->pos, len);
-		ub->pos += len;
-	}
-
-	return len;
-}
-
-#if LIBCURL_VERSION_NUM >= 0x071200
-static int seek_data_cb(void *user_data, curl_off_t offset, int origin)
-{
-	struct upload_buffer *ub = (struct upload_buffer *) user_data;
-	
-	switch (origin) {
-	case SEEK_SET:
-		ub->pos = (size_t) offset;
-		break;
-	case SEEK_CUR:
-		ub->pos += (size_t) offset;
-		break;
-	case SEEK_END:
-		ub->pos = ub->len + (size_t) offset;
-		break;
-	default:
-		return 1; /* CURL_SEEKFUNC_FAIL */
-	}
-
-	return 0; /* CURL_SEEKFUNC_OK */
-}
-#endif
-
 static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
 {
 	struct header_info *hi = (struct header_info *) user_data;
@@ -505,6 +484,9 @@ static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
 		val = NULL;
 	}

+	if (!strcasecmp("Content-Length", key))
+		hi->content_length = strtoul(val, NULL, 10);
+
 out:
 	free(key);
 	free(val);
@@ -564,48 +546,38 @@ json_t *json_rpc_call(CURL *curl, const char *url,
 	int rc;
 	long http_rc;
 	struct data_buffer all_data = {0};
-	struct upload_buffer upload_data;
 	char *json_buf;
 	json_error_t err;
 	struct curl_slist *headers = NULL;
-	char len_hdr[64];
 	char curl_err_str[CURL_ERROR_SIZE] = { 0 };
 	long timeout = (flags & JSON_RPC_LONGPOLL) ? opt_timeout : 30;
 	struct header_info hi = {0};

+   all_data.headers = &hi;
 	/* it is assumed that 'curl' is freshly [re]initialized at this pt */

-	if (opt_protocol)
-		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
+	if (opt_protocol)  curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
 	curl_easy_setopt(curl, CURLOPT_URL, url);
-	if (opt_cert)
-		curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
-//
-        curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
-
+	if (opt_cert)      curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
+   curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
 	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
 	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
 	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
 	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
 	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb);
 	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
-	curl_easy_setopt(curl, CURLOPT_READFUNCTION, upload_data_cb);
-	curl_easy_setopt(curl, CURLOPT_READDATA, &upload_data);
-#if LIBCURL_VERSION_NUM >= 0x071200
-	curl_easy_setopt(curl, CURLOPT_SEEKFUNCTION, &seek_data_cb);
-	curl_easy_setopt(curl, CURLOPT_SEEKDATA, &upload_data);
-#endif
-	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
-	if (opt_redirect)
-		curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
+   curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
+	if (opt_redirect)  curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
 	curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
 	curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, resp_hdr_cb);
 	curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hi);
-	if (opt_proxy) {
+	if (opt_proxy)
+   {
 		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
 		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
 	}
-	if (userpass) {
+	if (userpass)
+   {
 		curl_easy_setopt(curl, CURLOPT_USERPWD, userpass);
 		curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC);
 	}
@@ -613,23 +585,16 @@ json_t *json_rpc_call(CURL *curl, const char *url,
 	if (flags & JSON_RPC_LONGPOLL)
 		curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
 #endif
-	curl_easy_setopt(curl, CURLOPT_POST, 1);
+   curl_easy_setopt(curl, CURLOPT_POSTFIELDS, rpc_req);

 	if (opt_protocol)
 		applog(LOG_DEBUG, "JSON protocol request:\n%s\n", rpc_req);

-	upload_data.buf = rpc_req;
-	upload_data.len = strlen(rpc_req);
-	upload_data.pos = 0;
-	sprintf(len_hdr, "Content-Length: %lu",
-		(unsigned long) upload_data.len);
-
 	headers = curl_slist_append(headers, "Content-Type: application/json");
-	headers = curl_slist_append(headers, len_hdr);
 	headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
 	headers = curl_slist_append(headers, "X-Mining-Extensions: longpoll reject-reason");
-	//headers = curl_slist_append(headers, "Accept:"); /* disable Accept hdr*/
-	//headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/
+	//headers = curl_slist_append(headers, "Accept:"); // disable Accept hdr
+	//headers = curl_slist_append(headers, "Expect:"); // disable Expect hdr

 	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);

@@ -786,18 +751,26 @@ err_out:
 	return cfg;
 }

-// Segwit BEGIN
 void memrev(unsigned char *p, size_t len)
 {
-   unsigned char c, *q;
-   for (q = p + len - 1; p < q; p++, q--) {
-      c = *p;
-      *p = *q;
-      *q = c;
+   if ( len == 32 )
+   {
+      __m128i *pv = (__m128i*)p;
+      __m128i t = mm128_bswap_128( pv[0] );
+      pv[0] =     mm128_bswap_128( pv[1] );   
+      pv[1] = t;
+   }
+   else
+   {
+      unsigned char c, *q;
+      for (q = p + len - 1; p < q; p++, q--) 
+      {
+         c = *p;
+         *p = *q;
+         *q = c;
+      }
   }
 }
-// Segwit END
-

 void cbin2hex(char *out, const char *in, size_t len)
 {
@@ -832,32 +805,42 @@ char *bebin2hex(const unsigned char *p, size_t len)
   return s;
 }

-bool hex2bin(unsigned char *p, const char *hexstr, size_t len)
+bool hex2bin( unsigned char *p, const char *hexstr, const size_t len )
 {
-	char hex_byte[3];
-	char *ep;
+	if( hexstr == NULL )	return false;

-	hex_byte[2] = '\0';
-
-	while (*hexstr && len) {
-		if (!hexstr[1]) {
-			applog(LOG_ERR, "hex2bin str truncated");
-			return false;
-		}
-		hex_byte[0] = hexstr[0];
-		hex_byte[1] = hexstr[1];
-		*p = (unsigned char) strtol(hex_byte, &ep, 16);
-		if (*ep) {
-			applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte);
-			return false;
-		}
-		p++;
-		hexstr += 2;
-		len--;
+	size_t hexstr_len = strlen( hexstr );
+	if( ( hexstr_len % 2 ) != 0 )
+   {
+		applog( LOG_ERR, "hex2bin string truncated" );
+		return false;
+	}
+	size_t bin_len = hexstr_len / 2;
+	if ( bin_len > len )
+   {
+		applog( LOG_ERR, "hex2bin buffer too small" );
+		return false;
 	}

-	return(!len) ? true : false;
-/*	return (len == 0 && *hexstr == 0) ? true : false; */
+	memset( p, 0, len );
+	size_t i = 0;
+	while ( i < hexstr_len )
+   {
+		char c = hexstr[i];
+		unsigned char nibble;
+		if      ( c >= '0' && c <= '9' )	 nibble = (c - '0');
+		else if ( c >= 'A' && c <= 'F' )	 nibble = ( 10 + (c - 'A') );
+		else if ( c >= 'a' && c <= 'f' )	 nibble = ( 10 + (c - 'a') );
+		else
+      {
+			applog( LOG_ERR, "hex2bin invalid hex" );
+			return false;
+		}
+		p[(i / 2)] |= (nibble << ( (1 - (i % 2) ) * 4) );
+		i++;
+	}
+
+	return true;
 }

 int varint_encode(unsigned char *p, uint64_t n)
@@ -1339,6 +1322,43 @@ inline bool valid_hash( const void *hash, const void *target )

 #endif 

+inline double nbits_to_diff( uint32_t nbits )
+{
+   long double diff;
+   uint32_t shift = nbits & 0xff;
+   uint32_t bits = bswap_32( nbits ) & 0x00ffffff;
+   int shift_off = (int)shift - 29;
+
+   // diff = ( (2**16 -1) / ( 256**shift_off * bits )
+   // With uint128 byte shift is good for 16 <= shift <= 41. As unlikely
+   // as this may seem necessary, check just in case.
+
+   if ( shift_off >= -13 && shift_off <= 12 ) 
+   {  // fast
+      if ( shift_off == 0 )
+         diff = (long double)0xffff / (long double)bits;
+      else if ( shift_off < 0 )   // shift < 29
+         diff = (long double)( (uint128_t)0xffff << ( (-shift_off) *8 ) ) 
+              / (long double)bits;
+      else // ( shift_off > 0 )   // shift > 29
+         diff =   (long double)0xffff
+                / (long double)( (uint128_t)bits << ( shift_off*8 ) );  
+   }
+   else
+   {  // slow
+      int m;
+      diff = 0.;
+      for ( m = shift; m < 29; m++ )    diff *= 256.0;
+      for ( m = 29; m < shift; m++ )    diff /= 256.0;
+   }
+
+   if ( opt_debug )
+      applog( LOG_INFO, "nbits %08x: shift %u(%d), bits %06x, diff %8g",
+                         nbits, shift, shift_off, bits, (double)diff );
+
+   return (double)diff;
+}
+
 #ifdef WIN32
 #define socket_blocks() (WSAGetLastError() == WSAEWOULDBLOCK)
 #else
@@ -1371,7 +1391,7 @@ static bool send_line( struct stratum_ctx *sctx, char *s )
     {
        if ( rc != CURLE_AGAIN )
 #else                      
-     n = send(sock, s + sent, len, 0);
+     n = send( sctx->sock, s + sent, len, 0);
     if ( n < 0 )
     {
     if ( !socket_blocks() )
@@ -1379,8 +1399,8 @@ static bool send_line( struct stratum_ctx *sctx, char *s )
        return false;
 	     n = 0;
 	  }
-		sent += n;
-		len -= n;
+     sent += n;
+     len -= n;
   }

 	return true;
@@ -1507,7 +1527,8 @@ out:
 	return sret;
 }

-#if LIBCURL_VERSION_NUM >= 0x071101
+#if LIBCURL_VERSION_NUM >= 0x071101 && LIBCURL_VERSION_NUM < 0x072d00
+//#if LIBCURL_VERSION_NUM >= 0x071101
 static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose,
 	struct curl_sockaddr *addr)
 {
@@ -1575,7 +1596,8 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
 #if LIBCURL_VERSION_NUM >= 0x070f06
 	curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
 #endif
-#if LIBCURL_VERSION_NUM >= 0x071101
+#if LIBCURL_VERSION_NUM >= 0x071101 && LIBCURL_VERSION_NUM < 0x072d00
+//#if LIBCURL_VERSION_NUM >= 0x071101
 	curl_easy_setopt(curl, CURLOPT_OPENSOCKETFUNCTION, opensocket_grab_cb);
 	curl_easy_setopt(curl, CURLOPT_OPENSOCKETDATA, &sctx->sock);
 #endif
@@ -1589,7 +1611,10 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
 		return false;
 	}

-#if LIBCURL_VERSION_NUM < 0x071101
+#if LIBCURL_VERSION_NUM >= 0x072d00
+	curl_easy_getinfo(curl, CURLINFO_ACTIVESOCKET, &sctx->sock);
+#elif LIBCURL_VERSION_NUM < 0x071101   
+//#if LIBCURL_VERSION_NUM < 0x071101
 	/* CURLINFO_LASTSOCKET is broken on Win64; only use it as a last resort */
 	curl_easy_getinfo(curl, CURLINFO_LASTSOCKET, (long *)&sctx->sock);
 #endif
@@ -1885,7 +1910,8 @@ static uint32_t getblocheight(struct stratum_ctx *sctx)

 	// find 0xffff tag
 	p = (uint8_t*) sctx->job.coinbase + 32;
-	m = p + 128;
+   m = p + sctx->job.coinbase_size - 32 - 2;
+//   m = p + 128;
 	while (*p != 0xff && p < m) p++;
 	while (*p == 0xff && p < m) p++;
 	if (*(p-1) == 0xff && *(p-2) == 0xff) {
@@ -1992,23 +2018,41 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
      }
   }

-   if ( merkle_count )
-      merkle = (uchar**) malloc( merkle_count * sizeof(char *) );
-	for ( i = 0; i < merkle_count; i++ )
-   {
-		const char *s = json_string_value( json_array_get( merkle_arr, i ) );
-		if ( !s || strlen(s) != 64 )
-      {
-			while ( i-- ) free( merkle[i] );
-			free( merkle );
-			applog( LOG_ERR, "Stratum notify: invalid Merkle branch" );
-			goto out;
-		}
-		merkle[i] = (uchar*) malloc( 32 );
-		hex2bin( merkle[i], s, 32 );
-	}
+   pthread_mutex_lock( &sctx->work_lock );

-	pthread_mutex_lock( &sctx->work_lock );
+   if ( merkle_count )
+   {
+      if ( merkle_count > sctx->job.merkle_buf_size )
+      {
+         for ( i = 0; i < sctx->job.merkle_count; i++ )
+            free( sctx->job.merkle[i] );
+         free( sctx->job.merkle );
+
+         merkle = (uchar**) malloc( merkle_count * sizeof(char *) );
+         for ( i = 0; i < merkle_count; i++ )
+            merkle[i] = (uchar*) malloc( 32 );
+         sctx->job.merkle_buf_size = merkle_count;
+         sctx->job.merkle = merkle;
+      }
+
+      for ( i = 0; i < merkle_count; i++ )
+      {
+         const char *s = json_string_value( json_array_get( merkle_arr, i ) );
+         if ( !s || strlen(s) != 64 )
+         {
+            for ( int j = sctx->job.merkle_buf_size; j > 0; j-- )
+               free( sctx->job.merkle[i] );
+            free( sctx->job.merkle );
+            sctx->job.merkle_count =
+            sctx->job.merkle_buf_size = 0;
+            pthread_mutex_unlock( &sctx->work_lock );
+            applog( LOG_ERR, "Stratum notify: invalid Merkle branch" );
+            goto out;
+         }
+         hex2bin( sctx->job.merkle[i], s, 32 );
+      }   
+   }
+   sctx->job.merkle_count = merkle_count;         

 	coinb1_size = strlen( coinb1 ) / 2;
 	coinb2_size = strlen( coinb2 ) / 2;
@@ -2041,18 +2085,9 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
   }

 	sctx->block_height = getblocheight( sctx );
-
-	for ( i = 0; i < sctx->job.merkle_count; i++ )
-		free( sctx->job.merkle[i] );
-
-	free( sctx->job.merkle );
-	sctx->job.merkle = merkle;
-	sctx->job.merkle_count = merkle_count;
-
 	hex2bin( sctx->job.nbits, nbits, 4 );
 	hex2bin( sctx->job.ntime, stime, 4 );
 	sctx->job.clean = clean;
-
 	sctx->job.diff = sctx->next_diff;

 	pthread_mutex_unlock( &sctx->work_lock );
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -17,7 +17,9 @@ export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
 # used by GCC
 export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
 # Support for Windows 7 CPU groups, AES sometimes not included in -march
-export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
+# CPU groups disabled due to incompatibilities between Intel and AMD CPUs.
+#export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
+export DEFAULT_CFLAGS="-maes -O3 -Wall"
 export DEFAULT_CFLAGS_OLD="-O3 -Wall"

 # make link to local gmp header file.
@@ -127,7 +129,7 @@ make clean || echo clean
 # Native with CPU groups ennabled
 make clean || echo clean
 rm -f config.status
-CFLAGS="-march=native $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=native $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
Author	SHA1	Message	Date
Jay D Dee	de564ccbde	v3.22.2	2023-04-06 13:38:37 -04:00
Jay D Dee	fcd7727b0d	v3.22.1	2023-03-24 18:29:42 -04:00
Jay D Dee	3dd6787531	v3.22.0	2023-03-21 17:12:51 -04:00
Jay D Dee	cae1ce2ab7	v3.21.5	2023-03-15 12:27:04 -04:00
Jay D Dee	7a91c41d74	v3.21.4	2023-03-13 14:54:38 -04:00
Jay D Dee	c6bc9d67fb	v3.21.3 Unreleased	2023-03-13 03:20:13 -04:00
Jay D Dee	b339450898	v3.21.3	2023-03-11 14:54:49 -05:00
Jay D Dee	fb93160641	v3.21.2	2023-03-03 12:38:31 -05:00
Jay D Dee	520d4d5384	v3.21.1	2023-02-08 22:11:05 -05:00
Jay D Dee	da7030faa8	v3.21.0	2022-12-21 13:09:14 -05:00
Jay D Dee	bd84f199fe	v3.20.3	2022-10-21 23:12:18 -04:00
Jay D Dee	58030e2788	v3.20.2	2022-08-01 20:21:05 -04:00