v3.9.2.5

v3.9.2.4
v3.9.2.3
2025-09-17 23:44:27 +00:00 · 2019-06-13 11:20:27 -04:00 · 2019-06-07 23:30:38 -04:00 · 2019-06-05 12:20:04 -04:00 · 2019-06-04 17:14:03 -04:00 · 2019-06-04 16:56:44 -04:00
103 changed files with 7941 additions and 8882 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -68,7 +68,8 @@ cpuminer_SOURCES = \
  algo/blake/pentablake-4way.c \
  algo/blake/pentablake.c \
  algo/bmw/sph_bmw.c \
-  algo/bmw/bmw-hash-4way.c \
+  algo/bmw/bmw256-hash-4way.c \
+  algo/bmw/bmw512-hash-4way.c \
  algo/bmw/bmw256.c \
  algo/cryptonight/cryptolight.c \
  algo/cryptonight/cryptonight-common.c\
@@ -162,10 +163,13 @@ cpuminer_SOURCES = \
  algo/sha/sph_sha2.c \
  algo/sha/sph_sha2big.c \
  algo/sha/sha2-hash-4way.c \
+  algo/sha/sha256_hash_11way.c \
  algo/sha/sha2.c \
  algo/sha/sha256t-gate.c \
  algo/sha/sha256t-4way.c \
  algo/sha/sha256t.c \
+  algo/sha/sha256q-4way.c \
+  algo/sha/sha256q.c \
  algo/shabal/sph_shabal.c \
  algo/shabal/shabal-hash-4way.c \
  algo/shavite/sph_shavite.c \
@@ -262,7 +266,7 @@ cpuminer_SOURCES = \
  algo/yescrypt/sha256_Y.c \
  algo/yescrypt/yescrypt-best.c \
  algo/yespower/yespower.c \
-  algo/yespower/sha256.c \
+  algo/yespower/sha256_p.c \
  algo/yespower/yespower-opt.c

 disable_flags =
--- a/README.txt
+++ b/README.txt
@@ -12,7 +12,7 @@ the software, don't use it.
 Choose the exe that best matches you CPU's features or use trial and
 error to find the fastest one that doesn't crash. Pay attention to
 the features listed at cpuminer startup to ensure you are mining at
-optimum speed using all the available features.
+optimum speed using the best available features.

 Architecture names and compile options used are only provided for Intel
 Core series. Even the newest Pentium and Celeron CPUs are often missing
@@ -22,8 +22,6 @@ AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
 supported by cpuminer-opt due to an incompatible implementation of SSE2 on
 these CPUs. Some algos may crash the miner with an invalid instruction.
 Users are recommended to use an unoptimized miner such as cpuminer-multi.
-Changes in v3.8.4 may have improved compatibility with some of these CPUs.
-

 Exe name                Compile flags            Arch name

--- a/32
+++ b/32
@@ -33,11 +33,39 @@ Requirements
 Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
 supported.

-64 bit Linux or Windows operating system. Apple is not supported.
+64 bit Linux or Windows operating system. Apple and Android are not supported.

 Change Log
 ----------

+v3.9.2.5
+
+Fixed 2 regressions: hodl AES detection, x16r invalid shares with AVX2.
+More restructuring.
+
+v3.9.2.4
+
+Yet another affinity fix. Hopefully the last one.
+
+v3.9.2.3
+
+Another cpu-affinity fix.
+Disabled test code that fails to compile on some CPUs with limited
+AVX512 capabilities.
+
+v3.9.2.2
+
+Fixed some day one cpu-affinity issues.
+
+v3.9.2
+
+Added sha256q algo.
+Yespower now uses openssl SHA256, but no observable hash rate increase
+on Ryzen.
+Ongoing rearchitecting.
+Lyra2z now hashes 8-way on CPUs with AVX2.
+Lyra2 (all including phi2) now runs optimized code with SSE2.
+
 v3.9.1.1

 Fixed lyra2v3 AVX and below.
@@ -45,7 +73,7 @@ Fixed lyra2v3 AVX and below.
 Compiling on Windows using Cygwin now works. Simply use "./build.sh"
 just like on Linux. It isn't portable therefore the binaries package will
 continue to use the existing procedure.
-The Cygwin procedfure will be documented in more detail later and will
+The Cygwin procedure will be documented in more detail later and will
 include a list of packages that need to be installed.

 v3.9.1
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -210,6 +210,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_SCRYPTJANE:   register_scryptjane_algo   ( gate ); break;
     case ALGO_SHA256D:      register_sha256d_algo      ( gate ); break;
     case ALGO_SHA256T:      register_sha256t_algo      ( gate ); break;
+     case ALGO_SHA256Q:      register_sha256q_algo      ( gate ); break;
     case ALGO_SHAVITE3:     register_shavite_algo      ( gate ); break;
     case ALGO_SKEIN:        register_skein_algo        ( gate ); break;
     case ALGO_SKEIN2:       register_skein2_algo       ( gate ); break;
@@ -344,9 +345,9 @@ const char* const algo_alias_map[][2] =
  { NULL,                NULL           }   
 };

-// if arg is a valid alias for a known algo it is updated with the proper name.
-// No validation of the algo or alias is done, It is the responsinility of the
-// calling function to validate the algo after return.
+// if arg is a valid alias for a known algo it is updated with the proper
+// name. No validation of the algo or alias is done, It is the responsinility
+// of the calling function to validate the algo after return.
 void get_algo_alias( char** algo_or_alias )
 {
  int i;
@@ -361,3 +362,22 @@ void get_algo_alias( char** algo_or_alias )

 #undef ALIAS
 #undef PROPER
+
+// only for parallel when there are lanes.
+bool submit_solution( struct work *work, void *hash,
+                      struct thr_info *thr, int lane )
+{
+     work_set_target_ratio( work, hash );
+     if ( submit_work( thr, work ) )
+     {
+         applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
+                 accepted_share_count + rejected_share_count + 1,
+                 thr->id, lane );
+         return true;
+     }
+     else
+          applog( LOG_WARNING, "Failed to submit share." );
+     return false;
+}
+
+
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -2,8 +2,7 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include "miner.h"
-#include "avxdefs.h"
-#include "interleave.h"
+#include "simd-utils.h"

 /////////////////////////////
 ////
@@ -196,8 +195,9 @@ void four_way_not_tested();
 int null_scanhash();

 // The one and only, a callback for scanhash.
-
-
+bool submit_solution( struct work *work, void *hash,
+                      struct thr_info *thr, int lane );
+ 
 bool submit_work( struct thr_info *thr, const struct work *work_in );

 // displays warning
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -45,7 +45,7 @@ extern "C"{

 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #define SPH_SIZE_blake256   256

--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -16,7 +16,7 @@

 #if defined(__SSE4_2__)

-#include "avxdefs.h"
+#include "simd-utils.h"

 #include <stddef.h>
 #include <stdint.h>
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -41,15 +41,18 @@ extern "C"{
 #endif

 #include <stddef.h>
-#ifdef __AVX2__

 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #define SPH_SIZE_bmw256   256

 #define SPH_SIZE_bmw512   512

+#if defined(__SSE2__)
+
+// BMW-256 4 way 32
+
 typedef struct {
   __m128i buf[64];
   __m128i H[16];
@@ -59,6 +62,60 @@ typedef struct {

 typedef bmw_4way_small_context bmw256_4way_context;

+void bmw256_4way_init(void *cc);
+
+void bmw256_4way(void *cc, const void *data, size_t len);
+
+void bmw256_4way_close(void *cc, void *dst);
+
+void bmw256_4way_addbits_and_close(
+        void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif  // __SSE2__
+
+#if defined(__AVX2__)
+
+// BMW-256 8 way 32
+
+typedef struct {
+   __m256i buf[64];
+   __m256i H[16];
+   size_t ptr;
+   uint32_t bit_count;  // assume bit_count fits in 32 bits
+} bmw_8way_small_context __attribute__ ((aligned (64)));
+
+typedef bmw_8way_small_context bmw256_8way_context;
+
+void bmw256_8way_init( bmw256_8way_context *ctx );
+void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len );
+void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
+
+#endif
+
+
+#if defined(__SSE2__)
+
+// BMW-512 2 way 64
+
+typedef struct {
+   __m128i buf[16];
+   __m128i H[16];
+   size_t ptr;
+   uint64_t bit_count; 
+} bmw_2way_big_context __attribute__ ((aligned (64)));
+
+typedef bmw_2way_big_context bmw512_2way_context;
+
+void bmw512_2way_init( bmw512_2way_context *ctx );
+void bmw512_2way( bmw512_2way_context *ctx, const void *data, size_t len );
+void bmw512_2way_close( bmw512_2way_context *ctx, void *dst );
+
+#endif // __SSE2__
+
+#if defined(__AVX2__)
+
+// BMW-512 4 way 64
+
 typedef struct {
   __m256i buf[16];
   __m256i H[16];
@@ -68,14 +125,6 @@ typedef struct {

 typedef bmw_4way_big_context bmw512_4way_context;

-void bmw256_4way_init(void *cc);
-
-void bmw256_4way(void *cc, const void *data, size_t len);
-
-void bmw256_4way_close(void *cc, void *dst);
-
-void bmw256_4way_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);

 void bmw512_4way_init(void *cc);

@@ -86,10 +135,10 @@ void bmw512_4way_close(void *cc, void *dst);
 void bmw512_4way_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);

-#endif
+#endif  // __AVX2__

 #ifdef __cplusplus
 }
 #endif

-#endif
+#endif // BMW_HASH_H__
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -4,7 +4,7 @@
 #if defined(__AVX2__)

 #include <stdint.h>
-#include "avxdefs.h"
+#include "simd-utils.h"

 // 2x128, 2 way parallel SSE2

--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -13,7 +13,7 @@
 #include <stdbool.h>
 #include <unistd.h>
 #include <memory.h>
-#include "avxdefs.h"
+#include "simd-utils.h"
 #include <stdio.h>

 // The result of hashing 10 rounds of initial data which is params and 
--- a/algo/fugue/sph_fugue.c
+++ b/algo/fugue/sph_fugue.c
@@ -11,6 +11,8 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

+#define SPH_FUGUE_NOCOPY 1
+
 static const sph_u32 IV224[] = {
 	SPH_C32(0xf4c9120d), SPH_C32(0x6286f757), SPH_C32(0xee39e01c),
 	SPH_C32(0xe074e3cb), SPH_C32(0xa1127c62), SPH_C32(0x9a43d215),
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -12,7 +12,7 @@
 #include <memory.h>
 #include "hash-groestl.h"
 #include "miner.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #ifndef NO_AES_NI

--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -9,7 +9,7 @@
 #include <memory.h>
 #include "hash-groestl256.h"
 #include "miner.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #ifndef NO_AES_NI

--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -40,7 +40,7 @@

 #if defined (__AVX2__)

-#include "avxdefs.h"
+#include "simd-utils.h"

 #ifdef __cplusplus
 extern "C"{
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -69,7 +69,7 @@ extern "C"{

 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #define SPH_SIZE_haval256_5   256

--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -156,7 +156,7 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,

 bool register_hodl_algo( algo_gate_t* gate )
 {
-#if defined(__AES__)
+#if !defined(__AES__)
  applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
  return false;
 #endif
--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -44,7 +44,7 @@ extern "C"{

 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #define SPH_SIZE_jh256   256

--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -44,7 +44,7 @@ extern "C"{

 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #define SPH_SIZE_keccak256   256

--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -24,7 +24,7 @@

 #if defined(__AVX2__)

-#include "avxdefs.h"
+#include "simd-utils.h"

 #define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \
                               0UL, 0UL, 0UL, 0xffffffffUL )
--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -24,7 +24,7 @@

 #include <immintrin.h>
 #include "algo/sha/sha3-defs.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 /* The length of digests*/
 #define DIGEST_BIT_LEN_224 224
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -20,7 +20,7 @@

 #include <string.h>
 #include <emmintrin.h>
-#include "avxdefs.h"
+#include "simd-utils.h"
 #include "luffa_for_sse2.h"

 #define MULT2(a0,a1) do \
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -44,10 +44,11 @@ void allium_4way_hash( void *state, const void *input )
   blake256_4way( &ctx.blake, input + (64<<2), 16 );
   blake256_4way_close( &ctx.blake, vhash32 );

-   mm256_reinterleave_4x64( vhash64, vhash32, 256 );
+   mm256_rintrlv_4x32_4x64( vhash64, vhash32, 256 );
   keccak256_4way( &ctx.keccak, vhash64, 32 );
   keccak256_4way_close( &ctx.keccak, vhash64 );
-   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
@@ -67,73 +68,64 @@ void allium_4way_hash( void *state, const void *input )
   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );

-   mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+   mm256_intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+
   skein256_4way( &ctx.skein, vhash64, 32 );
   skein256_4way_close( &ctx.skein, vhash64 );
-   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

-   update_and_final_groestl256( &ctx.groestl, hash0, hash0, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash1, hash1, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash2, hash2, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash3, hash3, 256 );
+   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

-   memcpy( state,    hash0, 32 );
-   memcpy( state+32, hash1, 32 );
-   memcpy( state+64, hash2, 32 );
-   memcpy( state+96, hash3, 32 );
+   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
+   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
+   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
+   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
 }

 int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                             uint64_t *hashes_done )
+                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 76; // 19*4
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   swab32_array( edata, pdata, 20 );
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_bswap_intrlv80_4x32( vdata, pdata );
   blake256_4way_init( &allium_4way_ctx.blake );
   blake256_4way( &allium_4way_ctx.blake, vdata, 64 );

   do {
-     be32enc( noncep,   n   );
-     be32enc( noncep+1, n+1 );
-     be32enc( noncep+2, n+2 );
-     be32enc( noncep+3, n+3 );
+     *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );

     allium_4way_hash( hash, vdata );
     pdata[19] = n;

-     for ( int i = 0; i < 4; i++ )
-     if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+     for ( int lane = 0; lane < 4; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
     {
-         pdata[19] = n+i;
-         nonces[ num_found++ ] = n+i;
-         work_set_target_ratio( work, hash+(i<<3) );
+        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
+        {
+           pdata[19] = n + lane;
+           submit_solution( work, hash+(lane<<3), mythr, lane );
+         }
     }
     n += 4;
-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/lyra2/allium.c
+++ b/algo/lyra2/allium.c
@@ -70,7 +70,7 @@ void allium_hash(void *state, const void *input)
 }

 int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done )
+                     uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t _ALIGN(128) hash[8];
    uint32_t _ALIGN(128) endiandata[20];
@@ -80,6 +80,7 @@ int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t nonce = first_nonce;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

    if ( opt_benchmark )
        ptarget[7] = 0x3ffff;
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -1,6 +1,43 @@
 #include "lyra2-gate.h"


+// huge pages
+//
+// Use MAP_PRIVATE instead
+// In register algo:
+// replace thread safe whole matrix with a char**
+// alloc huge pages matrixsize * threads
+// make pointers to each thread to each thread, creating an 
+// array[thread][matrix].
+// Each thread can create its own matrix pointer:
+//  my_matrix = the matrix + ( thread_id * matrix_size  )
+//
+// Compiler version check?
+// Fallback?
+//
+// create a generic utility to map & unmap huge pages.
+// ptr = malloc_huge( size );
+// Yespower wrapper checks for 64 byte alignment, seems unnecessary as
+// it should be aligned to the page boundary. It may be desireable to
+// have the matrix size rounded up if necessary to something bigger
+// than 64 byte, say 4 kbytes a small page size.
+
+// Define some constants for indivual parameters and matrix size for
+// each algo. Use the parameter constants where apropriate.
+// Convert algos that don't yet do so to use dynamic alllocation.
+// Alloc huge pages globally. If ok each thread will create a pointer to
+// its chunk. If fail each thread will use use _mm_alloc for itself. 
+// BLOCK_LEN_BYTES is 768.
+
+#define LYRA2REV3_NROWS 4
+#define LYRA2REV3_NCOLS 4
+/*
+#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)*(LYRA2REV3_NCOLS)* \
+                                                 (LYRA2REV3_NROWS)*8)
+*/
+
+#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)<<4)
+
 __thread uint64_t* l2v3_wholeMatrix;

 bool lyra2rev3_thread_init()
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -5,7 +5,9 @@
 #include <stdint.h>
 #include "lyra2.h"

-#if defined(__AVX2__)
+//#if defined(__AVX2__)
+
+#if defined(__SSE2__)
  #define LYRA2REV3_4WAY
 #endif

@@ -43,14 +45,14 @@ bool register_lyra2rev2_algo( algo_gate_t* gate );

 void lyra2rev2_4way_hash( void *state, const void *input );
 int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                             uint64_t *hashes_done );
+                             uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev2_4way_ctx();

 #else

 void lyra2rev2_hash( void *state, const void *input );
 int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done );
+                        uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev2_ctx();

 #endif
@@ -61,7 +63,7 @@ bool init_lyra2rev2_ctx();
  #define LYRA2Z_4WAY
 #endif
 #if defined(__AVX2__)
-//  #define LYRA2Z_8WAY
+  #define LYRA2Z_8WAY
 #endif


@@ -71,21 +73,21 @@ bool init_lyra2rev2_ctx();

 void lyra2z_8way_hash( void *state, const void *input );
 int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done );
+                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_8way_thread_init();

 #elif defined(LYRA2Z_4WAY)

 void lyra2z_4way_hash( void *state, const void *input );
 int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done );
+                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_4way_thread_init();

 #else

 void lyra2z_hash( void *state, const void *input );
 int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
+                     uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_thread_init();

 #endif
@@ -102,14 +104,14 @@ bool lyra2z_thread_init();

 void lyra2h_4way_hash( void *state, const void *input );
 int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done );
+                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2h_4way_thread_init();

 #else

 void lyra2h_hash( void *state, const void *input );
 int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
+                     uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2h_thread_init();

 #endif
@@ -126,14 +128,14 @@ bool register_allium_algo( algo_gate_t* gate );

 void allium_4way_hash( void *state, const void *input );
 int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done );
+                          uint64_t *hashes_done, struct thr_info *mythr );
 bool init_allium_4way_ctx();

 #else

 void allium_hash( void *state, const void *input );
 int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
+                     uint64_t *hashes_done, struct thr_info *mythr );
 bool init_allium_ctx();

 #endif 
@@ -146,7 +148,7 @@ bool register_phi2_algo( algo_gate_t* gate );

 void phi2_hash( void *state, const void *input );
 int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
+                     uint64_t *hashes_done, struct thr_info *mythr );
 void init_phi2_ctx();

 #endif  // LYRA2_GATE_H__
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -236,7 +236,7 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
   //Tries to allocate enough space for the whole memory matrix

   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+//   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
   const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
 /*
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
@@ -566,7 +566,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,

 #if defined(__AVX2__)
   memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
-#elif defined(__SSE4_2__)
+#elif defined(__SSE2__)
   memset_zero_128( (__m128i*)wholeMatrix, i>>4 );   
 #else
   memset( wholeMatrix, 0, i );
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -36,66 +36,53 @@ void lyra2h_4way_hash( void *state, const void *input )
     blake256_4way( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

-     LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
-     LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
-     LYRA2Z( lyra2h_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 16, 16, 16 );
-     LYRA2Z( lyra2h_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 16, 16, 16 );
-
-     memcpy( state,    hash0, 32 );
-     memcpy( state+32, hash1, 32 );
-     memcpy( state+64, hash2, 32 );
-     memcpy( state+96, hash3, 32 );
+     LYRA2Z( lyra2h_4way_matrix, state, 32, hash0, 32, hash0, 32,
+             16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, state+32, 32, hash1, 32, hash1,
+             32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, state+64, 32, hash2, 32, hash2,
+             32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, state+96, 32, hash3, 32, hash3,
+             32, 16, 16, 16 );
 }

 int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done )
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep= vdata + 76; // 19*4
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;

-   for ( int i=0; i < 20; i++ )
-      be32enc( &edata[i], pdata[i] );
-
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
-
+   mm128_bswap_intrlv80_4x32( vdata, pdata );
   lyra2h_4way_midstate( vdata );

   do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+1, n+1 );
-      be32enc( noncep+2, n+2 );
-      be32enc( noncep+3, n+3 );
-
-      be32enc( &edata[19], n );
+     *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      lyra2h_4way_hash( hash, vdata );

      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
+           && !opt_benchmark )
      {
          pdata[19] = n+i;         
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          submit_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;
-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
+   } while (  (n < max_nonce-4) && !work_restart[thr_id].restart);

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/lyra2/lyra2h.c
+++ b/algo/lyra2/lyra2h.c
@@ -36,7 +36,7 @@ void lyra2h_hash( void *state, const void *input )
 }

 int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done )
+                    uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(64) hash[8];
 	uint32_t _ALIGN(64) endiandata[20];
@@ -45,6 +45,7 @@ int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

 	if (opt_benchmark)
 		ptarget[7] = 0x0000ff;
--- a/algo/lyra2/lyra2re.c
+++ b/algo/lyra2/lyra2re.c
@@ -6,7 +6,7 @@
 #include "algo/keccak/sph_keccak.h"
 #include "lyra2.h"
 #include "algo-gate-api.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 #if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl256.h"
 #endif
@@ -81,8 +81,8 @@ void lyra2re_hash(void *state, const void *input)
 	memcpy(state, hashA, 32);
 }

-int scanhash_lyra2re(int thr_id, struct work *work,
-	uint32_t max_nonce,	uint64_t *hashes_done)
+int scanhash_lyra2re( int thr_id, struct work *work, uint32_t max_nonce,
+	              uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -91,6 +91,7 @@ int scanhash_lyra2re(int thr_id, struct work *work,
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
        const uint32_t Htarg = ptarget[7];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

        swab32_array( endiandata, pdata, 20 );

--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -42,10 +42,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   blake256_4way( &ctx.blake, input + (64<<2), 16 );
   blake256_4way_close( &ctx.blake, vhash );

-   mm256_reinterleave_4x64( vhash64, vhash, 256 );
+   mm256_rintrlv_4x32_4x64( vhash64, vhash, 256 );
+
   keccak256_4way( &ctx.keccak, vhash64, 32 );
   keccak256_4way_close( &ctx.keccak, vhash64 );
-   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
@@ -60,10 +62,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
   LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );

-   mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+   mm256_intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+
   skein256_4way( &ctx.skein, vhash64, 32 );
   skein256_4way_close( &ctx.skein, vhash64 );
-   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
@@ -74,60 +78,55 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );

-   mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
-   bmw256_4way( &ctx.bmw, vhash, 32 );
-   bmw256_4way_close( &ctx.bmw, vhash );
+   mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );

-   mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+   bmw256_4way( &ctx.bmw, vhash, 32 );
+   bmw256_4way_close( &ctx.bmw, state );
 }

 int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                             uint64_t *hashes_done )
+                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(64) edata[20];
+   uint32_t *hash7 = &(hash[7<<2]);
+   uint32_t lane_hash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 76; // 19*4
+   __m128i *noncev = (__m128i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   swab32_array( edata, pdata, 20 );
-
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_bswap_intrlv80_4x32( vdata, pdata );

   blake256_4way_init( &l2v2_4way_ctx.blake );
   blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );

-   do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+1, n+1 );
-      be32enc( noncep+2, n+2 );
-      be32enc( noncep+3, n+3 );
+   do
+   {
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );

      lyra2rev2_4way_hash( hash, vdata );
      pdata[19] = n;

-      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
      {
-          pdata[19] = n+i;         
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+         mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+            pdata[19] = n + lane;         
+            submit_solution( work, lane_hash, mythr, lane );
+         }
      }
      n += 4;
-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
-
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -73,7 +73,7 @@ void lyra2rev2_hash( void *state, const void *input )
 }

 int scanhash_lyra2rev2(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -82,6 +82,7 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
        const uint32_t Htarg = ptarget[7];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -35,7 +35,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )

   blake256_4way( &ctx.blake, input, 80 );
   blake256_4way_close( &ctx.blake, vhash );
-   mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+   mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
@@ -55,10 +55,9 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );

-   mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
+   mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
   bmw256_4way( &ctx.bmw, vhash, 32 );
   bmw256_4way_close( &ctx.bmw, state );
-
 }

 int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -66,7 +65,6 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t edata[20] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t lane_hash[8];
   uint32_t *pdata = work->data;
@@ -74,22 +72,13 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
-   int num_found = 0;
   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
   
   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   // Need big endian data
-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
-
+   mm128_bswap_intrlv80_4x32( vdata, pdata );
   do
   {
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
@@ -100,22 +89,14 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
      for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
      {
         mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
-
-         if ( fulltest( lane_hash, ptarget ) )
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
              pdata[19] = n + lane;    
-              work_set_target_ratio( work, lane_hash );
-              if ( submit_work( mythr, work ) )
-                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-		             accepted_share_count + rejected_share_count + 1,
-			     thr_id, lane );
-              else
-                applog( LOG_WARNING, "Failed to submit share." );
-	 }
+              submit_solution( work, lane_hash, mythr, lane );
+	      }
      }
      n += 4;
   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
-
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -36,66 +36,51 @@ void lyra2z_4way_hash( void *state, const void *input )
     blake256_4way( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

-     LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
-
-     memcpy( state,    hash0, 32 );
-     memcpy( state+32, hash1, 32 );
-     memcpy( state+64, hash2, 32 );
-     memcpy( state+96, hash3, 32 );
+     LYRA2Z( lyra2z_4way_matrix, state   , 32, hash0, 32, hash0, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, state+32, 32, hash1, 32, hash1, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, state+64, 32, hash2, 32, hash2, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, state+96, 32, hash3, 32, hash3, 32, 8, 8, 8 );
 }

 int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done )
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 76; // 19*4
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;

-   for ( int i=0; i < 20; i++ )
-      be32enc( &edata[i], pdata[i] );
-
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
-
+   mm128_bswap_intrlv80_4x32( vdata, pdata );
   lyra2z_4way_midstate( vdata );

   do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+1, n+1 );
-      be32enc( noncep+2, n+2 );
-      be32enc( noncep+3, n+3 );
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );

      lyra2z_4way_hash( hash, vdata );
      pdata[19] = n;

      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
+           && !opt_benchmark )
      {
          pdata[19] = n+i;         
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          submit_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;
-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
@@ -134,8 +119,8 @@ void lyra2z_8way_hash( void *state, const void *input )
     blake256_8way( &ctx_blake, input + (64*8), 16 );
     blake256_8way_close( &ctx_blake, vhash );

-     mm256_deinterleave_8x32( hash0, hash1, hash2, hash3,
-                              hash4, hash5, hash6, hash7, vhash, 256 );
+     mm256_dintrlv_8x32( hash0, hash1, hash2, hash3,
+                         hash4, hash5, hash6, hash7, vhash, 256 );

     LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
     LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
@@ -150,64 +135,49 @@ void lyra2z_8way_hash( void *state, const void *input )
     memcpy( state+ 32, hash1, 32 );
     memcpy( state+ 64, hash2, 32 );
     memcpy( state+ 96, hash3, 32 );
-     memcpy( state+128, hash1, 32 );
-     memcpy( state+160, hash2, 32 );
-     memcpy( state+192, hash3, 32 );
-     memcpy( state+224, hash1, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
 }

 int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done )
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 152; // 19*8
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;

-   for ( int i=0; i < 19; i++ )
-      be32enc( &edata[i], pdata[i] );
-
-   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
-                                 edata, edata, edata, edata, 640 );
-
+   mm256_bswap_intrlv80_8x32( vdata, pdata );
   lyra2z_8way_midstate( vdata );

   do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+1, n+1 );
-      be32enc( noncep+2, n+2 );
-      be32enc( noncep+3, n+3 );
-      be32enc( noncep+4, n+4 );
-      be32enc( noncep+5, n+5 );
-      be32enc( noncep+6, n+6 );
-      be32enc( noncep+7, n+7 );
-
+      *noncev = mm256_bswap_32(
+                 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
      lyra2z_8way_hash( hash, vdata );
      pdata[19] = n;

      for ( int i = 0; i < 8; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
+           && !opt_benchmark )
      {
          pdata[19] = n+i;         
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          submit_solution( work, hash+(i<<3), mythr, i );
      }
      n += 8;
-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
+   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }


--- a/algo/lyra2/lyra2z.c
+++ b/algo/lyra2/lyra2z.c
@@ -3,7 +3,7 @@
 #include "lyra2-gate.h"
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 __thread uint64_t* lyra2z_matrix;

@@ -44,7 +44,7 @@ void lyra2z_hash( void *state, const void *input )
 }

 int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done )
+                    uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(64) hash[8];
 	uint32_t _ALIGN(64) endiandata[20];
@@ -53,6 +53,7 @@ int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

 	if (opt_benchmark)
 		ptarget[7] = 0x0000ff;
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -1,7 +1,7 @@
 #include <memory.h>
 #include "algo-gate-api.h"
 #include "lyra2.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 __thread uint64_t* lyra2z330_wholeMatrix;

@@ -16,39 +16,46 @@ void lyra2z330_hash(void *state, const void *input, uint32_t height)
 }

 int scanhash_lyra2z330( int thr_id, struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done )
+                        uint64_t *hashes_done, struct thr_info *mythr )
 {
-	uint32_t hash[8] __attribute__ ((aligned (64))); 
-	uint32_t endiandata[20] __attribute__ ((aligned (64)));
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
-	const uint32_t Htarg = ptarget[7];
-	const uint32_t first_nonce = pdata[19];
-	uint32_t nonce = first_nonce;
-	if (opt_benchmark)
-		ptarget[7] = 0x0000ff;
+   uint32_t hash[8] __attribute__ ((aligned (64))); 
+   uint32_t endiandata[20] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t nonce = first_nonce;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

-	for (int i=0; i < 19; i++) {
-		be32enc(&endiandata[i], pdata[i]);
-	}
+   if (opt_benchmark)
+	ptarget[7] = 0x0000ff;

-	do {
-		be32enc(&endiandata[19], nonce);
-		lyra2z330_hash( hash, endiandata, work->height );
-
-		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
-			work_set_target_ratio(work, hash);
-			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-			return 1;
-		}
-		nonce++;
-
-	} while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-	pdata[19] = nonce;
-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   
+   do
+   {
+      be32enc(&endiandata[19], nonce);
+      lyra2z330_hash( hash, endiandata, work->height );
+      if ( hash[7] <= Htarg && fulltest(hash, ptarget) && !opt_benchmark )
+      {
+         work_set_target_ratio(work, hash);
+         pdata[19] = nonce;
+         if ( submit_work( mythr, work ) )
+             applog( LOG_NOTICE, "Share %d submitted by thread %d",
+                     accepted_share_count + rejected_share_count + 1,
+                     mythr->id );
+         else
+             applog( LOG_WARNING, "Failed to submit share." );
+      }
+      nonce++;
+   } while (nonce < max_nonce && !work_restart[thr_id].restart);
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
+   return 0;
 }

 void lyra2z330_set_target( struct work* work, double job_diff )
--- a/algo/lyra2/phi2.c
+++ b/algo/lyra2/phi2.c
@@ -50,11 +50,11 @@ void phi2_hash(void *state, const void *input)
 	unsigned char _ALIGN(128) hashA[64];
 	unsigned char _ALIGN(128) hashB[64];

-        phi2_ctx_holder ctx __attribute__ ((aligned (64)));
-        memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
+  phi2_ctx_holder ctx __attribute__ ((aligned (64)));
+  memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );

-        cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)input,
-		              phi2_has_roots ? 144 : 80 );
+  cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)input,
+                        phi2_has_roots ? 144 : 80 );

 	LYRA2RE( &hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8 );
 	LYRA2RE( &hashA[32], 32, &hashB[32], 32, &hashB[32], 32, 1, 8, 8 );
@@ -63,17 +63,17 @@ void phi2_hash(void *state, const void *input)
 	sph_jh512_close( &ctx.jh, (void*)hash );

 	if ( hash[0] & 1 )
-       	{
-           sph_gost512( &ctx.gost, (const void*)hash, 64 );
+  	{
+      sph_gost512( &ctx.gost, (const void*)hash, 64 );
 	   sph_gost512_close( &ctx.gost, (void*)hash );
 	}
-       	else
-       	{
+  	else
+  	{
 #if defined(__AES__)
-           update_final_echo ( &ctx.echo1, (BitSequence *)hash,
-                               (const BitSequence *)hash, 512 );
-           update_final_echo ( &ctx.echo2, (BitSequence *)hash,
-                               (const BitSequence *)hash, 512 );
+      update_final_echo ( &ctx.echo1, (BitSequence *)hash,
+                          (const BitSequence *)hash, 512 );
+      update_final_echo ( &ctx.echo2, (BitSequence *)hash,
+                          (const BitSequence *)hash, 512 );
 #else
 	   sph_echo512( &ctx.echo1, (const void*)hash, 64 );
 	   sph_echo512_close( &ctx.echo1, (void*)hash );
@@ -92,42 +92,50 @@ void phi2_hash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }

-int scanhash_phi2(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
+	           uint64_t *hashes_done, struct thr_info *mythr )
 {
-	uint32_t _ALIGN(128) hash[8];
-	uint32_t _ALIGN(128) endiandata[36];
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
+   uint32_t _ALIGN(128) hash[8];
+   uint32_t _ALIGN(128) endiandata[36];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

-	const uint32_t Htarg = ptarget[7];
-	const uint32_t first_nonce = pdata[19];
-	uint32_t n = first_nonce;
+   if(opt_benchmark){
+   	ptarget[7] = 0x00ff;
+   }

-	if(opt_benchmark){
-		ptarget[7] = 0x00ff;
-	}
+   phi2_has_roots = false;
+   for ( int i=0; i < 36; i++ )
+   {
+	be32enc(&endiandata[i], pdata[i]);
+	if (i >= 20 && pdata[i]) phi2_has_roots = true;
+   }

-	phi2_has_roots = false;
-	for (int i=0; i < 36; i++) {
-		be32enc(&endiandata[i], pdata[i]);
-		if (i >= 20 && pdata[i]) phi2_has_roots = true;
-	}
+   do {
+	be32enc( &endiandata[19], n );
+	phi2_hash( hash, endiandata );

-	do {
-		be32enc(&endiandata[19], n);
-		phi2_hash(hash, endiandata);
-
-		if (hash[7] < Htarg && fulltest(hash, ptarget)) {
-			work_set_target_ratio(work, hash);
+	if ( hash[7] < Htarg && fulltest( hash, ptarget ) )
+       	{
+           pdata[19] = n;
+           work_set_target_ratio( work, hash );
+           if ( submit_work( mythr, work ) )
+               applog( LOG_NOTICE, "Share %d submitted by thread %d.",
+                            accepted_share_count + rejected_share_count + 1,
+                            thr_id );
+           else
+               applog( LOG_WARNING, "Failed to submit share." );
 			*hashes_done = n - first_nonce + 1;
-			pdata[19] = n;
-			return 1;
-		}
-		n++;
+	}
+	n++;

-	} while (n < max_nonce && !work_restart[thr_id].restart);
+   } while ( n < max_nonce && !work_restart[thr_id].restart );

-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-	return 0;
+   *hashes_done = n - first_nonce + 1;
+   pdata[19] = n;
+   return 0;
 }
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -51,7 +51,7 @@ inline void initState( uint64_t State[/*16*/] )
  state[3] = _mm256_set_epi64x( blake2b_IV[7], blake2b_IV[6],
                                blake2b_IV[5], blake2b_IV[4] );

-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)

  __m128i* state = (__m128i*)State;

@@ -137,7 +137,7 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
    //Squeezes remaining bytes
    memcpy_256( out, state, ( len_m256i % BLOCK_LEN_M256I ) );

-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)

    const int len_m128i = len / 16;
    const int fullBlocks = len_m128i / BLOCK_LEN_M128I;
@@ -205,7 +205,7 @@ inline void absorbBlock( uint64_t *State, const uint64_t *In )
    _mm256_store_si256( (__m256i*)State + 2, state2 );
    _mm256_store_si256( (__m256i*)State + 3, state3 );

-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)

    __m128i* state = (__m128i*)State;
    __m128i* in    = (__m128i*)In;
@@ -273,7 +273,7 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In )
    _mm256_store_si256( (__m256i*)State + 2, state2 );
    _mm256_store_si256( (__m256i*)State + 3, state3 );

-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)

    __m128i* state = (__m128i*)State;
    __m128i* in    = (__m128i*)In;
@@ -355,7 +355,7 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
    _mm256_store_si256( (__m256i*)State + 2, state2 );
    _mm256_store_si256( (__m256i*)State + 3, state3 );

-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)

    __m128i* state = (__m128i*)State;
    __m128i  state0 = _mm_load_si128(  state    );
@@ -494,7 +494,7 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
    _mm256_store_si256( (__m256i*)State + 2, state2 );
    _mm256_store_si256( (__m256i*)State + 3, state3 );

-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)

    __m128i* state = (__m128i*)State;
    __m128i  state0 = _mm_load_si128(  state    );
@@ -694,7 +694,7 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
    _mm256_store_si256( (__m256i*)State + 2, state2 );
    _mm256_store_si256( (__m256i*)State + 3, state3 );

-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)

    __m128i* in    = (__m128i*)rowIn;
    __m128i* inout = (__m128i*)rowInOut;
@@ -713,9 +713,9 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
    __m128i* state = (__m128i*)State;

    // For the last round in this function not optimized for AVX
-    uint64_t* ptrWordIn = rowIn;        //In Lyra2: pointer to prev
-    uint64_t* ptrWordInOut = rowInOut;  //In Lyra2: pointer to row*
-    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+//    uint64_t* ptrWordIn = rowIn;        //In Lyra2: pointer to prev
+//    uint64_t* ptrWordInOut = rowInOut;  //In Lyra2: pointer to row*
+//    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row

    for ( i = 0; i < nCols; i++ )
    {
@@ -750,6 +750,28 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
        out[4] = _mm_xor_si128( state[4], in[4] );
        out[5] = _mm_xor_si128( state[5], in[5] );

+
+       __m128i t0, t1;
+       t0 = _mm_srli_si128( state[0], 8 );
+       t1 = _mm_srli_si128( state[1], 8 );
+       inout[0] = _mm_xor_si128( inout[0],
+                              _mm_or_si128( _mm_slli_si128( state[0], 8 ),
+                                            _mm_srli_si128( state[5], 8 ) ) );
+       inout[1] = _mm_xor_si128( inout[1],
+                        _mm_or_si128( _mm_slli_si128( state[1], 8 ), t0 ) );
+       t0 = _mm_srli_si128( state[2], 8 );
+       inout[2] = _mm_xor_si128( inout[2],
+                        _mm_or_si128( _mm_slli_si128( state[2], 8 ), t1 ) );
+       t1 = _mm_srli_si128( state[3], 8 );
+       inout[3] = _mm_xor_si128( inout[3],
+                        _mm_or_si128( _mm_slli_si128( state[3], 8 ), t0 ) );
+       t0 = _mm_srli_si128( state[4], 8 );
+       inout[4] = _mm_xor_si128( inout[4],
+                        _mm_or_si128( _mm_slli_si128( state[4], 8 ), t1 ) );
+       inout[5] = _mm_xor_si128( inout[5],
+                        _mm_or_si128( _mm_slli_si128( state[5], 8 ), t0 ) );
+
+/*
        ptrWordInOut[0]  ^= State[11];
        ptrWordInOut[1]  ^= State[0];
        ptrWordInOut[2]  ^= State[1];
@@ -768,7 +790,7 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
        ptrWordIn += BLOCK_LEN_INT64;
        //Output: goes to previous column
        ptrWordOut -= BLOCK_LEN_INT64;
-
+*/
        inout += BLOCK_LEN_M128I;
        in    += BLOCK_LEN_M128I;
        out   -= BLOCK_LEN_M128I;
@@ -930,7 +952,7 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
   _mm256_store_si256( (__m256i*)State + 2, state2 );
   _mm256_store_si256( (__m256i*)State + 3, state3 );

-#elif defined(__SSE4_2__)
+#elif defined (__SSE2__)

    __m128i* state = (__m128i*)State;
    __m128i* in    = (__m128i*)rowIn;
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -23,7 +23,7 @@
 #define SPONGE_H_

 #include <stdint.h>
-#include "avxdefs.h"
+#include "simd-utils.h"

 #if defined(__GNUC__)
 #define ALIGN __attribute__ ((aligned(32)))
@@ -59,7 +59,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 // returns void, updates all args
 #define G_4X64(a,b,c,d) \
   a = _mm256_add_epi64( a, b ); \
-   d = mm256_ror_64( _mm256_xor_si256( d, a), 32 ); \
+   d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
   c = _mm256_add_epi64( c, d ); \
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
   a = _mm256_add_epi64( a, b ); \
@@ -91,7 +91,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \

-#elif defined(__SSE4_2__)
+#elif defined(__SSE2__)

 // process 2 columns in parallel
 // returns void, all args updated
@@ -132,7 +132,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \


-#endif // AVX2 else SSE4_2
+#endif // AVX2 else SSE2

 // Scalar
 //Blake2b's G function
--- a/algo/ripemd/ripemd-hash-4way.h
+++ b/algo/ripemd/ripemd-hash-4way.h
@@ -6,7 +6,7 @@

 #if defined(__SSE4_2__)

-#include "avxdefs.h"
+#include "simd-utils.h"

 typedef struct
 {
--- a/algo/sha/sha2-hash-4way.h
+++ b/algo/sha/sha2-hash-4way.h
@@ -42,7 +42,7 @@

 #include <stddef.h>
 #include "sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #if defined(__SSE2__)
 //#if defined(__SSE4_2__)
@@ -61,6 +61,26 @@ void sha256_4way_init( sha256_4way_context *sc );
 void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
 void sha256_4way_close( sha256_4way_context *sc, void *dst );

+/*
+// SHA-256 7 way hybrid
+// Combines SSE, MMX and scalar data to do 8 + 2 + 1 parallel.
+typedef struct {
+   __m128i  bufx[64>>2];
+   __m128i  valx[8];
+   __m64    bufy[64>>2];
+   __m64    valy[8];
+   uint32_t bufz[64>>2];
+   uint32_t valz[8];
+   uint32_t count_high, count_low;
+} sha256_7way_context;
+
+void sha256_7way_init( sha256_7way_context *ctx );
+void sha256_7way( sha256_7way_context *ctx, const void *datax,
+                         void *datay, void *dataz, size_t len );
+void sha256_7way_close( sha256_7way_context *ctx, void *dstx, void *dstyx,
+                         void *dstz  );
+*/
+
 #if defined (__AVX2__)

 // SHA-256 8 way
@@ -89,6 +109,24 @@ void sha512_4way_init( sha512_4way_context *sc);
 void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
 void sha512_4way_close( sha512_4way_context *sc, void *dst );

-#endif
-#endif
-#endif
+// SHA-256 11 way hybrid
+// Combines AVX2, MMX and scalar data to do 8 + 2 + 1 parallel.
+typedef struct {
+   __m256i  bufx[64>>2];
+   __m256i  valx[8];
+   __m64    bufy[64>>2];
+   __m64    valy[8];
+   uint32_t bufz[64>>2];
+   uint32_t valz[8];
+   uint32_t count_high, count_low;
+} sha256_11way_context;
+
+void sha256_11way_init( sha256_11way_context *ctx );
+void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
+	                 const void *datay, const void *dataz, size_t len );
+void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx,
+	                 void *dstz  );
+
+#endif  // __AVX2__
+#endif  // __SSE2__
+#endif  // SHA256_4WAY_H__
--- a/algo/sha/sha256_hash_11way.c
+++ b/algo/sha/sha256_hash_11way.c
@@ -0,0 +1,538 @@
+#if 0
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sha2-hash-4way.h"
+
+#if defined(__AVX2__)
+
+// naming convention for variables and macros
+// VARx: AVX2 8 way 32 bit
+// VARy: MMX 2 way 32 bit
+// VARz: scalar integer 32 bit
+
+
+static const uint32_t H256[8] =
+{
+        0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+        0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+static const uint32_t K256[64] = 
+{
+        0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
+        0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+        0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
+        0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+        0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
+        0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+        0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
+        0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+        0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
+        0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+        0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
+        0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+        0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
+        0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+        0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
+        0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
+};
+
+#define CHx(X, Y, Z) \
+   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
+
+#define CHy(X, Y, Z) \
+   _mm_xor_si64( _mm_and_si64( _mm_xor_si64( Y, Z ), X ), Z )
+
+#define CHz(X, Y, Z) ((( (Y) ^ (Z) ) & (X) ) ^ (Z) )
+
+
+#define MAJx(X, Y, Z) \
+   _mm256_or_si256( _mm256_and_si256( X, Y ), \
+                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
+
+#define MAJy(X, Y, Z) \
+   _mm_or_si64( _mm_and_si64( X, Y ), \
+                    _mm_and_si64( _mm_or_si64( X, Y ), Z ) )
+
+#define MAJz(X, Y, Z)  ( ( (X) & (Y) ) | ( ( (X) | (Y) ) & (Z) ) )
+
+#define BSG2_0x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_ror_32(x,2), mm256_ror_32(x,13) ), _mm256_srli_epi32(x,22) )
+
+#define BSG2_0y(x) \
+   _mm_xor_si64( _mm_xor_si64( \
+       mm64_ror_32(x,2), mm64_ror_32(x,13) ), _mm_srli_pi32(x,22) )
+
+#define BSG2_0z(x)  ( u32_ror_32(x,2) ^ u32_ror_32(x,13)  ^ ((x)>>22) )
+
+#define BSG2_1x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_ror_32(x,6), mm256_ror_32(x,11) ), _mm256_srli_epi32(x,25) )
+
+#define BSG2_1y(x) \
+   _mm_xor_si64( _mm_xor_si64( \
+       mm64_ror_32(x,6), mm64_ror_32(x,11) ), _mm_srli_pi32(x,25) )
+
+#define BSG2_1z(x)   ( u32_ror_32(x,6) ^ u32_ror_32(x,11) ^ ((x)>>25) )
+
+#define SSG2_0x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_ror_32(x,7), mm256_ror_32(x,18) ), _mm256_srli_epi32(x,3) ) 
+
+#define SSG2_0y(x) \
+   _mm_xor_si64( _mm_xor_si64( \
+       mm64_ror_32(x,7), mm64_ror_32(x,18) ), _mm_srli_pi32(x,3) )
+
+#define SSG2_0z(x)  (( u32_ror_32(x,7) ^ u32_ror_32(x,18) ) ^ ((x)>>3) )
+
+#define SSG2_1x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_ror_32(x,17), mm256_ror_32(x,19) ), _mm256_srli_epi32(x,10) )
+
+#define SSG2_1y(x) \
+   _mm_xor_si64( _mm_xor_si64( \
+       mm64_ror_32(x,17), mm64_ror_32(x,19) ), _mm_srli_pi32(x,10) )
+
+#define SSG2_1z(x)   ( u32_ror_32(x,17) ^ u32_ror_32(x,19)  ^ ((x)>>10) )
+
+#define SHA2x_MEXP( a, b, c, d ) \
+     _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
+                 SSG2_1x( Wx[a] ), Wx[b] ), SSG2_0x( Wx[c] ) ), Wx[d] )
+
+#define SHA2y_MEXP( a, b, c, d ) \
+     _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
+                 SSG2_1y( Wy[a] ), Wy[b] ), SSG2_0y( Wy[c] ) ), Wy[d] )
+
+#define SHA2z_MEXP( a, b, c, d ) \
+               ( SSG2_1z( Wz[a] ) + Wz[b] + SSG2_0z( Wz[c] ) + Wz[d] )
+
+
+#define SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, \
+	                  Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, \
+		          Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, i, j) \
+do { \
+  __m256i T1x, T2x; \
+  __m64 T1y, T2y; \
+  uint32_t T1z, T2z; \
+  T1x = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
+        _mm256_add_epi32( Hx, BSG2_1x(Ex) ), CHx(Ex, Fx, Gx) ), \
+                          _mm256_set1_epi32( K256[( (j)+(i) )] ) ), Wx[i] ); \
+  T1y = _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
+        _mm_add_pi32( Hy, BSG2_1y(Ey) ), CHy(Ey, Fy, Gy) ), \
+                          _mm_set1_pi32( K256[( (j)+(i) )] ) ), Wy[i] ); \
+  T1z = Hz + BSG2_1z( Ez ) + CHz( Ez, Fz, Gz ) + K256[ ((j)+(i)) ] + Wz[i]; \
+  T2x = _mm256_add_epi32( BSG2_0x(Ax), MAJx(Ax, Bx, Cx) ); \
+  T2y = _mm_add_pi32( BSG2_0y(Ay), MAJy(Ay, By, Cy) ); \
+  T2z = BSG2_0z( Az ) + MAJz( Az, Bz, Cz ); \
+  Dx  = _mm256_add_epi32( Dx,  T1x ); \
+  Dy  = _mm_add_pi32( Dy, T1y ); \
+  Dz  = Dz + T1z; \
+  Hx  = _mm256_add_epi32( T1x, T2x ); \
+  Hy  = _mm_add_pi32( T1y, T2y ); \
+  Hz  = T1z + T2z; \
+} while (0)
+	
+void sha256_11way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 ry[8],
+                         uint32_t *inz, uint32_t rz[8] )
+{
+   __m256i Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx;
+   __m256i Wx[16];
+   __m64 Ay, By, Cy, Dy, Ey, Fy, Gy, Hy;
+   __m64 Wy[16];
+   uint32_t Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz;
+   uint32_t Wz[16];
+
+   Wx[ 0] = mm256_bswap_32( inx[ 0] );
+   Wy[ 0] =  mm64_bswap_32( iny[ 0] );
+   Wz[ 0] =       bswap_32( inz[ 0] );
+
+   Wx[ 1] = mm256_bswap_32( inx[ 1] );
+   Wy[ 1] =  mm64_bswap_32( iny[ 1] );
+   Wz[ 1] =       bswap_32( inz[ 1] );
+
+   Wx[ 2] = mm256_bswap_32( inx[ 2] );
+   Wy[ 2] =  mm64_bswap_32( iny[ 2] );
+   Wz[ 2] =       bswap_32( inz[ 2] );
+
+   Wx[ 3] = mm256_bswap_32( inx[ 3] );
+   Wy[ 3] =  mm64_bswap_32( iny[ 3] );
+   Wz[ 3] =       bswap_32( inz[ 3] );
+
+   Wx[ 4] = mm256_bswap_32( inx[ 4] );
+   Wy[ 4] =  mm64_bswap_32( iny[ 4] );
+   Wz[ 4] =       bswap_32( inz[ 4] );
+
+   Wx[ 5] = mm256_bswap_32( inx[ 5] );
+   Wy[ 5] =  mm64_bswap_32( iny[ 5] );
+   Wz[ 5] =       bswap_32( inz[ 5] );
+
+   Wx[ 6] = mm256_bswap_32( inx[ 6] );
+   Wy[ 6] =  mm64_bswap_32( iny[ 6] );
+   Wz[ 6] =       bswap_32( inz[ 6] );
+
+   Wx[ 7] = mm256_bswap_32( inx[ 7] );
+   Wy[ 7] =  mm64_bswap_32( iny[ 7] );
+   Wz[ 7] =       bswap_32( inz[ 7] );
+
+   Wx[ 8] = mm256_bswap_32( inx[ 8] );
+   Wy[ 8] =  mm64_bswap_32( iny[ 8] );
+   Wz[ 8] =       bswap_32( inz[ 8] );
+
+   Wx[ 9] = mm256_bswap_32( inx[ 9] );
+   Wy[ 9] =  mm64_bswap_32( iny[ 9] );
+   Wz[ 9] =       bswap_32( inz[ 9] );
+
+   Wx[10] = mm256_bswap_32( inx[10] );
+   Wy[10] =  mm64_bswap_32( iny[10] );
+   Wz[10] =       bswap_32( inz[10] );
+
+   Wx[11] = mm256_bswap_32( inx[11] );
+   Wy[11] =  mm64_bswap_32( iny[11] );
+   Wz[11] =       bswap_32( inz[11] );
+
+   Wx[12] = mm256_bswap_32( inx[12] );
+   Wy[12] =  mm64_bswap_32( iny[12] );
+   Wz[12] =       bswap_32( inz[12] );
+
+   Wx[13] = mm256_bswap_32( inx[13] );
+   Wy[13] =  mm64_bswap_32( iny[13] );
+   Wz[13] =       bswap_32( inz[13] );
+
+   Wx[14] = mm256_bswap_32( inx[14] );
+   Wy[14] =  mm64_bswap_32( iny[14] );
+   Wz[14] =       bswap_32( inz[14] );
+
+   Wx[15] = mm256_bswap_32( inx[15] );
+   Wy[15] =  mm64_bswap_32( iny[15] );
+   Wz[15] =       bswap_32( inz[15] );
+
+   Ax = rx[0];     Ay = ry[0];     Az = rz[0];
+   Bx = rx[1];     By = ry[1];     Bz = rz[1];
+   Cx = rx[2];     Cy = ry[2];     Cz = rz[2];
+   Dx = rx[3];     Dy = ry[3];     Dz = rz[3];
+   Ex = rx[4];     Ey = ry[4];     Ez = rz[4];
+   Fx = rx[5];     Fy = ry[5];     Fz = rz[5];
+   Gx = rx[6];     Gy = ry[6];     Gz = rz[6];
+   Hx = rx[7];     Hy = ry[7];     Hz = rz[7];
+
+   SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
+                     Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
+                     Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  0, 0 );
+   SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
+		     Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
+		     Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  1, 0 );
+   SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
+		     Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
+		     Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz,  2, 0 );
+   SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
+		     Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
+		     Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez,  3, 0 );
+   SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
+		     Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
+		     Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz,  4, 0 );
+   SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
+		     Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
+		     Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz,  5, 0 );
+   SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
+		     Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
+		     Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz,  6, 0 );
+   SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
+		     By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
+		     Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az,  7, 0 );
+   SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
+		     Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
+		     Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  8, 0 );
+   SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
+		     Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
+		     Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  9, 0 );
+   SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
+		     Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
+		     Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, 0 );
+   SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
+		     Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
+		     Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, 0 );
+   SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
+		     Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
+		     Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, 0 );
+   SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
+		     Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
+		     Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, 0 );
+   SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
+		     Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
+		     Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, 0 );
+   SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
+		     By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
+		     Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      Wx[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
+      Wy[ 0] = SHA2y_MEXP( 14,  9,  1,  0 );
+      Wz[ 0] = SHA2z_MEXP( 14,  9,  1,  0 );
+
+      Wx[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
+      Wy[ 1] = SHA2y_MEXP( 15, 10,  2,  1 );
+      Wz[ 1] = SHA2z_MEXP( 15, 10,  2,  1 );
+
+      Wx[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
+      Wy[ 2] = SHA2y_MEXP(  0, 11,  3,  2 );
+      Wz[ 2] = SHA2z_MEXP(  0, 11,  3,  2 );
+
+      Wx[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
+      Wy[ 3] = SHA2y_MEXP(  1, 12,  4,  3 );
+      Wz[ 3] = SHA2z_MEXP(  1, 12,  4,  3 );
+
+      Wx[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
+      Wy[ 4] = SHA2y_MEXP(  2, 13,  5,  4 );
+      Wz[ 4] = SHA2z_MEXP(  2, 13,  5,  4 );
+
+      Wx[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
+      Wy[ 5] = SHA2y_MEXP(  3, 14,  6,  5 );
+      Wz[ 5] = SHA2z_MEXP(  3, 14,  6,  5 );
+
+      Wx[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
+      Wy[ 6] = SHA2y_MEXP(  4, 15,  7,  6 );
+      Wz[ 6] = SHA2z_MEXP(  4, 15,  7,  6 );
+
+      Wx[ 7] = SHA2x_MEXP(  5,  0,  8,  7);
+      Wy[ 7] = SHA2y_MEXP(  5,  0,  8,  7);
+      Wz[ 7] = SHA2z_MEXP(  5,  0,  8,  7);
+
+      Wx[ 8] = SHA2x_MEXP(  6,  1,  9,  8);
+      Wy[ 8] = SHA2y_MEXP(  6,  1,  9,  8);
+      Wz[ 8] = SHA2z_MEXP(  6,  1,  9,  8);
+
+      Wx[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
+      Wy[ 9] = SHA2y_MEXP(  7,  2, 10,  9);
+      Wz[ 9] = SHA2z_MEXP(  7,  2, 10,  9);
+
+      Wx[10] = SHA2x_MEXP(  8,  3, 11, 10 );
+      Wy[10] = SHA2y_MEXP(  8,  3, 11, 10);
+      Wz[10] = SHA2z_MEXP(  8,  3, 11, 10);
+
+      Wx[11] = SHA2x_MEXP(  9,  4, 12, 11);
+      Wy[11] = SHA2y_MEXP(  9,  4, 12, 11);
+      Wz[11] = SHA2z_MEXP(  9,  4, 12, 11 );
+
+      Wx[12] = SHA2x_MEXP( 10,  5, 13, 12 );
+      Wy[12] = SHA2y_MEXP( 10,  5, 13, 12 );
+      Wz[12] = SHA2z_MEXP( 10,  5, 13, 12 );
+
+      Wx[13] = SHA2x_MEXP( 11,  6, 14, 13 );
+      Wy[13] = SHA2y_MEXP( 11,  6, 14, 13 );
+      Wz[13] = SHA2z_MEXP( 11,  6, 14, 13 );
+
+      Wx[14] = SHA2x_MEXP( 12,  7, 15, 14 );
+      Wy[14] = SHA2y_MEXP( 12,  7, 15, 14 );
+      Wz[14] = SHA2z_MEXP( 12,  7, 15, 14 );
+
+      Wx[15] = SHA2x_MEXP( 13,  8,  0, 15 );
+      Wy[15] = SHA2y_MEXP( 13,  8,  0, 15 );
+      Wz[15] = SHA2z_MEXP( 13,  8,  0, 15 );
+
+
+      SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
+                        Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
+			Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,	 0, j );
+      SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
+		        Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
+		       	Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  1, j );
+      SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
+		        Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
+		       	Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz,  2, j );
+      SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
+		        Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
+		       	Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez,  3, j );
+      SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
+		        Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
+		       	Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz,  4, j );
+      SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
+		        Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
+		       	Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz,  5, j );
+      SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
+		        Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
+		       	Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz,  6, j );
+      SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
+		        By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
+		       	Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az,  7, j );
+      SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
+                        Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
+                        Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  8, j );
+      SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx, 
+                        Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, 
+                        Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  9, j );
+      SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx, 
+                        Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, 
+                        Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, j );
+      SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex, 
+                        Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, 
+                        Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, j );
+      SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx, 
+                        Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, 
+                        Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, j );
+      SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx, 
+                        Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, 
+                        Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, j );
+      SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx, 
+                        Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, 
+                        Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, j );
+      SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax, 
+                        By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, 
+                        Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, j );
+   }
+
+   rx[0] = _mm256_add_epi32( rx[0], Ax );
+   ry[0] =     _mm_add_pi32( ry[0], Ay );
+   rz[0] =                   rz[0]+ Az;
+   rx[1] = _mm256_add_epi32( rx[1], Bx );
+   ry[1] =     _mm_add_pi32( ry[1], By );
+   rz[1] =                   rz[1]+ Bz;
+   rx[2] = _mm256_add_epi32( rx[2], Cx );
+   ry[2] =     _mm_add_pi32( ry[2], Cy );
+   rz[3] =                   rz[3]+ Dz;
+   rx[4] = _mm256_add_epi32( rx[4], Ex );
+   ry[4] =     _mm_add_pi32( ry[4], Ey );
+   rz[4] =                   rz[4]+ Ez;
+   rx[5] = _mm256_add_epi32( rx[5], Fx );
+   ry[5] =     _mm_add_pi32( ry[5], Fy );
+   rz[5] =                   rz[5]+ Fz;
+   rx[6] = _mm256_add_epi32( rx[6], Gx );
+   ry[6] =     _mm_add_pi32( ry[6], Gy );
+   rz[6] =                   rz[6]+ Gz;
+   rx[7] = _mm256_add_epi32( rx[7], Hx );
+   ry[7] =     _mm_add_pi32( ry[7], Hy );
+   rz[7] =                   rz[7]+ Hz;
+
+}
+
+void sha256_11way_init( sha256_11way_context *ctx )
+{
+   ctx->count_high = ctx->count_low = 0;
+   ctx->valx[0] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[0] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[1] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[1] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[2] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[2] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[3] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[3] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[4] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[4] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[5] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[5] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[6] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[6] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[7] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[7] =     _mm_set1_pi32( H256[0] );
+   memcpy( ctx->valz, H256, 32 );
+}
+
+
+void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
+	                  const void *datay, const void *dataz, size_t len )
+{
+   __m256i  *vdatax = (__m256i*) datax;
+    __m64   *vdatay = (__m64*)   datay;
+   uint32_t *idataz = (uint32_t*)dataz;
+   size_t ptr;
+   const int buf_size = 64;
+
+   ptr = (unsigned)ctx->count_low & (buf_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      uint32_t clow, clow2;
+
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( ctx->bufx + (ptr>>2), vdatax + (ptr>>2), clen>>2 );
+      memcpy_m64( ctx->bufy + (ptr>>2), vdatay + (ptr>>2), clen>>2 );
+      memcpy    ( ctx->bufz +  ptr,     idataz +  ptr,     clen    );
+      ptr += clen;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         sha256_11way_round( ctx->bufx, ctx->valx,
+			     ctx->bufy, ctx->valy,
+			     ctx->bufz, ctx->valz );
+         ptr = 0;
+      }
+      clow = ctx->count_low;
+      clow2 = clow + clen;
+      ctx->count_low = clow2;
+      if ( clow2 < clow )
+         ctx->count_high++;
+   }
+}
+
+
+void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dsty,
+	                                            void *dstz)
+{
+    unsigned ptr, u;
+    uint32_t low, high;
+    const int buf_size = 64;
+    const int pad = buf_size - 8;
+
+    ptr = (unsigned)ctx->count_low & (buf_size - 1U);
+    ctx->bufx[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
+    ctx->bufy[ ptr>>2 ] = _mm_set1_pi32( 0x80 );
+    ctx->bufz[ ptr>>2 ] = 0x80;
+    ptr += 4;
+
+    if ( ptr > pad )
+    {
+         memset_zero_256( ctx->bufx + (ptr>>2), (buf_size - ptr) >> 2 );
+         memset_zero_m64( ctx->bufy + (ptr>>2), (buf_size - ptr) >> 2 );
+         memset(      ctx->bufz + (ptr>>2), 0,  (buf_size - ptr) >> 2 );
+         sha256_11way_round( ctx->bufx, ctx->valx,
+			     ctx->bufy, ctx->valy,
+			     ctx->bufz, ctx->valz );
+         memset_zero_256( ctx->bufx, pad >> 2 );
+         memset_zero_m64(  ctx->bufy, pad >> 2 );
+         memset(      ctx->bufz, 0,  pad >> 2 );
+    }
+    else
+    {
+        memset_zero_256( ctx->bufx + (ptr>>2),    (pad - ptr) >> 2 );
+        memset_zero_m64(  ctx->bufy + (ptr>>2),    (pad - ptr) >> 2 );
+        memset(          ctx->bufz + (ptr>>2), 0, (pad - ptr) >> 2 );
+    }
+
+    low = ctx->count_low;
+    high = (ctx->count_high << 3) | (low >> 29);
+    low = low << 3;
+
+    ctx->bufx[ pad >> 2 ] =
+                 mm256_bswap_32( _mm256_set1_epi32( high ) );
+    ctx->bufy[ pad >> 2 ] =
+                 mm64_bswap_32( _mm_set1_pi32( high ) );
+    ctx->bufz[ pad >> 2 ] =
+                 bswap_32( high );
+
+
+    ctx->bufx[ ( pad+4 ) >> 2 ] =
+                 mm256_bswap_32( _mm256_set1_epi32( low ) );
+    ctx->bufy[ ( pad+4 ) >> 2 ] =
+                 mm64_bswap_32( _mm_set1_pi32( low ) );
+    ctx->bufz[ ( pad+4 ) >> 2 ] =
+                 bswap_32( low );
+
+    sha256_11way_round( ctx->bufx, ctx->valx,
+		       ctx->bufy, ctx->valy,
+		       ctx->bufz, ctx->valz  );
+
+    for ( u = 0; u < 8; u ++ )
+    {
+       casti_m256i( dstx, u ) = mm256_bswap_32( ctx->valx[u] );
+       casti_m64  ( dsty, u ) =  mm64_bswap_32( ctx->valy[u] );
+       ((uint32_t*)dstz)[u] = bswap_32( ctx->valz[u] );
+   }
+}
+
+#endif
+#endif   // 0
--- a/algo/sha/sha256q-4way.c
+++ b/algo/sha/sha256q-4way.c
@@ -0,0 +1,188 @@
+#include "sha256t-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "sha2-hash-4way.h"
+
+#if defined(SHA256T_8WAY)
+
+static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
+
+void sha256q_8way_hash( void* output, const void* input )
+{
+   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+   sha256_8way_context ctx;
+   memcpy( &ctx, &sha256_ctx8, sizeof ctx );
+
+   sha256_8way( &ctx, input + (64<<3), 16 );
+   sha256_8way_close( &ctx, vhash );
+
+   sha256_8way_init( &ctx );
+   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_close( &ctx, vhash );
+
+   sha256_8way_init( &ctx );
+   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_close( &ctx, vhash );
+
+   sha256_8way_init( &ctx );
+   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_close( &ctx, output );
+}
+
+int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
+	                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+
+   const uint64_t htmax[] = {          0,
+                                     0xF,
+                                    0xFF,
+                                   0xFFF,
+                                  0xFFFF,
+                              0x10000000 };
+   const uint32_t masks[] = {  0xFFFFFFFF,
+                               0xFFFFFFF0,
+                               0xFFFFFF00,
+                               0xFFFFF000,
+                               0xFFFF0000,
+                                        0 };
+
+   // Need big endian data
+   mm256_bswap_intrlv80_8x32( vdata, pdata );
+   sha256_8way_init( &sha256_ctx8 );
+   sha256_8way( &sha256_ctx8, vdata, 64 );
+
+   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+   {
+      uint32_t mask = masks[m];
+      do
+      {
+         *noncev = mm256_bswap_32(
+		            _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
+
+	      pdata[19] = n;
+         sha256q_8way_hash( hash, vdata );
+
+         uint32_t *hash7 = &(hash[7<<3]); 
+	 
+         for ( int lane = 0; lane < 8; lane++ )
+         if ( !( hash7[ lane ] & mask ) )
+         { 
+            // deinterleave hash for lane
+	         uint32_t lane_hash[8];
+	         mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
+
+	         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+            {
+	           pdata[19] = n + lane;
+              submit_solution( work, lane_hash, mythr, lane );
+            }
+	      }
+         n += 8;
+      } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
+      break;
+   }
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#endif
+
+#if defined(SHA256T_4WAY)
+
+static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
+
+void sha256q_4way_hash( void* output, const void* input )
+{
+   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+   sha256_4way_context ctx;
+   memcpy( &ctx, &sha256_ctx4, sizeof ctx );
+
+   sha256_4way( &ctx, input + (64<<2), 16 );
+   sha256_4way_close( &ctx, vhash );
+
+   sha256_4way_init( &ctx );
+   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_close( &ctx, vhash );
+
+   sha256_4way_init( &ctx );
+   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_close( &ctx, vhash );
+
+   sha256_4way_init( &ctx );
+   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_close( &ctx, output );
+}
+
+int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
+	                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[7<<2]);
+   uint32_t lane_hash[8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+
+   const uint64_t htmax[] = {          0,
+                                     0xF,
+                                    0xFF,
+                                   0xFFF,
+                                  0xFFFF,
+                              0x10000000 };
+   const uint32_t masks[] = {  0xFFFFFFFF,
+                               0xFFFFFFF0,
+                               0xFFFFFF00,
+                               0xFFFFF000,
+                               0xFFFF0000,
+                                        0 };
+
+   mm128_bswap_intrlv80_4x32( vdata, pdata );
+   sha256_4way_init( &sha256_ctx4 );
+   sha256_4way( &sha256_ctx4, vdata, 64 );
+
+   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+   {
+      uint32_t mask = masks[m];
+      do {
+         *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
+         pdata[19] = n;
+
+         sha256q_4way_hash( hash, vdata );
+
+         for ( int lane = 0; lane < 4; lane++ )
+         if ( !( hash7[ lane ] & mask ) )
+         {
+            mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+
+            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+            {
+              pdata[19] = n + lane;
+              submit_solution( work, lane_hash, mythr, lane );
+            }
+         }
+         n += 4;
+      } while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
+      break;
+   }
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#endif
+
--- a/algo/sha/sha256q.c
+++ b/algo/sha/sha256q.c
@@ -0,0 +1,113 @@
+#include "sha256t-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <openssl/sha.h>
+
+static __thread SHA256_CTX sha256q_ctx __attribute__ ((aligned (64)));
+
+void sha256q_midstate( const void* input )
+{
+    SHA256_Init( &sha256q_ctx );
+    SHA256_Update( &sha256q_ctx, input, 64 );
+}
+
+void sha256q_hash( void* output, const void* input )
+{
+   uint32_t _ALIGN(64) hash[16];
+   const int midlen = 64;            // bytes
+   const int tail   = 80 - midlen;   // 16
+
+   SHA256_CTX ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &sha256q_ctx, sizeof sha256q_ctx );
+
+   SHA256_Update( &ctx, input + midlen, tail );
+   SHA256_Final( (unsigned char*)hash, &ctx );
+
+   SHA256_Init( &ctx );
+   SHA256_Update( &ctx, hash, 32 );
+   SHA256_Final( (unsigned char*)hash, &ctx );
+
+   SHA256_Init( &ctx );
+   SHA256_Update( &ctx, hash, 32 );
+   SHA256_Final( (unsigned char*)hash, &ctx );
+
+   SHA256_Init( &ctx );
+   SHA256_Update( &ctx, hash, 32 );
+   SHA256_Final( (unsigned char*)hash, &ctx );
+
+   memcpy( output, hash, 32 );
+}
+
+int scanhash_sha256q( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19] - 1;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t Htarg = ptarget[7];
+#ifdef _MSC_VER
+   uint32_t __declspec(align(32)) hash64[8];
+#else
+   uint32_t hash64[8] __attribute__((aligned(32)));
+#endif
+   uint32_t endiandata[32];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+
+   uint64_t htmax[] = {
+		0,
+		0xF,
+		0xFF,
+		0xFFF,
+		0xFFFF,
+		0x10000000
+	};
+   uint32_t masks[] = {
+		0xFFFFFFFF,
+		0xFFFFFFF0,
+		0xFFFFFF00,
+		0xFFFFF000,
+		0xFFFF0000,
+		0
+	};
+
+   // we need bigendian data...
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+   sha256q_midstate( endiandata );
+
+   for ( int m = 0; m < 6; m++ )
+   {
+      if ( Htarg <= htmax[m] )
+      {
+         uint32_t mask = masks[m];
+         do {
+            pdata[19] = ++n;
+            be32enc(&endiandata[19], n);
+            sha256q_hash( hash64, endiandata );
+            if ( ( !(hash64[7] & mask) ) && fulltest( hash64, ptarget ) )
+            {
+               work_set_target_ratio( work, hash64 );
+               if ( submit_work( mythr, work ) )
+                  applog( LOG_NOTICE, "Share %d submitted by thread %d.",
+                             accepted_share_count + rejected_share_count + 1,
+                             thr_id );
+               else
+                  applog( LOG_WARNING, "Failed to submit share." );
+               *hashes_done = n - first_nonce + 1;
+            }
+         } while ( n < max_nonce && !work_restart[thr_id].restart );
+         break;
+      }
+   }
+
+   *hashes_done = n - first_nonce + 1;
+   pdata[19] = n;
+   return 0;
+}
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -5,6 +5,137 @@
 #include <stdio.h>
 #include "sha2-hash-4way.h"

+#if defined(SHA256T_11WAY)
+
+static __thread sha256_11way_context sha256_ctx11 __attribute__ ((aligned (64)));
+
+void sha256t_11way_hash( void *outx, void *outy, void *outz, const void *inpx,
+	                 const void *inpy, const void*inpz )
+{
+   uint32_t hashx[8*8] __attribute__ ((aligned (64)));
+   uint32_t hashy[8*2] __attribute__ ((aligned (64)));
+   uint32_t hashz[8]   __attribute__ ((aligned (64)));
+   sha256_11way_context ctx;
+   const void *inpx64 = inpx+(64<<3);
+   const void *inpy64 = inpy+(64<<1);
+   const void *inpz64 = inpz+ 64;
+
+   memcpy( &ctx, &sha256_ctx11, sizeof ctx );
+   sha256_11way_update( &ctx, inpx64, inpy64, inpz64,  16 );
+   sha256_11way_close( &ctx, hashx, hashy, hashz );
+
+   sha256_11way_init( &ctx );
+   sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
+   sha256_11way_close( &ctx, hashx, hashy, hashz );
+
+   sha256_11way_init( &ctx );
+   sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
+   sha256_11way_close( &ctx, outx, outy, outz );
+}
+
+int scanhash_sha256t_11way( int thr_id, struct work *work, uint32_t max_nonce,
+	                    uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t datax[20*8]  __attribute__ ((aligned (64)));
+   uint32_t datay[20*2]  __attribute__ ((aligned (32)));
+   uint32_t dataz[20]    __attribute__ ((aligned (32)));
+   uint32_t hashx[8*8]   __attribute__ ((aligned (32)));
+   uint32_t hashy[8*2]   __attribute__ ((aligned (32)));
+   uint32_t hashz[8]     __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7;
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   __m256i  *noncex = (__m256i*) datax + 19;
+   __m64    *noncey = (__m64*)   datay + 19;
+   uint32_t *noncez = (uint32_t*)dataz + 19;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   int i;
+   const uint64_t htmax[] = {           0,
+                                      0xF,
+                                     0xFF,
+                                    0xFFF,
+                                   0xFFFF,
+                               0x10000000 };
+   const uint32_t masks[] = {  0xFFFFFFFF,
+                               0xFFFFFFF0,
+                               0xFFFFFF00,
+                               0xFFFFF000,
+                               0xFFFF0000,
+                                        0 };
+
+   // Use dataz (scalar) to stage bswapped data for the vectors.
+   casti_m256i( dataz, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+   casti_m256i( dataz, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+   casti_m128i( dataz, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+   mm256_intrlv_8x32( datax, dataz, dataz, dataz, dataz,
+                                 dataz, dataz, dataz, dataz, 640 );
+   mm64_interleave_2x32( datay, dataz, dataz, 640 );
+
+   sha256_11way_init( &sha256_ctx11 );
+   sha256_11way_update( &sha256_ctx11, datax, datay, dataz, 64 );
+
+   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+   {
+      uint32_t mask = masks[m];
+      do
+      {
+        *noncex = mm256_bswap_32(
+         _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
+        *noncey = mm64_bswap_32( _mm_set_pi32( n+9, n+8 ) );
+        *noncez = bswap_32( n+10 );
+
+        pdata[19] = n;
+
+        sha256t_11way_hash( hashx, hashy, hashz, datax, datay, dataz );
+
+        if ( opt_benchmark ) { n += 11; continue; }
+
+        hash7 = &(hashx[7<<3]); 
+        for ( i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
+        { 
+            // deinterleave hash for lane
+            mm256_extract_lane_8x32( lane_hash, hashx, i, 256 );
+            if ( fulltest( lane_hash, ptarget ) )
+            {
+	            pdata[19] = n + i;
+               submit_solution( work, lane_hash, mythr, i );
+            }
+        }
+
+        hash7 = &(hashy[7<<1]);
+        for( i = 0; i < 2; i++ ) if ( !(hash7[ 0] & mask ) )
+ 
+        {
+            mm64_extract_lane_2x32( lane_hash, hashy, i, 256 );
+           if ( fulltest( lane_hash, ptarget ) )
+           {
+               pdata[19] = n + 8 + i;
+               submit_solution( work, lane_hash, mythr, i+8 );
+           }
+	     }
+
+        if ( !(hashz[7] & mask ) && fulltest( hashz, ptarget ) )
+        {
+            pdata[19] = n+10;
+            submit_solution( work, hashz, mythr, 10 );
+        }
+        n += 11;
+
+      } while ( (n < max_nonce-12) && !work_restart[thr_id].restart );
+      break;
+   }
+    
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#endif
+
 #if defined(SHA256T_8WAY)

 static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
@@ -25,15 +156,15 @@ void sha256t_8way_hash( void* output, const void* input )
   sha256_8way_init( &ctx );
   sha256_8way( &ctx, vhash, 32 );
   sha256_8way_close( &ctx, output );
-
 }

 int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
-	                   uint64_t *hashes_done, struct thr_info *mythr )
+                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t hash[8*8] __attribute__ ((aligned (32)));
-   uint32_t edata[20] __attribute__ ((aligned (32)));;
+   uint32_t vdata[20*8]  __attribute__ ((aligned (64)));
+   uint32_t hash[8*8]    __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[7<<3]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
@@ -56,12 +187,7 @@ int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
                                        0 };

   // Need big endian data
-   casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
-   casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-
-   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
-                                 edata, edata, edata, edata, 640 );
+   mm256_bswap_intrlv80_8x32( vdata, pdata );
   sha256_8way_init( &sha256_ctx8 );
   sha256_8way( &sha256_ctx8, vdata, 64 );

@@ -70,40 +196,25 @@ int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
      uint32_t mask = masks[m];
      do
      {
-        *noncev = mm256_bswap_32(
-		 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
-
-	 pdata[19] = n;
-
+        *noncev = mm256_bswap_32( _mm256_set_epi32(
+                                          n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
+         pdata[19] = n;
         sha256t_8way_hash( hash, vdata );
-
-         uint32_t *hash7 = &(hash[7<<3]); 
-	 
         for ( int lane = 0; lane < 8; lane++ )
         if ( !( hash7[ lane ] & mask ) )
-         { 
+         {
            // deinterleave hash for lane
-	    uint32_t lane_hash[8];
-	    mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
-
-	    if ( fulltest( lane_hash, ptarget ) )
+            mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
+            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
            {
-	      pdata[19] = n + lane;
-              work_set_target_ratio( work, lane_hash );
-              if ( submit_work( mythr, work ) )
-                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-                             accepted_share_count + rejected_share_count + 1,
-                             thr_id, lane );
-              else
-                applog( LOG_WARNING, "Failed to submit share." );
-	    }
-	 }
+              pdata[19] = n + lane;
+              submit_solution( work, lane_hash, mythr, lane );
+	         }
+         }
         n += 8;
-
      } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
      break;
   }
-    
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
@@ -130,7 +241,6 @@ void sha256t_4way_hash( void* output, const void* input )
   sha256_4way_init( &ctx );
   sha256_4way( &ctx, vhash, 32 );
   sha256_4way_close( &ctx, output );
-
 }

 int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -138,9 +248,8 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[7<<2]);
-   uint32_t lane_hash[8];
-   uint32_t edata[20] __attribute__ ((aligned (32)));;
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
@@ -155,20 +264,14 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
                                   0xFFF,
                                  0xFFFF,
                              0x10000000 };
-   const uint32_t masks[] = {  0xFFFFFFFF,
-                               0xFFFFFFF0,
-                               0xFFFFFF00,
-                               0xFFFFF000,
-                               0xFFFF0000,
-                                        0 };
+   const uint32_t masks[] = { 0xFFFFFFFF,
+                              0xFFFFFFF0,
+                              0xFFFFFF00,
+                              0xFFFFF000,
+                              0xFFFF0000,
+                                       0 };

-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_bswap_intrlv80_4x32( vdata, pdata );
   sha256_4way_init( &sha256_ctx4 );
   sha256_4way( &sha256_ctx4, vdata, 64 );

@@ -177,7 +280,7 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
      uint32_t mask = masks[m];
      do {
         *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
-	 pdata[19] = n;
+         pdata[19] = n;

         sha256t_4way_hash( hash, vdata );

@@ -185,26 +288,16 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
         if ( !( hash7[ lane ] & mask ) )
         {
            mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
-
-            if ( fulltest( lane_hash, ptarget ) )
+            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
            {
              pdata[19] = n + lane;
-              work_set_target_ratio( work, lane_hash );
-              if ( submit_work( mythr, work ) )
-                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-                             accepted_share_count + rejected_share_count + 1,
-                             thr_id, lane );
-              else
-                applog( LOG_WARNING, "Failed to submit share." );
-            }
-         }
-
-	 n += 4;
-
+              submit_solution( work, lane_hash, mythr, lane );
+	         }
+         } 
+         n += 4;
      } while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
      break;
   }
-
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -3,15 +3,15 @@
 bool register_sha256t_algo( algo_gate_t* gate )
 {
 #if defined(SHA256T_8WAY)
-    gate->optimizations = SSE2_OPT | AVX2_OPT;
+    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t_8way;
    gate->hash       = (void*)&sha256t_8way_hash;
 #elif defined(SHA256T_4WAY)
-    gate->optimizations = SSE2_OPT | AVX2_OPT;
+    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t_4way;
    gate->hash       = (void*)&sha256t_4way_hash;
 #else
-    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
+gate->optimizations = SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t;
    gate->hash       = (void*)&sha256t_hash;
 #endif
@@ -19,3 +19,19 @@ bool register_sha256t_algo( algo_gate_t* gate )
    return true;
 }

+bool register_sha256q_algo( algo_gate_t* gate )
+{
+#if defined(SHA256T_4WAY)
+    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
+    gate->scanhash   = (void*)&scanhash_sha256q_4way;
+    gate->hash       = (void*)&sha256q_4way_hash;
+#else
+    gate->optimizations = SHA_OPT;
+    gate->scanhash   = (void*)&scanhash_sha256q;
+    gate->hash       = (void*)&sha256q_hash;
+#endif
+    gate->get_max64  = (void*)&get_max64_0x3ffff;
+    return true;
+
+}
+
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -6,7 +6,6 @@

 // Override multi way on ryzen, SHA is better.
 #if !defined(RYZEN_)
-//#if defined(__SSE4_2__)
 #if defined(__SSE2__)
  #define SHA256T_4WAY
 #endif
@@ -15,26 +14,35 @@
 #endif
 #endif

-bool register_blake2s_algo( algo_gate_t* gate );
+bool register_sha256t_algo( algo_gate_t* gate );
+bool register_sha256q_algo( algo_gate_t* gate );

 #if defined(SHA256T_8WAY)

 void sha256t_8way_hash( void *output, const void *input );
 int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
+void sha256q_8way_hash( void *output, const void *input );
+int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+#endif

-#elif defined(SHA256T_4WAY)
+#if defined(SHA256T_4WAY)

 void sha256t_4way_hash( void *output, const void *input );
 int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
-#else
+void sha256q_4way_hash( void *output, const void *input );
+int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+#endif

 void sha256t_hash( void *output, const void *input );
 int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );
-
-#endif
+void sha256q_hash( void *output, const void *input );
+int scanhash_sha256q( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );

 #endif

--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -70,8 +70,11 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
 	};

   // we need bigendian data...
-   for ( int k = 0; k < 19; k++ )
-      be32enc( &endiandata[k], pdata[k] );
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );

   sha256t_midstate( endiandata );

@@ -87,7 +90,13 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
            if ( ( !(hash64[7] & mask) ) && fulltest( hash64, ptarget ) )
            {
               *hashes_done = n - first_nonce + 1;
-               return true;
+               work_set_target_ratio( work, hash64 );
+               if ( submit_work( mythr, work ) )
+                  applog( LOG_NOTICE, "Share %d submitted by thread %d.",
+                             accepted_share_count + rejected_share_count + 1,
+                             thr_id );
+               else
+                  applog( LOG_WARNING, "Failed to submit share." );
            }
         } while ( n < max_nonce && !work_restart[thr_id].restart );
         break;
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -40,7 +40,7 @@

 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #ifdef __cplusplus
 extern "C"{
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -346,7 +346,7 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
      memcpy( buf + ptr, data, clen );
      data = (const unsigned char *)data + clen;
      ptr += clen;
-      len -= clen >> 1;
+      len -= (clen >> 1);
      if ( ptr == sizeof ctx->buf )
      {
         if ( ( ctx->count0 = ctx->count0 + 1024 )  == 0 )
@@ -365,16 +365,8 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
   }

   uint32_t vp = ptr>>5;
-
-   // Terminating byte then zero pad
-   casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
-
-   // Zero pad full vectors up to count
-   for ( ; vp < 6; vp++ )
-       casti_m256i( buf, vp ) = m256_zero;
-
   // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
-   // Count is misaligned to 16 bits and straddles a vector.
+   // Count is misaligned to 16 bits and straddles 2 vectors.
   // Use u32 overlay to stage then u16 to load buf.
   union
   {
@@ -387,6 +379,18 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
   count.u32[2] = ctx->count2;
   count.u32[3] = ctx->count3;

+   if ( vp == 0 )    // empty buf, xevan.
+   { 
+      casti_m256i( buf, 0 ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
+      memset_zero_256( (__m256i*)buf + 1, 5 );
+      ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
+   }
+   else     // half full buf, everyone else.
+   {
+      casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
+      memset_zero_256( (__m256i*)buf + vp, 6 - vp );
+   }
+
   casti_m256i( buf, 6 ) = _mm256_set_epi16( count.u16[0], 0,0,0,0,0,0,0,
                                             count.u16[0], 0,0,0,0,0,0,0 );
   casti_m256i( buf, 7 ) = _mm256_set_epi16(
--- a/algo/shavite/shavite-hash-2way.h
+++ b/algo/shavite/shavite-hash-2way.h
@@ -3,7 +3,7 @@

 #if defined(__AVX2__)
  
-#include "avxdefs.h"
+#include "simd-utils.h"

 typedef struct {
        unsigned char buf[128<<1];
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -36,7 +36,7 @@
 #ifdef __AES__

 #include "sph_shavite.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #ifdef __cplusplus
 extern "C"{
--- a/algo/simd/simd-hash-2way.h
+++ b/algo/simd/simd-hash-2way.h
@@ -5,7 +5,7 @@

 #if defined(__AVX2__)

-#include "avxdefs.h"
+#include "simd-utils.h"

 typedef struct {
  uint32_t A[ 32*2 ] __attribute__((aligned(64)));
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -17,13 +17,13 @@ void skeinhash_4way( void *state, const void *input )
     skein512_4way( &ctx_skein, input, 80 );
     skein512_4way_close( &ctx_skein, vhash64 );

-     mm256_reinterleave_4x32( vhash32, vhash64, 512 );
+     mm256_rintrlv_4x64_4x32( vhash32, vhash64, 512 );

     sha256_4way_init( &ctx_sha256 );
     sha256_4way( &ctx_sha256, vhash32, 64 );
     sha256_4way_close( &ctx_sha256, state );

-     mm128_deinterleave_4x32( state, state+32, state+64, state+96,
+     mm128_dintrlv_4x32( state, state+32, state+64, state+96,
 		              vhash32, 256 );
 }

@@ -48,7 +48,7 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
 	
    swab32_array( edata, pdata, 20 );
 
-    mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 );
+    mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );

    uint32_t *noncep = vdata + 73;   // 9*8 + 1

--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -49,7 +49,7 @@ extern "C"{

 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 // Output size in bits
 #define SPH_SIZE_skein256   256
--- a/algo/sm3/sm3-hash-4way.h
+++ b/algo/sm3/sm3-hash-4way.h
@@ -59,7 +59,7 @@
 #include <sys/types.h>
 #include <stdint.h>
 #include <string.h>
-#include "avxdefs.h"
+#include "simd-utils.h"

 #ifdef __cplusplus
 extern "C" {
--- a/algo/whirlpool/whirlpool-hash-4way.h
+++ b/algo/whirlpool/whirlpool-hash-4way.h
@@ -52,7 +52,7 @@

 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 /**
 * Output size (in bits) for WHIRLPOOL.
--- a/algo/x14/polytimos-4way.c
+++ b/algo/x14/polytimos-4way.c
@@ -49,10 +49,10 @@ void polytimos_4way_hash( void *output, const void *input )

     // Need to convert from 64 bit interleaved to 32 bit interleaved.
     uint32_t vhash32[16*4];
-     mm256_reinterleave_4x32( vhash32, vhash, 512 );
+     mm256_rintrlv_4x64_4x32( vhash32, vhash, 512 );
     shabal512_4way( &ctx.shabal, vhash32, 64 );
     shabal512_4way_close( &ctx.shabal, vhash32 );
-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );

     update_final_echo ( &ctx.echo, (BitSequence *)hash0,
                         (const BitSequence *)hash0, 512 );
@@ -66,13 +66,13 @@ void polytimos_4way_hash( void *output, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

-     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
+     mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );

     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -2,7 +2,7 @@
 #define X16R_GATE_H__ 1

 #include "algo-gate-api.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 #include <stdint.h>

 #if defined(__AVX2__) && defined(__AES__)
--- a/algo/x17/hmq1725.c
+++ b/algo/x17/hmq1725.c
@@ -299,7 +299,7 @@ extern void hmq1725hash(void *state, const void *input)
 }

 int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
-                      uint64_t *hashes_done )
+                      uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t endiandata[32] __attribute__((aligned(64)));
        uint32_t hash64[8] __attribute__((aligned(64)));
@@ -307,6 +307,7 @@ int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
        uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	//const uint32_t Htarg = ptarget[7];

 	//we need bigendian data...
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -25,7 +25,8 @@
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/sha/sha2-hash-4way.h"

-typedef struct {
+union _sonoa_4way_context_overlay
+{
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
@@ -43,31 +44,10 @@ typedef struct {
    sph_whirlpool_context   whirlpool;
    sha512_4way_context     sha512;
    haval256_5_4way_context haval;
-} sonoa_4way_ctx_holder;
-
-sonoa_4way_ctx_holder sonoa_4way_ctx __attribute__ ((aligned (64)));
-
-void init_sonoa_4way_ctx()
-{
-     blake512_4way_init( &sonoa_4way_ctx.blake );
-     bmw512_4way_init( &sonoa_4way_ctx.bmw );
-     init_groestl( &sonoa_4way_ctx.groestl, 64 );
-     skein512_4way_init( &sonoa_4way_ctx.skein );
-     jh512_4way_init( &sonoa_4way_ctx.jh );
-     keccak512_4way_init( &sonoa_4way_ctx.keccak );
-     luffa_2way_init( &sonoa_4way_ctx.luffa, 512 );
-     cube_2way_init( &sonoa_4way_ctx.cube, 512, 16, 32 );
-     shavite512_2way_init( &sonoa_4way_ctx.shavite );
-     simd_2way_init( &sonoa_4way_ctx.simd, 512 );
-     init_echo( &sonoa_4way_ctx.echo, 512 );
-     hamsi512_4way_init( &sonoa_4way_ctx.hamsi );
-     sph_fugue512_init( &sonoa_4way_ctx.fugue );
-     shabal512_4way_init( &sonoa_4way_ctx.shabal );
-     sph_whirlpool_init( &sonoa_4way_ctx.whirlpool );
-     sha512_4way_init( &sonoa_4way_ctx.sha512 );
-     haval256_5_4way_init( &sonoa_4way_ctx.haval );
 };

+typedef union _sonoa_4way_context_overlay sonoa_4way_context_overlay;
+
 void sonoa_4way_hash( void *state, const void *input )
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
@@ -77,19 +57,21 @@ void sonoa_4way_hash( void *state, const void *input )
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
-     sonoa_4way_ctx_holder ctx __attribute__ ((aligned (64)));
-        memcpy( &ctx, &sonoa_4way_ctx, sizeof(sonoa_4way_ctx) );
+     sonoa_4way_context_overlay ctx;

 // 1

+     blake512_4way_init( &ctx.blake );
     blake512_4way( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );

+     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
@@ -98,38 +80,46 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

+     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
     skein512_4way_close( &ctx.skein, vhash );

+     jh512_4way_init( &ctx.jh );
     jh512_4way( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );

+     keccak512_4way_init( &ctx.keccak );
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );

-     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+     mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

+     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );

+     cube_2way_init( &ctx.cube, 512, 16, 32 );
     cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
     cube_2way_init( &ctx.cube, 512, 16, 32 );
     cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );

+     shavite512_2way_init( &ctx.shavite );
     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
     shavite512_2way_init( &ctx.shavite );
     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );

+     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );

-     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );

+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, 512 );
     init_echo( &ctx.echo, 512 );
@@ -144,13 +134,13 @@ void sonoa_4way_hash( void *state, const void *input )

 // 2

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -161,7 +151,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -175,7 +165,7 @@ void sonoa_4way_hash( void *state, const void *input )
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );

-     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+     mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -197,8 +187,8 @@ void sonoa_4way_hash( void *state, const void *input )
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );

-     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );

     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -213,87 +203,90 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
-     hamsi512_4way_close( &ctx.hamsi, vhash );
-
-// 3
-     bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, 64 );
-     bmw512_4way_close( &ctx.bmw, vhash );
-
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-
-     skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, 64 );
-     skein512_4way_close( &ctx.skein, vhash );
-
-     jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
-
-     keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
-
-     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
-
-     luffa_2way_init( &ctx.luffa, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
-     luffa_2way_init( &ctx.luffa, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
-
-     cube_2way_init( &ctx.cube, 512, 16, 32 );
-     cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
-     cube_2way_init( &ctx.cube, 512, 16, 32 );
-     cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
-
-     shavite512_2way_init( &ctx.shavite );
-     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
-     shavite512_2way_init( &ctx.shavite );
-     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
-
-     simd_2way_init( &ctx.simd, 512 );
-     simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
-     simd_2way_init( &ctx.simd, 512 );
-     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
-
-     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
-
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                       (const BitSequence *) hash0, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                       (const BitSequence *) hash1, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                       (const BitSequence *) hash2, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                       (const BitSequence *) hash3, 512 );
-
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+// 3

+     bmw512_4way_init( &ctx.bmw );
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     skein512_4way_init( &ctx.skein );
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way_init( &ctx.jh );
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way_init( &ctx.keccak );
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+     mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     hamsi512_4way_init( &ctx.hamsi );
+     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_close( &ctx.hamsi, vhash );
+
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
     sph_fugue512_init( &ctx.fugue );
@@ -307,13 +300,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );

 // 4
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -324,7 +317,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -338,7 +331,7 @@ void sonoa_4way_hash( void *state, const void *input )
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );

-     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+     mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -360,8 +353,8 @@ void sonoa_4way_hash( void *state, const void *input )
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );

-     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );

     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -376,13 +369,13 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -397,18 +390,19 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

+     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );

-     mm256_reinterleave_4x64( vhashB, vhash, 512 ); 
+     mm256_rintrlv_4x32_4x64( vhashB, vhash, 512 ); 

     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhashB, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -423,8 +417,8 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

-     mm256_interleave_2x128( vhashA, hash0, hash1, 512 );
-     mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
+     mm256_intrlv_2x128( vhashA, hash0, hash1, 512 );
+     mm256_intrlv_2x128( vhashB, hash2, hash3, 512 );

     shavite512_2way_init( &ctx.shavite );
     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
@@ -432,19 +426,19 @@ void sonoa_4way_hash( void *state, const void *input )
     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );

 // 5
-     mm256_reinterleave_2x128_4x64( vhash, vhashA, vhashB, 512 );
+     mm256_rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );

     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

-     mm256_reinterleave_4x32( vhashB, vhash,  512 );
+     mm256_rintrlv_4x64_4x32( vhashB, vhash,  512 );

     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhashB, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );

-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -455,7 +449,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -469,7 +463,7 @@ void sonoa_4way_hash( void *state, const void *input )
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );

-     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+     mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -491,8 +485,8 @@ void sonoa_4way_hash( void *state, const void *input )
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );

-     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );

     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -507,13 +501,13 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -528,14 +522,15 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );

-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash0 );
     sph_whirlpool_init( &ctx.whirlpool );
@@ -550,13 +545,13 @@ void sonoa_4way_hash( void *state, const void *input )

 // 6

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     
     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -567,7 +562,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -581,7 +576,7 @@ void sonoa_4way_hash( void *state, const void *input )
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );

-     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+     mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -603,8 +598,8 @@ void sonoa_4way_hash( void *state, const void *input )
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );

-     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );

     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -619,13 +614,13 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -640,13 +635,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );

-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -661,12 +656,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash3 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

+     sha512_4way_init( &ctx.sha512 );
     sha512_4way( &ctx.sha512, vhash, 64 );
     sha512_4way_close( &ctx.sha512, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -683,13 +679,13 @@ void sonoa_4way_hash( void *state, const void *input )

 // 7

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -700,7 +696,7 @@ void sonoa_4way_hash( void *state, const void *input )
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
@@ -714,7 +710,7 @@ void sonoa_4way_hash( void *state, const void *input )
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );

-     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+     mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -736,8 +732,8 @@ void sonoa_4way_hash( void *state, const void *input )
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );

-     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );

     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -752,13 +748,13 @@ void sonoa_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -773,13 +769,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );

-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -794,17 +790,17 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash3 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     sha512_4way_init( &ctx.sha512 );
     sha512_4way( &ctx.sha512, vhash, 64 );
     sha512_4way_close( &ctx.sha512, vhash );

-     mm256_reinterleave_4x32( vhashB, vhash,  512 );
+     mm256_rintrlv_4x64_4x32( vhashB, vhash,  512 );

+     haval256_5_4way_init( &ctx.haval );
     haval256_5_4way( &ctx.haval, vhashB, 64 );
     haval256_5_4way_close( &ctx.haval, state );
-
 }

 int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -814,7 +810,6 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
     uint32_t *hash7 = &(hash[7<<2]);
     uint32_t lane_hash[8];
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
@@ -828,19 +823,13 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          0xFFFFF000, 0xFFFF0000,          0  };

     // Need big endian data
-     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
-     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
-     casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-
-     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
-
+     mm256_bswap_intrlv80_4x64( vdata, pdata );
     for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
     {
        uint32_t mask = masks[m];
        do
        {
-           *noncev = mm256_interleave_blend_32( mm256_bswap_32(
+           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
                             _mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ),
                                                *noncev );
           sonoa_4way_hash( hash, vdata );
@@ -849,17 +838,10 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
           if ( ( ( hash7[ lane ] & mask ) == 0 ) )
           {
              mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
-              if ( fulltest( lane_hash, ptarget ) )
+              if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
              {
                 pdata[19] = n + lane;
-                 work_set_target_ratio( work, lane_hash );
-                 if ( submit_work( mythr, work ) )
-                    applog( LOG_NOTICE,
-                             "Share %d submitted by thread %d, lane %d.",
-                             accepted_share_count + rejected_share_count + 1,
-                             thr_id, lane );
-                 else
-                    applog( LOG_WARNING, "Failed to submit share." );
+                 submit_solution( work, lane_hash, mythr, lane );
              }
           }
           n += 4;
--- a/algo/x17/sonoa-gate.c
+++ b/algo/x17/sonoa-gate.c
@@ -3,7 +3,7 @@
 bool register_sonoa_algo( algo_gate_t* gate )
 {
 #if defined (SONOA_4WAY)
-  init_sonoa_4way_ctx();
+//  init_sonoa_4way_ctx();
  gate->scanhash  = (void*)&scanhash_sonoa_4way;
  gate->hash      = (void*)&sonoa_4way_hash;
 #else
--- a/algo/x17/sonoa-gate.h
+++ b/algo/x17/sonoa-gate.h
@@ -17,7 +17,7 @@ void sonoa_4way_hash( void *state, const void *input );
 int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );

-void init_sonoa_4way_ctx();
+//void init_sonoa_4way_ctx();

 #endif

--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -14,7 +14,6 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
-#include "algo/shavite/sph_shavite.h"
 #include "algo/shavite/shavite-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -25,7 +24,6 @@
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/sha/sha2-hash-4way.h"

-//typedef struct {
 union _x17_4way_context_overlay
 {
    blake512_4way_context   blake;
@@ -48,30 +46,6 @@ union _x17_4way_context_overlay
 };  
 typedef union _x17_4way_context_overlay x17_4way_context_overlay;

-/*
-x17_4way_ctx_holder x17_4way_ctx __attribute__ ((aligned (64)));
-
-void init_x17_4way_ctx()
-{
-     blake512_4way_init( &x17_4way_ctx.blake );
-     bmw512_4way_init( &x17_4way_ctx.bmw );
-     init_groestl( &x17_4way_ctx.groestl, 64 );
-     skein512_4way_init( &x17_4way_ctx.skein );
-     jh512_4way_init( &x17_4way_ctx.jh );
-     keccak512_4way_init( &x17_4way_ctx.keccak );
-     luffa_2way_init( &x17_4way_ctx.luffa, 512 );
-     cube_2way_init( &x17_4way_ctx.cube, 512, 16, 32 );
-     shavite512_2way_init( &x17_4way_ctx.shavite );
-     simd_2way_init( &x17_4way_ctx.simd, 512 );
-     init_echo( &x17_4way_ctx.echo, 512 );
-     hamsi512_4way_init( &x17_4way_ctx.hamsi );
-     sph_fugue512_init( &x17_4way_ctx.fugue );
-     shabal512_4way_init( &x17_4way_ctx.shabal );
-     sph_whirlpool_init( &x17_4way_ctx.whirlpool );
-     sha512_4way_init( &x17_4way_ctx.sha512 );
-     haval256_5_4way_init( &x17_4way_ctx.haval );
-};
-*/
 void x17_4way_hash( void *state, const void *input )
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
@@ -82,7 +56,6 @@ void x17_4way_hash( void *state, const void *input )
     uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
     x17_4way_context_overlay ctx;
-//     memcpy( &ctx, &x17_4way_ctx, sizeof(x17_4way_ctx) );

     // 1 Blake parallel 4 way 64 bit
     blake512_4way_init( &ctx.blake );
@@ -95,7 +68,7 @@ void x17_4way_hash( void *state, const void *input )
     bmw512_4way_close( &ctx.bmw, vhash );

     // Serialize
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     // 3 Groestl
     init_groestl( &ctx.groestl, 64 );
@@ -108,7 +81,7 @@ void x17_4way_hash( void *state, const void *input )
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

     // Parallellize
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     // 4 Skein parallel 4 way 64 bit 
     skein512_4way_init( &ctx.skein );
@@ -126,7 +99,7 @@ void x17_4way_hash( void *state, const void *input )
     keccak512_4way_close( &ctx.keccak, vhash );

     // 7 Luffa  parallel 2 way 128 bit
-     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+     mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -151,8 +124,8 @@ void x17_4way_hash( void *state, const void *input )
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );

-     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+     mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
+     mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );

     // 11 Echo serial
     init_echo( &ctx.echo, 512 );
@@ -169,13 +142,13 @@ void x17_4way_hash( void *state, const void *input )
                       (const BitSequence *) hash3, 512 );

     // 12 Hamsi parallel 4 way 64 bit
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     // 13 Fugue serial
     sph_fugue512_init( &ctx.fugue );
@@ -192,13 +165,13 @@ void x17_4way_hash( void *state, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );

     // 14 Shabal, parallel 4 way 32 bit
-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );

-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
       
     // 15 Whirlpool serial
     sph_whirlpool_init( &ctx.whirlpool );
@@ -215,19 +188,18 @@ void x17_4way_hash( void *state, const void *input )
     sph_whirlpool_close( &ctx.whirlpool, hash3 );

     // 16 SHA512 parallel 64 bit 
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     sha512_4way_init( &ctx.sha512 );
     sha512_4way( &ctx.sha512, vhash, 64 );
     sha512_4way_close( &ctx.sha512, vhash );     

     // 17 Haval parallel 32 bit
-     mm256_reinterleave_4x32( vhashB, vhash,  512 );
+     mm256_rintrlv_4x64_4x32( vhashB, vhash,  512 );

     haval256_5_4way_init( &ctx.haval );
     haval256_5_4way( &ctx.haval, vhashB, 64 );
     haval256_5_4way_close( &ctx.haval, state );
-
 }

 int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -237,7 +209,6 @@ int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
     uint32_t *hash7 = &(hash[7<<2]);
     uint32_t lane_hash[8];
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
@@ -251,38 +222,24 @@ int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          0xFFFFF000, 0xFFFF0000,          0  };

     // Need big endian data
-     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
-     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
-     casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-
-     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
-
-     for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+     mm256_bswap_intrlv80_4x64( vdata, pdata );
+     for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
     {
-        uint32_t mask = masks[m];
+        uint32_t mask = masks[ m ];
        do
        {
-  	   *noncev = mm256_interleave_blend_32( mm256_bswap_32(
-	                     _mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ),
-	  		                        *noncev );
+           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+	              _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
           x17_4way_hash( hash, vdata );

-	   for ( int lane = 0; lane < 4; lane++ )
-           if ( ( ( hash7[ lane ] & mask ) == 0 ) )
+	     for ( int lane = 0; lane < 4; lane++ )
+           if ( ( hash7[ lane ] & mask ) == 0 )
           {
              mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
-              if ( fulltest( lane_hash, ptarget ) )
+              if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
              {
                 pdata[19] = n + lane;
-                 work_set_target_ratio( work, lane_hash );
-                 if ( submit_work( mythr, work ) )
-                    applog( LOG_NOTICE,
-			     "Share %d submitted by thread %d, lane %d.",
-                             accepted_share_count + rejected_share_count + 1,
-                             thr_id, lane );
-                 else
-                    applog( LOG_WARNING, "Failed to submit share." );
+                 submit_solution( work, lane_hash, mythr, lane );
              }
           }
           n += 4;
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -12,8 +12,9 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
-#include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
+#include "algo/shavite/shavite-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -24,16 +25,17 @@
 #include "algo/sha/sha2-hash-4way.h"
 #include "algo/haval/haval-hash-4way.h"

-typedef struct {
-        blake512_4way_context   blake;
+union _xevan_4way_context_overlay
+{
+	blake512_4way_context   blake;
        bmw512_4way_context     bmw;
        hashState_groestl       groestl;
        skein512_4way_context   skein;
        jh512_4way_context      jh;
        keccak512_4way_context  keccak;
        luffa_2way_context      luffa;
-        cubehashParam           cube;
-        sph_shavite512_context  shavite;
+        cube_2way_context       cube;
+        shavite512_2way_context shavite;
        simd_2way_context       simd;
        hashState_echo          echo;
        hamsi512_4way_context   hamsi;
@@ -42,39 +44,8 @@ typedef struct {
        sph_whirlpool_context   whirlpool;
        sha512_4way_context     sha512;
        haval256_5_4way_context haval;
-} xevan_4way_ctx_holder;
-
-xevan_4way_ctx_holder xevan_4way_ctx __attribute__ ((aligned (64)));
-static __thread blake512_4way_context xevan_blake_4way_mid
-                                        __attribute__ ((aligned (64)));
-
-void init_xevan_4way_ctx()
-{
-        blake512_4way_init(&xevan_4way_ctx.blake);
-        bmw512_4way_init( &xevan_4way_ctx.bmw );
-        init_groestl( &xevan_4way_ctx.groestl, 64 );
-        skein512_4way_init(&xevan_4way_ctx.skein);
-        jh512_4way_init(&xevan_4way_ctx.jh);
-        keccak512_4way_init(&xevan_4way_ctx.keccak);
-        luffa_2way_init( &xevan_4way_ctx.luffa, 512 );
-        cubehashInit( &xevan_4way_ctx.cube, 512, 16, 32 );
-        sph_shavite512_init( &xevan_4way_ctx.shavite );
-        simd_2way_init( &xevan_4way_ctx.simd, 512 );
-        init_echo( &xevan_4way_ctx.echo, 512 );
-        hamsi512_4way_init( &xevan_4way_ctx.hamsi );
-        sph_fugue512_init( &xevan_4way_ctx.fugue );
-        shabal512_4way_init( &xevan_4way_ctx.shabal );
-        sph_whirlpool_init( &xevan_4way_ctx.whirlpool );
-        sha512_4way_init( &xevan_4way_ctx.sha512 );
-        haval256_5_4way_init( &xevan_4way_ctx.haval );
 };
-
-void xevan_4way_blake512_midstate( const void* input )
-{
-    memcpy( &xevan_blake_4way_mid, &xevan_4way_ctx.blake,
-            sizeof(xevan_blake_4way_mid) );
-    blake512_4way( &xevan_blake_4way_mid, input, 64 );
-}
+typedef union _xevan_4way_context_overlay xevan_4way_context_overlay;

 void xevan_4way_hash( void *output, const void *input )
 {
@@ -83,343 +54,320 @@ void xevan_4way_hash( void *output, const void *input )
     uint64_t hash2[16] __attribute__ ((aligned (64)));
     uint64_t hash3[16] __attribute__ ((aligned (64)));
     uint64_t vhash[16<<2] __attribute__ ((aligned (64)));
-     uint64_t vhash32[16<<2] __attribute__ ((aligned (64)));
+     uint64_t vhashA[16<<2] __attribute__ ((aligned (64)));
+     uint64_t vhashB[16<<2] __attribute__ ((aligned (64)));
     const int dataLen = 128;
-     const int midlen = 64;            // bytes
-     const int tail   = 80 - midlen;   // 16
-     xevan_4way_ctx_holder ctx __attribute__ ((aligned (64)));
-     memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );
+     xevan_4way_context_overlay ctx __attribute__ ((aligned (64)));

-     // parallel way
-     memcpy( &ctx.blake, &xevan_blake_4way_mid,
-             sizeof(xevan_blake_4way_mid) );
-     blake512_4way( &ctx.blake, input + (midlen<<2), tail );
+     // parallel 4 way
+
+     blake512_4way_init( &ctx.blake );
+     blake512_4way( &ctx.blake, input, 80 );
     blake512_4way_close(&ctx.blake, vhash);
     memset( &vhash[8<<2], 0, 64<<2 );

+     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, dataLen );
     bmw512_4way_close( &ctx.bmw, vhash );

     // Serial
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
                               dataLen<<3 );
-     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
                               dataLen<<3 );
-     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
                               dataLen<<3 );
-     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
                               dataLen<<3 );

     // Parallel 4way
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

+     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, dataLen );
     skein512_4way_close( &ctx.skein, vhash );

+     jh512_4way_init( &ctx.jh );
     jh512_4way( &ctx.jh, vhash, dataLen );
     jh512_4way_close( &ctx.jh, vhash );

+     keccak512_4way_init( &ctx.keccak );
     keccak512_4way( &ctx.keccak, vhash, dataLen );
     keccak512_4way_close( &ctx.keccak, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
-     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
-     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
-     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
+     mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
+
     luffa_2way_init( &ctx.luffa, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
+     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );

-     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
-                           dataLen );
-     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
-                           dataLen );
-     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
-                           dataLen );
-     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
-                           dataLen );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashB, vhashB, dataLen );

-     sph_shavite512( &ctx.shavite, hash0, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash1, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash2, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash3, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash3 );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, dataLen );

-     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
-     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
-     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
     simd_2way_init( &ctx.simd, 512 );
-     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
+     simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );

+     mm256_dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 );
+     mm256_dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 );
+
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, dataLen<<3 );
-     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, dataLen<<3 );
-     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, dataLen<<3 );
-     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, dataLen<<3 );
     // Parallel
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, dataLen );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, dataLen );
     sph_fugue512_close( &ctx.fugue, hash0 );
-     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash1, dataLen );
     sph_fugue512_close( &ctx.fugue, hash1 );
-     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash2, dataLen );
     sph_fugue512_close( &ctx.fugue, hash2 );
-     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash3, dataLen );
     sph_fugue512_close( &ctx.fugue, hash3 );

     // Parallel 4way 32 bit
-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, dataLen );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+
+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

     // Serial
+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
     sph_whirlpool_close( &ctx.whirlpool, hash3 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     sha512_4way_init( &ctx.sha512 );
     sha512_4way( &ctx.sha512, vhash, dataLen );
     sha512_4way_close( &ctx.sha512, vhash );

-     mm256_reinterleave_4x32( vhash32, vhash, dataLen<<3 );
-     haval256_5_4way( &ctx.haval, vhash32, dataLen );
-     haval256_5_4way_close( &ctx.haval, vhash );
-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+     mm256_rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 );
+
+     haval256_5_4way_init( &ctx.haval );
+     haval256_5_4way( &ctx.haval, vhashA, dataLen );
+     haval256_5_4way_close( &ctx.haval, vhashA );
+
+     mm256_rintrlv_4x32_4x64( vhash, vhashA, dataLen<<3 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
     memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 );
-     memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );

+     blake512_4way_init( &ctx.blake );
     blake512_4way( &ctx.blake, vhash, dataLen );
     blake512_4way_close(&ctx.blake, vhash);

+     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, dataLen );
     bmw512_4way_close( &ctx.bmw, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
                               dataLen<<3 );
-     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
                               dataLen<<3 );
-     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
                               dataLen<<3 );
-     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
                               dataLen<<3 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

+     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, dataLen );
     skein512_4way_close( &ctx.skein, vhash );

+     jh512_4way_init( &ctx.jh );
     jh512_4way( &ctx.jh, vhash, dataLen );
     jh512_4way_close( &ctx.jh, vhash );

+     keccak512_4way_init( &ctx.keccak );
     keccak512_4way( &ctx.keccak, vhash, dataLen );
     keccak512_4way_close( &ctx.keccak, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
-     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
-     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
-     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
+     mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
+
     luffa_2way_init( &ctx.luffa, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
+     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );

-     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
-                           dataLen );
-     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
-                           dataLen );
-     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
-                           dataLen );
-     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
-                           dataLen );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashB, vhashB, dataLen );

-     sph_shavite512( &ctx.shavite, hash0, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash1, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash2, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash3, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash3 );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, dataLen );

-     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
-     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
-     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
     simd_2way_init( &ctx.simd, 512 );
-     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
+     simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );

+     mm256_dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 );
+     mm256_dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 );
+
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, dataLen<<3 );
-     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, dataLen<<3 );
-     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, dataLen<<3 );
-     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, dataLen<<3 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, dataLen );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

+     mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, dataLen );
     sph_fugue512_close( &ctx.fugue, hash0 );
-     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash1, dataLen );
     sph_fugue512_close( &ctx.fugue, hash1 );
-     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash2, dataLen );
     sph_fugue512_close( &ctx.fugue, hash2 );
-     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash3, dataLen );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, dataLen );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

+     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+
+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
     sph_whirlpool_close( &ctx.whirlpool, hash3 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     sha512_4way_init( &ctx.sha512 );
     sha512_4way( &ctx.sha512, vhash, dataLen );
     sha512_4way_close( &ctx.sha512, vhash );

-     mm256_reinterleave_4x32( vhash32, vhash, dataLen<<3 );
-     haval256_5_4way( &ctx.haval, vhash32, dataLen );
+     mm256_rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 );
+
+     haval256_5_4way_init( &ctx.haval );
+     haval256_5_4way( &ctx.haval, vhashA, dataLen );
     haval256_5_4way_close( &ctx.haval, output );
 }

 int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done )
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t lane_hash[8];
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(64) endiandata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned

   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 73;   // 9*8 + 1

   if ( opt_benchmark )
      ptarget[7] = 0x0cff;

-   for ( int k=0; k < 19; k++ )
-      be32enc( &endiandata[k], pdata[k] );
-
-   uint64_t *edata = (uint64_t*)endiandata;
-   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
-
-   xevan_4way_blake512_midstate( vdata );
-
+   mm256_bswap_intrlv80_4x64( vdata, pdata );
   do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+2, n+1 );
-      be32enc( noncep+4, n+2 );
-      be32enc( noncep+6, n+3 );
+      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+               _mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ), *noncev );

      xevan_4way_hash( hash, vdata );
      for ( int lane = 0; lane < 4; lane++ )
      if ( hash7[ lane ] <= Htarg )
      {
         mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
-	 if ( fulltest( lane_hash, ptarget ) )
+	      if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
             pdata[19] = n + lane;
-             nonces[ num_found++ ] = n + lane;
-             work_set_target_ratio( work, lane_hash );
+             submit_solution( work, lane_hash, mythr, lane );
         }
      }
      n += 4;
-   } while ( ( num_found == 0 ) && ( n < max_nonce )
-             && !work_restart[thr_id].restart );
+   } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/x17/xevan-gate.c
+++ b/algo/x17/xevan-gate.c
@@ -8,7 +8,7 @@ void xevan_set_target( struct work* work, double job_diff )
 bool register_xevan_algo( algo_gate_t* gate )
 {
 #if defined (XEVAN_4WAY)
-  init_xevan_4way_ctx();
+//  init_xevan_4way_ctx();
  gate->scanhash  = (void*)&scanhash_xevan_4way;
  gate->hash      = (void*)&xevan_4way_hash;
 #else
--- a/algo/x17/xevan-gate.h
+++ b/algo/x17/xevan-gate.h
@@ -15,16 +15,16 @@ bool register_xevan_algo( algo_gate_t* gate );
 void xevan_4way_hash( void *state, const void *input );

 int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done );
+                       uint64_t *hashes_done, struct thr_info *mythr );

-void init_xevan_4way_ctx();
+//void init_xevan_4way_ctx();

 #endif

 void xevan_hash( void *state, const void *input );

 int scanhash_xevan( int thr_id, struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done );
+                  uint64_t *hashes_done, struct thr_info *mythr );

 void init_xevan_ctx();

--- a/algo/x17/xevan.c
+++ b/algo/x17/xevan.c
@@ -230,13 +230,14 @@ void xevan_hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-int scanhash_xevan(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_xevan( int thr_id, struct work *work, uint32_t max_nonce,
+	            uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(64) hash[8];
 	uint32_t _ALIGN(64) endiandata[20];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
-
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
@@ -248,8 +249,7 @@ int scanhash_xevan(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *
 	for (int k=0; k < 19; k++)
 		be32enc(&endiandata[k], pdata[k]);

-        xevan_blake512_midstate( endiandata );
-
+   xevan_blake512_midstate( endiandata );
 	do {
 		be32enc(&endiandata[19], nonce);
 		xevan_hash(hash, endiandata);
--- a/algo/yescrypt/sha256_Y.c
+++ b/algo/yescrypt/sha256_Y.c
@@ -290,7 +290,7 @@ SHA256_Final_Y(unsigned char digest[32], SHA256_CTX_Y * ctx)

 /* Initialize an HMAC-SHA256 operation with the given key. */
 void
-HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
+HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y * ctx, const void * _K, size_t Klen)
 {
 	unsigned char pad[64];
 	unsigned char khash[32];
@@ -326,7 +326,7 @@ HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)

 /* Add bytes to the HMAC-SHA256 operation. */
 void
-HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)
+HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y * ctx, const void *in, size_t len)
 {

 	/* Feed data to the inner SHA256 operation. */
@@ -335,7 +335,7 @@ HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)

 /* Finish an HMAC-SHA256 operation. */
 void
-HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx)
+HMAC_SHA256_Final_Y(unsigned char digest[32], HMAC_SHA256_CTX_Y * ctx)
 {
 	unsigned char ihash[32];

@@ -361,7 +361,7 @@ void
 PBKDF2_SHA256_Y(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
 {
-	HMAC_SHA256_CTX PShctx, hctx;
+	HMAC_SHA256_CTX_Y PShctx, hctx;
 	uint8_t _ALIGN(128) T[32];
 	uint8_t _ALIGN(128) U[32];
 	uint8_t ivec[4];
@@ -370,8 +370,8 @@ PBKDF2_SHA256_Y(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
 	int k;

 	/* Compute HMAC state after processing P and S. */
-	HMAC_SHA256_Init(&PShctx, passwd, passwdlen);
-	HMAC_SHA256_Update(&PShctx, salt, saltlen);
+	HMAC_SHA256_Init_Y(&PShctx, passwd, passwdlen);
+	HMAC_SHA256_Update_Y(&PShctx, salt, saltlen);

 	/* Iterate through the blocks. */
 	for (i = 0; i * 32 < dkLen; i++) {
@@ -379,18 +379,18 @@ PBKDF2_SHA256_Y(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
 		be32enc(ivec, (uint32_t)(i + 1));

 		/* Compute U_1 = PRF(P, S || INT(i)). */
-		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
-		HMAC_SHA256_Update(&hctx, ivec, 4);
-		HMAC_SHA256_Final(U, &hctx);
+		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_Y));
+		HMAC_SHA256_Update_Y(&hctx, ivec, 4);
+		HMAC_SHA256_Final_Y(U, &hctx);

 		/* T_i = U_1 ... */
 		memcpy(T, U, 32);

 		for (j = 2; j <= c; j++) {
 			/* Compute U_j. */
-			HMAC_SHA256_Init(&hctx, passwd, passwdlen);
-			HMAC_SHA256_Update(&hctx, U, 32);
-			HMAC_SHA256_Final(U, &hctx);
+			HMAC_SHA256_Init_Y(&hctx, passwd, passwdlen);
+			HMAC_SHA256_Update_Y(&hctx, U, 32);
+			HMAC_SHA256_Final_Y(U, &hctx);

 			/* ... xor U_j ... */
 			for (k = 0; k < 32; k++)
--- a/algo/yescrypt/sha256_Y.h
+++ b/algo/yescrypt/sha256_Y.h
@@ -49,14 +49,14 @@ typedef struct HMAC_SHA256Context {
 typedef struct HMAC_SHA256Context {
        SHA256_CTX ictx;
        SHA256_CTX octx;
-} HMAC_SHA256_CTX;
+} HMAC_SHA256_CTX_Y;

 void	SHA256_Init_Y(SHA256_CTX_Y *);
 void	SHA256_Update_Y(SHA256_CTX_Y *, const void *, size_t);
 void	SHA256_Final_Y(unsigned char [32], SHA256_CTX_Y *);
-void	HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t);
-void	HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t);
-void	HMAC_SHA256_Final(unsigned char [32], HMAC_SHA256_CTX *);
+void	HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
+void	HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
+void	HMAC_SHA256_Final_Y(unsigned char [32], HMAC_SHA256_CTX_Y *);

 /**
 * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
--- a/algo/yescrypt/yescrypt-simd.c
+++ b/algo/yescrypt/yescrypt-simd.c
@@ -1354,14 +1354,14 @@ yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
 	if ((t || flags) && buflen == sizeof(sha256)) {
 	   /* Compute ClientKey */
 	   {
-		HMAC_SHA256_CTX ctx;
-		HMAC_SHA256_Init(&ctx, buf, buflen);
+		HMAC_SHA256_CTX_Y ctx;
+		HMAC_SHA256_Init_Y(&ctx, buf, buflen);
                if ( yescrypt_client_key )
-                    HMAC_SHA256_Update( &ctx, (uint8_t*)yescrypt_client_key,
+                    HMAC_SHA256_Update_Y( &ctx, (uint8_t*)yescrypt_client_key,
                                        yescrypt_client_key_len );
                else
-                    HMAC_SHA256_Update( &ctx, salt, saltlen );
-		HMAC_SHA256_Final(sha256, &ctx);
+                    HMAC_SHA256_Update_Y( &ctx, salt, saltlen );
+		HMAC_SHA256_Final_Y(sha256, &ctx);
 	   }
 	   /* Compute StoredKey */
 	   {
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -383,7 +383,7 @@ void yescrypthash(void *output, const void *input)
 }

 int scanhash_yescrypt( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done )
+                       uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t _ALIGN(64) vhash[8];
        uint32_t _ALIGN(64) endiandata[20];
@@ -393,6 +393,7 @@ int scanhash_yescrypt( int thr_id, struct work *work, uint32_t max_nonce,
        const uint32_t Htarg = ptarget[7];
        const uint32_t first_nonce = pdata[19];
        uint32_t n = first_nonce;
+        /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

        for (int k = 0; k < 19; k++)
                be32enc(&endiandata[k], pdata[k]);
--- a/algo/yespower/sha256-avx2.c
+++ b/algo/yespower/sha256-avx2.c
@@ -1,646 +0,0 @@
-/*-
- * Copyright 2005-2016 Colin Percival
- * Copyright 2016-2018 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <assert.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "insecure_memzero.h"
-#include "sysendian.h"
-
-#include "sha256.h"
-
-#ifdef __ICC
-/* Miscompile with icc 14.0.0 (at least), so don't use restrict there */
-#define restrict
-#elif __STDC_VERSION__ >= 199901L
-/* Have restrict */
-#elif defined(__GNUC__)
-#define restrict __restrict
-#else
-#define restrict
-#endif
-
-/*
- * Encode a length len*2 vector of (uint32_t) into a length len*8 vector of
- * (uint8_t) in big-endian form.
- */
-static void
-be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
-{
-
-	/* Encode vector, two words at a time. */
-	do {
-		be32enc(&dst[0], src[0]);
-		be32enc(&dst[4], src[1]);
-		src += 2;
-		dst += 8;
-	} while (--len);
-}
-
-/*
- * Decode a big-endian length len*8 vector of (uint8_t) into a length
- * len*2 vector of (uint32_t).
- */
-static void
-be32dec_vect(uint32_t * dst, const uint8_t * src, size_t len)
-{
-
-	/* Decode vector, two words at a time. */
-	do {
-		dst[0] = be32dec(&src[0]);
-		dst[1] = be32dec(&src[4]);
-		src += 8;
-		dst += 2;
-	} while (--len);
-}
-
-/* SHA256 round constants. */
-static const uint32_t Krnd[64] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-/* Elementary functions used by SHA256 */
-#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
-#define Maj(x, y, z)	((x & (y | z)) | (y & z))
-#define SHR(x, n)	(x >> n)
-#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
-#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
-#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
-#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
-#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
-
-/* SHA256 round function */
-#define RND(a, b, c, d, e, f, g, h, k)			\
-	h += S1(e) + Ch(e, f, g) + k;			\
-	d += h;						\
-	h += S0(a) + Maj(a, b, c);
-
-/* Adjusted round function for rotating state */
-#define RNDr(S, W, i, ii)			\
-	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
-	    S[(66 - i) % 8], S[(67 - i) % 8],	\
-	    S[(68 - i) % 8], S[(69 - i) % 8],	\
-	    S[(70 - i) % 8], S[(71 - i) % 8],	\
-	    W[i + ii] + Krnd[i + ii])
-
-/* Message schedule computation */
-#define MSCH(W, ii, i)				\
-	W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + s0(W[i + ii + 1]) + W[i + ii]
-
-/*
- * SHA256 block compression function.  The 256-bit state is transformed via
- * the 512-bit input block to produce a new state.
- */
-static void
-SHA256_Transform(uint32_t state[static restrict 8],
-    const uint8_t block[static restrict 64],
-    uint32_t W[static restrict 64], uint32_t S[static restrict 8])
-{
-	int i;
-
-	/* 1. Prepare the first part of the message schedule W. */
-	be32dec_vect(W, block, 8);
-
-	/* 2. Initialize working variables. */
-	memcpy(S, state, 32);
-
-	/* 3. Mix. */
-	for (i = 0; i < 64; i += 16) {
-		RNDr(S, W, 0, i);
-		RNDr(S, W, 1, i);
-		RNDr(S, W, 2, i);
-		RNDr(S, W, 3, i);
-		RNDr(S, W, 4, i);
-		RNDr(S, W, 5, i);
-		RNDr(S, W, 6, i);
-		RNDr(S, W, 7, i);
-		RNDr(S, W, 8, i);
-		RNDr(S, W, 9, i);
-		RNDr(S, W, 10, i);
-		RNDr(S, W, 11, i);
-		RNDr(S, W, 12, i);
-		RNDr(S, W, 13, i);
-		RNDr(S, W, 14, i);
-		RNDr(S, W, 15, i);
-
-		if (i == 48)
-			break;
-		MSCH(W, 0, i);
-		MSCH(W, 1, i);
-		MSCH(W, 2, i);
-		MSCH(W, 3, i);
-		MSCH(W, 4, i);
-		MSCH(W, 5, i);
-		MSCH(W, 6, i);
-		MSCH(W, 7, i);
-		MSCH(W, 8, i);
-		MSCH(W, 9, i);
-		MSCH(W, 10, i);
-		MSCH(W, 11, i);
-		MSCH(W, 12, i);
-		MSCH(W, 13, i);
-		MSCH(W, 14, i);
-		MSCH(W, 15, i);
-	}
-
-	/* 4. Mix local working variables into global state. */
-	state[0] += S[0];
-	state[1] += S[1];
-	state[2] += S[2];
-	state[3] += S[3];
-	state[4] += S[4];
-	state[5] += S[5];
-	state[6] += S[6];
-	state[7] += S[7];
-}
-
-static const uint8_t PAD[64] = {
-	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-/* Add padding and terminating bit-count. */
-static void
-SHA256_Pad(SHA256_CTX * ctx, uint32_t tmp32[static restrict 72])
-{
-	size_t r;
-
-	/* Figure out how many bytes we have buffered. */
-	r = (ctx->count >> 3) & 0x3f;
-
-	/* Pad to 56 mod 64, transforming if we finish a block en route. */
-	if (r < 56) {
-		/* Pad to 56 mod 64. */
-		memcpy(&ctx->buf[r], PAD, 56 - r);
-	} else {
-		/* Finish the current block and mix. */
-		memcpy(&ctx->buf[r], PAD, 64 - r);
-		SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-
-		/* The start of the final block is all zeroes. */
-		memset(&ctx->buf[0], 0, 56);
-	}
-
-	/* Add the terminating bit-count. */
-	be64enc(&ctx->buf[56], ctx->count);
-
-	/* Mix in the final block. */
-	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-}
-
-/* Magic initialization constants. */
-static const uint32_t initial_state[8] = {
-	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-};
-
-/**
- * SHA256_Init(ctx):
- * Initialize the SHA256 context ${ctx}.
- */
-void
-SHA256_Init(SHA256_CTX * ctx)
-{
-
-	/* Zero bits processed so far. */
-	ctx->count = 0;
-
-	/* Initialize state. */
-	memcpy(ctx->state, initial_state, sizeof(initial_state));
-}
-
-/**
- * SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
- */
-static void
-_SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len,
-    uint32_t tmp32[static restrict 72])
-{
-	uint32_t r;
-	const uint8_t * src = in;
-
-	/* Return immediately if we have nothing to do. */
-	if (len == 0)
-		return;
-
-	/* Number of bytes left in the buffer from previous updates. */
-	r = (ctx->count >> 3) & 0x3f;
-
-	/* Update number of bits. */
-	ctx->count += (uint64_t)(len) << 3;
-
-	/* Handle the case where we don't need to perform any transforms. */
-	if (len < 64 - r) {
-		memcpy(&ctx->buf[r], src, len);
-		return;
-	}
-
-	/* Finish the current block. */
-	memcpy(&ctx->buf[r], src, 64 - r);
-	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-	src += 64 - r;
-	len -= 64 - r;
-
-	/* Perform complete blocks. */
-	while (len >= 64) {
-		SHA256_Transform(ctx->state, src, &tmp32[0], &tmp32[64]);
-		src += 64;
-		len -= 64;
-	}
-
-	/* Copy left over data into buffer. */
-	memcpy(ctx->buf, src, len);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_SHA256_Update(ctx, in, len, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * SHA256_Final(digest, ctx):
- * Output the SHA256 hash of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-static void
-_SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx,
-    uint32_t tmp32[static restrict 72])
-{
-
-	/* Add padding. */
-	SHA256_Pad(ctx, tmp32);
-
-	/* Write the hash. */
-	be32enc_vect(digest, ctx->state, 4);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_SHA256_Final(digest, ctx, tmp32);
-
-	/* Clear the context state. */
-	insecure_memzero(ctx, sizeof(SHA256_CTX));
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * SHA256_Buf(in, len, digest):
- * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
- */
-void
-SHA256_Buf(const void * in, size_t len, uint8_t digest[32])
-{
-	SHA256_CTX ctx;
-	uint32_t tmp32[72];
-
-	SHA256_Init(&ctx);
-	_SHA256_Update(&ctx, in, len, tmp32);
-	_SHA256_Final(digest, &ctx, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(&ctx, sizeof(SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Init(ctx, K, Klen):
- * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
- * ${K}.
- */
-static void
-_HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen,
-    uint32_t tmp32[static restrict 72], uint8_t pad[static restrict 64],
-    uint8_t khash[static restrict 32])
-{
-	const uint8_t * K = _K;
-	size_t i;
-
-	/* If Klen > 64, the key is really SHA256(K). */
-	if (Klen > 64) {
-		SHA256_Init(&ctx->ictx);
-		_SHA256_Update(&ctx->ictx, K, Klen, tmp32);
-		_SHA256_Final(khash, &ctx->ictx, tmp32);
-		K = khash;
-		Klen = 32;
-	}
-
-	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-	SHA256_Init(&ctx->ictx);
-	memset(pad, 0x36, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-	_SHA256_Update(&ctx->ictx, pad, 64, tmp32);
-
-	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-	SHA256_Init(&ctx->octx);
-	memset(pad, 0x5c, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-	_SHA256_Update(&ctx->octx, pad, 64, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
-{
-	uint32_t tmp32[72];
-	uint8_t pad[64];
-	uint8_t khash[32];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Init(ctx, _K, Klen, tmp32, pad, khash);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(khash, 32);
-	insecure_memzero(pad, 64);
-}
-
-/**
- * HMAC_SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
- */
-static void
-_HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len,
-    uint32_t tmp32[static restrict 72])
-{
-
-	/* Feed data to the inner SHA256 operation. */
-	_SHA256_Update(&ctx->ictx, in, len, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Update(ctx, in, len, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Final(digest, ctx):
- * Output the HMAC-SHA256 of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-static void
-_HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx,
-    uint32_t tmp32[static restrict 72], uint8_t ihash[static restrict 32])
-{
-
-	/* Finish the inner SHA256 operation. */
-	_SHA256_Final(ihash, &ctx->ictx, tmp32);
-
-	/* Feed the inner hash to the outer SHA256 operation. */
-	_SHA256_Update(&ctx->octx, ihash, 32, tmp32);
-
-	/* Finish the outer SHA256 operation. */
-	_SHA256_Final(digest, &ctx->octx, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx)
-{
-	uint32_t tmp32[72];
-	uint8_t ihash[32];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Final(digest, ctx, tmp32, ihash);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(ihash, 32);
-}
-
-/**
- * HMAC_SHA256_Buf(K, Klen, in, len, digest):
- * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
- * length ${Klen}, and write the result to ${digest}.
- */
-void
-HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
-    uint8_t digest[32])
-{
-	HMAC_SHA256_CTX ctx;
-	uint32_t tmp32[72];
-	uint8_t tmp8[96];
-
-	_HMAC_SHA256_Init(&ctx, K, Klen, tmp32, &tmp8[0], &tmp8[64]);
-	_HMAC_SHA256_Update(&ctx, in, len, tmp32);
-	_HMAC_SHA256_Final(digest, &ctx, tmp32, &tmp8[0]);
-
-	/* Clean the stack. */
-	insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(tmp8, 96);
-}
-
-/* Add padding and terminating bit-count, but don't invoke Transform yet. */
-static int
-SHA256_Pad_Almost(SHA256_CTX * ctx, uint8_t len[static restrict 8],
-    uint32_t tmp32[static restrict 72])
-{
-	uint32_t r;
-
-	r = (ctx->count >> 3) & 0x3f;
-	if (r >= 56)
-		return -1;
-
-	/*
-	 * Convert length to a vector of bytes -- we do this now rather
-	 * than later because the length will change after we pad.
-	 */
-	be64enc(len, ctx->count);
-
-	/* Add 1--56 bytes so that the resulting length is 56 mod 64. */
-	_SHA256_Update(ctx, PAD, 56 - r, tmp32);
-
-	/* Add the terminating bit-count. */
-	ctx->buf[63] = len[7];
-	_SHA256_Update(ctx, len, 7, tmp32);
-
-	return 0;
-}
-
-/**
- * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
- * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
- * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
- */
-void
-PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
-    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
-{
-	HMAC_SHA256_CTX Phctx, PShctx, hctx;
-	uint32_t tmp32[72];
-	union {
-		uint8_t tmp8[96];
-		uint32_t state[8];
-	} u;
-	size_t i;
-	uint8_t ivec[4];
-	uint8_t U[32];
-	uint8_t T[32];
-	uint64_t j;
-	int k;
-	size_t clen;
-
-	/* Sanity-check. */
-	assert(dkLen <= 32 * (size_t)(UINT32_MAX));
-
-	if (c == 1 && (dkLen & 31) == 0 && (saltlen & 63) <= 51) {
-		uint32_t oldcount;
-		uint8_t * ivecp;
-
-		/* Compute HMAC state after processing P and S. */
-		_HMAC_SHA256_Init(&hctx, passwd, passwdlen,
-		    tmp32, &u.tmp8[0], &u.tmp8[64]);
-		_HMAC_SHA256_Update(&hctx, salt, saltlen, tmp32);
-
-		/* Prepare ictx padding. */
-		oldcount = hctx.ictx.count & (0x3f << 3);
-		_HMAC_SHA256_Update(&hctx, "\0\0\0", 4, tmp32);
-		if ((hctx.ictx.count & (0x3f << 3)) < oldcount ||
-		    SHA256_Pad_Almost(&hctx.ictx, u.tmp8, tmp32))
-			goto generic; /* Can't happen due to saltlen check */
-		ivecp = hctx.ictx.buf + (oldcount >> 3);
-
-		/* Prepare octx padding. */
-		hctx.octx.count += 32 << 3;
-		SHA256_Pad_Almost(&hctx.octx, u.tmp8, tmp32);
-
-		/* Iterate through the blocks. */
-		for (i = 0; i * 32 < dkLen; i++) {
-			/* Generate INT(i + 1). */
-			be32enc(ivecp, (uint32_t)(i + 1));
-
-			/* Compute U_1 = PRF(P, S || INT(i)). */
-			memcpy(u.state, hctx.ictx.state, sizeof(u.state));
-			SHA256_Transform(u.state, hctx.ictx.buf,
-			    &tmp32[0], &tmp32[64]);
-			be32enc_vect(hctx.octx.buf, u.state, 4);
-			memcpy(u.state, hctx.octx.state, sizeof(u.state));
-			SHA256_Transform(u.state, hctx.octx.buf,
-			    &tmp32[0], &tmp32[64]);
-			be32enc_vect(&buf[i * 32], u.state, 4);
-		}
-
-		goto cleanup;
-	}
-
-generic:
-	/* Compute HMAC state after processing P. */
-	_HMAC_SHA256_Init(&Phctx, passwd, passwdlen,
-	    tmp32, &u.tmp8[0], &u.tmp8[64]);
-
-	/* Compute HMAC state after processing P and S. */
-	memcpy(&PShctx, &Phctx, sizeof(HMAC_SHA256_CTX));
-	_HMAC_SHA256_Update(&PShctx, salt, saltlen, tmp32);
-
-	/* Iterate through the blocks. */
-	for (i = 0; i * 32 < dkLen; i++) {
-		/* Generate INT(i + 1). */
-		be32enc(ivec, (uint32_t)(i + 1));
-
-		/* Compute U_1 = PRF(P, S || INT(i)). */
-		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
-		_HMAC_SHA256_Update(&hctx, ivec, 4, tmp32);
-		_HMAC_SHA256_Final(T, &hctx, tmp32, u.tmp8);
-
-		if (c > 1) {
-			/* T_i = U_1 ... */
-			memcpy(U, T, 32);
-
-			for (j = 2; j <= c; j++) {
-				/* Compute U_j. */
-				memcpy(&hctx, &Phctx, sizeof(HMAC_SHA256_CTX));
-				_HMAC_SHA256_Update(&hctx, U, 32, tmp32);
-				_HMAC_SHA256_Final(U, &hctx, tmp32, u.tmp8);
-
-				/* ... xor U_j ... */
-				for (k = 0; k < 32; k++)
-					T[k] ^= U[k];
-			}
-		}
-
-		/* Copy as many bytes as necessary into buf. */
-		clen = dkLen - i * 32;
-		if (clen > 32)
-			clen = 32;
-		memcpy(&buf[i * 32], T, clen);
-	}
-
-	/* Clean the stack. */
-	insecure_memzero(&Phctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(&PShctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(U, 32);
-	insecure_memzero(T, 32);
-
-cleanup:
-	insecure_memzero(&hctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(&u, sizeof(u));
-}
--- a/algo/yespower/sha256.c
+++ b/algo/yespower/sha256.c
@@ -1,680 +0,0 @@
-/*-
- * Copyright 2005-2016 Colin Percival
- * Copyright 2016-2018 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <assert.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "insecure_memzero.h"
-#include "sysendian.h"
-
-#include "sha256.h"
-#include "avxdefs.h"
-
-#ifdef __ICC
-/* Miscompile with icc 14.0.0 (at least), so don't use restrict there */
-#define restrict
-#elif __STDC_VERSION__ >= 199901L
-/* Have restrict */
-#elif defined(__GNUC__)
-#define restrict __restrict
-#else
-#define restrict
-#endif
-
-/*
- * Encode a length len*2 vector of (uint32_t) into a length len*8 vector of
- * (uint8_t) in big-endian form.
- */
-static void
-be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
-{
-
-	/* Encode vector, two words at a time. */
-	do {
-		be32enc(&dst[0], src[0]);
-		be32enc(&dst[4], src[1]);
-		src += 2;
-		dst += 8;
-	} while (--len);
-}
-
-/*
- * Decode a big-endian length len*8 vector of (uint8_t) into a length
- * len*2 vector of (uint32_t).
- */
-static void
-be32dec_vect(uint32_t * dst, const uint8_t * src, size_t len)
-{
-
-	/* Decode vector, two words at a time. */
-	do {
-		dst[0] = be32dec(&src[0]);
-		dst[1] = be32dec(&src[4]);
-		src += 8;
-		dst += 2;
-	} while (--len);
-}
-
-/* SHA256 round constants. */
-static const uint32_t Krnd[64] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-/* Elementary functions used by SHA256 */
-#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
-#define Maj(x, y, z)	((x & (y | z)) | (y & z))
-#define SHR(x, n)	(x >> n)
-#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
-#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
-#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
-#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
-#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
-
-#if 0    //defined(__SHA__)
-
-// ABEF = _mm_sha256rnds2_epu32( CDGH, ABEF, k )
-//_mm_sha256rnds2_epu32 (__m128i a, __m128i b, __m128i k)
-// b = { ABEF }   a = { CDGH }
-//
-//a = _mm_set_epi32( S[(66 - i) % 8], S[(67 - i) % 8],
-//                 S[(70 - i) % 8], S[(71 - i) % 8] );
-//b = _mm_set_epi32( S[(64 - i) % 8], S[(65 - i) % 8],
-//                 S[(68 - i) % 8], S[(69 - i) % 8] );
-//k = _mm_set1_epi32( W[i + ii] + Krnd[i + ii] )
-// _mm_sha256rnds2_epu32(a,b,k)
-
-#define RNDr( S, W, i, ii ) do \
-{ \
-uint32_t abef[4]; \
-  __m128i ABEF =  _mm_set_epi32( S[(66 - i) % 8], S[(67 - i) % 8], \
-                                 S[(70 - i) % 8], S[(71 - i) % 8] ); \
-  __m128i CDGH =  _mm_set_epi32( S[(64 - i) % 8], S[(65 - i) % 8], \
-                                 S[(68 - i) % 8], S[(69 - i) % 8] ); \
-  __m128i    K =  _mm_set1_epi32( W[i + ii] + Krnd[i + ii] ); \
-  casti_m128i( abef, 0 )  = _mm_sha256rnds2_epu32( CDGH, ABEF, K ); \
-  S[(66 - i) % 8] = abef[3]; \
-  S[(67 - i) % 8] = abef[2]; \
-  S[(64 - i) % 8] = abef[1]; \
-  S[(65 - i) % 8] = abef[0]; \
-} while(0)
-
-#else
-
-/* SHA256 round function */
-
-#define RND(a, b, c, d, e, f, g, h, k)			\
-	h += S1(e) + Ch(e, f, g) + k;			\
-	d += h;						\
-	h += S0(a) + Maj(a, b, c);
-
-/* Adjusted round function for rotating state */
-#define RNDr(S, W, i, ii)			\
-	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
-	    S[(66 - i) % 8], S[(67 - i) % 8],	\
-	    S[(68 - i) % 8], S[(69 - i) % 8],	\
-	    S[(70 - i) % 8], S[(71 - i) % 8],	\
-	    W[i + ii] + Krnd[i + ii])
-
-#endif
-
-/* Message schedule computation */
-#define MSCH(W, ii, i)				\
-	W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + s0(W[i + ii + 1]) + W[i + ii]
-
-/*
- * SHA256 block compression function.  The 256-bit state is transformed via
- * the 512-bit input block to produce a new state.
- */
-static void
-SHA256_Transform(uint32_t state[static restrict 8],
-    const uint8_t block[static restrict 64],
-    uint32_t W[static restrict 64], uint32_t S[static restrict 8])
-{
-	int i;
-
-	/* 1. Prepare the first part of the message schedule W. */
-	be32dec_vect(W, block, 8);
-
-	/* 2. Initialize working variables. */
-	memcpy(S, state, 32);
-
-	/* 3. Mix. */
-	for (i = 0; i < 64; i += 16) {
-		RNDr(S, W, 0, i);
-		RNDr(S, W, 1, i);
-		RNDr(S, W, 2, i);
-		RNDr(S, W, 3, i);
-		RNDr(S, W, 4, i);
-		RNDr(S, W, 5, i);
-		RNDr(S, W, 6, i);
-		RNDr(S, W, 7, i);
-		RNDr(S, W, 8, i);
-		RNDr(S, W, 9, i);
-		RNDr(S, W, 10, i);
-		RNDr(S, W, 11, i);
-		RNDr(S, W, 12, i);
-		RNDr(S, W, 13, i);
-		RNDr(S, W, 14, i);
-		RNDr(S, W, 15, i);
-
-		if (i == 48)
-			break;
-		MSCH(W, 0, i);
-		MSCH(W, 1, i);
-		MSCH(W, 2, i);
-		MSCH(W, 3, i);
-		MSCH(W, 4, i);
-		MSCH(W, 5, i);
-		MSCH(W, 6, i);
-		MSCH(W, 7, i);
-		MSCH(W, 8, i);
-		MSCH(W, 9, i);
-		MSCH(W, 10, i);
-		MSCH(W, 11, i);
-		MSCH(W, 12, i);
-		MSCH(W, 13, i);
-		MSCH(W, 14, i);
-		MSCH(W, 15, i);
-	}
-
-	/* 4. Mix local working variables into global state. */
-	state[0] += S[0];
-	state[1] += S[1];
-	state[2] += S[2];
-	state[3] += S[3];
-	state[4] += S[4];
-	state[5] += S[5];
-	state[6] += S[6];
-	state[7] += S[7];
-}
-
-static const uint8_t PAD[64] = {
-	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-/* Add padding and terminating bit-count. */
-static void
-SHA256_Pad(SHA256_CTX * ctx, uint32_t tmp32[static restrict 72])
-{
-	size_t r;
-
-	/* Figure out how many bytes we have buffered. */
-	r = (ctx->count >> 3) & 0x3f;
-
-	/* Pad to 56 mod 64, transforming if we finish a block en route. */
-	if (r < 56) {
-		/* Pad to 56 mod 64. */
-		memcpy(&ctx->buf[r], PAD, 56 - r);
-	} else {
-		/* Finish the current block and mix. */
-		memcpy(&ctx->buf[r], PAD, 64 - r);
-		SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-
-		/* The start of the final block is all zeroes. */
-		memset(&ctx->buf[0], 0, 56);
-	}
-
-	/* Add the terminating bit-count. */
-	be64enc(&ctx->buf[56], ctx->count);
-
-	/* Mix in the final block. */
-	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-}
-
-/* Magic initialization constants. */
-static const uint32_t initial_state[8] = {
-	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-};
-
-/**
- * SHA256_Init(ctx):
- * Initialize the SHA256 context ${ctx}.
- */
-void
-SHA256_Init(SHA256_CTX * ctx)
-{
-
-	/* Zero bits processed so far. */
-	ctx->count = 0;
-
-	/* Initialize state. */
-	memcpy(ctx->state, initial_state, sizeof(initial_state));
-}
-
-/**
- * SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
- */
-static void
-_SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len,
-    uint32_t tmp32[static restrict 72])
-{
-	uint32_t r;
-	const uint8_t * src = in;
-
-	/* Return immediately if we have nothing to do. */
-	if (len == 0)
-		return;
-
-	/* Number of bytes left in the buffer from previous updates. */
-	r = (ctx->count >> 3) & 0x3f;
-
-	/* Update number of bits. */
-	ctx->count += (uint64_t)(len) << 3;
-
-	/* Handle the case where we don't need to perform any transforms. */
-	if (len < 64 - r) {
-		memcpy(&ctx->buf[r], src, len);
-		return;
-	}
-
-	/* Finish the current block. */
-	memcpy(&ctx->buf[r], src, 64 - r);
-	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-	src += 64 - r;
-	len -= 64 - r;
-
-	/* Perform complete blocks. */
-	while (len >= 64) {
-		SHA256_Transform(ctx->state, src, &tmp32[0], &tmp32[64]);
-		src += 64;
-		len -= 64;
-	}
-
-	/* Copy left over data into buffer. */
-	memcpy(ctx->buf, src, len);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_SHA256_Update(ctx, in, len, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * SHA256_Final(digest, ctx):
- * Output the SHA256 hash of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-static void
-_SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx,
-    uint32_t tmp32[static restrict 72])
-{
-
-	/* Add padding. */
-	SHA256_Pad(ctx, tmp32);
-
-	/* Write the hash. */
-	be32enc_vect(digest, ctx->state, 4);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_SHA256_Final(digest, ctx, tmp32);
-
-	/* Clear the context state. */
-	insecure_memzero(ctx, sizeof(SHA256_CTX));
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * SHA256_Buf(in, len, digest):
- * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
- */
-void
-SHA256_Buf(const void * in, size_t len, uint8_t digest[32])
-{
-	SHA256_CTX ctx;
-	uint32_t tmp32[72];
-
-	SHA256_Init(&ctx);
-	_SHA256_Update(&ctx, in, len, tmp32);
-	_SHA256_Final(digest, &ctx, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(&ctx, sizeof(SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Init(ctx, K, Klen):
- * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
- * ${K}.
- */
-static void
-_HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen,
-    uint32_t tmp32[static restrict 72], uint8_t pad[static restrict 64],
-    uint8_t khash[static restrict 32])
-{
-	const uint8_t * K = _K;
-	size_t i;
-
-	/* If Klen > 64, the key is really SHA256(K). */
-	if (Klen > 64) {
-		SHA256_Init(&ctx->ictx);
-		_SHA256_Update(&ctx->ictx, K, Klen, tmp32);
-		_SHA256_Final(khash, &ctx->ictx, tmp32);
-		K = khash;
-		Klen = 32;
-	}
-
-	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-	SHA256_Init(&ctx->ictx);
-	memset(pad, 0x36, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-	_SHA256_Update(&ctx->ictx, pad, 64, tmp32);
-
-	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-	SHA256_Init(&ctx->octx);
-	memset(pad, 0x5c, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-	_SHA256_Update(&ctx->octx, pad, 64, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
-{
-	uint32_t tmp32[72];
-	uint8_t pad[64];
-	uint8_t khash[32];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Init(ctx, _K, Klen, tmp32, pad, khash);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(khash, 32);
-	insecure_memzero(pad, 64);
-}
-
-/**
- * HMAC_SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
- */
-static void
-_HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len,
-    uint32_t tmp32[static restrict 72])
-{
-
-	/* Feed data to the inner SHA256 operation. */
-	_SHA256_Update(&ctx->ictx, in, len, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Update(ctx, in, len, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Final(digest, ctx):
- * Output the HMAC-SHA256 of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-static void
-_HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx,
-    uint32_t tmp32[static restrict 72], uint8_t ihash[static restrict 32])
-{
-
-	/* Finish the inner SHA256 operation. */
-	_SHA256_Final(ihash, &ctx->ictx, tmp32);
-
-	/* Feed the inner hash to the outer SHA256 operation. */
-	_SHA256_Update(&ctx->octx, ihash, 32, tmp32);
-
-	/* Finish the outer SHA256 operation. */
-	_SHA256_Final(digest, &ctx->octx, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx)
-{
-	uint32_t tmp32[72];
-	uint8_t ihash[32];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Final(digest, ctx, tmp32, ihash);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(ihash, 32);
-}
-
-/**
- * HMAC_SHA256_Buf(K, Klen, in, len, digest):
- * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
- * length ${Klen}, and write the result to ${digest}.
- */
-void
-HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
-    uint8_t digest[32])
-{
-	HMAC_SHA256_CTX ctx;
-	uint32_t tmp32[72];
-	uint8_t tmp8[96];
-
-	_HMAC_SHA256_Init(&ctx, K, Klen, tmp32, &tmp8[0], &tmp8[64]);
-	_HMAC_SHA256_Update(&ctx, in, len, tmp32);
-	_HMAC_SHA256_Final(digest, &ctx, tmp32, &tmp8[0]);
-
-	/* Clean the stack. */
-	insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(tmp8, 96);
-}
-
-/* Add padding and terminating bit-count, but don't invoke Transform yet. */
-static int
-SHA256_Pad_Almost(SHA256_CTX * ctx, uint8_t len[static restrict 8],
-    uint32_t tmp32[static restrict 72])
-{
-	uint32_t r;
-
-	r = (ctx->count >> 3) & 0x3f;
-	if (r >= 56)
-		return -1;
-
-	/*
-	 * Convert length to a vector of bytes -- we do this now rather
-	 * than later because the length will change after we pad.
-	 */
-	be64enc(len, ctx->count);
-
-	/* Add 1--56 bytes so that the resulting length is 56 mod 64. */
-	_SHA256_Update(ctx, PAD, 56 - r, tmp32);
-
-	/* Add the terminating bit-count. */
-	ctx->buf[63] = len[7];
-	_SHA256_Update(ctx, len, 7, tmp32);
-
-	return 0;
-}
-
-/**
- * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
- * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
- * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
- */
-void
-PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
-    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
-{
-	HMAC_SHA256_CTX Phctx, PShctx, hctx;
-	uint32_t tmp32[72];
-	union {
-		uint8_t tmp8[96];
-		uint32_t state[8];
-	} u;
-	size_t i;
-	uint8_t ivec[4];
-	uint8_t U[32];
-	uint8_t T[32];
-	uint64_t j;
-	int k;
-	size_t clen;
-
-	/* Sanity-check. */
-	assert(dkLen <= 32 * (size_t)(UINT32_MAX));
-
-	if (c == 1 && (dkLen & 31) == 0 && (saltlen & 63) <= 51) {
-		uint32_t oldcount;
-		uint8_t * ivecp;
-
-		/* Compute HMAC state after processing P and S. */
-		_HMAC_SHA256_Init(&hctx, passwd, passwdlen,
-		    tmp32, &u.tmp8[0], &u.tmp8[64]);
-		_HMAC_SHA256_Update(&hctx, salt, saltlen, tmp32);
-
-		/* Prepare ictx padding. */
-		oldcount = hctx.ictx.count & (0x3f << 3);
-		_HMAC_SHA256_Update(&hctx, "\0\0\0", 4, tmp32);
-		if ((hctx.ictx.count & (0x3f << 3)) < oldcount ||
-		    SHA256_Pad_Almost(&hctx.ictx, u.tmp8, tmp32))
-			goto generic; /* Can't happen due to saltlen check */
-		ivecp = hctx.ictx.buf + (oldcount >> 3);
-
-		/* Prepare octx padding. */
-		hctx.octx.count += 32 << 3;
-		SHA256_Pad_Almost(&hctx.octx, u.tmp8, tmp32);
-
-		/* Iterate through the blocks. */
-		for (i = 0; i * 32 < dkLen; i++) {
-			/* Generate INT(i + 1). */
-			be32enc(ivecp, (uint32_t)(i + 1));
-
-			/* Compute U_1 = PRF(P, S || INT(i)). */
-			memcpy(u.state, hctx.ictx.state, sizeof(u.state));
-			SHA256_Transform(u.state, hctx.ictx.buf,
-			    &tmp32[0], &tmp32[64]);
-			be32enc_vect(hctx.octx.buf, u.state, 4);
-			memcpy(u.state, hctx.octx.state, sizeof(u.state));
-			SHA256_Transform(u.state, hctx.octx.buf,
-			    &tmp32[0], &tmp32[64]);
-			be32enc_vect(&buf[i * 32], u.state, 4);
-		}
-
-		goto cleanup;
-	}
-
-generic:
-	/* Compute HMAC state after processing P. */
-	_HMAC_SHA256_Init(&Phctx, passwd, passwdlen,
-	    tmp32, &u.tmp8[0], &u.tmp8[64]);
-
-	/* Compute HMAC state after processing P and S. */
-	memcpy(&PShctx, &Phctx, sizeof(HMAC_SHA256_CTX));
-	_HMAC_SHA256_Update(&PShctx, salt, saltlen, tmp32);
-
-	/* Iterate through the blocks. */
-	for (i = 0; i * 32 < dkLen; i++) {
-		/* Generate INT(i + 1). */
-		be32enc(ivec, (uint32_t)(i + 1));
-
-		/* Compute U_1 = PRF(P, S || INT(i)). */
-		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
-		_HMAC_SHA256_Update(&hctx, ivec, 4, tmp32);
-		_HMAC_SHA256_Final(T, &hctx, tmp32, u.tmp8);
-
-		if (c > 1) {
-			/* T_i = U_1 ... */
-			memcpy(U, T, 32);
-
-			for (j = 2; j <= c; j++) {
-				/* Compute U_j. */
-				memcpy(&hctx, &Phctx, sizeof(HMAC_SHA256_CTX));
-				_HMAC_SHA256_Update(&hctx, U, 32, tmp32);
-				_HMAC_SHA256_Final(U, &hctx, tmp32, u.tmp8);
-
-				/* ... xor U_j ... */
-				for (k = 0; k < 32; k++)
-					T[k] ^= U[k];
-			}
-		}
-
-		/* Copy as many bytes as necessary into buf. */
-		clen = dkLen - i * 32;
-		if (clen > 32)
-			clen = 32;
-		memcpy(&buf[i * 32], T, clen);
-	}
-
-	/* Clean the stack. */
-	insecure_memzero(&Phctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(&PShctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(U, 32);
-	insecure_memzero(T, 32);
-
-cleanup:
-	insecure_memzero(&hctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(&u, sizeof(u));
-}
--- a/algo/yespower/sha256.c.new
+++ b/algo/yespower/sha256.c.new
@@ -1,672 +0,0 @@
-/*-
- * Copyright 2005-2016 Colin Percival
- * Copyright 2016-2018 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <assert.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "insecure_memzero.h"
-#include "sysendian.h"
-
-#include "sha256.h"
-
-#ifdef __ICC
-/* Miscompile with icc 14.0.0 (at least), so don't use restrict there */
-#define restrict
-#elif __STDC_VERSION__ >= 199901L
-/* Have restrict */
-#elif defined(__GNUC__)
-#define restrict __restrict
-#else
-#define restrict
-#endif
-
-/*
- * Encode a length len*2 vector of (uint32_t) into a length len*8 vector of
- * (uint8_t) in big-endian form.
- */
-static void
-be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
-{
-
-	/* Encode vector, two words at a time. */
-	do {
-		be32enc(&dst[0], src[0]);
-		be32enc(&dst[4], src[1]);
-		src += 2;
-		dst += 8;
-	} while (--len);
-}
-
-/*
- * Decode a big-endian length len*8 vector of (uint8_t) into a length
- * len*2 vector of (uint32_t).
- */
-static void
-be32dec_vect(uint32_t * dst, const uint8_t * src, size_t len)
-{
-
-	/* Decode vector, two words at a time. */
-	do {
-		dst[0] = be32dec(&src[0]);
-		dst[1] = be32dec(&src[4]);
-		src += 8;
-		dst += 2;
-	} while (--len);
-}
-
-#if 0
-/* SHA256 round constants. */
-static const uint32_t Krnd[64] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-/* Elementary functions used by SHA256 */
-#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
-#define Maj(x, y, z)	((x & (y | z)) | (y & z))
-#define SHR(x, n)	(x >> n)
-#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
-#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
-#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
-#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
-#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
-
-/* SHA256 round function */
-#define RND(a, b, c, d, e, f, g, h, k)			\
-	h += S1(e) + Ch(e, f, g) + k;			\
-	d += h;						\
-	h += S0(a) + Maj(a, b, c);
-
-/* Adjusted round function for rotating state */
-#define RNDr(S, W, i, ii)			\
-	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
-	    S[(66 - i) % 8], S[(67 - i) % 8],	\
-	    S[(68 - i) % 8], S[(69 - i) % 8],	\
-	    S[(70 - i) % 8], S[(71 - i) % 8],	\
-	    W[i + ii] + Krnd[i + ii])
-
-/* Message schedule computation */
-#define MSCH(W, ii, i)				\
-	W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + s0(W[i + ii + 1]) + W[i + ii]
-
-/*
- * SHA256 block compression function.  The 256-bit state is transformed via
- * the 512-bit input block to produce a new state.
- */
-static void
-SHA256_Transform(uint32_t state[static restrict 8],
-    const uint8_t block[static restrict 64],
-    uint32_t W[static restrict 64], uint32_t S[static restrict 8])
-{
-	int i;
-
-	/* 1. Prepare the first part of the message schedule W. */
-	be32dec_vect(W, block, 8);
-
-	/* 2. Initialize working variables. */
-	memcpy(S, state, 32);
-
-	/* 3. Mix. */
-	for (i = 0; i < 64; i += 16) {
-		RNDr(S, W, 0, i);
-		RNDr(S, W, 1, i);
-		RNDr(S, W, 2, i);
-		RNDr(S, W, 3, i);
-		RNDr(S, W, 4, i);
-		RNDr(S, W, 5, i);
-		RNDr(S, W, 6, i);
-		RNDr(S, W, 7, i);
-		RNDr(S, W, 8, i);
-		RNDr(S, W, 9, i);
-		RNDr(S, W, 10, i);
-		RNDr(S, W, 11, i);
-		RNDr(S, W, 12, i);
-		RNDr(S, W, 13, i);
-		RNDr(S, W, 14, i);
-		RNDr(S, W, 15, i);
-
-		if (i == 48)
-			break;
-		MSCH(W, 0, i);
-		MSCH(W, 1, i);
-		MSCH(W, 2, i);
-		MSCH(W, 3, i);
-		MSCH(W, 4, i);
-		MSCH(W, 5, i);
-		MSCH(W, 6, i);
-		MSCH(W, 7, i);
-		MSCH(W, 8, i);
-		MSCH(W, 9, i);
-		MSCH(W, 10, i);
-		MSCH(W, 11, i);
-		MSCH(W, 12, i);
-		MSCH(W, 13, i);
-		MSCH(W, 14, i);
-		MSCH(W, 15, i);
-	}
-
-	/* 4. Mix local working variables into global state. */
-	state[0] += S[0];
-	state[1] += S[1];
-	state[2] += S[2];
-	state[3] += S[3];
-	state[4] += S[4];
-	state[5] += S[5];
-	state[6] += S[6];
-	state[7] += S[7];
-}
-#endif
-static const uint8_t PAD[64] = {
-	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-/* Add padding and terminating bit-count. */
-static void
-SHA256_Pad(SHA256_CTX * ctx, uint32_t tmp32[static restrict 72])
-{
-	size_t r;
-
-	/* Figure out how many bytes we have buffered. */
-	r = (ctx->count >> 3) & 0x3f;
-
-	/* Pad to 56 mod 64, transforming if we finish a block en route. */
-	if (r < 56) {
-		/* Pad to 56 mod 64. */
-		memcpy(&ctx->buf[r], PAD, 56 - r);
-	} else {
-		/* Finish the current block and mix. */
-		memcpy(&ctx->buf[r], PAD, 64 - r);
-		SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-
-		/* The start of the final block is all zeroes. */
-		memset(&ctx->buf[0], 0, 56);
-	}
-
-	/* Add the terminating bit-count. */
-	be64enc(&ctx->buf[56], ctx->count);
-
-	/* Mix in the final block. */
-	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-}
-#if 0
-/* Magic initialization constants. */
-static const uint32_t initial_state[8] = {
-	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-};
-
-/**
- * SHA256_Init(ctx):
- * Initialize the SHA256 context ${ctx}.
- */
-void
-SHA256_Init(SHA256_CTX * ctx)
-{
-
-	/* Zero bits processed so far. */
-	ctx->count = 0;
-
-	/* Initialize state. */
-	memcpy(ctx->state, initial_state, sizeof(initial_state));
-}
-
-/**
- * SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
- */
-static void
-_SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len,
-    uint32_t tmp32[static restrict 72])
-{
-	uint32_t r;
-	const uint8_t * src = in;
-
-	/* Return immediately if we have nothing to do. */
-	if (len == 0)
-		return;
-
-	/* Number of bytes left in the buffer from previous updates. */
-	r = (ctx->count >> 3) & 0x3f;
-
-	/* Update number of bits. */
-	ctx->count += (uint64_t)(len) << 3;
-
-	/* Handle the case where we don't need to perform any transforms. */
-	if (len < 64 - r) {
-		memcpy(&ctx->buf[r], src, len);
-		return;
-	}
-
-	/* Finish the current block. */
-	memcpy(&ctx->buf[r], src, 64 - r);
-	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-	src += 64 - r;
-	len -= 64 - r;
-
-	/* Perform complete blocks. */
-	while (len >= 64) {
-		SHA256_Transform(ctx->state, src, &tmp32[0], &tmp32[64]);
-		src += 64;
-		len -= 64;
-	}
-
-	/* Copy left over data into buffer. */
-	memcpy(ctx->buf, src, len);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_SHA256_Update(ctx, in, len, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * SHA256_Final(digest, ctx):
- * Output the SHA256 hash of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-static void
-_SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx,
-    uint32_t tmp32[static restrict 72])
-{
-
-	/* Add padding. */
-	SHA256_Pad(ctx, tmp32);
-
-	/* Write the hash. */
-	be32enc_vect(digest, ctx->state, 4);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_SHA256_Final(digest, ctx, tmp32);
-
-	/* Clear the context state. */
-	insecure_memzero(ctx, sizeof(SHA256_CTX));
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-#endif
-/**
- * SHA256_Buf(in, len, digest):
- * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
- */
-void
-SHA256_Buf(const void * in, size_t len, uint8_t digest[32])
-{
-	SHA256_CTX ctx;
-	uint32_t tmp32[72];
-
-	SHA256_Init(&ctx);
-        SHA256_Update(&ctx, in, len);
-        SHA256_Final(digest, &ctx);
-//	_SHA256_Update(&ctx, in, len, tmp32);
-//	_SHA256_Final(digest, &ctx, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(&ctx, sizeof(SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Init(ctx, K, Klen):
- * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
- * ${K}.
- */
-static void
-_HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen,
-    uint32_t tmp32[static restrict 72], uint8_t pad[static restrict 64],
-    uint8_t khash[static restrict 32])
-{
-	const uint8_t * K = _K;
-	size_t i;
-
-	/* If Klen > 64, the key is really SHA256(K). */
-	if (Klen > 64) {
-		SHA256_Init(&ctx->ictx);
-                SHA256_Update(&ctx->ictx, K, Klen);
-                SHA256_Final(khash, &ctx->ictx);
-//		_SHA256_Update(&ctx->ictx, K, Klen, tmp32);
-//		_SHA256_Final(khash, &ctx->ictx, tmp32);
-		K = khash;
-		Klen = 32;
-	}
-
-	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-	SHA256_Init(&ctx->ictx);
-	memset(pad, 0x36, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-        SHA256_Update(&ctx->ictx, pad, 64);
-//	_SHA256_Update(&ctx->ictx, pad, 64, tmp32);
-
-	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-	SHA256_Init(&ctx->octx);
-	memset(pad, 0x5c, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-        SHA256_Update(&ctx->octx, pad, 64);
-//	_SHA256_Update(&ctx->octx, pad, 64, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
-{
-	uint32_t tmp32[72];
-	uint8_t pad[64];
-	uint8_t khash[32];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Init(ctx, _K, Klen, tmp32, pad, khash);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(khash, 32);
-	insecure_memzero(pad, 64);
-}
-
-/**
- * HMAC_SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
- */
-static void
-_HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len,
-    uint32_t tmp32[static restrict 72])
-{
-
-	/* Feed data to the inner SHA256 operation. */
-        SHA256_Update(&ctx->ictx, in, len);
-//	_SHA256_Update(&ctx->ictx, in, len, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Update(ctx, in, len, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Final(digest, ctx):
- * Output the HMAC-SHA256 of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-static void
-_HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx,
-    uint32_t tmp32[static restrict 72], uint8_t ihash[static restrict 32])
-{
-        /* Finish the inner SHA256 operation. */
-        _SHA256_Final(ihash, &ctx->ictx, tmp32);
-
-        /* Feed the inner hash to the outer SHA256 operation. */
-        _SHA256_Update(&ctx->octx, ihash, 32, tmp32);
-
-        /* Finish the outer SHA256 operation. */
-        _SHA256_Final(digest, &ctx->octx, tmp32);
-
-
-//	_SHA256_Final(ihash, &ctx->ictx, tmp32);
-//	_SHA256_Update(&ctx->octx, ihash, 32, tmp32);
-//	_SHA256_Final(digest, &ctx->octx, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx)
-{
-	uint32_t tmp32[72];
-	uint8_t ihash[32];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Final(digest, ctx, tmp32, ihash);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(ihash, 32);
-}
-
-/**
- * HMAC_SHA256_Buf(K, Klen, in, len, digest):
- * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
- * length ${Klen}, and write the result to ${digest}.
- */
-void
-HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
-    uint8_t digest[32])
-{
-	HMAC_SHA256_CTX ctx;
-	uint32_t tmp32[72];
-	uint8_t tmp8[96];
-
-	_HMAC_SHA256_Init(&ctx, K, Klen, tmp32, &tmp8[0], &tmp8[64]);
-	_HMAC_SHA256_Update(&ctx, in, len, tmp32);
-	_HMAC_SHA256_Final(digest, &ctx, tmp32, &tmp8[0]);
-
-	/* Clean the stack. */
-	insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(tmp8, 96);
-}
-
-/* Add padding and terminating bit-count, but don't invoke Transform yet. */
-static int
-SHA256_Pad_Almost(SHA256_CTX * ctx, uint8_t len[static restrict 8],
-    uint32_t tmp32[static restrict 72])
-{
-	uint32_t r;
-
-	r = (ctx->count >> 3) & 0x3f;
-	if (r >= 56)
-		return -1;
-
-	/*
-	 * Convert length to a vector of bytes -- we do this now rather
-	 * than later because the length will change after we pad.
-	 */
-	be64enc(len, ctx->count);
-
-        /* Add 1--56 bytes so that the resulting length is 56 mod 64. */
-        SHA256_Update(ctx, PAD, 56 - r, tmp);
-
-        /* Add the terminating bit-count. */
-        ctx->buf[63] = len[7];
-        SHA256_Update(ctx, len, 7, tmp);
-	
-	/* Add 1--56 bytes so that the resulting length is 56 mod 64. */
-//	_SHA256_Update(ctx, PAD, 56 - r, tmp32);
-
-	/* Add the terminating bit-count. */
-//	ctx->buf[63] = len[7];
-//	_SHA256_Update(ctx, len, 7, tmp32);
-
-	return 0;
-}
-
-/**
- * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
- * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
- * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
- */
-void
-PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
-    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
-{
-	HMAC_SHA256_CTX Phctx, PShctx, hctx;
-	uint32_t tmp32[72];
-	union {
-		uint8_t tmp8[96];
-		uint32_t state[8];
-	} u;
-	size_t i;
-	uint8_t ivec[4];
-	uint8_t U[32];
-	uint8_t T[32];
-	uint64_t j;
-	int k;
-	size_t clen;
-
-	/* Sanity-check. */
-	assert(dkLen <= 32 * (size_t)(UINT32_MAX));
-
-	if (c == 1 && (dkLen & 31) == 0 && (saltlen & 63) <= 51) {
-		uint32_t oldcount;
-		uint8_t * ivecp;
-
-		/* Compute HMAC state after processing P and S. */
-		_HMAC_SHA256_Init(&hctx, passwd, passwdlen,
-		    tmp32, &u.tmp8[0], &u.tmp8[64]);
-		_HMAC_SHA256_Update(&hctx, salt, saltlen, tmp32);
-
-		/* Prepare ictx padding. */
-		oldcount = hctx.ictx.count & (0x3f << 3);
-		_HMAC_SHA256_Update(&hctx, "\0\0\0", 4, tmp32);
-		if ((hctx.ictx.count & (0x3f << 3)) < oldcount ||
-		    SHA256_Pad_Almost(&hctx.ictx, u.tmp8, tmp32))
-			goto generic; /* Can't happen due to saltlen check */
-		ivecp = hctx.ictx.buf + (oldcount >> 3);
-
-		/* Prepare octx padding. */
-		hctx.octx.count += 32 << 3;
-		SHA256_Pad_Almost(&hctx.octx, u.tmp8, tmp32);
-
-		/* Iterate through the blocks. */
-		for (i = 0; i * 32 < dkLen; i++) {
-			/* Generate INT(i + 1). */
-			be32enc(ivecp, (uint32_t)(i + 1));
-
-			/* Compute U_1 = PRF(P, S || INT(i)). */
-			memcpy(u.state, hctx.ictx.state, sizeof(u.state));
-
-                        SHA256_Transform(u.state, hctx.ictx.buf );
-                        be32enc_vect(hctx.octx.buf, u.state, 4);
-                        memcpy(u.state, hctx.octx.state, sizeof(u.state));
-                        SHA256_Transform(u.state, hctx.octx.buf );
-
-//			SHA256_Transform(u.state, hctx.ictx.buf,
-//			    &tmp32[0], &tmp32[64]);
-//			be32enc_vect(hctx.octx.buf, u.state, 4);
-//			memcpy(u.state, hctx.octx.state, sizeof(u.state));
-//			SHA256_Transform(u.state, hctx.octx.buf,
-//			    &tmp32[0], &tmp32[64]);
-
-			be32enc_vect(&buf[i * 32], u.state, 4);
-		}
-
-		goto cleanup;
-	}
-
-generic:
-	/* Compute HMAC state after processing P. */
-	_HMAC_SHA256_Init(&Phctx, passwd, passwdlen,
-	    tmp32, &u.tmp8[0], &u.tmp8[64]);
-
-	/* Compute HMAC state after processing P and S. */
-	memcpy(&PShctx, &Phctx, sizeof(HMAC_SHA256_CTX));
-	_HMAC_SHA256_Update(&PShctx, salt, saltlen, tmp32);
-
-	/* Iterate through the blocks. */
-	for (i = 0; i * 32 < dkLen; i++) {
-		/* Generate INT(i + 1). */
-		be32enc(ivec, (uint32_t)(i + 1));
-
-		/* Compute U_1 = PRF(P, S || INT(i)). */
-		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
-		_HMAC_SHA256_Update(&hctx, ivec, 4, tmp32);
-		_HMAC_SHA256_Final(T, &hctx, tmp32, u.tmp8);
-
-		if (c > 1) {
-			/* T_i = U_1 ... */
-			memcpy(U, T, 32);
-
-			for (j = 2; j <= c; j++) {
-				/* Compute U_j. */
-				memcpy(&hctx, &Phctx, sizeof(HMAC_SHA256_CTX));
-				_HMAC_SHA256_Update(&hctx, U, 32, tmp32);
-				_HMAC_SHA256_Final(U, &hctx, tmp32, u.tmp8);
-
-				/* ... xor U_j ... */
-				for (k = 0; k < 32; k++)
-					T[k] ^= U[k];
-			}
-		}
-
-		/* Copy as many bytes as necessary into buf. */
-		clen = dkLen - i * 32;
-		if (clen > 32)
-			clen = 32;
-		memcpy(&buf[i * 32], T, clen);
-	}
-
-	/* Clean the stack. */
-	insecure_memzero(&Phctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(&PShctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(U, 32);
-	insecure_memzero(T, 32);
-
-cleanup:
-	insecure_memzero(&hctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(&u, sizeof(u));
-}
--- a/algo/yespower/sha256.h
+++ b/algo/yespower/sha256.h
@@ -1,129 +0,0 @@
-/*-
- * Copyright 2005-2016 Colin Percival
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef _SHA256_H_
-#define _SHA256_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Use #defines in order to avoid namespace collisions with anyone else's
- * SHA256 code (e.g., the code in OpenSSL).
- */
-#define SHA256_Init libcperciva_SHA256_Init
-#define SHA256_Update libcperciva_SHA256_Update
-#define SHA256_Final libcperciva_SHA256_Final
-#define SHA256_Buf libcperciva_SHA256_Buf
-#define SHA256_CTX libcperciva_SHA256_CTX
-#define HMAC_SHA256_Init libcperciva_HMAC_SHA256_Init
-#define HMAC_SHA256_Update libcperciva_HMAC_SHA256_Update
-#define HMAC_SHA256_Final libcperciva_HMAC_SHA256_Final
-#define HMAC_SHA256_Buf libcperciva_HMAC_SHA256_Buf
-#define HMAC_SHA256_CTX libcperciva_HMAC_SHA256_CTX
-
-/* Context structure for SHA256 operations. */
-typedef struct {
-	uint32_t state[8];
-	uint64_t count;
-	uint8_t buf[64];
-} SHA256_CTX;
-
-/**
- * SHA256_Init(ctx):
- * Initialize the SHA256 context ${ctx}.
- */
-void SHA256_Init(SHA256_CTX *);
-
-/**
- * SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
- */
-void SHA256_Update(SHA256_CTX *, const void *, size_t);
-
-/**
- * SHA256_Final(digest, ctx):
- * Output the SHA256 hash of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-void SHA256_Final(uint8_t[32], SHA256_CTX *);
-
-/**
- * SHA256_Buf(in, len, digest):
- * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
- */
-void SHA256_Buf(const void *, size_t, uint8_t[32]);
-
-/* Context structure for HMAC-SHA256 operations. */
-typedef struct {
-	SHA256_CTX ictx;
-	SHA256_CTX octx;
-} HMAC_SHA256_CTX;
-
-/**
- * HMAC_SHA256_Init(ctx, K, Klen):
- * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
- * ${K}.
- */
-void HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t);
-
-/**
- * HMAC_SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
- */
-void HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t);
-
-/**
- * HMAC_SHA256_Final(digest, ctx):
- * Output the HMAC-SHA256 of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-void HMAC_SHA256_Final(uint8_t[32], HMAC_SHA256_CTX *);
-
-/**
- * HMAC_SHA256_Buf(K, Klen, in, len, digest):
- * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
- * length ${Klen}, and write the result to ${digest}.
- */
-void HMAC_SHA256_Buf(const void *, size_t, const void *, size_t, uint8_t[32]);
-
-/**
- * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
- * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
- * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
- */
-void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t,
-    uint64_t, uint8_t *, size_t);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* !_SHA256_H_ */
--- a/algo/yespower/sha256.h.new
+++ b/algo/yespower/sha256.h.new
@@ -1,134 +0,0 @@
-/*-
- * Copyright 2005-2016 Colin Percival
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef _SHA256_H_
-#define _SHA256_H_
-
-#include <stddef.h>
-#include <stdint.h>
-#include <openssl.sha>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Use #defines in order to avoid namespace collisions with anyone else's
- * SHA256 code (e.g., the code in OpenSSL).
- */
-/*
-#define SHA256_Init libcperciva_SHA256_Init
-#define SHA256_Update libcperciva_SHA256_Update
-#define SHA256_Final libcperciva_SHA256_Final
-#define SHA256_CTX libcperciva_SHA256_CTX
-*/
-#define SHA256_Buf libcperciva_SHA256_Buf
-#define HMAC_SHA256_Init libcperciva_HMAC_SHA256_Init
-#define HMAC_SHA256_Update libcperciva_HMAC_SHA256_Update
-#define HMAC_SHA256_Final libcperciva_HMAC_SHA256_Final
-#define HMAC_SHA256_Buf libcperciva_HMAC_SHA256_Buf
-#define HMAC_SHA256_CTX libcperciva_HMAC_SHA256_CTX
-
-#if 0
-/* Context structure for SHA256 operations. */
-typedef struct {
-	uint32_t state[8];
-	uint64_t count;
-	uint8_t buf[64];
-} SHA256_CTX;
-
-/**
- * SHA256_Init(ctx):
- * Initialize the SHA256 context ${ctx}.
- */
-void SHA256_Init(SHA256_CTX *);
-
-/**
- * SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
- */
-void SHA256_Update(SHA256_CTX *, const void *, size_t);
-
-/**
- * SHA256_Final(digest, ctx):
- * Output the SHA256 hash of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-void SHA256_Final(uint8_t[32], SHA256_CTX *);
-#endif
-
-/**
- * SHA256_Buf(in, len, digest):
- * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
- */
-void SHA256_Buf(const void *, size_t, uint8_t[32]);
-
-/* Context structure for HMAC-SHA256 operations. */
-typedef struct {
-	SHA256_CTX ictx;
-	SHA256_CTX octx;
-} HMAC_SHA256_CTX;
-
-/**
- * HMAC_SHA256_Init(ctx, K, Klen):
- * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
- * ${K}.
- */
-void HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t);
-
-/**
- * HMAC_SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
- */
-void HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t);
-
-/**
- * HMAC_SHA256_Final(digest, ctx):
- * Output the HMAC-SHA256 of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-void HMAC_SHA256_Final(uint8_t[32], HMAC_SHA256_CTX *);
-
-/**
- * HMAC_SHA256_Buf(K, Klen, in, len, digest):
- * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
- * length ${Klen}, and write the result to ${digest}.
- */
-void HMAC_SHA256_Buf(const void *, size_t, const void *, size_t, uint8_t[32]);
-
-/**
- * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
- * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
- * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
- */
-void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t,
-    uint64_t, uint8_t *, size_t);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* !_SHA256_H_ */
--- a/algo/yespower/sha256_p.c
+++ b/algo/yespower/sha256_p.c
@@ -0,0 +1,218 @@
+/*-
+ * Copyright 2005,2007,2009 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+
+#include <stdint.h>
+#include <string.h>
+
+#include "sysendian.h"
+
+#include "sha256_p.h"
+#include "compat.h"
+
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define SHR(x, n)	(x >> n)
+#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
+#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
+#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k)			\
+	t0 = h + S1(e) + Ch(e, f, g) + k;		\
+	t1 = S0(a) + Maj(a, b, c);			\
+	d += t0;					\
+	h  = t0 + t1;
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i, k)			\
+	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
+	    S[(66 - i) % 8], S[(67 - i) % 8],	\
+	    S[(68 - i) % 8], S[(69 - i) % 8],	\
+	    S[(70 - i) % 8], S[(71 - i) % 8],	\
+	    W[i] + k)
+
+/*
+static unsigned char PAD[64] = {
+	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+*/
+/**
+ * SHA256_Buf(in, len, digest):
+ * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
+ */
+void
+SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
+{
+	SHA256_CTX ctx;
+        SHA256_Init( &ctx );
+        SHA256_Update( &ctx, in, len );
+        SHA256_Final( digest, &ctx );
+}
+
+/**
+ * HMAC_SHA256_Buf(K, Klen, in, len, digest):
+ * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
+ * length ${Klen}, and write the result to ${digest}.
+ */
+void
+HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
+    uint8_t digest[32])
+{
+        HMAC_SHA256_CTX ctx;
+
+        HMAC_SHA256_Init( &ctx, K, Klen );
+        HMAC_SHA256_Update( &ctx, in, len );
+        HMAC_SHA256_Final( digest, &ctx );
+}
+
+/* Initialize an HMAC-SHA256 operation with the given key. */
+void
+HMAC_SHA256_Init( HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen )
+{
+	unsigned char pad[64];
+	unsigned char khash[32];
+	const unsigned char * K = _K;
+	size_t i;
+
+	/* If Klen > 64, the key is really SHA256(K). */
+	if (Klen > 64) {
+		SHA256_Init( &ctx->ictx );
+		SHA256_Update( &ctx->ictx, K, Klen );
+		SHA256_Final( khash, &ctx->ictx );
+		K = khash;
+		Klen = 32;
+	}
+
+	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
+        SHA256_Init( &ctx->ictx );
+	memset( pad, 0x36, 64 );
+	for ( i = 0; i < Klen; i++ )
+		pad[i] ^= K[i];
+	SHA256_Update( &ctx->ictx, pad, 64 );
+
+	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
+	SHA256_Init( &ctx->octx );
+	memset(pad, 0x5c, 64);
+	for ( i = 0; i < Klen; i++ )
+		pad[i] ^= K[i];
+	SHA256_Update( &ctx->octx, pad, 64 );
+
+	/* Clean the stack. */
+	//memset(khash, 0, 32);
+}
+
+/* Add bytes to the HMAC-SHA256 operation. */
+void
+HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)
+{
+
+	/* Feed data to the inner SHA256 operation. */
+	SHA256_Update( &ctx->ictx, in, len );
+}
+
+/* Finish an HMAC-SHA256 operation. */
+void
+HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx )
+{
+	unsigned char ihash[32];
+
+	/* Finish the inner SHA256 operation. */
+	SHA256_Final( ihash, &ctx->ictx );
+
+	/* Feed the inner hash to the outer SHA256 operation. */
+	SHA256_Update( &ctx->octx, ihash, 32 );
+
+	/* Finish the outer SHA256 operation. */
+	SHA256_Final( digest, &ctx->octx );
+
+	/* Clean the stack. */
+	//memset(ihash, 0, 32);
+}
+
+/**
+ * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
+ * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
+ * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
+ */
+void
+PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
+    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
+{
+	HMAC_SHA256_CTX PShctx, hctx;
+	uint8_t _ALIGN(128) T[32];
+	uint8_t _ALIGN(128) U[32];
+	uint8_t ivec[4];
+	size_t i, clen;
+	uint64_t j;
+	int k;
+
+	/* Compute HMAC state after processing P and S. */
+	HMAC_SHA256_Init(&PShctx, passwd, passwdlen);
+	HMAC_SHA256_Update(&PShctx, salt, saltlen);
+
+	/* Iterate through the blocks. */
+	for (i = 0; i * 32 < dkLen; i++) {
+		/* Generate INT(i + 1). */
+		be32enc(ivec, (uint32_t)(i + 1));
+
+		/* Compute U_1 = PRF(P, S || INT(i)). */
+		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
+		HMAC_SHA256_Update(&hctx, ivec, 4);
+		HMAC_SHA256_Final(U, &hctx);
+
+		/* T_i = U_1 ... */
+		memcpy(T, U, 32);
+
+		for (j = 2; j <= c; j++) {
+			/* Compute U_j. */
+			HMAC_SHA256_Init(&hctx, passwd, passwdlen);
+			HMAC_SHA256_Update(&hctx, U, 32);
+			HMAC_SHA256_Final(U, &hctx);
+
+			/* ... xor U_j ... */
+			for (k = 0; k < 32; k++)
+				T[k] ^= U[k];
+		}
+
+		/* Copy as many bytes as necessary into buf. */
+		clen = dkLen - i * 32;
+		if (clen > 32)
+			clen = 32;
+		memcpy(&buf[i * 32], T, clen);
+	}
+
+	/* Clean PShctx, since we never called _Final on it. */
+	//memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y));
+}
--- a/algo/yespower/sha256_p.c.sha
+++ b/algo/yespower/sha256_p.c.sha
@@ -1,496 +0,0 @@
-/*-
- * Copyright 2005,2007,2009 Colin Percival
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <sys/types.h>
-
-#include <stdint.h>
-#include <string.h>
-
-#include "sysendian.h"
-
-#include "sha256_p.h"
-#include "compat.h"
-
-/*
- * Encode a length len/4 vector of (uint32_t) into a length len vector of
- * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
- */
-static void
-be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len)
-{
-	size_t i;
-
-	for (i = 0; i < len / 4; i++)
-		be32enc(dst + i * 4, src[i]);
-}
-
-/*
- * Decode a big-endian length len vector of (unsigned char) into a length
- * len/4 vector of (uint32_t).  Assumes len is a multiple of 4.
- */
-static void
-be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len)
-{
-	size_t i;
-
-	for (i = 0; i < len / 4; i++)
-		dst[i] = be32dec(src + i * 4);
-}
-
-/* Elementary functions used by SHA256 */
-#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
-#define Maj(x, y, z)	((x & (y | z)) | (y & z))
-#define SHR(x, n)	(x >> n)
-#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
-#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
-#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
-#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
-#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
-
-/* SHA256 round function */
-#define RND(a, b, c, d, e, f, g, h, k)			\
-	t0 = h + S1(e) + Ch(e, f, g) + k;		\
-	t1 = S0(a) + Maj(a, b, c);			\
-	d += t0;					\
-	h  = t0 + t1;
-
-/* Adjusted round function for rotating state */
-#define RNDr(S, W, i, k)			\
-	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
-	    S[(66 - i) % 8], S[(67 - i) % 8],	\
-	    S[(68 - i) % 8], S[(69 - i) % 8],	\
-	    S[(70 - i) % 8], S[(71 - i) % 8],	\
-	    W[i] + k)
-
-/*
- * SHA256 block compression function.  The 256-bit state is transformed via
- * the 512-bit input block to produce a new state.
- */
-static void
-SHA256_Transform_p(uint32_t * state, const unsigned char block[64])
-{
-	uint32_t _ALIGN(128) W[64], S[8];
-	uint32_t t0, t1;
-	int i;
-
-	/* 1. Prepare message schedule W. */
-	be32dec_vect(W, block, 64);
-	for (i = 16; i < 64; i++)
-		W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
-
-	/* 2. Initialize working variables. */
-	memcpy(S, state, 32);
-
-	/* 3. Mix. */
-	RNDr(S, W, 0, 0x428a2f98);
-	RNDr(S, W, 1, 0x71374491);
-	RNDr(S, W, 2, 0xb5c0fbcf);
-	RNDr(S, W, 3, 0xe9b5dba5);
-	RNDr(S, W, 4, 0x3956c25b);
-	RNDr(S, W, 5, 0x59f111f1);
-	RNDr(S, W, 6, 0x923f82a4);
-	RNDr(S, W, 7, 0xab1c5ed5);
-	RNDr(S, W, 8, 0xd807aa98);
-	RNDr(S, W, 9, 0x12835b01);
-	RNDr(S, W, 10, 0x243185be);
-	RNDr(S, W, 11, 0x550c7dc3);
-	RNDr(S, W, 12, 0x72be5d74);
-	RNDr(S, W, 13, 0x80deb1fe);
-	RNDr(S, W, 14, 0x9bdc06a7);
-	RNDr(S, W, 15, 0xc19bf174);
-	RNDr(S, W, 16, 0xe49b69c1);
-	RNDr(S, W, 17, 0xefbe4786);
-	RNDr(S, W, 18, 0x0fc19dc6);
-	RNDr(S, W, 19, 0x240ca1cc);
-	RNDr(S, W, 20, 0x2de92c6f);
-	RNDr(S, W, 21, 0x4a7484aa);
-	RNDr(S, W, 22, 0x5cb0a9dc);
-	RNDr(S, W, 23, 0x76f988da);
-	RNDr(S, W, 24, 0x983e5152);
-	RNDr(S, W, 25, 0xa831c66d);
-	RNDr(S, W, 26, 0xb00327c8);
-	RNDr(S, W, 27, 0xbf597fc7);
-	RNDr(S, W, 28, 0xc6e00bf3);
-	RNDr(S, W, 29, 0xd5a79147);
-	RNDr(S, W, 30, 0x06ca6351);
-	RNDr(S, W, 31, 0x14292967);
-	RNDr(S, W, 32, 0x27b70a85);
-	RNDr(S, W, 33, 0x2e1b2138);
-	RNDr(S, W, 34, 0x4d2c6dfc);
-	RNDr(S, W, 35, 0x53380d13);
-	RNDr(S, W, 36, 0x650a7354);
-	RNDr(S, W, 37, 0x766a0abb);
-	RNDr(S, W, 38, 0x81c2c92e);
-	RNDr(S, W, 39, 0x92722c85);
-	RNDr(S, W, 40, 0xa2bfe8a1);
-	RNDr(S, W, 41, 0xa81a664b);
-	RNDr(S, W, 42, 0xc24b8b70);
-	RNDr(S, W, 43, 0xc76c51a3);
-	RNDr(S, W, 44, 0xd192e819);
-	RNDr(S, W, 45, 0xd6990624);
-	RNDr(S, W, 46, 0xf40e3585);
-	RNDr(S, W, 47, 0x106aa070);
-	RNDr(S, W, 48, 0x19a4c116);
-	RNDr(S, W, 49, 0x1e376c08);
-	RNDr(S, W, 50, 0x2748774c);
-	RNDr(S, W, 51, 0x34b0bcb5);
-	RNDr(S, W, 52, 0x391c0cb3);
-	RNDr(S, W, 53, 0x4ed8aa4a);
-	RNDr(S, W, 54, 0x5b9cca4f);
-	RNDr(S, W, 55, 0x682e6ff3);
-	RNDr(S, W, 56, 0x748f82ee);
-	RNDr(S, W, 57, 0x78a5636f);
-	RNDr(S, W, 58, 0x84c87814);
-	RNDr(S, W, 59, 0x8cc70208);
-	RNDr(S, W, 60, 0x90befffa);
-	RNDr(S, W, 61, 0xa4506ceb);
-	RNDr(S, W, 62, 0xbef9a3f7);
-	RNDr(S, W, 63, 0xc67178f2);
-
-	/* 4. Mix local working variables into global state */
-	for (i = 0; i < 8; i++)
-		state[i] += S[i];
-#if 0
-	/* Clean the stack. */
-	memset(W, 0, 256);
-	memset(S, 0, 32);
-	t0 = t1 = 0;
-#endif
-}
-
-static unsigned char PAD[64] = {
-	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-// only called by SHA256_Final_p
-/* Add padding and terminating bit-count. */
-static void
-SHA256_Pad_p(SHA256_CTX_p * ctx)
-{
-	unsigned char len[8];
-	uint32_t r, plen;
-
-	/*
-	 * Convert length to a vector of bytes -- we do this now rather
-	 * than later because the length will change after we pad.
-	 */
-	be32enc_vect(len, ctx->count, 8);
-
-	/* Add 1--64 bytes so that the resulting length is 56 mod 64 */
-	r = (ctx->count[1] >> 3) & 0x3f;
-	plen = (r < 56) ? (56 - r) : (120 - r);
-	SHA256_Update_p(ctx, PAD, (size_t)plen);
-	/* Add the terminating bit-count */
-	SHA256_Update_p(ctx, len, 8);
-}
-
-/* SHA-256 initialization.  Begins a SHA-256 operation. */
-void
-SHA256_Init_p(SHA256_CTX_p * ctx)
-{
-	/* Zero bits processed so far */
-	ctx->count[0] = ctx->count[1] = 0;
-
-	/* Magic initialization constants */
-	ctx->state[0] = 0x6A09E667;
-	ctx->state[1] = 0xBB67AE85;
-	ctx->state[2] = 0x3C6EF372;
-	ctx->state[3] = 0xA54FF53A;
-	ctx->state[4] = 0x510E527F;
-	ctx->state[5] = 0x9B05688C;
-	ctx->state[6] = 0x1F83D9AB;
-	ctx->state[7] = 0x5BE0CD19;
-}
-
-/* Add bytes into the hash */
-void
-SHA256_Update_p(SHA256_CTX_p * ctx, const void *in, size_t len)
-{
-	uint32_t bitlen[2];
-	uint32_t r;
-	const unsigned char *src = in;
-
-	/* Number of bytes left in the buffer from previous updates */
-	r = (ctx->count[1] >> 3) & 0x3f;
-
-	/* Convert the length into a number of bits */
-	bitlen[1] = ((uint32_t)len) << 3;
-	bitlen[0] = (uint32_t)(len >> 29);
-
-	/* Update number of bits */
-	if ((ctx->count[1] += bitlen[1]) < bitlen[1])
-		ctx->count[0]++;
-	ctx->count[0] += bitlen[0];
-
-	/* Handle the case where we don't need to perform any transforms */
-	if (len < 64 - r) {
-		memcpy(&ctx->buf[r], src, len);
-		return;
-	}
-
-	/* Finish the current block */
-	memcpy(&ctx->buf[r], src, 64 - r);
-        SHA256_Transform_p(ctx->state, ctx->buf);
-	src += 64 - r;
-	len -= 64 - r;
-
-	/* Perform complete blocks */
-	while (len >= 64) {
-		SHA256_Transform_p(ctx->state, src);
-		src += 64;
-		len -= 64;
-	}
-
-	/* Copy left over data into buffer */
-	memcpy(ctx->buf, src, len);
-}
-
-/*
- * SHA-256 finalization.  Pads the input data, exports the hash value,
- * and clears the context state.
- */
-void
-SHA256_Final_p(unsigned char digest[32], SHA256_CTX_p * ctx)
-{
-	/* Add padding */
-	SHA256_Pad_p(ctx);
-
-	/* Write the hash */
-	be32enc_vect(digest, ctx->state, 32);
-
-	/* Clear the context state */
-	memset((void *)ctx, 0, sizeof(*ctx));
-}
-
-/**
- * SHA256_Buf(in, len, digest):
- * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
- */
-void
-SHA256_Buf_p(const void * in, size_t len, uint8_t digest[32])
-{
-//        SHA256_CTX_p ctx;
-//        uint32_t tmp32[72];
-
-#if defined(__SHA__)
-        SHA256_CTX ctx;
-        SHA256_Init(&ctx);
-        SHA256_Update(&ctx, in, len);
-        SHA256_Final(digest, &ctx);
-#else
-        SHA256_CTX_p ctx;
-	SHA256_Init_p(&ctx);
-        SHA256_Update_p(&ctx, in, len);
-        SHA256_Final_p(digest, &ctx);
-#endif
-
-        /* Clean the stack. */
-//      insecure_memzero(&ctx, sizeof(SHA256_CTX));
-//      insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Buf(K, Klen, in, len, digest):
- * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
- * length ${Klen}, and write the result to ${digest}.
- */
-void
-HMAC_SHA256_Buf_p(const void * K, size_t Klen, const void * in, size_t len,
-    uint8_t digest[32])
-{
-        HMAC_SHA256_CTX_p ctx;
-//        uint32_t tmp32[72];
-//        uint8_t tmp8[96];
-
-        HMAC_SHA256_Init_p(&ctx, K, Klen);
-        HMAC_SHA256_Update_p(&ctx, in, len);
-        HMAC_SHA256_Final_p(digest, &ctx);
-
-        /* Clean the stack. */
-//        insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
-//        insecure_memzero(tmp32, 288);
-//        insecure_memzero(tmp8, 96);
-}
-
-/* Initialize an HMAC-SHA256 operation with the given key. */
-void
-HMAC_SHA256_Init_p(HMAC_SHA256_CTX_p * ctx, const void * _K, size_t Klen)
-{
-	unsigned char pad[64];
-	unsigned char khash[32];
-	const unsigned char * K = _K;
-	size_t i;
-
-	/* If Klen > 64, the key is really SHA256(K). */
-	if (Klen > 64) {
-#if defined(__SHA__)
-		SHA256_Init(&ctx->ictx);
-		SHA256_Update(&ctx->ictx, K, Klen);
-		SHA256_Final(khash, &ctx->ictx);
-#else
-                SHA256_Init_p(&ctx->ictx);
-                SHA256_Update_p(&ctx->ictx, K, Klen);
-                SHA256_Final_p(khash, &ctx->ictx);
-#endif
-		K = khash;
-		Klen = 32;
-	}
-
-	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-#if defined(__SHA__)
-        SHA256_Init(&ctx->ictx);
-#else
-        SHA256_Init_p(&ctx->ictx);
-#endif
-	memset(pad, 0x36, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-#if defined(__SHA__)
-	SHA256_Update(&ctx->ictx, pad, 64);
-#else
-        SHA256_Update_p(&ctx->ictx, pad, 64);
-#endif
-
-	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-#if defined(__SHA__)
-	SHA256_Init(&ctx->octx);
-#else
-        SHA256_Init_p(&ctx->octx);
-#endif
-	memset(pad, 0x5c, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-#if defined(__SHA__)
-	SHA256_Update(&ctx->octx, pad, 64);
-#else
-        SHA256_Update_p(&ctx->octx, pad, 64);
-#endif
-
-	/* Clean the stack. */
-	//memset(khash, 0, 32);
-}
-
-/* Add bytes to the HMAC-SHA256 operation. */
-void
-HMAC_SHA256_Update_p(HMAC_SHA256_CTX_p * ctx, const void *in, size_t len)
-{
-
-	/* Feed data to the inner SHA256 operation. */
-#if defined(__SHA__)
-	SHA256_Update(&ctx->ictx, in, len);
-#else
-        SHA256_Update_p(&ctx->ictx, in, len);
-#endif
-}
-
-/* Finish an HMAC-SHA256 operation. */
-void
-HMAC_SHA256_Final_p(unsigned char digest[32], HMAC_SHA256_CTX_p * ctx)
-{
-	unsigned char ihash[32];
-
-#if defined(__SHA__)
-	/* Finish the inner SHA256 operation. */
-	SHA256_Final(ihash, &ctx->ictx);
-
-	/* Feed the inner hash to the outer SHA256 operation. */
-	SHA256_Update(&ctx->octx, ihash, 32);
-
-	/* Finish the outer SHA256 operation. */
-	SHA256_Final(digest, &ctx->octx);
-#else
-        /* Finish the inner SHA256 operation. */
-        SHA256_Final_p(ihash, &ctx->ictx);
-
-        /* Feed the inner hash to the outer SHA256 operation. */
-        SHA256_Update_p(&ctx->octx, ihash, 32);
-
-        /* Finish the outer SHA256 operation. */
-        SHA256_Final_p(digest, &ctx->octx);
-#endif
-
-	/* Clean the stack. */
-	//memset(ihash, 0, 32);
-}
-
-/**
- * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
- * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
- * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
- */
-void
-PBKDF2_SHA256_p(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
-    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
-{
-	HMAC_SHA256_CTX_p PShctx, hctx;
-	uint8_t _ALIGN(128) T[32];
-	uint8_t _ALIGN(128) U[32];
-	uint8_t ivec[4];
-	size_t i, clen;
-	uint64_t j;
-	int k;
-
-	/* Compute HMAC state after processing P and S. */
-	HMAC_SHA256_Init_p(&PShctx, passwd, passwdlen);
-	HMAC_SHA256_Update_p(&PShctx, salt, saltlen);
-
-	/* Iterate through the blocks. */
-	for (i = 0; i * 32 < dkLen; i++) {
-		/* Generate INT(i + 1). */
-		be32enc(ivec, (uint32_t)(i + 1));
-
-		/* Compute U_1 = PRF(P, S || INT(i)). */
-		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_p));
-		HMAC_SHA256_Update_p(&hctx, ivec, 4);
-		HMAC_SHA256_Final_p(U, &hctx);
-
-		/* T_i = U_1 ... */
-		memcpy(T, U, 32);
-
-		for (j = 2; j <= c; j++) {
-			/* Compute U_j. */
-			HMAC_SHA256_Init_p(&hctx, passwd, passwdlen);
-			HMAC_SHA256_Update_p(&hctx, U, 32);
-			HMAC_SHA256_Final_p(U, &hctx);
-
-			/* ... xor U_j ... */
-			for (k = 0; k < 32; k++)
-				T[k] ^= U[k];
-		}
-
-		/* Copy as many bytes as necessary into buf. */
-		clen = dkLen - i * 32;
-		if (clen > 32)
-			clen = 32;
-		memcpy(&buf[i * 32], T, clen);
-	}
-
-	/* Clean PShctx, since we never called _Final on it. */
-	//memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y));
-}
--- a/algo/yespower/sha256_p.h.sha
+++ b/algo/yespower/sha256_p.h.sha
@@ -33,45 +33,24 @@
 #include <stdint.h>
 #include <openssl/sha.h>

-typedef struct SHA256Context {
-	uint32_t state[8];
-	uint32_t count[2];
-	unsigned char buf[64];
-} SHA256_CTX_p;
-
-/*
 typedef struct HMAC_SHA256Context {
-	SHA256_CTX_Y ictx;
-	SHA256_CTX_Y octx;
-} HMAC_SHA256_CTX_Y;
-*/
-
-typedef struct HMAC_SHA256Context {
-#if defined(__SHA__)
        SHA256_CTX ictx;
        SHA256_CTX octx;
-#else
-        SHA256_CTX_p ictx;
-        SHA256_CTX_p octx;
-#endif
-} HMAC_SHA256_CTX_p;
+} HMAC_SHA256_CTX;

-void	SHA256_Init_p(SHA256_CTX_p *);
-void	SHA256_Update_p(SHA256_CTX_p *, const void *, size_t);
-void	SHA256_Final_p(unsigned char [32], SHA256_CTX_p *);
-void    SHA256_Buf_p(const void * in, size_t len, uint8_t digest[32]);
-void	HMAC_SHA256_Init_p(HMAC_SHA256_CTX_p *, const void *, size_t);
-void	HMAC_SHA256_Update_p(HMAC_SHA256_CTX_p *, const void *, size_t);
-void	HMAC_SHA256_Final_p(unsigned char [32], HMAC_SHA256_CTX_p *);
-void    HMAC_SHA256_Buf_p(const void * K, size_t Klen, const void * in,
-	size_t len, uint8_t digest[32]);
+void SHA256_Buf( const void * in, size_t len, uint8_t digest[32] );
+void HMAC_SHA256_Init( HMAC_SHA256_CTX *, const void *, size_t );
+void HMAC_SHA256_Update( HMAC_SHA256_CTX *, const void *, size_t );
+void HMAC_SHA256_Final( unsigned char [32], HMAC_SHA256_CTX * );
+void HMAC_SHA256_Buf( const void * K, size_t Klen, const void * in,
+                      size_t len, uint8_t digest[32] );

 /**
 * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
 * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
 * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
 */
-void	PBKDF2_SHA256_p(const uint8_t *, size_t, const uint8_t *, size_t,
-    uint64_t, uint8_t *, size_t);
+void PBKDF2_SHA256( const uint8_t *, size_t, const uint8_t *, size_t,
+                    uint64_t, uint8_t *, size_t);

 #endif /* !_SHA256_H_ */
--- a/algo/yespower/yespower-opt.c
+++ b/algo/yespower/yespower-opt.c
@@ -62,6 +62,7 @@
 #warning "Note: building generic code for non-x86.  That's OK."
 #endif
 */
+
 /*
 * The SSE4 code version has fewer instructions than the generic SSE2 version,
 * but all of the instructions are SIMD, thereby wasting the scalar execution
@@ -96,7 +97,7 @@
 #include <string.h>

 #include "insecure_memzero.h"
-#include "sha256.h"
+#include "sha256_p.h"
 #include "sysendian.h"

 #include "yespower.h"
@@ -528,7 +529,7 @@ static volatile uint64_t Smask2var = Smask2;
 /* 64-bit without AVX.  This relies on out-of-order execution and register
 * renaming.  It may actually be fastest on CPUs with AVX(2) as well - e.g.,
 * it runs great on Haswell. */
-//#warning "Note: using x86-64 inline assembly for pwxform.  That's great."
+#warning "Note: using x86-64 inline assembly for pwxform.  That's great."
 #undef MAYBE_MEMORY_BARRIER
 #define MAYBE_MEMORY_BARRIER \
 	__asm__("" : : : "memory");
--- a/algo/yespower/yespower-opt.c.sha
+++ b/algo/yespower/yespower-opt.c.sha
--- a/algo/yespower/yespower-ref.c
+++ b/algo/yespower/yespower-ref.c
@@ -51,7 +51,7 @@
 #include <stdlib.h>
 #include <string.h>

-#include "sha256.h"
+#include "sha256_p.h"
 #include "sysendian.h"

 #include "yespower.h"
@@ -534,11 +534,12 @@ int yespower(yespower_local_t *local,

 		if (pers) {
 			HMAC_SHA256_Buf(dst, sizeof(*dst), pers, perslen,
+               return true;
 			    (uint8_t *)sha256);
 			SHA256_Buf(sha256, sizeof(sha256), (uint8_t *)dst);
 		}
 	} else {
-		HMAC_SHA256_Buf((uint8_t *)B + B_size - 64, 64,
+		HMAC_SHA256_Buf_P((uint8_t *)B + B_size - 64, 64,
 		    sha256, sizeof(sha256), (uint8_t *)dst);
 	}

--- a/algo/yespower/yespower.c
+++ b/algo/yespower/yespower.c
@@ -38,7 +38,7 @@ void yespower_hash( const char *input, char *output, uint32_t len )
 }

 int scanhash_yespower( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done )
+                       uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t _ALIGN(64) vhash[8];
        uint32_t _ALIGN(64) endiandata[20];
@@ -48,6 +48,7 @@ int scanhash_yespower( int thr_id, struct work *work, uint32_t max_nonce,
        const uint32_t Htarg = ptarget[7];
        const uint32_t first_nonce = pdata[19];
        uint32_t n = first_nonce;
+        /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

        for (int k = 0; k < 19; k++)
                be32enc(&endiandata[k], pdata[k]);
--- a/avxdefs.h
+++ b/avxdefs.h
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -100,9 +100,9 @@ rm -f config.status
 CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
 make -j 16
 strip -s cpuminer.exe
-mv cpuminer.exe cpuminer-native.exe
+#mv cpuminer.exe cpuminer-native.exe
 strip -s cpuminer
-mv cpuminer cpuminer-native
+#mv cpuminer cpuminer-native

-make clean || echo done
+#make clean || echo done

--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.1.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.9.1.1'
-PACKAGE_STRING='cpuminer-opt 3.9.1.1'
+PACKAGE_VERSION='3.9.3'
+PACKAGE_STRING='cpuminer-opt 3.9.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.9.1.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.9.3 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.9.1.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.9.3:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.9.1.1
+cpuminer-opt configure 3.9.3
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.9.1.1, which was
+It was created by cpuminer-opt $as_me 3.9.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.9.1.1'
+ VERSION='3.9.3'


 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.9.1.1, which was
+This file was extended by cpuminer-opt $as_me 3.9.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.9.1.1
+cpuminer-opt config.status 3.9.3
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.9.1.1])
+AC_INIT([cpuminer-opt], [3.9.3])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -105,10 +105,12 @@ enum algos opt_algo = ALGO_NULL;
 int opt_scrypt_n = 0;
 int opt_pluck_n = 128;
 int opt_n_threads = 0;
-#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
-__int128_t opt_affinity = -1LL;
+// Windows doesn't support 128 bit affinity mask.
+#if defined(__linux) && defined(GCC_INT128)  
+#define AFFINITY_USES_UINT128 1
+uint128_t opt_affinity = -1LL;
 #else
-int64_t opt_affinity = -1LL;
+uint64_t opt_affinity = -1LL;
 #endif
 int opt_priority = 0;
 int num_cpus = 1;
@@ -203,7 +205,8 @@ static inline void drop_policy(void)
 #define pthread_setaffinity_np(tid,sz,s) {} /* only do process affinity */
 #endif

-#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
+// Linux affinity can use int128.
+#if AFFINITY_USES_UINT128
 static void affine_to_cpu_mask( int id, unsigned __int128 mask )
 #else
 static void affine_to_cpu_mask( int id, unsigned long long mask )
@@ -216,7 +219,7 @@ static void affine_to_cpu_mask( int id, unsigned long long mask )
   for ( uint8_t i = 0; i < ncpus; i++ ) 
   {
      // cpu mask
-#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
+#if AFFINITY_USES_UINT128
      if( ( mask & ( (unsigned __int128)1ULL << i ) ) )  CPU_SET( i, &set );
 #else
      if( (ncpus > 64) || ( mask & (1ULL << i) ) )  CPU_SET( i, &set );
@@ -237,6 +240,7 @@ static void affine_to_cpu_mask( int id, unsigned long long mask )
 #elif defined(WIN32) /* Windows */
 static inline void drop_policy(void) { }

+// Windows CPU groups to manage more than 64 CPUs.
 static void affine_to_cpu_mask( int id, unsigned long mask )
 {
   bool success;
@@ -263,7 +267,7 @@ static void affine_to_cpu_mask( int id, unsigned long mask )
 	      break;

  	   cpu -= cpus;
-         }
+   }

 	if (opt_debug)
 	applog(LOG_DEBUG, "Binding thread %d to cpu %d on cpu group %d (mask %x)", id, cpu, group, (1ULL << cpu));
@@ -847,7 +851,8 @@ static int share_result( int result, struct work *work, const char *reason )
   float rate;
   char rate_s[8] = {0};
   double sharediff = work ? work->sharediff : stratum.sharediff;
-   bool solved = result && (net_diff > 0.0 ) && ( sharediff >= net_diff );
+   bool solved = result && accepted_share_count && (net_diff > 0.0 )
+	         && ( sharediff >= net_diff );
   char sol[32] = {0};
   int i;

@@ -857,15 +862,17 @@ static int share_result( int result, struct work *work, const char *reason )
       hashcount += thr_hashcount[i];
       hashrate += thr_hashrates[i];
   }
+   solved = result && ( (uint64_t)hashcount > 0 )  && (net_diff > 0.0 )
+                                             && ( sharediff >= net_diff );
   result ? accepted_share_count++ : rejected_share_count++;

   if ( solved )
   {
      solved_block_count++;
      if ( use_colors )
-         sprintf( sol, CL_GRN " Solved" CL_WHT " %d", solved_block_count );   
+         sprintf( sol, CL_GRN " Solved: %d" CL_WHT, solved_block_count );   
      else
-         sprintf( sol, " Solved %d", solved_block_count ); 
+         sprintf( sol, ", Solved: %d", solved_block_count ); 
   }

   pthread_mutex_unlock(&stats_lock);
@@ -1839,26 +1846,42 @@ static void *miner_thread( void *userdata )
   }
   else
 */
+
   if ( num_cpus > 1 )
   {
-      if ( (opt_affinity == -1LL) && (opt_n_threads) > 1 ) 
-      {
-         if (opt_debug)
-            applog( LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)",
-                   thr_id, thr_id % num_cpus, ( 1ULL << (thr_id % num_cpus) ) );
-#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
-         affine_to_cpu_mask( thr_id,
-                             (unsigned __int128)1LL << (thr_id % num_cpus) );
+#if AFFINITY_USES_UINT128
+       // Default affinity
+       if ( (opt_affinity == i128_neg1 ) && opt_n_threads > 1 )
+       {  
+         if ( opt_debug )
+            applog( LOG_DEBUG, "Binding thread %d to cpu %d.",
+                    thr_id, thr_id % num_cpus,
+	                 u128_hi64( (uint128_t)1ULL << (thr_id % num_cpus) ),
+		              u128_lo64( (uint128_t)1ULL << (thr_id % num_cpus) ) );
+         affine_to_cpu_mask( thr_id, (uint128_t)1ULL << (thr_id % num_cpus) );
+       }
 #else
-         affine_to_cpu_mask( thr_id, 1ULL << (thr_id % num_cpus) );
-#endif
-      }
-      else if (opt_affinity != -1)
-      {
+       if ( (opt_affinity == -1LL) && opt_n_threads > 1 ) 
+       {
         if (opt_debug)
-             applog( LOG_DEBUG, "Binding thread %d to cpu mask %x",
-                                 thr_id, opt_affinity);
-         affine_to_cpu_mask( thr_id, opt_affinity );
+            applog( LOG_DEBUG, "Binding thread %d to cpu %d.",
+                thr_id, thr_id % num_cpus, 1LL << (thr_id % num_cpus)) ;
+         affine_to_cpu_mask( thr_id, 1ULL << (thr_id % num_cpus) );
+       }
+#endif
+      else   // Custom affinity
+      {
+#if AFFINITY_USES_UINT128
+         if (opt_debug)
+             applog( LOG_DEBUG, "Binding thread %d to mask %016llx %016llx",
+                                thr_id, u128_hi64( opt_affinity ), 
+                                        u128_lo64( opt_affinity ) );
+#else
+         if (opt_debug)
+             applog( LOG_DEBUG, "Binding thread %d to mask %016llx",
+                                 thr_id, opt_affinity );
+#endif
+      affine_to_cpu_mask( thr_id, opt_affinity );
      }
   }

@@ -2894,13 +2917,21 @@ void parse_arg(int key, char *arg )
 		break;
 	case 1020:
 		p = strstr(arg, "0x");
-		if (p)
-			ul = strtoul(p, NULL, 16);
+		if ( p )
+			ul = strtoull( p, NULL, 16 );
 		else
-			ul = atol(arg);
-		if (ul > (1UL<<num_cpus)-1)
-			ul = -1;
-		opt_affinity = ul;
+			ul = atoll( arg );
+//		if ( ul > ( 1ULL << num_cpus ) - 1ULL )
+//			ul = -1LL;
+#if AFFINITY_USES_UINT128
+// replicate the low 64 bits to make a full 128 bit mask if there are more
+// than 64 CPUs, otherwise zero extend the upper half.
+                opt_affinity = (uint128_t)ul;
+                if ( num_cpus > 64 )
+                   opt_affinity = (opt_affinity << 64 ) | (uint128_t)ul;
+#else
+                   opt_affinity = ul;
+#endif
 		break;
 	case 1021:
 		v = atoi(arg);
@@ -3299,20 +3330,18 @@ int main(int argc, char *argv[])
 	}

 	if (!rpc_userpass)
-        {
+   {
 		rpc_userpass = (char*) malloc(strlen(rpc_user) + strlen(rpc_pass) + 2);
-                if (rpc_userpass)
-	           sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
-                else
-                   return 1;
+      if (rpc_userpass)
+          sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
+       else
+         return 1;
 	}

-        // All options must be set before starting the gate
-        if ( !register_algo_gate( opt_algo, &algo_gate ) )
-           exit(1);
+   // All options must be set before starting the gate
+   if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1);

-        if ( !check_cpu_capability() )
-           exit(1);
+   if ( !check_cpu_capability() ) exit(1);

 	pthread_mutex_init(&stats_lock, NULL);
 	pthread_mutex_init(&g_work_lock, NULL);
@@ -3325,7 +3354,7 @@ int main(int argc, char *argv[])
 	        ? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL)
 	        : CURL_GLOBAL_ALL;
 	if (curl_global_init(flags))
-        {
+   {
 		applog(LOG_ERR, "CURL initialization failed");
 		return 1;
 	}
@@ -3384,6 +3413,8 @@ int main(int argc, char *argv[])
   if ( num_cpus != opt_n_threads )   
     applog( LOG_INFO,"%u CPU cores available, %u miner threads selected.",
             num_cpus, opt_n_threads );
+
+// To be reviewed
   if ( opt_affinity != -1 )
   {
      if ( num_cpus > 64 )
--- a/interleave.h
+++ b/interleave.h
--- a/miner.h
+++ b/miner.h
@@ -363,7 +363,7 @@ struct work {
 	char *job_id;
 	size_t xnonce2_len;
 	unsigned char *xnonce2;
-        uint32_t nonces[8];
+        uint32_t nonces[8];    // deprecated
 } __attribute__ ((aligned (64)));

 struct stratum_job {
@@ -538,6 +538,7 @@ enum algos {
        ALGO_SCRYPTJANE,
        ALGO_SHA256D,
        ALGO_SHA256T,
+        ALGO_SHA256Q,
        ALGO_SHAVITE3,    
        ALGO_SKEIN,       
        ALGO_SKEIN2,      
@@ -625,6 +626,7 @@ static const char* const algo_names[] = {
        "scryptjane",
        "sha256d",
        "sha256t",
+        "sha256q",
        "shavite3",
        "skein",
        "skein2",
@@ -774,7 +776,8 @@ Options:\n\
                          scryptjane:nf\n\
                          sha256d       Double SHA-256\n\
                          sha256t       Triple SHA-256, Onecoin (OC)\n\
-                          shavite3      Shavite3\n\
+                          sha256q       Quad SHA-256, Pyrite (PYE)\n\
+			  shavite3      Shavite3\n\
                          skein         Skein+Sha (Skeincoin)\n\
                          skein2        Double Skein (Woodcoin)\n\
                          skunk         Signatum (SIGT)\n\
--- a/simd-utils.h
+++ b/simd-utils.h
@@ -0,0 +1,183 @@
+#if !defined(SIMD_UTILS_H__)
+#define SIMD_UTILS_H__ 1
+
+//////////////////////////////////////////////////////////////////////
+//
+//             SIMD utilities
+//
+//    Not to be confused with the hashing function of the same name. This
+//    is about Single Instruction Multiple Data programming using CPU
+//    features such as SSE and AVX.
+//
+//    This header is the entry point to a suite of macros and functions
+//    to perform basic operations on vectors that are useful in crypto
+//    mining. Some of these functions have native CPU support for scalar
+//    data but not for vectors. The main categories are bit rotation
+//    and endian byte swapping
+//
+//    An attempt was made to make the names as similar as possible to
+//    Intel's intrinsic function format. Most variations are to avoid
+//    confusion with actual Intel intrinsics, brevity, and clarity.
+//
+//    This suite supports some operations on regular 64 bit integers
+//    as well as 128 bit integers available on recent versions of Linux
+//    and GCC.
+//
+//    It also supports various vector sizes on CPUs that meet the minimum
+//    requirements.
+//
+//    The minimum for any real work is a 64 bit CPU with SSE2,
+//    ie an the Intel Core 2.
+//
+//    Following are the minimum requirements for each vector size. There
+//    is no significant 64 bit vectorization therefore SSE2 is the practical
+//    minimum for using this code.
+//
+//    MMX:     64 bit vectors  
+//    SSE2:   128 bit vectors  (64 bit CPUs only, such as Intel Core2.
+//    AVX2:   256 bit vectors  (Starting with Intel Haswell and AMD Ryzen)
+//    AVX512: 512 bit vectors  (still under development)
+//
+//    Most functions are avalaible at the stated levels but in rare cases
+//    a higher level feature may be required with no compatible alternative.
+//    Some SSE2 functions have versions optimized for higher feature levels
+//    such as SSSE3 or SSE4.1 that will be used automatically on capable
+//    CPUs.
+//
+//    The vector size boundaries are respected to maintain compatibility.
+//    For example, an instruction introduced with AVX2 may improve 128 bit
+//    vector performance but will not be implemented. A CPU with AVX2 will
+//    tend to use 256 bit vectors. On a practical level AVX512 does introduce
+//    bit rotation instructions for 128 and 256 bit vectors in addition to
+//    its own 5a12 bit vectors. These will not be back ported to replace the
+//    SW implementations for the smaller vectors. This policy may be reviewed
+//    in the future once AVX512 is established. 
+//
+//    Strict alignment of data is required: 16 bytes for 128 bit vectors,
+//    32 bytes for 256 bit vectors and 64 bytes for 512 bit vectors. 64 byte
+//    alignment is recommended in all cases for best cache alignment.
+//
+//    Windows has problems with function vector arguments larger than
+//    128 bits. Stack alignment is only guaranteed to 16 bytes. Always use
+//    pointers for larger vectors in function arguments. Macros can be
+//    used for larger value arguments.
+//
+//    An attempt was made to make the names as similar as possible to
+//    Intel's intrinsic function format. Most variations are to avoid
+//    confusion with actual Intel intrinsics, brevity, and clarity
+//
+//    The main differences are:
+//
+//   - the leading underscore(s) "_" and the "i" are dropped from the
+//     prefix of vector instructions.
+//   - "mm64" and "mm128" used for 64 and 128 bit prefix respectively
+//     to avoid the ambiguity of "mm".
+//   - the element size does not include additional type specifiers
+//      like "epi".
+//   - some macros contain value args that are updated.
+//   - specialized shift and rotate functions that move elements around
+//     use the notation "1x32" to indicate the distance moved as units of
+//     the element size.
+//   - there is a subset of some functions for scalar data. They may have
+//     no prefix nor vec-size, just one size, the size of the data.
+//
+//    Function names follow this pattern:
+//
+//         prefix_op[esize]_[vsize]
+//
+//    Prefix: usually the size of the largest vectors used. Following
+//            are some examples:
+//
+//    u64:  unsigned 64 bit integer function
+//    i128: signed 128 bit integer function
+//    m128: 128 bit vector identifier
+//    mm128: 128 bit vector function
+//
+//    op: describes the operation of the function or names the data
+//        identifier.
+//
+//    esize: optional, element size of operation
+//
+//    vsize: optional, lane size used when a function operates on elements
+//           of vectors within lanes of a vector.
+//
+//    Ex: mm256_ror1x64_128 rotates each 128 bit lane of a 256 bit vector
+//        right by 64 bits.
+//
+//   Some random thoughts about macros and inline functions, the pros and
+//   cons, when to use them, etc:
+//
+// Macros are very convenient and efficient for statement functions.
+// Macro args are passed by value and modifications are seen by the caller.
+// Macros should not generally call regular functions unless it is for a
+// special purpose such overloading a function name.
+// Statement function macros that return a value should not end in ";"
+// Statement function macros that return a value and don't modify input args
+// may be used in function arguments and expressions.
+// Macro args used in expressions should be protected ex: (x)+1
+// Macros force inlining, function inlining can be overridden by the compiler.
+// Inline functions are preferred when multiple statements or local variables
+// are needed.
+// The compiler can't do any syntax checking or type checking of args making
+// macros difficult to debug.
+// Although it is technically posssible to access the callers data without
+// they being passed as arguments it is good practice to always define
+// arguments even if they have the same name.
+//
+// General guidelines for inline functions:
+//
+// Inline functions should not have loops, it defeats the purpose of inlining.
+// Inline functions should be short, the benefit is lost and the memory cost
+// increases if the function is referenced often.
+// Inline functions may call other functions, inlined or not. It is convenient
+// for wrapper functions whether or not the wrapped function is itself inlined.
+// Care should be taken when unrolling loops that contain calls to inlined
+// functions that may be large.
+// Large code blocks used only once may use function inlining to
+// improve high level code readability without the penalty of function
+// overhead.
+//
+///////////////////////////////////////////////////////
+
+#include <inttypes.h>
+#include <x86intrin.h>
+#include <memory.h>
+#include <stdbool.h>
+// byteswap.h doesn't exist on Windows, find alternative
+//#include <byteswap.h>
+
+// Various types and overlays
+#include "simd-utils/simd-types.h"
+
+// 64 and 128 bit integers.
+#include "simd-utils/simd-int.h"
+
+#if defined(__MMX__)
+
+// 64 bit vectors
+#include "simd-utils/simd-mmx.h"
+#include "simd-utils/intrlv-mmx.h"
+#if defined(__SSE2__)
+
+// 128 bit vectors
+#include "simd-utils/simd-sse2.h"
+#include "simd-utils/intrlv-sse2.h"
+
+#if defined(__AVX2__)
+
+// 256 bit vectors
+#include "simd-utils/simd-avx2.h"
+#include "simd-utils/intrlv-avx2.h"
+
+// Skylake-X has all these
+#if defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// 512 bit vectors
+#include "simd-utils/simd-avx512.h"
+#include "simd-utils/intrlv-avx512.h"
+
+#endif  // MMX
+#endif  // SSE2
+#endif  // AVX2
+#endif  // AVX512
+#endif  // SIMD_UTILS_H__
--- a/simd-utils/intrlv-avx2.h
+++ b/simd-utils/intrlv-avx2.h
@@ -0,0 +1,726 @@
+#if !defined(INTRLV_AVX22_H__)
+#define INTRLV_AVX2_H__ 1
+
+#if  defined(__AVX2__)
+
+// Convenient short cuts for local use only
+
+// Extract 64 bits from the low 128 bits of 256 bit vector.
+#define extr64_cast128_256( a, n ) \
+   _mm_extract_epi64( _mm256_castsi256_si128( a ), n )
+
+// Extract 32 bits from the low 128 bits of 256 bit vector.
+#define extr32_cast128_256( a, n ) \
+   _mm_extract_epi32( _mm256_castsi256_si128( a ), n )
+
+///////////////////////////////////////////////////////////
+//
+//          AVX2 256 Bit Vectors
+//
+
+#define mm256_put_64( s0, s1, s2, s3) \
+  _mm256_set_epi64x( *((const uint64_t*)(s3)), *((const uint64_t*)(s2)), \
+                     *((const uint64_t*)(s1)), *((const uint64_t*)(s0)) )
+
+#define mm256_put_32( s00, s01, s02, s03, s04, s05, s06, s07 ) \
+  _mm256_set_epi32( *((const uint32_t*)(s07)), *((const uint32_t*)(s06)), \
+                    *((const uint32_t*)(s05)), *((const uint32_t*)(s04)), \
+                    *((const uint32_t*)(s03)), *((const uint32_t*)(s02)), \
+                    *((const uint32_t*)(s01)), *((const uint32_t*)(s00)) )
+
+#define mm256_get_64( s, i0, i1, i2, i3 ) \
+  _mm256_set_epi64x( ((const uint64_t*)(s))[i3], ((const uint64_t*)(s))[i2], \
+                     ((const uint64_t*)(s))[i1], ((const uint64_t*)(s))[i0] )
+
+#define mm256_get_32( s, i0, i1, i2, i3, i4, i5, i6, i7 ) \
+  _mm256_set_epi32( ((const uint32_t*)(s))[i7], ((const uint32_t*)(s))[i6], \
+                    ((const uint32_t*)(s))[i5], ((const uint32_t*)(s))[i4], \
+                    ((const uint32_t*)(s))[i3], ((const uint32_t*)(s))[i2], \
+                    ((const uint32_t*)(s))[i1], ((const uint32_t*)(s))[i0] )
+
+
+// Blend 2 vectors alternating hi & lo: { hi[n], lo[n-1], ... hi[1], lo[0] }
+#define mm256_intrlv_blend_128( hi, lo ) \
+                _mm256_blend_epi32( hi, lo, 0x0f )
+
+#define mm256_intrlv_blend_64( hi, lo ) \
+                _mm256_blend_epi32( hi, lo, 0x33 )
+
+#define mm256_intrlv_blend_32( hi, lo ) \
+           _mm256_blend_epi32( hi, lo, 0x55 )
+
+// Interleave 8x32_256
+#define mm256_intrlv_8x32_256( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \
+{ \
+   __m128i s0hi = mm128_extr_hi128_256( s0 ); \
+   __m128i s1hi = mm128_extr_hi128_256( s1 ); \
+   __m128i s2hi = mm128_extr_hi128_256( s2 ); \
+   __m128i s3hi = mm128_extr_hi128_256( s3 ); \
+   __m128i s4hi = mm128_extr_hi128_256( s4 ); \
+   __m128i s5hi = mm128_extr_hi128_256( s5 ); \
+   __m128i s6hi = mm128_extr_hi128_256( s6 ); \
+   __m128i s7hi = mm128_extr_hi128_256( s7 ); \
+   casti_m256i( d,0 ) = _mm256_set_epi32( \
+                        extr32_cast128_256(s7,0), extr32_cast128_256(s6,0), \
+                        extr32_cast128_256(s5,0), extr32_cast128_256(s4,0), \
+                        extr32_cast128_256(s3,0), extr32_cast128_256(s2,0), \
+                        extr32_cast128_256(s1,0), extr32_cast128_256(s0,0) ); \
+   casti_m256i( d,1 ) = _mm256_set_epi32( \
+                        extr32_cast128_256(s7,1), extr32_cast128_256(s6,1), \
+                        extr32_cast128_256(s5,1), extr32_cast128_256(s4,1), \
+                        extr32_cast128_256(s3,1), extr32_cast128_256(s2,1), \
+                        extr32_cast128_256(s1,1), extr32_cast128_256(s0,1) ); \
+   casti_m256i( d,2 ) = _mm256_set_epi32( \
+                        extr32_cast128_256(s7,2), extr32_cast128_256(s6,2), \
+                        extr32_cast128_256(s5,2), extr32_cast128_256(s4,2), \
+                        extr32_cast128_256(s3,2), extr32_cast128_256(s2,2), \
+                        extr32_cast128_256(s1,2), extr32_cast128_256(s0,2) ); \
+   casti_m256i( d,3 ) = _mm256_set_epi32( \
+                        extr32_cast128_256(s7,3), extr32_cast128_256(s6,3), \
+                        extr32_cast128_256(s5,3), extr32_cast128_256(s4,3), \
+                        extr32_cast128_256(s3,3), extr32_cast128_256(s2,3), \
+                        extr32_cast128_256(s1,3), extr32_cast128_256(s0,3) ); \
+   casti_m256i( d,4 ) = _mm256_set_epi32( \
+                           mm128_extr_32(s7hi,0),    mm128_extr_32(s6hi,0), \
+                           mm128_extr_32(s5hi,0),    mm128_extr_32(s4hi,0), \
+                           mm128_extr_32(s3hi,0),    mm128_extr_32(s2hi,0), \
+                           mm128_extr_32(s1hi,0),    mm128_extr_32(s0hi,0) ); \
+   casti_m256i( d,5 ) = _mm256_set_epi32( \
+                           mm128_extr_32(s7hi,1),    mm128_extr_32(s6hi,1), \
+                           mm128_extr_32(s5hi,1),    mm128_extr_32(s4hi,1), \
+                           mm128_extr_32(s3hi,1),    mm128_extr_32(s2hi,1), \
+                           mm128_extr_32(s1hi,1),    mm128_extr_32(s0hi,1) ); \
+   casti_m256i( d,6 ) = _mm256_set_epi32( \
+                           mm128_extr_32(s7hi,2),    mm128_extr_32(s6hi,2), \
+                           mm128_extr_32(s5hi,2),    mm128_extr_32(s4hi,2), \
+                           mm128_extr_32(s3hi,2),    mm128_extr_32(s2hi,2), \
+                           mm128_extr_32(s1hi,2),    mm128_extr_32(s0hi,2) ); \
+   casti_m256i( d,7 ) = _mm256_set_epi32( \
+                           mm128_extr_32(s7hi,3),    mm128_extr_32(s6hi,3), \
+                           mm128_extr_32(s5hi,3),    mm128_extr_32(s4hi,3), \
+                           mm128_extr_32(s3hi,3),    mm128_extr_32(s2hi,3), \
+                           mm128_extr_32(s1hi,3),    mm128_extr_32(s0hi,3) ); \
+} while(0)
+
+#define mm256_intrlv_8x32_128( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \
+{ \
+   casti_m256i( d,0 ) = _mm256_set_epi32( \
+                           mm128_extr_32(s7,0), mm128_extr_32(s6,0), \
+                           mm128_extr_32(s5,0), mm128_extr_32(s4,0), \
+                           mm128_extr_32(s3,0), mm128_extr_32(s2,0), \
+                           mm128_extr_32(s1,0), mm128_extr_32(s0,0) ); \
+   casti_m256i( d,1 ) = _mm256_set_epi32( \
+                           mm128_extr_32(s7,1), mm128_extr_32(s6,1), \
+                           mm128_extr_32(s5,1), mm128_extr_32(s4,1), \
+                           mm128_extr_32(s3,1), mm128_extr_32(s2,1), \
+                           mm128_extr_32(s1,1), mm128_extr_32(s0,1) ); \
+   casti_m256i( d,2 ) = _mm256_set_epi32( \
+                           mm128_extr_32(s7,2), mm128_extr_32(s6,2), \
+                           mm128_extr_32(s5,2), mm128_extr_32(s4,2), \
+                           mm128_extr_32(s3,2), mm128_extr_32(s2,2), \
+                           mm128_extr_32(s1,2), mm128_extr_32(s0,2) ); \
+   casti_m256i( d,3 ) = _mm256_set_epi32( \
+                           mm128_extr_32(s7,3), mm128_extr_32(s6,3), \
+                           mm128_extr_32(s5,3), mm128_extr_32(s4,3), \
+                           mm128_extr_32(s3,3), mm128_extr_32(s2,3), \
+                           mm128_extr_32(s1,3), mm128_extr_32(s0,3) ); \
+} while(0)
+
+#define mm256_bswap_intrlv_8x32_256( d, src ) \
+do { \
+  __m256i s0 = mm256_bswap_32( src ); \
+  __m128i s1 = _mm256_extracti128_si256( s0, 1 ); \
+  casti_m256i( d, 0 ) = _mm256_set1_epi32( _mm_extract_epi32( \
+                                     _mm256_castsi256_si128( s0 ), 0 ) ); \
+  casti_m256i( d, 1 ) = _mm256_set1_epi32( _mm_extract_epi32( \
+                                     _mm256_castsi256_si128( s0 ), 1 ) ); \
+  casti_m256i( d, 2 ) = _mm256_set1_epi32( _mm_extract_epi32( \
+                                     _mm256_castsi256_si128( s0 ), 2 ) ); \
+  casti_m256i( d, 3 ) = _mm256_set1_epi32( _mm_extract_epi32( \
+                                     _mm256_castsi256_si128( s0 ), 3 ) ); \
+  casti_m256i( d, 4 ) = _mm256_set1_epi32( _mm_extract_epi32( s1,   0 ) ); \
+  casti_m256i( d, 5 ) = _mm256_set1_epi32( _mm_extract_epi32( s1,   1 ) ); \
+  casti_m256i( d, 6 ) = _mm256_set1_epi32( _mm_extract_epi32( s1,   2 ) ); \
+  casti_m256i( d, 7 ) = _mm256_set1_epi32( _mm_extract_epi32( s1,   3 ) ); \
+} while(0)
+
+#define mm256_bswap_intrlv_8x32_128( d, src ) \
+do { \
+  __m128i ss = mm128_bswap_32( src ); \
+  casti_m256i( d, 0 ) = _mm256_set1_epi32( _mm_extract_epi32( ss, 0 ) ); \
+  casti_m256i( d, 1 ) = _mm256_set1_epi32( _mm_extract_epi32( ss, 1 ) ); \
+  casti_m256i( d, 2 ) = _mm256_set1_epi32( _mm_extract_epi32( ss, 2 ) ); \
+  casti_m256i( d, 3 ) = _mm256_set1_epi32( _mm_extract_epi32( ss, 3 ) ); \
+} while(0)
+
+#define mm256_dintrlv_8x32_256( d0, d1, d2, d3, d4, d5, d6, d7, s ) \
+do { \
+  __m256i s0 = casti_m256i(s,0); \
+  __m256i s1 = casti_m256i(s,1); \
+  __m256i s2 = casti_m256i(s,2); \
+  __m256i s3 = casti_m256i(s,3); \
+  __m256i s4 = casti_m256i(s,4); \
+  __m256i s5 = casti_m256i(s,5); \
+  __m256i s6 = casti_m256i(s,6); \
+  __m256i s7 = casti_m256i(s,7); \
+  __m128i s0hi = _mm256_extracti128_si256( s0, 1 ); \
+  __m128i s1hi = _mm256_extracti128_si256( s1, 1 ); \
+  __m128i s2hi = _mm256_extracti128_si256( s2, 1 ); \
+  __m128i s3hi = _mm256_extracti128_si256( s3, 1 ); \
+  __m128i s4hi = _mm256_extracti128_si256( s4, 1 ); \
+  __m128i s5hi = _mm256_extracti128_si256( s5, 1 ); \
+  __m128i s6hi = _mm256_extracti128_si256( s6, 1 ); \
+  __m128i s7hi = _mm256_extracti128_si256( s7, 1 ); \
+   d0 = _mm256_set_epi32( \
+              extr32_cast128_256( s7, 0 ), extr32_cast128_256( s6, 0 ), \
+              extr32_cast128_256( s5, 0 ), extr32_cast128_256( s4, 0 ), \
+              extr32_cast128_256( s3, 0 ), extr32_cast128_256( s2, 0 ), \
+              extr32_cast128_256( s1, 0 ), extr32_cast128_256( s0, 0 ) );\
+   d1 = _mm256_set_epi32( \
+              extr32_cast128_256( s7, 1 ), extr32_cast128_256( s6, 1 ), \
+              extr32_cast128_256( s5, 1 ), extr32_cast128_256( s4, 1 ), \
+              extr32_cast128_256( s3, 1 ), extr32_cast128_256( s2, 1 ), \
+              extr32_cast128_256( s1, 1 ), extr32_cast128_256( s0, 1 ) );\
+   d2 = _mm256_set_epi32( \
+              extr32_cast128_256( s7, 2 ), extr32_cast128_256( s6, 2 ), \
+              extr32_cast128_256( s5, 2 ), extr32_cast128_256( s4, 2 ), \
+              extr32_cast128_256( s3, 2 ), extr32_cast128_256( s2, 2 ), \
+              extr32_cast128_256( s1, 2 ), extr32_cast128_256( s0, 2 ) );\
+   d3 = _mm256_set_epi32( \
+              extr32_cast128_256( s7, 3 ), extr32_cast128_256( s6, 3 ), \
+              extr32_cast128_256( s5, 3 ), extr32_cast128_256( s4, 3 ), \
+              extr32_cast128_256( s3, 3 ), extr32_cast128_256( s2, 3 ), \
+              extr32_cast128_256( s1, 3 ), extr32_cast128_256( s0, 3 ) );\
+   d4 = _mm256_set_epi32( \
+              _mm_extract_epi32( s7hi, 0 ), _mm_extract_epi32( s6hi, 0 ), \
+              _mm_extract_epi32( s5hi, 0 ), _mm_extract_epi32( s4hi, 0 ), \
+              _mm_extract_epi32( s3hi, 0 ), _mm_extract_epi32( s2hi, 0 ), \
+              _mm_extract_epi32( s1hi, 0 ), _mm_extract_epi32( s0hi, 0 ) ); \
+   d5 = _mm256_set_epi32( \
+              _mm_extract_epi32( s7hi, 1 ), _mm_extract_epi32( s6hi, 1 ), \
+              _mm_extract_epi32( s5hi, 1 ), _mm_extract_epi32( s4hi, 1 ), \
+              _mm_extract_epi32( s3hi, 1 ), _mm_extract_epi32( s2hi, 1 ), \
+              _mm_extract_epi32( s1hi, 1 ), _mm_extract_epi32( s0hi, 1 ) ); \
+   d6 = _mm256_set_epi32( \
+              _mm_extract_epi32( s7hi, 2 ), _mm_extract_epi32( s6hi, 2 ), \
+              _mm_extract_epi32( s5hi, 2 ), _mm_extract_epi32( s4hi, 2 ), \
+              _mm_extract_epi32( s3hi, 2 ), _mm_extract_epi32( s2hi, 2 ), \
+              _mm_extract_epi32( s1hi, 2 ), _mm_extract_epi32( s0hi, 2 ) ); \
+   d7 = _mm256_set_epi32( \
+              _mm_extract_epi32( s7hi, 3 ), _mm_extract_epi32( s6hi, 3 ), \
+              _mm_extract_epi32( s5hi, 3 ), _mm_extract_epi32( s4hi, 3 ), \
+              _mm_extract_epi32( s3hi, 3 ), _mm_extract_epi32( s2hi, 3 ), \
+              _mm_extract_epi32( s1hi, 3 ), _mm_extract_epi32( s0hi, 3 ) ); \
+} while(0)
+
+#define mm128_dintrlv_8x32_128( d0, d1, d2, d3, d4, d5, d6, d7, s ) \
+do { \
+   __m128i s0 = casti_m128i(s,0); \
+   __m128i s1 = casti_m128i(s,1); \
+   __m128i s2 = casti_m128i(s,2); \
+   __m128i s3 = casti_m128i(s,3); \
+   d0 = _mm_set_epi32( \
+              _mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
+              _mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
+   d1 = _mm_set_epi32( \
+              _mm_extract_epi32( s3, 1 ), _mm_extract_epi32( s2, 0 ), \
+              _mm_extract_epi32( s1, 1 ), _mm_extract_epi32( s0, 0 ) ); \
+   d2 = _mm_set_epi32( \
+              _mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
+              _mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
+   d3 = _mm_set_epi32( \
+              _mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
+              _mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
+   d4 = _mm_set_epi32( \
+              _mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
+              _mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
+   d5 = _mm_set_epi32( \
+              _mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
+              _mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
+   d6 = _mm_set_epi32( \
+              _mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
+              _mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
+   d7 = _mm_set_epi32( \
+              _mm_extract_epi32( s3, 0 ), _mm_extract_epi32( s2, 0 ), \
+              _mm_extract_epi32( s1, 0 ), _mm_extract_epi32( s0, 0 ) ); \
+} while(0)
+
+#define mm256_intrlv_4x64_256( d, s0, s1, s2, s3 ) \
+do { \
+  __m128i s0hi = _mm256_extracti128_si256( s0, 1 ); \
+  __m128i s1hi = _mm256_extracti128_si256( s1, 1 ); \
+  __m128i s2hi = _mm256_extracti128_si256( s2, 1 ); \
+  __m128i s3hi = _mm256_extracti128_si256( s3, 1 ); \
+  casti_m256i( d,0 ) = _mm256_set_epi64x( \
+                extr64_cast128_256( s3, 0 ), extr64_cast128_256( s2, 0 ), \
+                extr64_cast128_256( s1, 0 ), extr64_cast128_256( s0, 0 ) ); \
+  casti_m256i( d,1 ) = _mm256_set_epi64x( \
+                extr64_cast128_256( s3, 1 ), extr64_cast128_256( s2, 1 ), \
+                extr64_cast128_256( s1, 1 ), extr64_cast128_256( s0, 1 ) ); \
+  casti_m256i( d,2 ) = _mm256_set_epi64x( \
+                  _mm_extract_epi64( s3hi,0 ), _mm_extract_epi64( s2hi,0 ), \
+                  _mm_extract_epi64( s1hi,0 ), _mm_extract_epi64( s0hi,0 ) ); \
+  casti_m256i( d,3 ) = _mm256_set_epi64x( \
+                  _mm_extract_epi64( s3hi,1 ), _mm_extract_epi64( s2hi,1 ), \
+                  _mm_extract_epi64( s1hi,1 ), _mm_extract_epi64( s0hi,1 ) ); \
+} while(0)
+
+#define mm256_intrlv_4x64_128( d, s0, s1, s2, s3 ) \
+do { \
+  casti_m256i( d,0 ) = _mm256_set_epi64x( \
+                  _mm_extract_epi64( s3, 0 ), _mm_extract_epi64( s2, 0 ), \
+                  _mm_extract_epi64( s1, 0 ), _mm_extract_epi64( s0, 0 ) ); \
+  casti_m256i( d,1 ) = _mm256_set_epi64x( \
+                  _mm_extract_epi64( s3, 1 ), _mm_extract_epi64( s2, 1 ), \
+                  _mm_extract_epi64( s1, 1 ), _mm_extract_epi32( s0, 1 ) ); \
+} while(0)
+
+#define mm256_bswap_intrlv_4x64_256( d, src ) \
+do { \
+  __m256i s0 = mm256_bswap_32( src ); \
+  __m128i s1 = _mm256_extracti128_si256( s0, 1 ); \
+  casti_m256i( d,0 ) = _mm256_set1_epi64x( _mm_extract_epi64( \
+                                        _mm256_castsi256_si128( s0 ), 0 ) ); \
+  casti_m256i( d,1 ) = _mm256_set1_epi64x( _mm_extract_epi64( \
+                                        _mm256_castsi256_si128( s0 ), 1 ) ); \
+  casti_m256i( d,2 ) = _mm256_set1_epi64x(   _mm_extract_epi64( s1,   0 ) ); \
+  casti_m256i( d,3 ) = _mm256_set1_epi64x(   _mm_extract_epi64( s1,   1 ) ); \
+} while(0)
+
+#define mm256_bswap_intrlv_4x64_128( d, src ) \
+do { \
+  __m128i ss = mm128_bswap_32( src ); \
+  casti_m256i( d,0 ) = _mm256_set1_epi64x( _mm_extract_epi64( ss, 0 ) ); \
+  casti_m256i( d,1 ) = _mm256_set1_epi64x( _mm_extract_epi64( ss, 1 ) ); \
+} while(0)
+
+// 4 lanes of 256 bits using 64 bit interleaving (standard final hash size)
+static inline void mm256_dintrlv_4x64_256( void *d0, void *d1, void *d2,
+                            void *d3, const int n, const void *src )
+{
+   __m256i s0   = *( (__m256i*) src     );            // s[0][1:0]
+   __m256i s1   = *( (__m256i*)(src+32) );            // s[1][1:0]
+   __m256i s2   = *( (__m256i*)(src+64) );            // s[2][1:0]
+   __m256i s3   = *( (__m256i*)(src+96) );            // s[3][2:0]
+   __m128i s0hi = _mm256_extracti128_si256( s0, 1 );  // s[0][3:2]
+   __m128i s1hi = _mm256_extracti128_si256( s1, 1 );  // s[1][3:2]
+   __m128i s2hi = _mm256_extracti128_si256( s2, 1 );  // s[2][3:2]
+   __m128i s3hi = _mm256_extracti128_si256( s3, 1 );  // s[3][3:2]
+
+   casti_m256i( d0,n ) = _mm256_set_epi64x(
+              extr64_cast128_256( s3, 0 ), extr64_cast128_256( s2, 0 ),
+              extr64_cast128_256( s1, 0 ), extr64_cast128_256( s0, 0 ) );
+   casti_m256i( d1,n ) = _mm256_set_epi64x(
+              extr64_cast128_256( s3, 1 ), extr64_cast128_256( s2, 1 ),
+              extr64_cast128_256( s1, 1 ), extr64_cast128_256( s0, 1 ) );
+   casti_m256i( d2,n ) = _mm256_set_epi64x(
+              _mm_extract_epi64( s3hi, 0 ), _mm_extract_epi64( s2hi, 0 ),
+              _mm_extract_epi64( s1hi, 0 ), _mm_extract_epi64( s0hi, 0 ) );
+   casti_m256i( d3,n ) = _mm256_set_epi64x(
+              _mm_extract_epi64( s3hi, 1 ), _mm_extract_epi64( s2hi, 1 ),
+              _mm_extract_epi64( s1hi, 1 ), _mm_extract_epi64( s0hi, 1 ) );
+}
+
+
+// quarter avx2 block, 16 bytes * 4 lanes
+// 4 lanes of 128 bits using 64 bit interleaving
+// Used for last 16 bytes of 80 byte input, only used for testing.
+static inline void mm128_dintrlv_4x64_128( void *d0, void *d1, void *d2,
+                                  void *d3, const int n, const void *src )
+{
+  __m256i s0 = *( (__m256i*) src     );
+  __m256i s1 = *( (__m256i*)(src+32) );
+  __m128i s0hi = _mm256_extracti128_si256( s0, 1 );
+  __m128i s1hi = _mm256_extracti128_si256( s1, 1 );
+
+  casti_m128i( d0,n ) = _mm_set_epi64x( extr64_cast128_256( s1  , 0 ),
+                                        extr64_cast128_256( s0  , 0 ) );
+  casti_m128i( d1,n ) = _mm_set_epi64x( extr64_cast128_256( s1  , 1 ),
+                                        extr64_cast128_256( s0  , 1 ) );
+  casti_m128i( d2,n ) = _mm_set_epi64x( _mm_extract_epi64(    s1hi, 0 ),
+                                        _mm_extract_epi64(    s0hi, 0 ) );
+  casti_m128i( d3,n ) = _mm_set_epi64x( _mm_extract_epi64(    s1hi, 1 ),
+                                        _mm_extract_epi64(    s0hi, 1 ) );
+}
+
+/*
+static inline void mm256_dintrlv_2x128x256( void *d0, void *d1,
+                                                 const int n, const void *s )
+{
+   casti_m256i( d0,n ) = mm256_get_64( s, 0, 1, 4, 5 );
+   casti_m256i( d1,n ) = mm256_get_64( s, 2, 3, 6, 7 );
+}
+*/
+//
+
+// Interleave 8 source buffers containing 32 bit data into the destination
+// vector
+#define mm256_interleave_8x32 mm256_intrlv_8x32
+static inline void mm256_intrlv_8x32( void *d, const void *s0,
+        const void *s1, const void *s2, const void *s3, const void *s4,
+        const void *s5, const void *s6, const void *s7, int bit_len )
+{
+   mm256_intrlv_8x32_256( d    , casti_m256i( s0,0 ), casti_m256i( s1,0 ),
+            casti_m256i( s2,0 ), casti_m256i( s3,0 ), casti_m256i( s4,0 ),
+            casti_m256i( s5,0 ), casti_m256i( s6,0 ), casti_m256i( s7,0 ) );
+   if ( bit_len <= 256 ) return;
+   mm256_intrlv_8x32_256( d+256, casti_m256i( s0,1 ), casti_m256i( s1,1 ), 
+            casti_m256i( s2,1 ), casti_m256i( s3,1 ), casti_m256i( s4,1 ),
+            casti_m256i( s5,1 ), casti_m256i( s6,1 ), casti_m256i( s7,1 ) );
+   if ( bit_len <= 512 ) return;
+   if ( bit_len <= 640 )
+   {
+      mm256_intrlv_8x32_128( d+512, casti_m128i( s0,4 ), casti_m128i( s1,4 ),
+               casti_m128i( s2,4 ), casti_m128i( s3,4 ), casti_m128i( s4,4 ),
+               casti_m128i( s5,4 ), casti_m128i( s6,4 ), casti_m128i( s7,4 ) );
+      return;
+   }
+   mm256_intrlv_8x32_256( d+512, casti_m256i( s0,2 ), casti_m256i( s1,2 ), 
+            casti_m256i( s2,2 ), casti_m256i( s3,2 ), casti_m256i( s4,2 ),
+            casti_m256i( s5,2 ), casti_m256i( s6,2 ), casti_m256i( s7,2 ) );
+   mm256_intrlv_8x32_256( d+768, casti_m256i( s0,3 ), casti_m256i( s1,3 ), 
+            casti_m256i( s2,3 ), casti_m256i( s3,3 ), casti_m256i( s4,3 ),
+            casti_m256i( s5,3 ), casti_m256i( s6,3 ), casti_m256i( s7,3 ) );
+   // bit_len == 1024
+}
+
+
+// Interleave 80 bytes of 32 bit data for 8 lanes.
+static inline void mm256_bswap_intrlv80_8x32( void *d, const void *s )
+{
+   mm256_bswap_intrlv_8x32_256( d    , casti_m256i( s, 0 ) );
+   mm256_bswap_intrlv_8x32_256( d+256, casti_m256i( s, 1 ) );
+   mm256_bswap_intrlv_8x32_128( d+512, casti_m128i( s, 4 ) );
+}
+
+// Deinterleave 8 buffers of 32 bit data from the source buffer.
+// Sub-function can be called directly for 32 byte final hash.
+#define mm256_deinterleave_8x32 mm256_dintrlv_8x32
+static inline void mm256_dintrlv_8x32( void *d0, void *d1, void *d2,
+                        void *d3, void *d4, void *d5, void *d6, void *d7,
+                        const void *s, int bit_len )
+{
+   mm256_dintrlv_8x32_256( casti_m256i(d0,0), casti_m256i(d1,0),
+        casti_m256i(d2,0), casti_m256i(d3,0), casti_m256i(d4,0),
+        casti_m256i(d5,0), casti_m256i(d6,0), casti_m256i(d7,0), s );
+   if ( bit_len <= 256 ) return;
+   mm256_dintrlv_8x32_256( casti_m256i(d0,1), casti_m256i(d1,1), 
+        casti_m256i(d2,1), casti_m256i(d3,1), casti_m256i(d4,1),
+        casti_m256i(d5,1), casti_m256i(d6,1), casti_m256i(d7,1), s+256 );
+   if ( bit_len <= 512 ) return;
+   // short block, final 16 bytes of input data
+   if ( bit_len <= 640 )
+   {
+      mm128_dintrlv_8x32_128( casti_m128i(d0,2), casti_m128i(d1,2), 
+           casti_m128i(d2,2), casti_m128i(d3,2), casti_m128i(d4,2),
+           casti_m128i(d5,2), casti_m128i(d6,2), casti_m128i(d7,2), s+512 );
+      return;
+   }
+   // bitlen == 1024
+   mm256_dintrlv_8x32_256( casti_m256i(d0,2), casti_m256i(d1,2), 
+        casti_m256i(d2,2), casti_m256i(d3,2), casti_m256i(d4,2),
+        casti_m256i(d5,2), casti_m256i(d6,2), casti_m256i(d7,2), s+512 );
+   mm256_dintrlv_8x32_256( casti_m256i(d0,3), casti_m256i(d1,3), 
+        casti_m256i(d2,3), casti_m256i(d3,3), casti_m256i(d4,3),
+        casti_m256i(d5,3), casti_m256i(d6,3), casti_m256i(d7,3), s+768 );
+}
+
+static inline void mm256_extract_lane_8x32( void *d, const void *s,
+                                            const int lane, const int bit_len )
+{
+  casti_m256i( d,0 ) = mm256_get_32(s, lane   , lane+  8, lane+ 16, lane+ 24,
+                                       lane+32, lane+ 40, lane+ 48, lane+ 56 );
+  if ( bit_len <= 256 ) return;
+  casti_m256i( d,1 ) = mm256_get_32(s, lane+64, lane+ 72, lane+ 80, lane+ 88,
+                                       lane+96, lane+104, lane+112, lane+120 );
+  // bit_len == 512
+}
+
+// Interleave 4 source buffers containing 64 bit data into the destination
+// buffer. Only bit_len 256, 512, 640 & 1024 are supported.
+#define mm256_interleave_4x64 mm256_intrlv_4x64
+static inline void mm256_intrlv_4x64( void *d, const void *s0,
+            const void *s1, const void *s2, const void *s3, int bit_len )
+{
+  mm256_intrlv_4x64_256( d    , casti_m256i(s0,0), casti_m256i(s1,0),
+                                casti_m256i(s2,0), casti_m256i(s3,0) );
+  if ( bit_len <= 256 ) return;
+  mm256_intrlv_4x64_256( d+128, casti_m256i(s0,1), casti_m256i(s1,1),
+                                casti_m256i(s2,1), casti_m256i(s3,1) );
+  if ( bit_len <= 512 ) return;
+  if ( bit_len <= 640 )
+  {
+    mm256_intrlv_4x64_128( d+256, casti_m128i(s0,4), casti_m128i(s1,4),
+                                  casti_m128i(s2,4), casti_m128i(s3,4) );
+    return;
+  }
+  // bit_len == 1024
+  mm256_intrlv_4x64_256( d+256, casti_m256i(s0,2), casti_m256i(s1,2),
+                                casti_m256i(s2,2), casti_m256i(s3,2) );
+  mm256_intrlv_4x64_256( d+384, casti_m256i(s0,3), casti_m256i(s1,3),
+                                casti_m256i(s2,3), casti_m256i(s3,3) );
+}
+
+// Interleave 80 bytes of 32 bit data for 8 lanes.
+static inline void mm256_bswap_intrlv80_4x64( void *d, const void *s )
+{
+   mm256_bswap_intrlv_4x64_256( d    , casti_m256i( s, 0 ) );
+   mm256_bswap_intrlv_4x64_256( d+128, casti_m256i( s, 1 ) );
+   mm256_bswap_intrlv_4x64_128( d+256, casti_m128i( s, 4 ) );
+}
+
+// Deinterleave 4 buffers of 64 bit data from the source buffer.
+// bit_len must be 256, 512, 640 or 1024 bits.
+// Requires overrun padding for 640 bit len.
+#define mm256_deinterleave_4x64 mm256_dintrlv_4x64
+static inline void mm256_dintrlv_4x64( void *d0, void *d1, void *d2,
+                                    void *d3, const void *s, int bit_len )
+{
+   mm256_dintrlv_4x64_256( d0, d1, d2, d3, 0, s );
+   if ( bit_len <= 256 ) return;
+   mm256_dintrlv_4x64_256( d0, d1, d2, d3, 1, s+128 );
+   if ( bit_len <= 512 ) return;
+   // short block, final 16 bytes of input data
+   if ( bit_len <= 640 )
+   {
+      mm128_dintrlv_4x64_128( d0, d1, d2, d3, 4, s+256 );
+      return;
+   }
+   // bit_len == 1024
+   mm256_dintrlv_4x64_256( d0, d1, d2, d3, 2, s+256 );
+   mm256_dintrlv_4x64_256( d0, d1, d2, d3, 3, s+384 );
+}
+
+// extract and deinterleave specified lane.
+#define mm256_extract_lane_4x64_256 \
+      casti_m256i( d, 0 ) = mm256_get_64( s, lane, lane+4, lane+8, lane+12 )
+static inline void mm256_extract_lane_4x64( void *d, const void *s,
+                                            const int lane, const int bit_len )
+{
+  casti_m256i( d, 0 ) = mm256_get_64( s, lane, lane+4, lane+8, lane+12 );
+  if ( bit_len <= 256 ) return;
+  casti_m256i( d, 1 ) = mm256_get_64( s, lane+16, lane+20, lane+24, lane+28 );
+  return;
+}
+
+
+// Convert from 4x32 SSE2 interleaving to 4x64 AVX2.
+// Can't do it in place
+#define mm256_reinterleave_4x32_4x64 mm256_rintrlv_4x32_4x64
+static inline void mm256_rintrlv_4x32_4x64( void *dst, void *src,
+                                            int  bit_len )
+{
+   __m256i* d = (__m256i*)dst;
+   uint32_t *s = (uint32_t*)src;
+
+   d[0] = _mm256_set_epi32( s[ 7],s[ 3],s[ 6],s[ 2],s[ 5],s[ 1],s[ 4],s[ 0] );
+   d[1] = _mm256_set_epi32( s[15],s[11],s[14],s[10],s[13],s[ 9],s[12],s[ 8] );
+   d[2] = _mm256_set_epi32( s[23],s[19],s[22],s[18],s[21],s[17],s[20],s[16] );
+   d[3] = _mm256_set_epi32( s[31],s[27],s[30],s[26],s[29],s[25],s[28],s[24] );
+
+   if ( bit_len <= 256 ) return;
+
+   d[4] = _mm256_set_epi32( s[39],s[35],s[38],s[34],s[37],s[33],s[36],s[32] );
+   d[5] = _mm256_set_epi32( s[47],s[43],s[46],s[42],s[45],s[41],s[44],s[40] );
+   d[6] = _mm256_set_epi32( s[55],s[51],s[54],s[50],s[53],s[49],s[52],s[48] );
+   d[7] = _mm256_set_epi32( s[63],s[59],s[62],s[58],s[61],s[57],s[60],s[56] );
+
+   if ( bit_len <= 512 ) return;
+
+   d[8] = _mm256_set_epi32( s[71],s[67],s[70],s[66],s[69],s[65],s[68],s[64] );
+   d[9] = _mm256_set_epi32( s[79],s[75],s[78],s[74],s[77],s[73],s[76],s[72] );
+
+   if ( bit_len <= 640 ) return;
+
+  d[10] = _mm256_set_epi32(s[87],s[83],s[86],s[82],s[85],s[81],s[84],s[80]);
+  d[11] = _mm256_set_epi32(s[95],s[91],s[94],s[90],s[93],s[89],s[92],s[88]);
+
+  d[12] = _mm256_set_epi32(s[103],s[99],s[102],s[98],s[101],s[97],s[100],s[96]);
+  d[13] = _mm256_set_epi32(s[111],s[107],s[110],s[106],s[109],s[105],s[108],s[104]);
+  d[14] = _mm256_set_epi32(s[119],s[115],s[118],s[114],s[117],s[113],s[116],s[112]);
+  d[15] = _mm256_set_epi32(s[127],s[123],s[126],s[122],s[125],s[121],s[124],s[120]);
+   // bit_len == 1024
+}
+
+// Convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
+// bit_len must be multiple of 64
+#define mm256_reinterleave_4x64_4x32 mm256_rintrlv_4x64_4x32
+static inline void mm256_rintrlv_4x64_4x32( void *dst, void *src,
+                                            int  bit_len )
+{
+   __m256i  *d = (__m256i*)dst;
+   uint32_t *s = (uint32_t*)src;
+
+   d[0] = _mm256_set_epi32( s[ 7],s[ 5],s[ 3],s[ 1],s[ 6],s[ 4],s[ 2],s[ 0] );
+   d[1] = _mm256_set_epi32( s[15],s[13],s[11],s[ 9],s[14],s[12],s[10],s[ 8] );
+   d[2] = _mm256_set_epi32( s[23],s[21],s[19],s[17],s[22],s[20],s[18],s[16] );
+   d[3] = _mm256_set_epi32( s[31],s[29],s[27],s[25],s[30],s[28],s[26],s[24] );
+
+   if ( bit_len <= 256 ) return;
+
+   d[4] = _mm256_set_epi32( s[39],s[37],s[35],s[33],s[38],s[36],s[34],s[32] );
+   d[5] = _mm256_set_epi32( s[47],s[45],s[43],s[41],s[46],s[44],s[42],s[40] );
+   d[6] = _mm256_set_epi32( s[55],s[53],s[51],s[49],s[54],s[52],s[50],s[48] );
+   d[7] = _mm256_set_epi32( s[63],s[61],s[59],s[57],s[62],s[60],s[58],s[56] );
+
+   if ( bit_len <= 512 ) return;
+
+   d[8] = _mm256_set_epi32( s[71],s[69],s[67],s[65],s[70],s[68],s[66],s[64] );
+   d[9] = _mm256_set_epi32( s[79],s[77],s[75],s[73],s[78],s[76],s[74],s[72] );
+
+   if ( bit_len <= 640 ) return;
+
+   d[10] = _mm256_set_epi32( s[87],s[85],s[83],s[81],s[86],s[84],s[82],s[80] );
+   d[11] = _mm256_set_epi32( s[95],s[93],s[91],s[89],s[94],s[92],s[90],s[88] );
+
+   d[12] = _mm256_set_epi32( s[103],s[101],s[99],s[97],s[102],s[100],s[98],s[96] );
+   d[13] = _mm256_set_epi32( s[111],s[109],s[107],s[105],s[110],s[108],s[106],s[104] );
+   d[14] = _mm256_set_epi32( s[119],s[117],s[115],s[113],s[118],s[116],s[114],s[112] );
+   d[15] = _mm256_set_epi32( s[127],s[125],s[123],s[121],s[126],s[124],s[122],s[120] );
+   // bit_len == 1024
+}
+
+#define mm256_reinterleave_4x64_2x128 mm256_rintrlv_4x64_2x128
+static inline void mm256_rintrlv_4x64_2x128( void *dst0, void *dst1,
+                                              const void *src, int  bit_len )
+{
+   __m256i* d0 = (__m256i*)dst0;
+   __m256i* d1 = (__m256i*)dst1;
+   uint64_t *s = (uint64_t*)src;
+
+   d0[0] = _mm256_set_epi64x( s[ 5], s[ 1], s[ 4], s[ 0] );
+   d1[0] = _mm256_set_epi64x( s[ 7], s[ 3], s[ 6], s[ 2] );
+
+   d0[1] = _mm256_set_epi64x( s[13], s[ 9], s[12], s[ 8] );
+   d1[1] = _mm256_set_epi64x( s[15], s[11], s[14], s[10] );
+
+   if ( bit_len <= 256 ) return;
+
+   d0[2] = _mm256_set_epi64x( s[21], s[17], s[20], s[16] );
+   d1[2] = _mm256_set_epi64x( s[23], s[19], s[22], s[18] );
+
+   d0[3] = _mm256_set_epi64x( s[29], s[25], s[28], s[24] );
+   d1[3] = _mm256_set_epi64x( s[31], s[27], s[30], s[26] );
+
+   if ( bit_len <= 512 ) return;
+
+   d0[4] = _mm256_set_epi64x( s[37], s[33], s[36], s[32] );
+   d1[4] = _mm256_set_epi64x( s[39], s[35], s[38], s[34] );
+
+   d0[5] = _mm256_set_epi64x( s[45], s[41], s[44], s[40] );
+   d1[5] = _mm256_set_epi64x( s[47], s[43], s[46], s[42] );
+
+   d0[6] = _mm256_set_epi64x( s[53], s[49], s[52], s[48] );
+   d1[6] = _mm256_set_epi64x( s[55], s[51], s[54], s[50] );
+
+   d0[7] = _mm256_set_epi64x( s[61], s[57], s[60], s[56] );
+   d1[7] = _mm256_set_epi64x( s[63], s[59], s[62], s[58] );
+}
+
+#define mm256_reinterleave_2x128_4x64 mm256_rintrlv_2x128_4x64
+static inline void mm256_rintrlv_2x128_4x64( void *dst, const void *src0,
+                                         const void *src1, int  bit_len )
+{
+   __m256i* d = (__m256i*)dst;
+   uint64_t *s0 = (uint64_t*)src0;
+   uint64_t *s1 = (uint64_t*)src1;
+
+   d[ 0] = _mm256_set_epi64x( s1[2], s1[0], s0[2], s0[0] );
+   d[ 1] = _mm256_set_epi64x( s1[3], s1[1], s0[3], s0[1] );
+   d[ 2] = _mm256_set_epi64x( s1[6], s1[4], s0[6], s0[4] );
+   d[ 3] = _mm256_set_epi64x( s1[7], s1[5], s0[7], s0[5] );
+
+   if ( bit_len <= 256 ) return;
+
+   d[ 4] = _mm256_set_epi64x( s1[10], s1[ 8], s0[10], s0[ 8] );
+   d[ 5] = _mm256_set_epi64x( s1[11], s1[ 9], s0[11], s0[ 9] );
+   d[ 6] = _mm256_set_epi64x( s1[14], s1[12], s0[14], s0[12] );
+   d[ 7] = _mm256_set_epi64x( s1[15], s1[13], s0[15], s0[13] );
+
+   if ( bit_len <= 512 ) return;
+
+   d[ 8] = _mm256_set_epi64x( s1[18], s1[16], s0[18], s0[16] );
+   d[ 9] = _mm256_set_epi64x( s1[19], s1[17], s0[19], s0[17] );
+   d[10] = _mm256_set_epi64x( s1[22], s1[20], s0[22], s0[20] );
+   d[11] = _mm256_set_epi64x( s1[23], s1[21], s0[23], s0[21] );
+
+   d[12] = _mm256_set_epi64x( s1[26], s1[24], s0[26], s0[24] );
+   d[13] = _mm256_set_epi64x( s1[27], s1[25], s0[27], s0[25] );
+   d[14] = _mm256_set_epi64x( s1[30], s1[28], s0[30], s0[28] );
+   d[15] = _mm256_set_epi64x( s1[31], s1[29], s0[31], s0[29] );
+}
+
+
+#define mm256_interleave_2x128 mm256_intrlv_2x128
+static inline void mm256_intrlv_2x128( const void *d, const void *s0,
+                                      void *s1, const int bit_len )
+{
+  __m128i s1hi = _mm256_extracti128_si256( casti_m256i( s1,0), 1 );
+  __m128i s0hi = _mm256_extracti128_si256( casti_m256i( s0,0), 1 );
+  casti_m256i( d,0 ) = mm256_concat_128(
+                           _mm256_castsi256_si128( casti_m256i( s1,0 ) ),
+                           _mm256_castsi256_si128( casti_m256i( s0,0 ) ) );
+  casti_m256i( d,1 ) = mm256_concat_128( s1hi, s0hi );                  
+
+  if ( bit_len <= 256 ) return;
+  s0hi = _mm256_extracti128_si256( casti_m256i( s0,1), 1 );
+  s1hi = _mm256_extracti128_si256( casti_m256i( s1,1), 1 );
+  casti_m256i( d,2 ) = mm256_concat_128(
+                           _mm256_castsi256_si128( casti_m256i( s1,1 ) ),
+                           _mm256_castsi256_si128( casti_m256i( s0,1 ) ) );
+  casti_m256i( d,3 ) = mm256_concat_128( s1hi, s0hi );        
+
+  if ( bit_len <= 512 ) return;
+
+  s0hi = _mm256_extracti128_si256( casti_m256i( s0,2), 1 );
+  s1hi = _mm256_extracti128_si256( casti_m256i( s1,2), 1 );
+  casti_m256i( d,4 ) = mm256_concat_128(
+                           _mm256_castsi256_si128( casti_m256i( s1,2 ) ),
+                           _mm256_castsi256_si128( casti_m256i( s0,2 ) ) );
+  casti_m256i( d,5 ) = mm256_concat_128( s1hi, s0hi );        
+
+  s0hi = _mm256_extracti128_si256( casti_m256i( s0,3), 1 );
+  s1hi = _mm256_extracti128_si256( casti_m256i( s1,3), 1 );
+  casti_m256i( d,6 ) = mm256_concat_128(
+                           _mm256_castsi256_si128( casti_m256i( s1,3 ) ),
+                           _mm256_castsi256_si128( casti_m256i( s0,3 ) ) );
+  casti_m256i( d,7 ) = mm256_concat_128( s1hi, s0hi );        
+}
+
+#define mm256_deinterleave_2x128 mm256_dintrlv_2x128
+static inline void mm256_dintrlv_2x128( void *dst0, void *dst1, const void *s,
+                                             int bit_len )
+{
+   __m256i *d0 = (__m256i*)dst0;
+   __m256i *d1 = (__m256i*)dst1;
+
+   __m256i s0 = casti_m256i( s, 0 );
+   __m256i s1 = casti_m256i( s, 1 );
+   d0[0] = _mm256_inserti128_si256( s0, mm128_extr_lo128_256( s1 ), 1 );
+   d1[0] = _mm256_inserti128_si256( s1, mm128_extr_hi128_256( s0 ), 0 ); 
+
+   if ( bit_len <= 256 ) return;
+
+   s0 = casti_m256i( s, 2 );
+   s1 = casti_m256i( s, 3 );
+   d0[1] = _mm256_inserti128_si256( s0, mm128_extr_lo128_256( s1 ), 1 );
+   d1[1] = _mm256_inserti128_si256( s1, mm128_extr_hi128_256( s0 ), 0 );
+
+   if ( bit_len <= 512 ) return;
+
+   s0 = casti_m256i( s, 4 );
+   s1 = casti_m256i( s, 5 );
+   d0[2] = _mm256_inserti128_si256( s0, mm128_extr_lo128_256( s1 ), 1 );
+   d1[2] = _mm256_inserti128_si256( s1, mm128_extr_hi128_256( s0 ), 0 );
+
+   s0 = casti_m256i( s, 6 );
+   s1 = casti_m256i( s, 7 );
+   d0[3] = _mm256_inserti128_si256( s0, mm128_extr_lo128_256( s1 ), 1 );
+   d1[3] = _mm256_inserti128_si256( s1, mm128_extr_hi128_256( s0 ), 0 );
+}
+
+#undef extr64_cast128_256
+#undef extr32_cast128_256
+
+#endif // AVX2
+#endif // INTRLV_AVX22_H__
--- a/simd-utils/intrlv-avx512.h
+++ b/simd-utils/intrlv-avx512.h
@@ -0,0 +1,679 @@
+#if !defined(INTRLV_AVX512_H__)
+#define INTRLV_AVX512_H__ 1
+
+#if defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SSE2 functions used in AVX512 interleaving
+
+// AVX512 block is 64 * 64 bytes
+
+// quarter avx512 block, 16 bytes * 16 lanes 
+static inline void mm128_dintrlv_16x32x128( void *d00, void *d01,
+        void *d02, void *d03, void *d04, void *d05, void *d06, void *d07,
+        void *d08, void *d09, void *d10, void *d11, void *d12, void *d13,
+        void *d14, void *d15, const int n, const void *s )
+{
+   cast_m128i( d00 ) = mm128_get_32( s, 0,  16, 32, 48 );
+   cast_m128i( d01 ) = mm128_get_32( s, 1,  17, 33, 49 );
+   cast_m128i( d02 ) = mm128_get_32( s, 2,  18, 34, 50 );
+   cast_m128i( d03 ) = mm128_get_32( s, 3,  19, 35, 51 );
+   cast_m128i( d04 ) = mm128_get_32( s, 4,  20, 36, 52 );
+   cast_m128i( d05 ) = mm128_get_32( s, 5,  21, 37, 53 );
+   cast_m128i( d06 ) = mm128_get_32( s, 6,  22, 38, 54 );
+   cast_m128i( d07 ) = mm128_get_32( s, 7,  23, 39, 55 );
+   cast_m128i( d08 ) = mm128_get_32( s, 8,  24, 40, 56 );
+   cast_m128i( d09 ) = mm128_get_32( s, 9,  25, 41, 57 );
+   cast_m128i( d10 ) = mm128_get_32( s, 10, 26, 42, 58 );
+   cast_m128i( d11 ) = mm128_get_32( s, 11, 27, 43, 59 );
+   cast_m128i( d12 ) = mm128_get_32( s, 12, 28, 44, 60 );
+   cast_m128i( d13 ) = mm128_get_32( s, 13, 29, 45, 61 );
+   cast_m128i( d14 ) = mm128_get_32( s, 14, 30, 46, 62 );
+   cast_m128i( d15 ) = mm128_get_32( s, 15, 31, 47, 63 );
+}
+
+// quarter avx512 block, 32 bytes * 8 lanes
+// 8 lanes of 128 bits using 64 bit interleaving
+// Used for last 16 bytes of 80 byte input, only used for testing.
+static inline void mm128_dintrlv_8x64x128( void *d0, void *d1, void *d2,
+                         void *d3, void *d4, void *d5, void *d6, void *d7,
+                         const int n, const void *s )
+{
+   casti_m128i( d0,n ) = mm128_get_64( s, 0,  8 );
+   casti_m128i( d1,n ) = mm128_get_64( s, 1,  9 );
+   casti_m128i( d2,n ) = mm128_get_64( s, 2, 10 );
+   casti_m128i( d3,n ) = mm128_get_64( s, 3, 11 );
+   casti_m128i( d4,n ) = mm128_get_64( s, 4, 12 );
+   casti_m128i( d5,n ) = mm128_get_64( s, 5, 13 );
+   casti_m128i( d6,n ) = mm128_get_64( s, 6, 14 );
+   casti_m128i( d7,n ) = mm128_get_64( s, 7, 15 );
+}
+
+static inline void mm128_dintrlv_4x128x128( void *d0, void *d1, void *d2,
+                                void *d3, const int n, const void *s )
+{
+   casti_m128i( d0,n ) = mm128_get_64( s, 0, 1 );
+   casti_m128i( d1,n ) = mm128_get_64( s, 2, 3 );
+   casti_m128i( d2,n ) = mm128_get_64( s, 4, 5 );
+   casti_m128i( d3,n ) = mm128_get_64( s, 5, 7 );
+}
+
+// AVX2 functions Used in AVX512 interleaving
+
+static inline void mm256_dintrlv_16x32x256( void *d00, void *d01,
+                 void *d02, void *d03, void *d04, void *d05,
+            void *d06, void *d07, void *d08, void *d09,
+            void *d10, void *d11, void *d12, void *d13,
+            void *d14, void *d15, const int n, const void *s )
+{
+   casti_m256i( d00,n ) = mm256_get_32( s,  0, 16, 32, 48, 64, 80, 96,112 );
+   casti_m256i( d01,n ) = mm256_get_32( s,  1, 17, 33, 49, 65, 81, 97,113 );
+   casti_m256i( d02,n ) = mm256_get_32( s,  2, 18, 34, 50, 66, 82, 98,114 );
+   casti_m256i( d03,n ) = mm256_get_32( s,  3, 19, 35, 51, 67, 83, 99,115 );
+   casti_m256i( d04,n ) = mm256_get_32( s,  4, 20, 36, 52, 68, 84,100,116 );
+   casti_m256i( d05,n ) = mm256_get_32( s,  5, 21, 37, 53, 69, 85,101,117 );
+   casti_m256i( d06,n ) = mm256_get_32( s,  6, 22, 38, 54, 70, 86,102,118 );
+   casti_m256i( d07,n ) = mm256_get_32( s,  7, 23, 39, 55, 71, 87,103,119 );
+   casti_m256i( d08,n ) = mm256_get_32( s,  8, 24, 40, 56, 72, 88,104,120 );
+   casti_m256i( d09,n ) = mm256_get_32( s,  9, 25, 41, 57, 73, 89,105,121 );
+   casti_m256i( d10,n ) = mm256_get_32( s, 10, 26, 42, 58, 74, 90,106,122 );
+   casti_m256i( d11,n ) = mm256_get_32( s, 11, 27, 43, 59, 75, 91,107,123 );
+   casti_m256i( d12,n ) = mm256_get_32( s, 12, 28, 44, 60, 76, 92,108,124 );
+   casti_m256i( d13,n ) = mm256_get_32( s, 13, 29, 45, 61, 77, 93,109,125 );
+   casti_m256i( d14,n ) = mm256_get_32( s, 14, 30, 46, 62, 78, 94,110,126 );
+   casti_m256i( d15,n ) = mm256_get_32( s, 15, 31, 47, 63, 79, 95,111,127 );
+}
+
+// 8 lanes of 256 bits using 64 bit interleaving (standard final hash size)
+static inline void mm256_dintrlv_8x64x256( void *d0, void *d1, void *d2,
+                            void *d3, void *d4, void *d5, void *d6, void *d7,
+                            const int n, const void *s )
+{
+   casti_m256i( d0,n ) = mm256_get_64( s,  0,  8, 16, 24 );
+   casti_m256i( d1,n ) = mm256_get_64( s,  1,  9, 17, 25 );
+   casti_m256i( d2,n ) = mm256_get_64( s,  2, 10, 18, 26 );
+   casti_m256i( d3,n ) = mm256_get_64( s,  3, 11, 19, 27 );
+   casti_m256i( d4,n ) = mm256_get_64( s,  4, 12, 20, 28 );
+   casti_m256i( d5,n ) = mm256_get_64( s,  5, 13, 21, 29 );
+   casti_m256i( d6,n ) = mm256_get_64( s,  6, 14, 22, 30 );
+   casti_m256i( d7,n ) = mm256_get_64( s,  7, 15, 23, 31 );
+}
+
+static inline void mm256_dintrlv_4x128x256( void *d0, void *d1, void *d2,
+                                void *d3, const int n, const void *s )
+{
+   casti_m256i( d0,n ) = mm256_get_64( s,  0,  1,  8,  9 );
+   casti_m256i( d1,n ) = mm256_get_64( s,  2,  3, 10, 11 );
+   casti_m256i( d2,n ) = mm256_get_64( s,  4,  5, 12, 13 );
+   casti_m256i( d3,n ) = mm256_get_64( s,  6,  7, 14, 15 );
+}
+
+// AVX 512 helper functions.
+//
+// Macro functions returning vector.
+// Abstracted typecasting, avoid temp pointers.
+// Source arguments may be any 64 or 32 byte aligned pointer as appropriate.
+
+#define mm512_put_64( s0, s1, s2, s3, s4, s5, s6, s7 ) \
+  _mm512_set_epi64( *((const uint64_t*)(s7)), *((const uint64_t*)(s6)), \
+                    *((const uint64_t*)(s5)), *((const uint64_t*)(s4)), \
+                    *((const uint64_t*)(s3)), *((const uint64_t*)(s2)), \
+                    *((const uint64_t*)(s1)), *((const uint64_t*)(s0)) ) 
+
+#define mm512_put_32( s00, s01, s02, s03, s04, s05, s06, s07, \
+                      s08, s09, s10, s11, s12, s13, s14, s15 ) \
+  _mm512_set_epi32( *((const uint32_t*)(s15)), *((const uint32_t*)(s14)), \
+                    *((const uint32_t*)(s13)), *((const uint32_t*)(s12)), \
+                    *((const uint32_t*)(s11)), *((const uint32_t*)(s10)), \
+                    *((const uint32_t*)(s09)), *((const uint32_t*)(s08)), \
+                    *((const uint32_t*)(s07)), *((const uint32_t*)(s06)), \
+                    *((const uint32_t*)(s05)), *((const uint32_t*)(s04)), \
+                    *((const uint32_t*)(s03)), *((const uint32_t*)(s02)), \
+                    *((const uint32_t*)(s01)), *((const uint32_t*)(s00)) ) 
+
+#define mm512_get_64( s, i0, i1, i2, i3, i4, i5, i6, i7 ) \
+  _mm512_set_epi64( ((const uint64_t*)(s))[i7], ((const uint64_t*)(s))[i6], \
+                    ((const uint64_t*)(s))[i5], ((const uint64_t*)(s))[i4], \
+                    ((const uint64_t*)(s))[i3], ((const uint64_t*)(s))[i2], \
+                    ((const uint64_t*)(s))[i1], ((const uint64_t*)(s))[i0] )
+
+#define mm512_get_32( s, i00, i01, i02, i03, i04, i05, i06, i07, \
+                         i08, i09, i10, i11, i12, i13, i14, i15 ) \
+  _mm512_set_epi32( ((const uint32_t*)(s))[i15], ((const uint32_t*)(s))[i14], \
+                    ((const uint32_t*)(s))[i13], ((const uint32_t*)(s))[i12], \
+                    ((const uint32_t*)(s))[i11], ((const uint32_t*)(s))[i10], \
+                    ((const uint32_t*)(s))[i09], ((const uint32_t*)(s))[i08], \
+                    ((const uint32_t*)(s))[i07], ((const uint32_t*)(s))[i06], \
+                    ((const uint32_t*)(s))[i05], ((const uint32_t*)(s))[i04], \
+                    ((const uint32_t*)(s))[i03], ((const uint32_t*)(s))[i02], \
+                    ((const uint32_t*)(s))[i01], ((const uint32_t*)(s))[i00] )
+
+// AVX512 has no blend, can be done with permute2xvar but at what cost?
+// Can also be done with shifting and mask-or'ing for 3 instructins with
+// 1 dependency. Finally it can be done with 1 _mm512_set but with 8 64 bit
+// array index calculations and 8 pointer reads.
+
+// Blend 2 vectors alternating hi & lo: { hi[n], lo[n-1], ... hi[1]. lo[0] }
+#define mm512_interleave_blend_128( hi, lo ) \
+  _mm256_permute2xvar_epi64( hi, lo, _mm512_set_epi64( \
+                            0x7, 0x6, 0x5, 0x4, 0xb, 0xa, 0x9, 0x8 )
+
+#define mm512_interleave_blend_64( hi, lo ) \
+  _mm256_permute2xvar_epi64( hi, lo, _mm512_set_epi64( \
+                            0x7, 0x6, 0xd, 0xc, 0x3, 0x2, 0x9, 0x8 )
+
+#define mm512_interleave_blend_32( hi, lo ) \
+  _mm256_permute2xvar_epi32( hi, lo, _mm512_set_epi32( \
+                  0x0f, 0x1e, 0x0d, 0x1c, 0x0b, 0x1a, 0x09, 0x18, \
+                            0x07, 0x16, 0x05, 0x14, 0x03, 0x12, 0x01, 0x10 )
+//
+
+static inline void mm512_intrlv_16x32x512( void *d, const void *s00,
+     const void *s01, const void *s02, const void *s03, const void *s04,
+     const void *s05, const void *s06, const void *s07, const void *s08,
+     const void *s09, const void *s10, const void *s11, const void *s12,
+     const void *s13, const void *s14, const void *s15 )
+{
+   casti_m512i( d, 0 ) = mm512_put_32(
+         s00,    s01,    s02,    s03,    s04,    s05,    s06,    s07,
+              s08,    s09,    s10,    s11,    s12,    s13,    s14,    s15 );
+   casti_m512i( d, 1 ) = mm512_put_32(
+         s00+ 4, s01+ 4, s02+ 4, s03+ 4, s04+ 4, s05+ 4, s06+ 4, s07+ 4,
+              s08+ 4, s09+ 4, s10+ 4, s11+ 4, s12+ 4, s13+ 4, s14+ 4, s15+ 4 );
+   casti_m512i( d, 2 ) = mm512_put_32(
+              s00+ 8, s01+ 8, s02+ 8, s03+ 8, s04+ 8, s05+ 8, s06+ 8, s07+ 8,
+              s08+ 8, s09+ 8, s10+ 8, s11+ 8, s12+ 8, s13+ 8, s14+ 8, s15+ 8 );
+   casti_m512i( d, 3 ) = mm512_put_32(
+         s00+12, s01+12, s02+12, s03+12, s04+12, s05+12, s06+12, s07+12,
+              s08+12, s09+12, s10+12, s11+12, s12+12, s13+12, s14+12, s15+12 );
+   casti_m512i( d, 4 ) = mm512_put_32(
+         s00+16, s01+16, s02+16, s03+16, s04+16, s05+16, s06+16, s07+16,
+              s08+16, s09+16, s10+16, s11+16, s12+16, s13+16, s14+16, s15+16 );
+   casti_m512i( d, 5 ) = mm512_put_32(
+         s00+20, s01+20, s02+20, s03+20, s04+20, s05+20, s06+20, s07+20,
+              s08+20, s09+20, s10+20, s11+20, s12+20, s13+20, s14+20, s15+20 );
+   casti_m512i( d, 6 ) = mm512_put_32(
+         s00+24, s01+24, s02+24, s03+24, s04+24, s05+24, s06+24, s07+24,
+              s08+24, s09+24, s10+24, s11+24, s12+24, s13+24, s14+24, s15+24 );
+   casti_m512i( d, 7 ) = mm512_put_32(
+         s00+28, s01+28, s02+28, s03+28, s04+28, s05+28, s06+28, s07+28,
+              s08+28, s09+28, s10+28, s11+28, s12+28, s13+28, s14+28, s15+28 );
+   casti_m512i( d, 8 ) = mm512_put_32(
+         s00+32, s01+28, s02+28, s03+28, s04+32, s05+28, s06+28, s07+28,
+              s08+32, s09+28, s10+28, s11+28, s12+32, s13+28, s14+28, s15+28 );
+   casti_m512i( d, 9 ) = mm512_put_32(
+         s00+36, s01+28, s02+28, s03+28, s04+36, s05+28, s06+28, s07+28,
+              s08+36, s09+28, s10+28, s11+28, s12+36, s13+28, s14+28, s15+28 );
+   casti_m512i( d,10 ) = mm512_put_32(
+         s00+40, s01+28, s02+28, s03+28, s04+40, s05+28, s06+28, s07+28,
+              s08+40, s09+28, s10+28, s11+28, s12+40, s13+28, s14+28, s15+28 );
+   casti_m512i( d,11 ) = mm512_put_32(
+         s00+44, s01+28, s02+28, s03+28, s04+44, s05+28, s06+28, s07+28,
+              s08+44, s09+28, s10+28, s11+28, s12+44, s13+28, s14+28, s15+28 );
+   casti_m512i( d,12 ) = mm512_put_32(
+         s00+48, s01+28, s02+28, s03+28, s04+48, s05+28, s06+28, s07+28,
+              s08+48, s09+28, s10+28, s11+28, s12+48, s13+28, s14+28, s15+28 );
+   casti_m512i( d,13 ) = mm512_put_32(
+         s00+52, s01+28, s02+28, s03+28, s04+52, s05+28, s06+28, s07+28,
+              s08+52, s09+28, s10+28, s11+28, s12+52, s13+28, s14+28, s15+28 );
+   casti_m512i( d,14 ) = mm512_put_32(
+         s00+56, s01+28, s02+28, s03+28, s04+56, s05+28, s06+28, s07+28,
+              s08+56, s09+28, s10+28, s11+28, s12+56, s13+28, s14+28, s15+28 );
+   casti_m512i( d,15 ) = mm512_put_32(
+         s00+60, s01+28, s02+28, s03+28, s04+60, s05+28, s06+28, s07+28,
+              s08+60, s09+28, s10+28, s11+28, s12+60, s13+28, s14+28, s15+28 );
+}
+
+static inline void mm512_intrlv_16x32x256( void *d, const void *s00,
+     const void *s01, const void *s02, const void *s03, const void *s04,
+     const void *s05, const void *s06, const void *s07, const void *s08,
+     const void *s09, const void *s10, const void *s11, const void *s12,
+     const void *s13, const void *s14, const void *s15 )
+{
+   casti_m512i( d, 0 ) = mm512_put_32(
+             s00,    s01,    s02,    s03,    s04,    s05,    s06,    s07,
+             s08,    s09,    s10,    s11,    s12,    s13,    s14,    s15 );
+   casti_m512i( d, 1 ) = mm512_put_32(
+        s00+ 4, s01+ 4, s02+ 4, s03+ 4, s04+ 4, s05+ 4, s06+ 4, s07+ 4,
+             s08+ 4, s09+ 4, s10+ 4, s11+ 4, s12+ 4, s13+ 4, s14+ 4, s15+ 4 );
+   casti_m512i( d, 2 ) = mm512_put_32(
+        s00+ 8, s01+ 8, s02+ 8, s03+ 8, s04+ 8, s05+ 8, s06+ 8, s07+ 8,
+             s08+ 8, s09+ 8, s10+ 8, s11+ 8, s12+ 8, s13+ 8, s14+ 8, s15+ 8 );
+   casti_m512i( d, 3 ) = mm512_put_32(
+        s00+12, s01+12, s02+12, s03+12, s04+12, s05+12, s06+12, s07+12,
+             s08+12, s09+12, s10+12, s11+12, s12+12, s13+12, s14+12, s15+12 );
+   casti_m512i( d, 4 ) = mm512_put_32(
+             s00+16, s01+16, s02+16, s03+16, s04+16, s05+16, s06+16, s07+16,
+             s08+16, s09+16, s10+16, s11+16, s12+16, s13+16, s14+16, s15+16 );
+   casti_m512i( d, 5 ) = mm512_put_32(
+        s00+20, s01+20, s02+20, s03+20, s04+20, s05+20, s06+20, s07+20,
+             s08+20, s09+20, s10+20, s11+20, s12+20, s13+20, s14+20, s15+20 );
+   casti_m512i( d, 6 ) = mm512_put_32(
+        s00+24, s01+24, s02+24, s03+24, s04+24, s05+24, s06+24, s07+24,
+             s08+24, s09+24, s10+24, s11+24, s12+24, s13+24, s14+24, s15+24 );
+   casti_m512i( d, 7 ) = mm512_put_32(
+        s00+28, s01+28, s02+28, s03+28, s04+28, s05+28, s06+28, s07+28,
+             s08+28, s09+28, s10+28, s11+28, s12+28, s13+28, s14+28, s15+28 );
+}
+
+// Last 16 bytes of input
+static inline void mm512_intrlv_16x32x128( void *d, const void *s00,
+     const void *s01, const void *s02, const void *s03, const void *s04,
+     const void *s05, const void *s06, const void *s07, const void *s08,
+     const void *s09, const void *s10, const void *s11, const void *s12,
+     const void *s13, const void *s14, const void *s15 )
+{
+   casti_m512i( d, 0 ) = mm512_put_32(
+        s00,    s01,    s02,    s03,    s04,    s05,    s06,    s07,
+        s08,    s09,    s10,    s11,    s12,    s13,    s14,    s15 );
+   casti_m512i( d, 1 ) = mm512_put_32(
+        s00+ 4, s01+ 4, s02+ 4, s03+ 4, s04+ 4, s05+ 4, s06+ 4, s07+ 4,
+             s08+ 4, s09+ 4, s10+ 4, s11+ 4, s12+ 4, s13+ 4, s14+ 4, s15+ 4 );
+   casti_m512i( d, 2 ) = mm512_put_32(
+        s00+ 8, s01+ 8, s02+ 8, s03+ 8, s04+ 8, s05+ 8, s06+ 8, s07+ 8,
+             s08+ 8, s09+ 8, s10+ 8, s11+ 8, s12+ 8, s13+ 8, s14+ 8, s15+ 8 );
+   casti_m512i( d, 3 ) = mm512_put_32(
+        s00+12, s01+12, s02+12, s03+12, s04+12, s05+12, s06+12, s07+12,
+             s08+12, s09+12, s10+12, s11+12, s12+12, s13+12, s14+12, s15+12 );
+}
+
+// can be called directly for 64 byte hash.
+static inline void mm512_dintrlv_16x32x512( void *d00, void *d01,
+                void *d02, void *d03, void *d04, void *d05, void *d06,
+                void *d07, void *d08, void *d09, void *d10, void *d11,
+                void *d12, void *d13, void *d14, void *d15, const int n,
+      const void *s )
+{
+   casti_m512i(d00,n) = mm512_get_32( s,  0, 16, 32, 48, 64, 80, 96,112,
+                              128,144,160,176,192,208,224,240 );
+   casti_m512i(d01,n) = mm512_get_32( s,  1, 17, 33, 49, 65, 81, 97,113,
+                              129,145,161,177,193,209,225,241 );
+   casti_m512i(d02,n) = mm512_get_32( s,  2, 18, 34, 50, 66, 82, 98,114,
+                    130,146,162,178,194,210,226,242 );
+   casti_m512i(d03,n) = mm512_get_32( s,  3, 19, 35, 51, 67, 83, 99,115,
+                                        131,147,163,179,195,211,227,243 );
+   casti_m512i(d04,n) = mm512_get_32( s,  4, 20, 36, 52, 68, 84,100,116,
+                              132,148,164,180,196,212,228,244 );
+   casti_m512i(d05,n) = mm512_get_32( s,  5, 21, 37, 53, 69, 85,101,117,
+                                        133,149,165,181,197,213,229,245 );
+   casti_m512i(d06,n) = mm512_get_32( s,  6, 22, 38, 54, 70, 86,102,118,
+                                        134,150,166,182,198,214,230,246 );
+   casti_m512i(d07,n) = mm512_get_32( s,  7, 23, 39, 55, 71, 87,103,119,
+                              135,151,167,183,199,215,231,247 );
+   casti_m512i(d08,n) = mm512_get_32( s,  8, 24, 40, 56, 72, 88,104,120,
+                              136,152,168,184,200,216,232,248 );
+   casti_m512i(d09,n) = mm512_get_32( s,  9, 25, 41, 57, 73, 89,105,121,
+                              137,153,169,185,201,217,233,249 );
+   casti_m512i(d10,n) = mm512_get_32( s, 10, 26, 42, 58, 74, 90,106,122,
+                              138,154,170,186,202,218,234,250 );
+   casti_m512i(d11,n) = mm512_get_32( s, 11, 27, 43, 59, 75, 91,107,123,
+                              139,155,171,187,203,219,235,251 );
+   casti_m512i(d12,n) = mm512_get_32( s, 12, 28, 44, 60, 76, 92,108,124,
+                              140,156,172,188,204,220,236,252 );
+   casti_m512i(d13,n) = mm512_get_32( s, 13, 29, 45, 61, 77, 93,109,125,
+                              141,157,173,189,205,221,237,253 );
+   casti_m512i(d14,n) = mm512_get_32( s, 14, 30, 46, 62, 78, 94,110,126,
+                                   142,158,174,190,206,222,238,254 );
+   casti_m512i(d15,n) = mm512_get_32( s, 15, 31, 47, 63, 79, 95,111,127,
+                                    143,159,175,191,207,223,239,255 );
+}
+
+static inline void mm512_intrlv_8x64x512( void *d, const void *s0,
+                   const void *s1, const void *s2, const void *s3,
+                   const void *s4, const void *s5, const void *s6,
+                   const void *s7 )
+{
+  casti_m512i( d,0 ) = mm512_put_64( s0,    s1,    s2,    s3,
+                                     s4,    s5,    s6,    s7 );
+  casti_m512i( d,1 ) = mm512_put_64( s0+ 8, s1+ 8, s2+ 8, s3+ 8,
+                                     s4+ 8, s5+ 8, s6+ 8, s7+ 8 );
+  casti_m512i( d,2 ) = mm512_put_64( s0+16, s1+16, s2+16, s3+16,
+                                     s4+16, s5+16, s6+16, s7+16 );
+  casti_m512i( d,3 ) = mm512_put_64( s0+24, s1+24, s2+24, s3+24,
+                                     s4+24, s5+24, s6+24, s7+24 );
+  casti_m512i( d,4 ) = mm512_put_64( s0+32, s1+32, s2+32, s3+32,
+                                     s4+32, s5+32, s6+32, s7+32 );
+  casti_m512i( d,5 ) = mm512_put_64( s0+40, s1+40, s2+40, s3+40,
+                                     s4+40, s5+40, s6+40, s7+40 );
+  casti_m512i( d,6 ) = mm512_put_64( s0+48, s1+48, s2+48, s3+48,
+                                     s4+48, s5+48, s6+48, s7+48 );
+  casti_m512i( d,7 ) = mm512_put_64( s0+56, s1+56, s2+56, s3+56,
+                                     s4+56, s5+56, s6+56, s7+56 );
+}
+
+static inline void mm512_intrlv_8x64x256( void *d, const void *s0,
+                   const void *s1, const void *s2, const void *s3,
+                   const void *s4, const void *s5, const void *s6,
+                   const void *s7 )
+{
+  casti_m512i( d,0 ) = mm512_put_64( s0,    s1,    s2,    s3,
+                                     s4,    s5,    s6,    s7 );
+  casti_m512i( d,1 ) = mm512_put_64( s0+ 8, s1+ 8, s2+ 8, s3+ 8,
+                                     s4+ 8, s5+ 8, s6+ 8, s7+ 8 );
+  casti_m512i( d,2 ) = mm512_put_64( s0+16, s1+16, s2+16, s3+16,
+                                     s4+16, s5+16, s6+16, s7+16 );
+  casti_m512i( d,3 ) = mm512_put_64( s0+24, s1+24, s2+24, s3+24,
+                                     s4+24, s5+24, s6+24, s7+24 );
+}
+
+
+// 8 lanes of 512 bits using 64 bit interleaving (typical intermediate hash) 
+static inline void mm512_dintrlv_8x64x512( void *d0, void *d1, void *d2,
+                            void *d3, void *d4, void *d5, void *d6, void *d7,
+             const int n, const void *s )
+{
+   casti_m512i( d0,n ) = mm512_get_64( s, 0,  8, 16, 24, 32, 40, 48, 56 );
+   casti_m512i( d1,n ) = mm512_get_64( s, 1,  9, 17, 25, 33, 41, 49, 57 );
+   casti_m512i( d2,n ) = mm512_get_64( s, 2, 10, 18, 26, 34, 42, 50, 58 );
+   casti_m512i( d3,n ) = mm512_get_64( s, 3, 11, 19, 27, 35, 43, 51, 59 );
+   casti_m512i( d4,n ) = mm512_get_64( s, 4, 12, 20, 28, 36, 44, 52, 60 );
+   casti_m512i( d5,n ) = mm512_get_64( s, 5, 13, 21, 29, 37, 45, 53, 61 );
+   casti_m512i( d6,n ) = mm512_get_64( s, 6, 14, 22, 30, 38, 46, 54, 62 );
+   casti_m512i( d7,n ) = mm512_get_64( s, 7, 15, 23, 31, 39, 47, 55, 63 );
+}
+
+static inline void mm512_dintrlv_4x128x512( void *d0, void *d1, void *d2,
+                                void *d3, const int n, const void *s )
+{
+   casti_m512i( d0,n ) = mm512_get_64( s, 0, 1,  8,  9, 16, 17, 24, 25 );
+   casti_m512i( d1,n ) = mm512_get_64( s, 2, 3, 10, 11, 18, 19, 16, 27 );
+   casti_m512i( d2,n ) = mm512_get_64( s, 4, 5, 12, 13, 20, 21, 28, 29 );
+   casti_m512i( d3,n ) = mm512_get_64( s, 6, 7, 14, 15, 22, 23, 30, 31 );
+}
+
+// AVX-512 user facing functions.
+
+static inline void mm512_intrlv_16x32( void *d, const void *s00,
+    const void *s01, const void *s02, const void *s03, const void *s04,
+    const void *s05, const void *s06, const void *s07, const void *s08,
+    const void *s09, const void *s10, const void *s11, const void *s12,
+    const void *s13, const void *s14, const void *s15, int bit_len )
+{
+   if ( bit_len <= 256 )
+   {
+      mm512_intrlv_16x32x256( d, s00, s01, s02, s03, s04, s05, s06, s07,
+                                     s08, s09, s10, s11, s12, s13, s14, s15 );
+      return;
+   }
+   mm512_intrlv_16x32x512( d, s00, s01, s02, s03, s04, s05, s06, s07,
+                        s08, s09, s10, s11, s12, s13, s14, s15 );
+   if ( bit_len <= 512 ) return;
+   if ( bit_len <= 640 )
+   {
+
+      mm512_intrlv_16x32x128( d+1024, s00+64, s01+64, s02+64, s03+64,
+                s04+64, s05+64, s06+64, s07+64, s08+64, s09+64,
+           s10+64, s11+64, s12+64, s13+64, s14+64, s15+64 );
+      return;
+   }
+   mm512_intrlv_16x32x512( d+1024, s00+64, s01+64, s02+64, s03+64,
+                       s04+64, s05+64, s06+64, s07+64, s08+64, s09+64,
+                       s10+64, s11+64, s12+64, s13+64, s14+64, s15+64 );
+   // bit_len == 1024
+}
+
+// sub-functions can be called directly for 32 & 64 byte hash.
+static inline void mm512_dintrlv_16x32( void *d00, void *d01, void *d02,
+            void *d03, void *d04, void *d05, void *d06, void *d07, void *d08,
+            void *d09, void *d10, void *d11, void *d12, void *d13, void *d14,
+            void *d15, const void *src, const int bit_len )
+{
+   if ( bit_len <= 256 )
+   {
+      mm256_dintrlv_16x32x256( d00, d01, d02, d03, d04, d05, d06, d07,
+                                    d08, d09, d10, d11, d12, d13, d14, d15,
+                                    0,src );
+      return;
+   }
+   mm512_dintrlv_16x32x512( d00, d01, d02, d03, d04, d05, d06, d07,
+                                 d08, d09, d10, d11, d12, d13, d14, d15,
+                                 0, src );
+   if ( bit_len <= 512 ) return;
+   if ( bit_len <= 640 )
+   {
+      // short block, final 16 bytes of input data.
+      mm128_dintrlv_16x32x128( d00, d01, d02, d03, d04, d05, d06, d07,
+                          d08, d09, d10, d11, d12, d13, d14, d15,
+                1, src+1024 );
+      return;
+   }
+   // bit_len == 1024
+   mm512_dintrlv_16x32x512( d00, d01, d02, d03, d04, d05, d06, d07,
+                       d08, d09, d10, d11, d12, d13, d14, d15,
+             1, src+1024 );
+}
+
+static inline void mm512_extract_lane_16x32( void *dst, const void *src,
+                                            const int lane, const int bit_len )
+{
+  if ( bit_len <= 256 )
+  {
+     cast_m256i( dst ) = mm256_get_32( src, lane, lane+16, lane+32, lane+48,
+                               lane+64, lane+80, lane+96, lane+112 );
+     return;
+  }
+  cast_m512i( dst ) = mm512_get_32( src, lane, lane+ 16, lane+ 32, lane+ 48,
+                 lane+ 64, lane+ 80, lane+ 96, lane+112, lane+128, lane+144,
+            lane+160, lane+176, lane+192, lane+208, lane+224, lane+248 );
+}
+
+//
+
+static inline void mm512_intrlv_8x64( void *d, const void *s0,
+                   const void *s1, const void *s2, const void *s3,
+                   const void *s4, const void *s5, const void *s6,
+                   const void *s7, int bit_len )
+{
+   if ( bit_len <= 256 )
+   {
+      mm512_intrlv_8x64x256( d, s0, s1, s2, s3, s4, s5, s6, s7 );
+      return;
+   }
+   mm512_intrlv_8x64x512( d, s0, s1, s2, s3, s4, s5, s6, s7 );
+   if ( bit_len <= 512 ) return;
+   if ( bit_len <= 640 )
+   {
+      casti_m512i( d, 8 ) = mm512_put_64( s7+64, s6+64, s5+64, s4+64,
+                                          s3+64, s2+64, s1+64, s0+64 );
+      casti_m512i( d, 9 ) = mm512_put_64( s7+72, s6+72, s5+72, s4+72,
+                                          s3+72, s2+72, s1+72, s0+72 );
+      return;
+   }
+   // bitlen == 1024
+   mm512_intrlv_8x64x512( d+512, s0+64, s1+64, s2+64, s3+64,
+                           s4+64, s5+64, s6+64, s7+64 );
+}
+
+
+static inline void mm512_dintrlv_8x64( void *d0, void *d1, void *d2,
+                        void *d3, void *d4, void *d5, void *d6, void *d7,
+                        const void *s, const int bit_len )
+{
+   if ( bit_len <= 256 )
+   {
+      mm256_dintrlv_8x64x256( d0, d1, d2, d3, d4, d5, d6, d7, 0, s );
+      return;
+   }
+   mm512_dintrlv_8x64x512( d0, d1, d2, d3, d4, d5, d6, d7, 0, s );
+   if ( bit_len <= 512 ) return;
+   if ( bit_len <= 640 )
+   {
+      // short block, final 16 bytes of input data.
+      mm128_dintrlv_8x64x128( d0, d1, d2, d3, d4, d5, d6, d7, 1, s+512 );
+      return;
+   }
+   // bit_len == 1024
+   mm512_dintrlv_8x64x512( d0, d1, d2, d3, d4, d5, d6, d7, 1, s+512 );
+}
+
+// Extract one lane from 64 bit interleaved data
+static inline void mm512_extract_lane_8x64( void *d, const void *s,
+                                            const int lane, const int bit_len )
+{
+  if ( bit_len <= 256 )
+  {
+     cast_m256i( d ) = mm256_get_64( s, lane, lane+8, lane+16, lane+24 );
+     return;
+  }
+  // else bit_len == 512
+  cast_m512i( d ) = mm512_get_64( s, lane   , lane+ 8, lane+16, lane+24,
+                               lane+32, lane+40, lane+48, lane+56 );
+}
+
+//
+
+static inline void mm512_intrlv_4x128( void *d, const void *s0,
+            const void *s1, const void *s2, const void *s3, const int bit_len )
+{
+  casti_m512i( d, 0 ) = mm512_put_64( s0,    s0+8,  s1,    s1+8,
+                                      s2,    s2+8,  s3,    s3+8 );
+  casti_m512i( d, 1 ) = mm512_put_64( s0+16, s0+24, s1+16, s1+24,
+                                      s2+16, s2+24, s3+16, s3+24 );
+  if ( bit_len <= 256 ) return;
+
+  casti_m512i( d, 2 ) = mm512_put_64( s0+32, s0+40, s1+32, s1+40,
+                                      s2+32, s2+40, s3+32, s3+40 );
+  casti_m512i( d, 3 ) = mm512_put_64( s0+48, s0+56, s1+48, s1+56,
+                                      s2+48, s2+56, s3+48, s3+56 );
+  if ( bit_len <= 512 ) return;
+
+  casti_m512i( d, 4 ) = mm512_put_64( s0+64, s0+72, s1+64, s1+72,
+                                      s2+64, s2+72, s3+64, s3+72 );
+  if ( bit_len <= 640 ) return;
+
+  casti_m512i( d, 5 ) = mm512_put_64( s0+ 80, s0+ 88, s1+ 80, s1+ 88,
+                                      s2+ 80, s2+ 88, s3+ 80, s3+ 88 );
+  casti_m512i( d, 6 ) = mm512_put_64( s0+ 96, s0+104, s1+ 96, s1+104,
+                                      s2+ 96, s2+104, s3+ 96, s3+104 );
+  casti_m512i( d, 7 ) = mm512_put_64( s0+112, s0+120, s1+112, s1+120,
+                                      s2+112, s2+120, s3+112, s3+120 );
+   // bit_len == 1024
+}
+
+static inline void mm512_dintrlv_4x128( void *d0, void *d1, void *d2,
+              void *d3, const void *s, const int bit_len )
+{
+   if ( bit_len <= 256 )
+   {
+      mm256_dintrlv_4x128x256( d0, d1, d2, d3, 0, s );
+      return;
+   }
+   mm512_dintrlv_4x128x512( d0, d1, d2, d3, 0, s );
+   if ( bit_len <= 512 ) return;
+   if ( bit_len <= 640 )
+   {
+      mm128_dintrlv_4x128x128( d0, d1, d2, d3, 1, s+256 );
+      return;
+   }
+   // bit_len == 1024
+   mm512_dintrlv_4x128x512( d0, d1, d2, d3, 1, s+256 );
+}
+
+// input one 8x64 buffer and return 2*4*128
+static inline void mm512_rintrlv_8x64_4x128( void *dst0, void *dst1,
+                                              const void *src, int  bit_len )
+{
+   __m512i* d0 = (__m512i*)dst0;
+   __m512i* d1 = (__m512i*)dst1;
+   uint64_t *s = (uint64_t*)src;
+
+   d0[0] = _mm512_set_epi64( s[ 11], s[  3], s[ 10], s[  2],
+                             s[  9], s[  1], s[  8], s[  0] );
+   d0[1] = _mm512_set_epi64( s[ 27], s[ 19], s[ 26], s[ 18],
+                   s[ 25], s[ 17], s[ 24], s[ 16] );
+   d0[2] = _mm512_set_epi64( s[ 15], s[  7], s[ 14], s[  6],
+                             s[ 13], s[  5], s[ 12], s[  4] );
+   d0[3] = _mm512_set_epi64( s[ 31], s[ 23], s[ 30], s[ 22],
+                             s[ 29], s[ 21], s[ 28], s[ 20] );
+   d1[0] = _mm512_set_epi64( s[ 43], s[ 35], s[ 42], s[ 34],
+                             s[ 41], s[ 33], s[ 40], s[ 32] );
+   d1[1] = _mm512_set_epi64( s[ 59], s[ 51], s[ 58], s[ 50],
+                             s[ 57], s[ 49], s[ 56], s[ 48] );
+   d1[2] = _mm512_set_epi64( s[ 47], s[ 39], s[ 46], s[ 38],
+                             s[ 45], s[ 37], s[ 44], s[ 36] );
+   d1[3] = _mm512_set_epi64( s[ 63], s[ 55], s[ 62], s[ 54],
+                              s[ 61], s[ 53], s[ 60], s[ 52] );
+
+   if ( bit_len <= 512 ) return;
+
+   d0[4] = _mm512_set_epi64( s[ 75], s[ 67], s[ 74], s[ 66],
+                             s[ 73], s[ 65], s[ 72], s[ 64] );
+   d0[5] = _mm512_set_epi64( s[ 91], s[ 83], s[ 90], s[ 82],
+                             s[ 89], s[ 81], s[ 88], s[ 80] );
+   d0[6] = _mm512_set_epi64( s[ 79], s[ 71], s[ 78], s[ 70],
+                             s[ 77], s[ 69], s[ 76], s[ 68] );
+   d0[7] = _mm512_set_epi64( s[ 95], s[ 87], s[ 94], s[ 86],
+                             s[ 93], s[ 85], s[ 92], s[ 84] );
+   d1[4] = _mm512_set_epi64( s[107], s[ 99], s[106], s[ 98],
+                             s[105], s[ 97], s[104], s[ 96] );
+   d1[5] = _mm512_set_epi64( s[123], s[115], s[122], s[114],
+                             s[121], s[113], s[120], s[112] );
+   d1[6] = _mm512_set_epi64( s[111], s[103], s[110], s[102],
+                             s[109], s[101], s[108], s[100] );
+   d1[7] = _mm512_set_epi64( s[127], s[119], s[126], s[118],
+                             s[125], s[117], s[124], s[116] );
+
+}
+
+// input 2 4x128  return 8x64
+static inline void mm512_rintrlv_4x128_8x64( void *dst, const void *src0,
+                                              const void *src1, int  bit_len )
+{
+   __m512i* d = (__m512i*)dst;
+   uint64_t *s0 = (uint64_t*)src0;
+   uint64_t *s1 = (uint64_t*)src1;
+
+   d[0] = _mm512_set_epi64( s1[ 6], s1[ 4], s1[ 2], s1[ 0],
+                            s0[ 6], s0[ 4], s0[ 2], s0[ 0] );
+   d[1] = _mm512_set_epi64( s1[ 7], s1[ 5], s1[ 3], s1[ 1],
+                            s0[ 7], s0[ 5], s0[ 3], s0[ 1] );
+   d[2] = _mm512_set_epi64( s1[14], s1[12], s1[10], s1[ 8],
+                            s0[14], s0[12], s0[10], s0[ 8] );
+   d[3] = _mm512_set_epi64( s1[15], s1[13], s1[11], s1[ 9],
+                            s0[15], s0[13], s0[11], s0[ 9] );
+   d[4] = _mm512_set_epi64( s1[22], s1[20], s1[18], s1[16],
+                            s0[22], s0[20], s0[18], s0[16] );
+   d[5] = _mm512_set_epi64( s1[23], s1[21], s1[19], s1[17],
+                            s0[24], s0[21], s0[19], s0[17] );
+   d[6] = _mm512_set_epi64( s1[22], s1[28], s1[26], s1[24],
+                            s0[22], s0[28], s0[26], s0[24] );
+   d[7] = _mm512_set_epi64( s1[31], s1[29], s1[27], s1[25],
+                            s0[31], s0[29], s0[27], s0[25] );
+
+   if ( bit_len <= 512 ) return;
+
+   d[0] = _mm512_set_epi64( s1[38], s1[36], s1[34], s1[32],
+                            s0[38], s0[36], s0[34], s0[32] );
+   d[1] = _mm512_set_epi64( s1[39], s1[37], s1[35], s1[33],
+                            s0[39], s0[37], s0[35], s0[33] );
+   d[2] = _mm512_set_epi64( s1[46], s1[44], s1[42], s1[40],
+                            s0[46], s0[44], s0[42], s0[40] );
+   d[3] = _mm512_set_epi64( s1[47], s1[45], s1[43], s1[41],
+                            s0[47], s0[45], s0[43], s0[41] );
+   d[4] = _mm512_set_epi64( s1[54], s1[52], s1[50], s1[48],
+                            s0[54], s0[52], s0[50], s0[48] );
+   d[5] = _mm512_set_epi64( s1[55], s1[53], s1[51], s1[49],
+                            s0[55], s0[53], s0[51], s0[49] );
+
+   d[6] = _mm512_set_epi64( s1[62], s1[60], s1[58], s1[56],
+                            s0[62], s0[60], s0[58], s0[56] );
+   d[7] = _mm512_set_epi64( s1[63], s1[61], s1[59], s1[57],
+                            s0[63], s0[61], s0[59], s0[57] );
+
+}
+
+static inline void mm512_extract_lane_4x128( void *d, const void *s,
+                                            const int lane, const int bit_len )
+{
+  int l = lane<<1;
+  if ( bit_len <= 256 )
+  {
+     cast_m256i( d ) = mm256_get_64( s, l, l+1, l+8, l+9 );
+     return;
+  }
+  // else bit_len == 512
+  cast_m512i( d ) = mm512_get_64( s, l   , l+ 1, l+ 8, l+ 9,
+                                l+16, l+17, l+24, l+25 );
+}
+
+#endif // AVX512
+#endif // INTRLV_AVX512_H__
--- a/simd-utils/intrlv-mmx.h
+++ b/simd-utils/intrlv-mmx.h
@@ -0,0 +1,126 @@
+#if !defined(INTRLV_MMX_H__)
+#define INTRLV_MMX_H__ 1
+
+#if defined(__MMX__)
+
+//////////////////////////////////////////////////////
+//
+//          MMX 64 bit vectors
+
+#define mm64_put_32( s0, s1 ) \
+  _mm_set_pi32( *((const uint32_t*)(s1)), *((const uint32_t*)(s0)) )
+
+#define mm64_get_32( s, i0, i1 ) \
+  _mm_set_pi32( ((const uint32_t*)(s))[i1], ((const uint32_t*)(s))[i0] )
+
+// 1 MMX block, 8 bytes * 2 lanes
+static inline void mm64_intrlv_2x32( void *d, const void *s0,
+                                     const void *s1, int len )
+{
+  casti_m64( d, 0 ) = mm64_put_32( s0    , s1     );
+  casti_m64( d, 1 ) = mm64_put_32( s0+  4, s1+  4 );
+  casti_m64( d, 2 ) = mm64_put_32( s0+  8, s1+  8 );
+  casti_m64( d, 3 ) = mm64_put_32( s0+ 12, s1+ 12 );
+  casti_m64( d, 4 ) = mm64_put_32( s0+ 16, s1+ 16 );
+  casti_m64( d, 5 ) = mm64_put_32( s0+ 20, s1+ 20 );
+  casti_m64( d, 6 ) = mm64_put_32( s0+ 24, s1+ 24 );
+  casti_m64( d, 7 ) = mm64_put_32( s0+ 28, s1+ 28 );
+
+  if ( len <= 256 ) return;
+
+  casti_m64( d, 8 ) = mm64_put_32( s0+ 32, s1+ 32 );
+  casti_m64( d, 9 ) = mm64_put_32( s0+ 36, s1+ 36 );
+  casti_m64( d,10 ) = mm64_put_32( s0+ 40, s1+ 40 );
+  casti_m64( d,11 ) = mm64_put_32( s0+ 44, s1+ 44 );
+  casti_m64( d,12 ) = mm64_put_32( s0+ 48, s1+ 48 );
+  casti_m64( d,13 ) = mm64_put_32( s0+ 52, s1+ 52 );
+  casti_m64( d,14 ) = mm64_put_32( s0+ 56, s1+ 56 );
+  casti_m64( d,15 ) = mm64_put_32( s0+ 60, s1+ 60 );
+
+  if ( len <= 512 ) return;
+
+  casti_m64( d,16 ) = mm64_put_32( s0+ 64, s1+ 64 );
+  casti_m64( d,17 ) = mm64_put_32( s0+ 68, s1+ 68 );
+  casti_m64( d,18 ) = mm64_put_32( s0+ 72, s1+ 72 );
+  casti_m64( d,19 ) = mm64_put_32( s0+ 76, s1+ 76 );
+
+  if ( len <= 640 ) return;
+  casti_m64( d,20 ) = mm64_put_32( s0+ 80, s1+ 80 );
+  casti_m64( d,21 ) = mm64_put_32( s0+ 84, s1+ 84 );
+  casti_m64( d,22 ) = mm64_put_32( s0+ 88, s1+ 88 );
+  casti_m64( d,23 ) = mm64_put_32( s0+ 92, s1+ 92 );
+  casti_m64( d,24 ) = mm64_put_32( s0+ 96, s1+ 96 );
+  casti_m64( d,25 ) = mm64_put_32( s0+100, s1+100 );
+  casti_m64( d,26 ) = mm64_put_32( s0+104, s1+104 );
+  casti_m64( d,27 ) = mm64_put_32( s0+108, s1+108 );
+  casti_m64( d,28 ) = mm64_put_32( s0+112, s1+112 );
+  casti_m64( d,29 ) = mm64_put_32( s0+116, s1+116 );
+  casti_m64( d,30 ) = mm64_put_32( s0+120, s1+120 );
+  casti_m64( d,31 ) = mm64_put_32( s0+124, s1+124 );
+}
+
+static inline void mm64_dintrlv_2x32( void *d00, void *d01, const int n,
+                                      const void *s, int len )
+{
+   casti_m64( d00,0 ) = mm64_get_32( s,  0,  2 );
+   casti_m64( d01,0 ) = mm64_get_32( s,  1,  3 );
+   casti_m64( d00,1 ) = mm64_get_32( s,  4,  6 );
+   casti_m64( d01,1 ) = mm64_get_32( s,  5,  7 );
+   casti_m64( d00,2 ) = mm64_get_32( s,  8, 10 );
+   casti_m64( d01,2 ) = mm64_get_32( s,  9, 11 );
+   casti_m64( d00,3 ) = mm64_get_32( s, 12, 14 );
+   casti_m64( d01,3 ) = mm64_get_32( s, 13, 15 );
+
+   if ( len <= 256 ) return;
+
+   casti_m64( d00,4 ) = mm64_get_32( s, 16, 18 );
+   casti_m64( d01,4 ) = mm64_get_32( s, 17, 19 );
+   casti_m64( d00,5 ) = mm64_get_32( s, 20, 22 );
+   casti_m64( d01,5 ) = mm64_get_32( s, 21, 23 );
+   casti_m64( d00,6 ) = mm64_get_32( s, 24, 26 );
+   casti_m64( d01,6 ) = mm64_get_32( s, 25, 27 );
+   casti_m64( d00,7 ) = mm64_get_32( s, 28, 30 );
+   casti_m64( d01,7 ) = mm64_get_32( s, 29, 31 );
+
+   if ( len <= 512 ) return;
+
+   casti_m64( d00,8 ) = mm64_get_32( s, 32, 34 );
+   casti_m64( d01,8 ) = mm64_get_32( s, 33, 35 );
+   casti_m64( d00,9 ) = mm64_get_32( s, 36, 38 );
+   casti_m64( d01,9 ) = mm64_get_32( s, 37, 39 );
+
+   if ( len <= 640 ) return;
+   casti_m64( d00,10 ) = mm64_get_32( s, 40, 42 );
+   casti_m64( d01,10 ) = mm64_get_32( s, 41, 43 );
+   casti_m64( d00,11 ) = mm64_get_32( s, 44, 46 );
+   casti_m64( d01,11 ) = mm64_get_32( s, 45, 47 );
+   casti_m64( d00,12 ) = mm64_get_32( s, 48, 50 );
+   casti_m64( d01,12 ) = mm64_get_32( s, 49, 51 );
+   casti_m64( d00,13 ) = mm64_get_32( s, 52, 54 );
+   casti_m64( d01,13 ) = mm64_get_32( s, 53, 55 );
+   casti_m64( d00,14 ) = mm64_get_32( s, 56, 58 );
+   casti_m64( d01,14 ) = mm64_get_32( s, 57, 59 );
+   casti_m64( d00,15 ) = mm64_get_32( s, 60, 62 );
+   casti_m64( d01,15 ) = mm64_get_32( s, 61, 63 );
+}
+
+static inline void mm64_extract_lane_2x32( void *d, const void *s,
+                                         const int lane, const int bit_len )
+{
+  casti_m64( d, 0 ) = mm64_get_32( s, lane   , lane+ 4 );
+  casti_m64( d, 1 ) = mm64_get_32( s, lane+ 8, lane+12 );
+  casti_m64( d, 2 ) = mm64_get_32( s, lane+16, lane+20 );
+  casti_m64( d, 3 ) = mm64_get_32( s, lane+24, lane+28 );
+
+  if ( bit_len <= 256 ) return;
+  casti_m64( d, 4 ) = mm64_get_32( s, lane+32, lane+36 );
+  casti_m64( d, 5 ) = mm64_get_32( s, lane+40, lane+44 );
+  casti_m64( d, 6 ) = mm64_get_32( s, lane+48, lane+52 );
+  casti_m64( d, 7 ) = mm64_get_32( s, lane+56, lane+60 );
+  // bit_len == 512
+}
+
+
+
+#endif // MMX
+#endif // INTRLV_MMX_H__
--- a/simd-utils/intrlv-sse2.h
+++ b/simd-utils/intrlv-sse2.h
@@ -0,0 +1,195 @@
+#if !defined(INTRLV_SSE2_H__)
+#define INTRLV_SSE2_H__ 1
+
+// Don't call __mm_extract_epi32 directly, it needs SSE4.1.
+// Use mm128_extr_32 wrapper instead, it has both SSE4.1 & SSE2 covered.
+
+#if  defined(__SSE2__)
+
+///////////////////////////////////////////////////////////////
+//
+//           SSE2 128 bit vectors
+
+
+// Macros to abstract typecasting
+
+// Interleave lanes 
+#define mm128_put_64( s0, s1) \
+  _mm_set_epi64x( *((const uint64_t*)(s1)), *((const uint64_t*)(s0)) )
+
+#define mm128_put_32( s0, s1, s2, s3 ) \
+  _mm_set_epi32( *((const uint32_t*)(s3)), *((const uint32_t*)(s2)), \
+                 *((const uint32_t*)(s1)), *((const uint32_t*)(s0)) )
+
+// Deinterleave lanes
+#define mm128_get_64( s, i0, i1 ) \
+  _mm_set_epi64x( ((const uint64_t*)(s))[i1], ((const uint64_t*)(s))[i0] )
+
+#define mm128_get_32( s, i0, i1, i2, i3 ) \
+  _mm_set_epi32( ((const uint32_t*)(s))[i3], ((const uint32_t*)(s))[i2], \
+                 ((const uint32_t*)(s))[i1], ((const uint32_t*)(s))[i0] )
+
+// blend 2 vectors while interleaving: { hi[n], lo[n-1], ... hi[1], lo[0] }
+#define mm128_intrlv_blend_64( hi, lo ) \
+                _mm256_blend_epi16( hi, lo, 0x0f )
+#define mm128_intrlv_blend_32( hi, lo ) \
+                _mm6_blend_epi16( hi, lo, 0x33 )
+
+// 1 sse2 block, 16 x 16 bytes
+
+#define mm128_intrlv_4x32_128( d, s0, s1, s2, s3 )\
+do { \
+   casti_m128i( d,0 ) = _mm_set_epi32( \
+                            mm128_extr_32( s3, 0 ), mm128_extr_32( s2, 0 ), \
+                            mm128_extr_32( s1, 0 ), mm128_extr_32( s0, 0 ) ); \
+   casti_m128i( d,1 ) = _mm_set_epi32( \
+                            mm128_extr_32( s3, 1 ), mm128_extr_32( s2, 1 ), \
+                            mm128_extr_32( s1, 1 ), mm128_extr_32( s0, 1 ) ); \
+   casti_m128i( d,2 ) = _mm_set_epi32( \
+                            mm128_extr_32( s3, 2 ), mm128_extr_32( s2, 2 ), \
+                            mm128_extr_32( s1, 2 ), mm128_extr_32( s0, 2 ) ); \
+   casti_m128i( d,3 ) = _mm_set_epi32( \
+                            mm128_extr_32( s3, 3 ), mm128_extr_32( s2, 3 ), \
+                            mm128_extr_32( s1, 3 ), mm128_extr_32( s0, 3 ) ); \
+} while(0)
+
+static inline void mm128_dintrlv_4x32_128( void *d0, void *d1, void *d2,
+                                           void *d3, const void *src )
+{
+   __m128i s0 = *(__m128i*) src;
+   __m128i s1 = *(__m128i*)(src+16);
+   __m128i s2 = *(__m128i*)(src+32);
+   __m128i s3 = *(__m128i*)(src+48);
+
+   *(__m128i*)d0 = _mm_set_epi32(
+                     mm128_extr_32( s3,0 ), mm128_extr_32( s2,0 ),
+                     mm128_extr_32( s1,0 ), mm128_extr_32( s0,0 ) );
+   *(__m128i*)d1 = _mm_set_epi32(
+                     mm128_extr_32( s3,1 ), mm128_extr_32( s2,1 ),
+                     mm128_extr_32( s1,1 ), mm128_extr_32( s0,1 ) );
+   *(__m128i*)d2 = _mm_set_epi32(
+                     mm128_extr_32( s3,2 ), mm128_extr_32( s2,2 ),
+                     mm128_extr_32( s1,2 ), mm128_extr_32( s0,2 ) );
+   *(__m128i*)d3 = _mm_set_epi32(
+                     mm128_extr_32( s3,3 ), mm128_extr_32( s2,3 ),
+                     mm128_extr_32( s1,3 ), mm128_extr_32( s0,3 ) );
+}
+
+static inline void mm128_intrlv_2x64x128( void *d, const void *s0,
+                       const void *s1 )
+{
+  casti_m128i( d,0 ) = mm128_put_64( s0,    s1    );
+  casti_m128i( d,1 ) = mm128_put_64( s0+ 8, s1+ 8 );
+  casti_m128i( d,2 ) = mm128_put_64( s0+16, s1+16 );
+  casti_m128i( d,3 ) = mm128_put_64( s0+24, s1+24 );
+}
+
+#define mm128_bswap_intrlv_4x32_128( d, src ) \
+do { \
+  __m128i ss = mm128_bswap_32( src );\
+  casti_m128i( d,0 ) = _mm_set1_epi32( mm128_extr_32( ss, 0 ) ); \
+  casti_m128i( d,1 ) = _mm_set1_epi32( mm128_extr_32( ss, 1 ) ); \
+  casti_m128i( d,2 ) = _mm_set1_epi32( mm128_extr_32( ss, 2 ) ); \
+  casti_m128i( d,3 ) = _mm_set1_epi32( mm128_extr_32( ss, 3 ) ); \
+} while(0)
+
+
+//
+// User functions.
+
+// interleave 4 arrays of 32 bit elements for 128 bit processing
+// bit_len must be 256, 512 or 640 bits.
+#define mm128_interleave_4x32 mm128_intrlv_4x32
+static inline void mm128_intrlv_4x32( void *d, const void *s0,
+               const void *s1, const void *s2, const void *s3, int bit_len )
+{
+   mm128_intrlv_4x32_128( d    , casti_m128i(s0,0), casti_m128i(s1,0),
+                                 casti_m128i(s2,0), casti_m128i(s3,0) );
+   mm128_intrlv_4x32_128( d+ 64, casti_m128i(s0,1), casti_m128i(s1,1),   
+                                 casti_m128i(s2,1), casti_m128i(s3,1) );   
+   if ( bit_len <= 256 ) return;
+   mm128_intrlv_4x32_128( d+128, casti_m128i(s0,2), casti_m128i(s1,2),
+                                 casti_m128i(s2,2), casti_m128i(s3,2) );
+   mm128_intrlv_4x32_128( d+192, casti_m128i(s0,3), casti_m128i(s1,3),
+                                 casti_m128i(s2,3), casti_m128i(s3,3) );
+   if ( bit_len <= 512 ) return;
+   mm128_intrlv_4x32_128( d+256, casti_m128i(s0,4), casti_m128i(s1,4),
+                                 casti_m128i(s2,4), casti_m128i(s3,4) );
+   if ( bit_len <= 640 ) return;
+   mm128_intrlv_4x32_128( d+320, casti_m128i(s0,5), casti_m128i(s1,5),
+                                 casti_m128i(s2,5), casti_m128i(s3,5) );
+   mm128_intrlv_4x32_128( d+384, casti_m128i(s0,6), casti_m128i(s1,6),
+                                 casti_m128i(s2,6), casti_m128i(s3,6) );
+   mm128_intrlv_4x32_128( d+448, casti_m128i(s0,7), casti_m128i(s1,7),
+                                 casti_m128i(s2,7), casti_m128i(s3,7) );
+   // bit_len == 1024
+}
+
+// Still used by decred due to odd data size: 180 bytes
+// bit_len must be multiple of 32
+#define mm128_interleave_4x32x mm128_intrlv_4x32x
+static inline void mm128_intrlv_4x32x( void *dst, void *src0, void  *src1,
+                                        void *src2, void *src3, int bit_len )
+{
+   uint32_t *d  = (uint32_t*)dst;
+   uint32_t *s0 = (uint32_t*)src0;
+   uint32_t *s1 = (uint32_t*)src1;
+   uint32_t *s2 = (uint32_t*)src2;
+   uint32_t *s3 = (uint32_t*)src3;
+
+   for ( int i = 0; i < bit_len >> 5; i++, d += 4 )
+   {
+      *d     = *(s0+i);
+      *(d+1) = *(s1+i);
+      *(d+2) = *(s2+i);
+      *(d+3) = *(s3+i);
+   }
+}
+
+#define mm128_deinterleave_4x32 mm128_dintrlv_4x32
+static inline void mm128_dintrlv_4x32( void *d0, void *d1, void *d2,
+                                       void *d3, const void *s, int bit_len )
+{
+   mm128_dintrlv_4x32_128( d0    , d1    , d2    , d3    , s     );
+   mm128_dintrlv_4x32_128( d0+ 16, d1+ 16, d2+ 16, d3+ 16, s+ 64 );
+   if ( bit_len <= 256 ) return;
+   mm128_dintrlv_4x32_128( d0+ 32, d1+ 32, d2+ 32, d3+ 32, s+128 );
+   mm128_dintrlv_4x32_128( d0+ 48, d1+ 48, d2+ 48, d3+ 48, s+192 );
+   if ( bit_len <= 512 ) return;
+   mm128_dintrlv_4x32_128( d0+ 64, d1+ 64, d2+ 64, d3+ 64, s+256 );
+   if ( bit_len <= 640 ) return;
+   mm128_dintrlv_4x32_128( d0+ 80, d1+ 80, d2+ 80, d3+ 80, s+320 );
+   mm128_dintrlv_4x32_128( d0+ 96, d1+ 96, d2+ 96, d3+ 96, s+384 );
+   mm128_dintrlv_4x32_128( d0+112, d1+112, d2+112, d3+112, s+448 );
+   // bit_len == 1024
+}
+
+// extract and deinterleave specified lane.
+static inline void mm128_extract_lane_4x32( void *d, const void *s,
+                                            const int lane, const int bit_len )
+{
+  casti_m128i( d, 0 ) =
+             mm128_get_32( s, lane   , lane+ 4, lane+ 8, lane+12 );
+  casti_m128i( d, 1 ) =
+             mm128_get_32( s, lane+16, lane+20, lane+24, lane+28 );
+  if ( bit_len <= 256 ) return;
+  casti_m128i( d, 2 ) =
+             mm128_get_32( s, lane+32, lane+36, lane+40, lane+44 );
+  casti_m128i( d, 3 ) =
+             mm128_get_32( s, lane+48, lane+52, lane+56, lane+60 );
+  // bit_len == 512
+}
+
+// Interleave 80 bytes of 32 bit data for 4 lanes.
+static inline void mm128_bswap_intrlv80_4x32( void *d, const void *s )
+{
+   mm128_bswap_intrlv_4x32_128( d    , casti_m128i( s, 0 ) );
+   mm128_bswap_intrlv_4x32_128( d+ 64, casti_m128i( s, 1 ) );
+   mm128_bswap_intrlv_4x32_128( d+128, casti_m128i( s, 2 ) );
+   mm128_bswap_intrlv_4x32_128( d+192, casti_m128i( s, 3 ) );
+   mm128_bswap_intrlv_4x32_128( d+256, casti_m128i( s, 4 ) );
+}
+
+#endif // SSE2
+#endif // INTRLV_SSE2_H__
+
--- a/simd-utils/simd-avx2.h
+++ b/simd-utils/simd-avx2.h
@@ -0,0 +1,484 @@
+#if !defined(SIMD_AVX2_H__)
+#define SIMD_AVX2_H__ 1
+
+#if defined(__AVX2__)
+
+/////////////////////////////////////////////////////////////////////
+//
+//             AVX2 256 bit vectors
+//
+// AVX2 is required for integer support of 256 bit vectors.
+// Some 256 bit vector utilities require AVX512 or have more efficient
+// AVX512 implementations. They will be selected automatically but their use
+// is limited because 256 bit vectors are less likely to be used when 512
+// is available.
+
+// Vector type overlays used by compile time vector constants.
+// Constants of these types reside in memory.
+
+
+// Compile time vector constants and initializers.
+//
+// The following macro constants and functions should only be used
+// for compile time initialization of constant and variable vector
+// arrays. These constants use memory, use _mm256_set at run time to
+// avoid using memory.
+
+#define mm256_const_64( x3, x2, x1, x0 ) {{ x3, x2, x1, x0 }}
+#define mm256_const1_64( x ) {{ x,x,x,x }}
+
+#define mm256_const_32( x7, x6, x5, x4, x3, x2, x1, x0 ) \
+                     {{ x7, x6, x5, x4, x3, x2, x1, x0 }}
+#define mm256_const1_32( x ) {{ x,x,x,x, x,x,x,x }}
+
+#define mm256_const_16( x15, x14, x13, x12, x11, x10, x09, x08, \
+                        x07, x06, x05, x04, x03, x02, x01, x00 ) \
+                     {{ x15, x14, x13, x12, x11, x10, x09, x08, \
+                        x07, x06, x05, x04, x03, x02, x01, x00 }}
+#define mm256_const1_16( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
+
+#define mm256_const_8( x31, x30, x29, x28, x27, x26, x25, x24, \
+                       x23, x22, x21, x20, x19, x18, x17, x16, \
+                       x15, x14, x13, x12, x11, x10, x09, x08, \
+                       x07, x06, x05, x04, x03, x02, x01, x00 ) \
+                    {{ x31, x30, x29, x28, x27, x26, x25, x24, \
+                       x23, x22, x21, x20, x19, x18, x17, x16, \
+                       x15, x14, x13, x12, x11, x10, x09, x08, \
+                       x07, x06, x05, x04, x03, x02, x01, x00 }}
+#define mm256_const1_8( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, \
+                               x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
+
+// Predefined compile time constant vectors.
+// Use Pseudo constants at run time for all simple constant vectors.
+#define c256_zero         mm256_const1_64( 0ULL )
+#define c256_one_256      mm256_const_64(  0ULL, 0ULL, 0ULL, 1ULL )
+#define c256_one_128      mm256_const_64(  0ULL, 1ULL, 0ULL, 1ULL )
+#define c256_one_64       mm256_const1_64( 1ULL )
+#define c256_one_32       mm256_const1_32( 1UL )
+#define c256_one_16       mm256_const1_16( 1U )
+#define c256_one_8        mm256_const1_8(  1U )
+#define c256_neg1         mm256_const1_64( 0xFFFFFFFFFFFFFFFFULL )
+#define c256_neg1_64      mm256_const1_64( 0xFFFFFFFFFFFFFFFFULL )
+#define c256_neg1_32      mm256_const1_32( 0xFFFFFFFFUL )
+#define c256_neg1_16      mm256_const1_16( 0xFFFFU )
+#define c256_neg1_8       mm256_const1_8(  0xFFU )
+
+//
+// Pseudo constants.
+// These can't be used for compile time initialization but are preferable
+// for simple constant vectors at run time.
+
+#define m256_zero            _mm256_setzero_si256()
+#define m256_one_256         _mm256_set_epi64x(  0ULL, 0ULL, 0ULL, 1ULL )
+#define m256_one_128         _mm256_set_epi64x(  0ULL, 1ULL, 0ULL, 1ULL )
+#define m256_one_64          _mm256_set1_epi64x( 1ULL )
+#define m256_one_32          _mm256_set1_epi32(  1UL )
+#define m256_one_16          _mm256_set1_epi16(  1U )
+#define m256_one_8           _mm256_set1_epi8(   1U )
+#define m256_neg1            _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
+
+//
+// Basic operations without SIMD equivalent
+
+// Bitwise not ( ~x )
+#define mm256_not( x )       _mm256_xor_si256( (x), m256_neg1 ) \
+
+// Unary negation of each element ( -a )
+#define mm256_negate_64( a ) _mm256_sub_epi64( m256_zero, a )
+#define mm256_negate_32( a ) _mm256_sub_epi32( m256_zero, a )
+#define mm256_negate_16( a ) _mm256_sub_epi16( m256_zero, a )
+
+//
+// Vector size conversion.
+//
+// Allows operations on either or both halves of a 256 bit vector serially.
+// Handy for parallel AES.
+// Caveats:
+//      _mm256_castsi256_si128 is free and without side effects.
+//      _mm256_castsi128_si256 is also free but leaves the high half
+//      undefined. That's ok if the hi half will be subseqnently assigned.
+//      If assigning both, do lo first, If assigning only 1, use
+//      _mm256_inserti128_si256.
+//
+#define mm128_extr_lo128_256( a ) _mm256_castsi256_si128( a )
+#define mm128_extr_hi128_256( a ) _mm256_extracti128_si256( a, 1 )
+
+// input __m128i, returns __m256i
+// To build a 256 bit vector from 2 128 bit vectors lo must be done first.
+// lo alone leaves hi undefined, hi alone leaves lo unchanged.
+// Both cost one clock while preserving the other half..
+// Insert b into specified half of a leaving other half of a unchanged.
+#define mm256_ins_lo128_256( a, b )  _mm256_inserti128_si256( a, b, 0 )
+#define mm256_ins_hi128_256( a, b )  _mm256_inserti128_si256( a, b, 1 )
+
+// concatenate two 128 bit vectors into one 256 bit vector
+#define mm256_concat_128( hi, lo ) \
+   mm256_ins_hi128_256( _mm256_castsi128_si256( lo ), hi )
+
+// Parallel AES, for when x is expected to be in a 256 bit register.
+#define mm256_aesenc_2x128( x ) \
+     mm256_concat_128( \
+     _mm_aesenc_si128( mm128_extr_hi128_256( x ), m128_zero ), \
+          _mm_aesenc_si128( mm128_extr_lo128_256( x ), m128_zero ) )
+
+#define mm256_aesenckey_2x128( x, k ) \
+     mm256_concat_128( \
+     _mm_aesenc_si128( mm128_extr_hi128_256( x ), \
+                       mm128_extr_lo128_256( k ) ), \
+     _mm_aesenc_si128( mm128_extr_hi128_256( x ), \
+                       mm128_extr_lo128_256( k ) ) )
+
+#define mm256_paesenc_2x128( y, x ) do \
+{ \
+  __m256i *X = (__m256i*)x; \
+  __m256i *Y = (__m256i*)y; \
+  y[0] = _mm_aesenc_si128( x[0], m128_zero ); \
+  y[1] = _mm_aesenc_si128( x[1], m128_zero ); \
+} while(0);
+
+// With pointers.
+#define mm256_paesenckey_2x128( y, x, k ) do \
+{ \
+  __m256i *X = (__m256i*)x; \
+  __m256i *Y = (__m256i*)y; \
+  __m256i *K = (__m256i*)ky; \
+  y[0] = _mm_aesenc_si128( x[0], K[0] ); \
+  y[1] = _mm_aesenc_si128( x[1], K[1] ); \
+} while(0);
+
+//
+// Pointer casting
+
+// p = any aligned pointer
+// returns p as pointer to vector type, not very useful
+#define castp_m256i(p) ((__m256i*)(p))
+
+// p = any aligned pointer
+// returns *p, watch your pointer arithmetic
+#define cast_m256i(p) (*((__m256i*)(p)))
+
+// p = any aligned pointer, i = scaled array index
+// returns value p[i]
+#define casti_m256i(p,i) (((__m256i*)(p))[(i)])
+
+// p = any aligned pointer, o = scaled offset
+// returns pointer p+o
+#define casto_m256i(p,o) (((__m256i*)(p))+(o))
+
+
+// Gather scatter
+
+#define mm256_gather_64( d, s0, s1, s2, s3 ) \
+    ((uint64_t*)(d))[0] = (uint64_t)(s0); \
+    ((uint64_t*)(d))[1] = (uint64_t)(s1); \
+    ((uint64_t*)(d))[2] = (uint64_t)(s2); \
+    ((uint64_t*)(d))[3] = (uint64_t)(s3);
+
+#define mm256_gather_32( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \
+    ((uint32_t*)(d))[0] = (uint32_t)(s0); \
+    ((uint32_t*)(d))[1] = (uint32_t)(s1); \
+    ((uint32_t*)(d))[2] = (uint32_t)(s2); \
+    ((uint32_t*)(d))[3] = (uint32_t)(s3); \
+    ((uint32_t*)(d))[4] = (uint32_t)(s4); \
+    ((uint32_t*)(d))[5] = (uint32_t)(s5); \
+    ((uint32_t*)(d))[6] = (uint32_t)(s6); \
+    ((uint32_t*)(d))[7] = (uint32_t)(s7);
+
+
+// Scatter data from contiguous memory.
+// All arguments are pointers
+#define mm256_scatter_64( d0, d1, d2, d3, s ) \
+   *((uint64_t*)(d0)) = ((uint64_t*)(s))[0]; \
+   *((uint64_t*)(d1)) = ((uint64_t*)(s))[1]; \
+   *((uint64_t*)(d2)) = ((uint64_t*)(s))[2]; \
+   *((uint64_t*)(d3)) = ((uint64_t*)(s))[3];
+
+#define mm256_scatter_32( d0, d1, d2, d3, d4, d5, d6, d7, s ) \
+   *((uint32_t*)(d0)) = ((uint32_t*)(s))[0]; \
+   *((uint32_t*)(d1)) = ((uint32_t*)(s))[1]; \
+   *((uint32_t*)(d2)) = ((uint32_t*)(s))[2]; \
+   *((uint32_t*)(d3)) = ((uint32_t*)(s))[3]; \
+   *((uint32_t*)(d4)) = ((uint32_t*)(s))[4]; \
+   *((uint32_t*)(d5)) = ((uint32_t*)(s))[5]; \
+   *((uint32_t*)(d6)) = ((uint32_t*)(s))[6]; \
+   *((uint32_t*)(d7)) = ((uint32_t*)(s))[7];
+
+
+//
+// Memory functions
+// n = number of 256 bit (32 byte) vectors
+
+static inline void memset_zero_256( __m256i *dst, int n )
+{   for ( int i = 0; i < n; i++ ) dst[i] = m256_zero; }
+
+static inline void memset_256( __m256i *dst, const __m256i a,  int n )
+{   for ( int i = 0; i < n; i++ ) dst[i] = a; }
+
+static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
+{   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
+
+//
+//           Bit rotations.
+//
+// The only bit shift for more than 64 bits is with __int128.
+//
+// AVX512 has bit rotate for 256 bit vectors with 64 or 32 bit elements
+// but is of little value
+
+//
+// Rotate each element of v by c bits
+#define mm256_ror_64( v, c ) \
+   _mm256_or_si256( _mm256_srli_epi64( v, c ), \
+                    _mm256_slli_epi64( v, 64-(c) ) )
+
+#define mm256_rol_64( v, c ) \
+   _mm256_or_si256( _mm256_slli_epi64( v, c ), \
+                    _mm256_srli_epi64( v, 64-(c) ) )
+
+#define mm256_ror_32( v, c ) \
+   _mm256_or_si256( _mm256_srli_epi32( v, c ), \
+                    _mm256_slli_epi32( v, 32-(c) ) )
+
+#define mm256_rol_32( v, c ) \
+   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
+                    _mm256_srli_epi32( v, 32-(c) ) )
+
+#define  mm256_ror_16( v, c ) \
+   _mm256_or_si256( _mm256_srli_epi16( v, c ), \
+                    _mm256_slli_epi16( v, 16-(c) ) )
+
+#define mm256_rol_16( v, c ) \
+   _mm256_or_si256( _mm256_slli_epi16( v, c ), \
+                    _mm256_srli_epi16( v, 16-(c) ) )
+
+// Rotate bits in each element of v by the amount in corresponding element of
+// index vector c
+#define mm256_rorv_64( v, c ) \
+   _mm256_or_si256( \
+         _mm256_srlv_epi64( v, _mm256_set1_epi64x( c ) ), \
+         _mm256_sllv_epi64( v, _mm256_set1_epi64x( 64-(c) ) ) )
+
+#define mm256_rolv_64( v, c ) \
+   _mm256_or_si256( \
+         _mm256_sllv_epi64( v, _mm256_set1_epi64x( c ) ), \
+         _mm256_srlv_epi64( v, _mm256_set1_epi64x( 64-(c) ) ) )
+
+
+#define mm256_rorv_32( v, c ) \
+   _mm256_or_si256( \
+         _mm256_srlv_epi32( v, _mm256_set1_epi32( c ) ), \
+         _mm256_sllv_epi32( v, _mm256_set1_epi32( 32-(c) ) ) )
+
+#define mm256_rolv_32( v, c ) \
+   _mm256_or_si256( \
+         _mm256_sllv_epi32( v, _mm256_set1_epi32( c ) ), \
+         _mm256_srlv_epi32( v, _mm256_set1_epi32( 32-(c) ) ) )
+
+// AVX512 can do 16 bit elements.
+
+//
+// Rotate elements accross all lanes.
+//
+// AVX2 has no full vector permute for elements less than 32 bits.
+// AVX512 has finer granularity full vector permutes.
+
+// Swap 128 bit elements in 256 bit vector.
+#define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )
+
+// Rotate 256 bit vector by one 64 bit element
+#define mm256_ror_1x64( v )     _mm256_permute4x64_epi64( v, 0x39 )
+#define mm256_rol_1x64( v )     _mm256_permute4x64_epi64( v, 0x93 )
+
+// Rotate 256 bit vector by one 32 bit element.
+#define mm256_ror_1x32( v ) \
+    _mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 0,7,6,5, 4,3,2,1 ) )
+#define mm256_rol_1x32( v ) \
+    _mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 6,5,4,3, 2,1,0,7 ) )
+
+// Rotate 256 bit vector by three 32 bit elements (96 bits).
+#define mm256_ror_3x32( v ) \
+    _mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 2,1,0,7, 6,5,4,3 ) )
+#define mm256_rol_3x32( v ) \
+    _mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 4,3,2,1, 0,7,6,5 ) )
+
+// AVX512 can do 16 & 8 bit elements.
+#if defined(__AVX512VL__)
+
+// Rotate 256 bit vector by one 16 bit element.     
+#define mm256_ror_1x16( v ) \
+   _mm256_permutexvar_epi16( _mm256_set_epi16( \
+    0,15,14,13,12,11,10, 9,   8, 7, 6, 5, 4, 3, 2, 1 ), v )
+
+#define mm256_rol_1x16( v ) \
+   _mm256_permutexvar_epi16( _mm256_set_epi16( \
+        14,13,12,11,10, 9, 8, 7,   6, 5, 4, 3, 2, 1, 0,15 ), v )
+
+// Rotate 256 bit vector by one byte.
+#define mm256_ror_1x8( v ) \
+   _mm256_permutexvar_epi8( _mm256_set_epi8( \
+         0,31,30,29,28,27,26,25,  24,23,22,21,20,19,18,17, \
+   16,15,14,13,12,11,10, 9,   8, 7, 6, 5, 4, 3, 2, 1 ), v )
+
+#define mm256_rol_1x8( v ) \
+   _mm256_permutexvar_epi8( _mm256_set_epi8( \
+        30,29,28,27,26,25,24,23,  22,21,20,19,18,17,16,15, \
+        14,13,12,11,10, 9, 8, 7,   6, 5, 4, 3, 2, 1, 0,31 ), v )
+
+#endif  // AVX512
+
+// Invert vector: {3,2,1,0} -> {0,1,2,3}
+#define mm256_invert_64( v ) _mm256_permute4x64_epi64( a, 0x1b )
+
+#define mm256_invert_32( v ) \
+     _mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 0,1,2,3,4,5,6,7 ) )
+
+// AVX512 can do 16 & 8 bit elements.
+
+//
+// Rotate elements within lanes of 256 bit vector.
+
+// Swap 64 bit elements in each 128 bit lane.
+#define mm256_swap64_128( v )   _mm256_shuffle_epi32( v, 0x4e )
+
+// Rotate each 128 bit lane by one 32 bit element.
+#define mm256_ror1x32_128( v )  _mm256_shuffle_epi32( v, 0x39 )
+#define mm256_rol1x32_128( v )  _mm256_shuffle_epi32( v, 0x93 )
+
+// Rotate each 128 bit lane by one 16 bit element.
+#define mm256_rol1x16_128( v ) \
+         _mm256_shuffle_epi8( 13,12,11,10, 9,8,7,6, 5,4,3,2, 1,0,15,14 )
+#define mm256_ror1x16_128( v ) \
+        _mm256_shuffle_epi8( 1,0,15,14, 13,12,11,10, 9,8,7,6, 5,4,3,2 )
+
+// Rotate each 128 bit lane by one byte
+#define mm256_rol1x8_128( v ) \
+        _mm256_shuffle_epi8( 14, 13,12,11, 10,9,8,7, 6,5,4,3, 2,1,0,15 )
+#define mm256_ror1x8_128( v ) \
+        _mm256_shuffle_epi8( 0,15,14,13, 12,11,10,9, 8,7,6,5, 4,3,2,1 )
+
+// Rotate each 128 bit lane by c bytes.
+#define mm256_bror_128( v, c ) \
+  _mm256_or_si256( _mm256_bsrli_epi128( v, c ), \
+                   _mm256_bslli_epi128( v, 16-(c) ) )
+#define mm256_brol_128( v, c ) \
+  _mm256_or_si256( _mm256_bslli_epi128( v, c ), \
+                   _mm256_bsrli_epi128( v, 16-(c) ) )
+
+// Swap 32 bit elements in each 64 bit lane
+#define mm256_swap32_64( v )    _mm256_shuffle_epi32( v, 0xb1 )
+
+#define mm256_ror16_64( v ) \
+      _mm256_shuffle_epi8(  9, 8,15,14,13,12,11,10,  1, 0, 7, 6, 5, 4, 3, 2 );
+#define mm256_rol16_64( v ) \
+      _mm256_shuffle_epi8( 13,12,11,10, 9, 8,15,14,  5, 4, 3, 2, 1, 0, 7, 6 );
+
+
+// Swap 16 bit elements in each 32 bit lane
+#define mm256_swap16_32( v )  _mm256_shuffle_epi8( v, \
+        _mm_set_epi8( 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2 )
+
+//
+// Swap bytes in vector elements, endian bswap.
+#define mm256_bswap_64( v ) \
+   _mm256_shuffle_epi8( v, _mm256_set_epi8( 8, 9,10,11,12,13,14,15, \
+                                            0, 1, 2, 3, 4, 5, 6, 7, \
+                                            8, 9,10,11,12,13,14,15, \
+                                            0, 1, 2, 3, 4, 5, 6, 7 ) )
+
+#define mm256_bswap_32( v ) \
+   _mm256_shuffle_epi8( v, _mm256_set_epi8( 12,13,14,15,   8, 9,10,11, \
+                                             4, 5, 6, 7,   0, 1, 2, 3, \
+                                            12,13,14,15,   8, 9,10,11, \
+                                             4, 5, 6, 7,   0, 1, 2, 3 ) )
+
+#define mm256_bswap_16( v ) \
+   _mm256_shuffle_epi8( v, _mm256_set_epi8(  14,15,  12,13,  10,11,   8, 9, \
+                                              6, 7,   4, 5,   2, 3,   0, 1, \
+                                             14,15,  12,13,  10,11,   8, 9, \
+                                              6, 7,   4, 5,   2, 3,   0, 1 ) )
+
+//
+// Rotate two concatenated 256 bit vectors as one 512 bit vector by specified
+// number of elements. Rotate is done in place, source arguments are
+// overwritten.
+// Some of these can use permute but appears to be slower. Maybe a Ryzen
+// issue
+
+#define mm256_swap256_512 (v1, v2) \
+   v1 = _mm256_xor_si256(v1, v2); \
+   v2 = _mm256_xor_si256(v1, v2); \
+   v1 = _mm256_xor_si256(v1, v2);
+
+#define mm256_ror1x128_512( v1, v2 ) \
+do { \
+   __m256i t = _mm256_alignr_epi8( v1, v2, 16 ); \
+   v1 = _mm256_alignr_epi8( v2, v1, 16 ); \
+   v2 = t; \
+} while(0)
+
+#define mm256_rol1x128_512( v1, v2 ) \
+do { \
+   __m256i t = _mm256_alignr_epi8( v1, v2, 16 ); \
+   v2 = _mm256_alignr_epi8( v2, v1, 16 ); \
+   v1 = t; \
+} while(0)
+
+#define mm256_ror1x64_512( v1, v2 ) \
+do { \
+   __m256i t = _mm256_alignr_epi8( v1, v2, 8 ); \
+   v1 = _mm256_alignr_epi8( v2, v1, 8 ); \
+   v2 = t; \
+} while(0)
+
+#define mm256_rol1x64_512( v1, v2 ) \
+do { \
+   __m256i t = _mm256_alignr_epi8( v1, v2, 24 ); \
+   v2 = _mm256_alignr_epi8( v2, v1, 24 ); \
+   v1 = t; \
+} while(0)
+
+#define mm256_ror1x32_512( v1, v2 ) \
+do { \
+   __m256i t = _mm256_alignr_epi8( v1, v2, 4 ); \
+   v1 = _mm256_alignr_epi8( v2, v1, 4 ); \
+   v2 = t; \
+} while(0)
+
+#define mm256_rol1x32_512( v1, v2 ) \
+do { \
+   __m256i t = _mm256_alignr_epi8( v1, v2, 28 ); \
+   v2 = _mm256_alignr_epi8( v2, v1, 28 ); \
+   v1 = t; \
+} while(0)
+
+#define mm256_ror1x16_512( v1, v2 ) \
+do { \
+   __m256i t = _mm256_alignr_epi8( v1, v2, 2 ); \
+   v1 = _mm256_alignr_epi8( v2, v1, 2 ); \
+   v2 = t; \
+} while(0)
+
+#define mm256_rol1x16_512( v1, v2 ) \
+do { \
+   __m256i t = _mm256_alignr_epi8( v1, v2, 30 ); \
+   v2 = _mm256_alignr_epi8( v2, v1, 30 ); \
+   v1 = t; \
+} while(0)
+
+#define mm256_ror1x8_512( v1, v2 ) \
+do { \
+   __m256i t = _mm256_alignr_epi8( v1, v2, 1 ); \
+   v1 = _mm256_alignr_epi8( v2, v1, 1 ); \
+   v2 = t; \
+} while(0)
+
+#define mm256_rol1x8_512( v1, v2 ) \
+do { \
+   __m256i t = _mm256_alignr_epi8( v1, v2, 31 ); \
+   v2 = _mm256_alignr_epi8( v2, v1, 31 ); \
+   v1 = t; \
+} while(0)
+
+#endif // __AVX2__
+#endif // SIMD_AVX2_H__
+
--- a/simd-utils/simd-avx512.h
+++ b/simd-utils/simd-avx512.h
@@ -0,0 +1,604 @@
+#if !defined(SIMD_AVX512_H__)
+#define SIMD_AVX512_H__ 1
+
+#if defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+
+////////////////////////////////////////////////////////
+//
+// Some extentsions in AVX512 supporting operations on
+// smaller elements in 256 bit vectors.
+
+// Variable rotate, each element rotates by corresponding index.
+#define mm256_rorv_16( v, c ) \
+   _mm256_or_si256( \
+         _mm256_srlv_epi16( v, _mm256_set1_epi16( c ) ), \
+         _mm256_sllv_epi16( v, _mm256_set1_epi16( 16-(c) ) ) )
+
+#define mm256_rolv_16( v, c ) \
+   _mm256_or_si256( \
+         _mm256_sllv_epi16( v, _mm256_set1_epi16( c ) ), \
+         _mm256_srlv_epi16( v, _mm256_set1_epi16( 16-(c) ) ) )
+
+// Invert vector: {7,6,5,4,3,2,1,0} -> {0,1,2,3,4,5,6,7}
+#define mm256_invert_16 ( v ) \
+     _mm256_permutex_epi16( v, _mm256_set_epi16( 0, 1, 2, 3, 4, 5, 6, 7, \
+                                                 8, 9,10,11,12,13,14,15 ) )
+
+#define mm256_invert_8( v ) \
+     _mm256_permutex_epi8( v, _mm256_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, \
+                                               8, 9,10,11,12,13,14,15, \
+                                              16,17,18,19,20,21,22,23, \
+                                              24,25,26,27,28,29,30,31 ) )
+
+//////////////////////////////////////////////////////////////
+//
+//   AVX512 512 bit vectors
+//
+// Other AVX512 extensions that may be required for some functions.
+// __AVX512VBMI__  __AVX512VAES__
+//
+// Experimental, not fully tested.
+
+
+//
+// Compile time vector constants and initializers.
+//
+// The following macro constants and functions should only be used
+// for compile time initialization of constant and variable vector
+// arrays. These constants use memory, use set instruction or pseudo
+// constants at run time to avoid using memory.
+
+// Constant initializers
+#define mm512_const_64( x7, x6, x5, x4, x3, x2, x1, x0 ) \
+                     {{ x7, x6, x5, x4, x3, x2, x1, x0 }}
+
+#define mm512_const1_64( x ) {{ x,x,x,x,x,x,x }}
+
+#define mm512_const_32( x15, x14, x13, x12, x11, x10, x09, x08, \
+                        x07, x06, x05, x04, x03, x02, x01, x00 ) \
+                     {{ x15, x14, x13, x12, x11, x10, x09, x08, }} \
+                        x07, x06, x05, x04, x03, x02, x01, x00 }}
+
+#define mm512_const1_32( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
+
+#define mm512_const_16( x31, x30, x29, x28, x27, x26, x25, x24, \
+                        x23, x22, x21, x20, x19, x18, x17, x16, \
+                        x15, x14, x13, x12, x11, x10, x09, x08, \
+                        x07, x06, x05, x04, x03, x02, x01, x00 ) \
+                     {{ x31, x30, x29, x28, x27, x26, x25, x24, \
+                        x23, x22, x21, x20, x19, x18, x17, x16, \
+                        x15, x14, x13, x12, x11, x10, x09, x08, \
+                        x07, x06, x05, x04, x03, x02, x01, x00 }}
+
+#define mm512_const1_16( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, \
+                                x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
+
+#define mm512_const_8( x63, x62, x61, x60, x59, x58, x57, x56, \
+                       x55, x54, x53, x52, x51, x50, x49, x48, \
+                       x47, x46, x45, x44, x43, x42, x41, x40, \
+                       x39, x38, x37, x36, x35, x34, x33, x32, \
+                       x31, x30, x29, x28, x27, x26, x25, x24, \
+                       x23, x22, x21, x20, x19, x18, x17, x16, \
+                       x15, x14, x13, x12, x11, x10, x09, x08, \
+                       x07, x06, x05, x04, x03, x02, x01, x00 ) \
+                    {{ x63, x62, x61, x60, x59, x58, x57, x56, \
+                       x55, x54, x53, x52, x51, x50, x49, x48, \
+                       x47, x46, x45, x44, x43, x42, x41, x40, \
+                       x39, x38, x37, x36, x35, x34, x33, x32, \
+                       x31, x30, x29, x28, x27, x26, x25, x24, \
+                       x23, x22, x21, x20, x19, x18, x17, x16, \
+                       x15, x14, x13, x12, x11, x10, x09, x08, \
+                       x07, x06, x05, x04, x03, x02, x01, x00 }}
+
+#define mm512_const1_8( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, \
+                               x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, \
+                               x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, \
+                               x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
+
+// Predefined compile time constant vectors.
+#define c512_zero      mm512_const1_64(   0ULL )
+#define c512_neg1      mm512_const1_64(   0xFFFFFFFFFFFFFFFFULL )
+#define c512_one_512   mm512_const_epi64( 0ULL, 0ULL, 0ULL, 0ULL, \
+                                          0ULL, 0ULL, 0ULL, 1ULL )
+#define c512_one_256   mm512_const_64(    0ULL, 0ULL, 0ULL, 1ULL, \
+                                          0ULL, 0ULL, 0ULL, 1ULL )
+#define c512_one_128   mm512_const_64(    0ULL, 1ULL, 0ULL, 1ULL, \
+                                          0ULL, 1ULL, 0ULL, 1ULL )
+#define c512_one_64    mm512_const1_64(   1ULL )
+#define c512_one_32    mm512_const1_32(   1UL )
+#define c512_one_16    mm512_const1_16(   1U )
+#define c512_one_8     mm512_const1_8(    1U )
+#define c512_neg1_64   mm512_const1_64( 0xFFFFFFFFFFFFFFFFULL )
+#define c512_neg1_32   mm512_const1_32( 0xFFFFFFFFUL )
+#define c512_neg1_16   mm512_const1_32( 0xFFFFU )
+#define c512_neg1_8    mm512_const1_32( 0xFFU )
+
+//
+// Pseudo constants.
+
+// _mm512_setzero_si512 uses xor instruction. If needed frequently
+// in a function is it better to define a register variable (const?)
+// initialized to zero.
+// It isn't clear to me yet how set or set1 actually work.
+
+#define m512_zero       _mm512_setzero_si512()
+#define m512_one_512    _mm512_set_epi64(  0ULL, 0ULL, 0ULL, 0ULL, \
+                                           0ULL, 0ULL, 0ULL, 1ULL )
+#define m512_one_256    _mm512_set4_epi64( 0ULL, 0ULL, 0ULL, 1ULL )
+#define m512_one_128    _mm512_set4_epi64( 0ULL, 1ULL, 0ULL, 1ULL )
+#define m512_one_64     _mm512_set1_epi64( 1ULL )
+#define m512_one_32     _mm512_set1_epi32( 1UL )
+#define m512_one_16     _mm512_set1_epi16( 1U )
+#define m512_one_8      _mm512_set1_epi8(  1U )
+#define m512_neg1       _mm512_set1_epi64( 0xFFFFFFFFFFFFFFFFULL )
+
+
+//
+// Basic operations without SIMD equivalent
+
+#define mm512_not( x )       _mm512_xor_si512( x, m512_neg1 )
+#define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x )
+#define mm512_negate_32( x ) _mm512_sub_epi32( m512_zero, x )  
+#define mm512_negate_16( x ) _mm512_sub_epi16( m512_zero, x )  
+
+//
+// Pointer casting
+
+// p = any aligned pointer
+// i = scaled array index
+// o = scaled address offset
+
+// returns p as pointer to vector
+#define castp_m512i(p) ((__m512i*)(p))
+
+// returns *p as vector value
+#define cast_m512i(p) (*((__m512i*)(p)))
+
+// returns p[i] as vector value
+#define casti_m512i(p,i) (((__m512i*)(p))[(i)])
+
+// returns p+o as pointer to vector
+#define casto_m512i(p,o) (((__m512i*)(p))+(o))
+
+// Gather scatter
+
+#define mm512_gather_64( d, s0, s1, s2, s3, s4, s5, s6, s7 ) \
+    ((uint64_t*)(d))[0] = (uint64_t)(s0); \
+    ((uint64_t*)(d))[1] = (uint64_t)(s1); \
+    ((uint64_t*)(d))[2] = (uint64_t)(s2); \
+    ((uint64_t*)(d))[3] = (uint64_t)(s3); \
+    ((uint64_t*)(d))[4] = (uint64_t)(s4); \
+    ((uint64_t*)(d))[5] = (uint64_t)(s5); \
+    ((uint64_t*)(d))[6] = (uint64_t)(s6); \
+    ((uint64_t*)(d))[7] = (uint64_t)(s7); 
+
+
+#define mm512_gather_32( d, s00, s01, s02, s03, s04, s05, s06, s07, \
+                   s08, s09, s10, s11, s12, s13, s14, s15 ) \
+    ((uint32_t*)(d))[ 0] = (uint32_t)(s00); \
+    ((uint32_t*)(d))[ 1] = (uint32_t)(s01); \
+    ((uint32_t*)(d))[ 2] = (uint32_t)(s02); \
+    ((uint32_t*)(d))[ 3] = (uint32_t)(s03); \
+    ((uint32_t*)(d))[ 4] = (uint32_t)(s04); \
+    ((uint32_t*)(d))[ 5] = (uint32_t)(s05); \
+    ((uint32_t*)(d))[ 6] = (uint32_t)(s06); \
+    ((uint32_t*)(d))[ 7] = (uint32_t)(s07); \
+    ((uint32_t*)(d))[ 8] = (uint32_t)(s08); \
+    ((uint32_t*)(d))[ 9] = (uint32_t)(s09); \
+    ((uint32_t*)(d))[10] = (uint32_t)(s10); \
+    ((uint32_t*)(d))[11] = (uint32_t)(s11); \
+    ((uint32_t*)(d))[12] = (uint32_t)(s12); \
+    ((uint32_t*)(d))[13] = (uint32_t)(s13); \
+    ((uint32_t*)(d))[13] = (uint32_t)(s14); \
+    ((uint32_t*)(d))[15] = (uint32_t)(s15);
+
+// Scatter data from contiguous memory.
+// All arguments are pointers
+#define mm512_scatter_64( d0, d1, d2, d3, d4, d5, d6, d7, s ) \
+   *((uint64_t*)(d0)) = ((uint64_t*)(s))[0]; \
+   *((uint64_t*)(d1)) = ((uint64_t*)(s))[1]; \
+   *((uint64_t*)(d2)) = ((uint64_t*)(s))[2]; \
+   *((uint64_t*)(d3)) = ((uint64_t*)(s))[3]; \
+   *((uint64_t*)(d4)) = ((uint64_t*)(s))[4]; \
+   *((uint64_t*)(d5)) = ((uint64_t*)(s))[5]; \
+   *((uint64_t*)(d6)) = ((uint64_t*)(s))[6]; \
+   *((uint64_t*)(d7)) = ((uint64_t*)(s))[7];
+
+
+#define mm512_scatter_32( d00, d01, d02, d03, d04, d05, d06, d07, \
+                     d08, d09, d10, d11, d12, d13, d14, d15, s ) \
+   *((uint32_t*)(d00)) = ((uint32_t*)(s))[ 0]; \
+   *((uint32_t*)(d01)) = ((uint32_t*)(s))[ 1]; \
+   *((uint32_t*)(d02)) = ((uint32_t*)(s))[ 2]; \
+   *((uint32_t*)(d03)) = ((uint32_t*)(s))[ 3]; \
+   *((uint32_t*)(d04)) = ((uint32_t*)(s))[ 4]; \
+   *((uint32_t*)(d05)) = ((uint32_t*)(s))[ 5]; \
+   *((uint32_t*)(d06)) = ((uint32_t*)(s))[ 6]; \
+   *((uint32_t*)(d07)) = ((uint32_t*)(s))[ 7]; \
+   *((uint32_t*)(d00)) = ((uint32_t*)(s))[ 8]; \
+   *((uint32_t*)(d01)) = ((uint32_t*)(s))[ 9]; \
+   *((uint32_t*)(d02)) = ((uint32_t*)(s))[10]; \
+   *((uint32_t*)(d03)) = ((uint32_t*)(s))[11]; \
+   *((uint32_t*)(d04)) = ((uint32_t*)(s))[12]; \
+   *((uint32_t*)(d05)) = ((uint32_t*)(s))[13]; \
+   *((uint32_t*)(d06)) = ((uint32_t*)(s))[14]; \
+   *((uint32_t*)(d07)) = ((uint32_t*)(s))[15];
+
+
+//
+// Bit rotations.
+
+// AVX512F has built-in bit fixed and variable rotation for 64 & 32 bit
+// elements. There is no bit rotation or shift for larger elements.
+//
+// _mm512_rol_epi64,  _mm512_ror_epi64,  _mm512_rol_epi32,  _mm512_ror_epi32
+// _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
+//
+// Here is a bit rotate for 16 bit elements:
+#define mm512_ror_16( v, c ) \
+    _mm512_or_si512( _mm512_srli_epi16( v, c ), \
+                     _mm512_slli_epi16( v, 16-(c) )
+#define mm512_rol_16( v, c ) \
+    _mm512_or_si512( _mm512_slli_epi16( v, c ), \
+                     _mm512_srli_epi16( v, 16-(c) )
+
+//
+// Rotate elements in 512 bit vector.
+
+#define mm512_swap_256( v ) \
+    _mm512_permutexvar_epi64( v, _mm512_set_epi64( 3,2,1,0,  7,6,5,4 ) )
+
+#define mm512_ror_1x128( v ) \
+    _mm512_permutexvar_epi64( v, _mm512_set_epi64( 1,0,  7,6,  5,4,  3,2 ) )
+
+#define mm512_rol_1x128( v ) \
+    _mm512_permutexvar_epi64( v, _mm512_set_epi64( 5,4,  3,2,  1,0,  7,6 ) )
+
+#define mm512_ror_1x64( v ) \
+    _mm512_permutexvar_epi64( v, _mm512_set_epi64( 0,7,6,5,4,3,2,1 ) )
+
+#define mm512_rol_1x64( v ) \
+    _mm512_permutexvar_epi64( v, _mm512_set_epi64( 6,5,4,3,2,1,0,7 ) )
+
+#define mm512_ror_1x32( v ) \
+  _mm512_permutexvar_epi32( v, _mm512_set_epi32( \
+                      0,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1 ) )
+
+#define mm512_rol_1x32( v ) \
+  _mm512_permutexvar_epi32( v, _mm512_set_epi32( \
+                     14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15 ) )
+
+//  Although documented to exist in AVX512F the _mm512_set_epi8 &
+//  _mm512_set_epi16 intrinsics fail to compile. Seems usefull to have
+//  for endian byte swapping. Workaround by using _mm512_set_epi32.
+//  Ugly but it works.
+
+#define mm512_ror_1x16( v ) \
+   _mm512_permutexvar_epi16( v, _mm512_set_epi32( \
+                       0x0000001F, 0x001E001D, 0x001C001B, 0x001A0019, \
+                       0X00180017, 0X00160015, 0X00140013, 0X00120011, \
+                       0X0010000F, 0X000E000D, 0X000C000B, 0X000A0009, \
+                       0X00080007, 0X00060005, 0X00040003, 0X00020001 ) )
+
+#define mm512_rol_1x16( v ) \
+   _mm512_permutexvar_epi16( v, _mm512_set_epi16( \
+                       0x001E001D, 0x001C001B, 0x001A0019, 0x00180017, \
+                       0X00160015, 0X00140013, 0X00120011, 0x0010000F, \
+                       0X000E000D, 0X000C000B, 0X000A0009, 0X00080007, \
+                       0X00060005, 0X00040003, 0X00020001, 0x0000001F ) )
+
+
+#define mm512_ror_1x8( v ) \
+   _mm512_permutexvar_epi8( v, _mm512_set_epi8( \
+                       0x003F3E3D, 0x3C3B3A39, 0x38373635, 0x34333231, \
+                       0x302F2E2D, 0x2C2B2A29, 0x28272625, 0x24232221, \
+                       0x201F1E1D, 0x1C1B1A19. 0x18171615, 0x14131211, \
+                       0x100F0E0D, 0x0C0B0A09, 0x08070605, 0x04030201 ) )
+
+#define mm512_rol_1x8( v ) \
+   _mm512_permutexvar_epi8( v, _mm512_set_epi8( \
+                       0x3E3D3C3B, 0x3A393837, 0x36353433, 0x3231302F. \
+                       0x2E2D2C2B, 0x2A292827, 0x26252423, 0x2221201F, \
+                       0x1E1D1C1B, 0x1A191817, 0x16151413, 0x1211100F, \
+                       0x0E0D0C0B, 0x0A090807, 0x06050403, 0x0201003F ) )
+
+// Invert vector: {3,2,1,0} -> {0,1,2,3}
+#define mm512_invert_128( v ) _mm512_permute4f128_epi32( a, 0x1b )
+
+#define mm512_invert_64( v ) \
+   _mm512_permutex_epi64( v, _mm512_set_epi64( 0,1,2,3,4,5,6,7 ) )
+
+#define mm512_invert_32( v ) \
+   _mm512_permutexvar_epi32( v, _mm512_set_epi32( \
+                      0, 1, 2, 3, 4, 5, 6, 7,   8, 9,10,11,12,13,14,15 ) )
+
+
+#define mm512_invert_16( v ) \
+   _mm512_permutexvar_epi16( v, _mm512_set_epi32( \
+                       0x00000001, 0x00020003, 0x00040005, 0x00060007, \
+                       0x00080009, 0x000A000B, 0x000C000D, 0x000E000F, \
+                       0x00100011, 0x00120013, 0x00140015, 0x00160017, \
+                       0x00180019, 0x001A001B, 0x001C001D, 0x001E001F ) )
+
+#define mm512_invert_8(  v ) \
+   _mm512_permutexvar_epi8( v, _mm512_set_epi32( \
+                       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F, \
+                       0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F, \
+                       0x20212223, 0x24252627, 0x28292A2B, 0x2C2D2E2F, \
+                       0x30313233, 0x34353637, 0x38393A3B, 0x3C3D3E3F ) )
+
+//
+// Rotate elements within 256 bit lanes of 512 bit vector.
+
+// Swap hi & lo 128 bits in each 256 bit lane
+#define mm512_swap128_256( v )   _mm512_permutex_epi64( v, 0x4e )
+
+// Rotate 256 bit lanes by one 64 bit element
+#define mm512_ror1x64_256( v )   _mm512_permutex_epi64( v, 0x39 )
+#define mm512_rol1x64_256( v )   _mm512_permutex_epi64( v, 0x93 )
+
+// Rotate 256 bit lanes by one 32 bit element
+#define mm512_ror1x32_256( v ) \
+   _mm512_permutexvar_epi32( v, _mm512_set_epi32( \
+                      8,15,14,13,12,11,10, 9,   0, 7, 6, 5, 4, 3, 2, 1 ) )
+#define mm512_rol1x32_256( v ) \
+   _mm512_permutexvar_epi32( v, _mm512_set_epi32( \
+                     14,13,12,11,10, 9, 8,15,   6, 5, 4, 3, 2, 1, 0, 7 ) )
+#define mm512_ror1x16_256( v ) \
+   _mm512_permutexvar_epi16( v, _mm512_set_epi32( \
+                     0x0010001F, 0x001E001D, 0x001C001B, 0x001A0019, \
+                     0x00180017, 0x00160015, 0x00140013, 0x00120011, \
+                     0x0000000F, 0x000E000D, 0x000C000B, 0x000A0009, \
+                     0x00080007, 0x00060005, 0x00040003, 0x00020001 ) )
+
+#define mm512_rol1x16_256( v ) \
+   _mm512_permutexvar_epi16( v, _mm512_set_epi32( \
+                     0x001E001D, 0x001C001B, 0x001A0019, 0x00180017, \
+                     0x00160015, 0x00140013, 0x00120011, 0x0000000F, \
+                     0x000E000D, 0x000C000B, 0x000A0009, 0x00080007, \
+                     0x00060005, 0x00040003, 0x00020001, 0x0000001F ) )
+
+#define mm512_ror1x8_256( v ) \
+   _mm512_permutexvar_epi8( v, _mm512_set_epi32( \
+                     0x203F3E3D, 0x3C3B3A39, 0x38373635, 0x34333231, \
+                     0x302F2E2D, 0x2C2B2A29, 0x28272625, 0x24232221, \
+                     0x001F1E1D, 0x1C1B1A19, 0x18171615, 0x14131211, \
+                     0x100F0E0D, 0x0C0B0A09, 0x08070605, 0x04030201 ) )
+
+#define mm512_rol1x8_256( v ) \
+    _mm512_permutexvar_epi8( v, _mm512_set_epi32( \
+                     0x3E3D3C3B, 0x3A393837, 0x36353433, 0x3231302F, \
+                     0x2E2D2C2B, 0x2A292827, 0x26252423, 0x2221203F, \
+                     0x1E1D1C1B, 0x1A191817, 0x16151413, 0x1211100F, \
+                     0x0E0D0C0B, 0x0A090807, 0x06050403, 0x0201001F ) )
+
+//
+// Rotate elements within 128 bit lanes of 512 bit vector.
+
+// Swap hi & lo 64 bits in each 128 bit lane
+#define mm512_swap64_128( v )    _mm512_permutex_epi64( v, 0xb1 )
+
+// Rotate 128 bit lanes by one 32 bit element
+#define mm512_ror1x32_128( v )   _mm512_shuffle_epi32( v, 0x39 )
+#define mm512_rol1x32_128( v )   _mm512_shuffle_epi32( v, 0x93 )
+
+#define mm512_ror1x16_128( v ) \
+    _mm512_permutexvar_epi16( v, _mm512_set_epi32( \
+                     0x0018001F, 0x001E001D, 0x001C001B, 0x001A0019, \
+                     0x00100017, 0x00160015, 0x00140013, 0x00120011, \
+                     0x0008000F, 0x000E000D, 0x000C000B, 0x000A0009, \
+                     0x00000007, 0x00060005, 0x00040003, 0x00020001 ) )
+
+#define mm512_rol1x16_128( v ) \
+    _mm512_permutexvar_epi16( v, _mm512_set_epi32( \
+                     0x001E001D, 0x001C001B, 0x001A0019, 0x0018001F, \
+                     0x00160015, 0x00140013, 0x00120011, 0x00100017, \
+                     0x000E000D, 0x000C000B, 0x000A0009, 0x0008000F, \
+                     0x00060005, 0x00040003, 0x00020001, 0x00000007 ) )
+
+#define mm512_ror1x8_128( v ) \
+    _mm512_permutexvar_epi8( v, _mm512_set_epi32( \
+                     0x303F3E3D, 0x3C3B3A39, 0x38373635, 0x34333231, \
+                     0x202F2E2D, 0x2C2B2A29, 0x28272625, 0x24232221, \
+                     0x101F1E1D, 0x1C1B1A19, 0x18171615, 0x14131211, \
+                     0x000F0E0D, 0x0C0B0A09, 0x08070605, 0x04030201 ) )
+
+#define mm512_rol1x8_128( v ) \
+    _mm512_permutexvar_epi8( v, _mm512_set_epi32( \
+                     0x3E3D3C3B, 0x3A393837, 0x36353433. 0x3231303F, \
+                     0x2E2D2C2B, 0x2A292827, 0x26252423, 0x2221202F, \
+                     0x1E1D1C1B, 0x1A191817, 0x16151413, 0x1211101F, \
+                     0x0E0D0C0B, 0x0A090807, 0x06050403, 0x0201000F ) )
+
+// Rotate 128 bit lanes by c bytes.  
+#define mm512_bror_128( v, c ) \
+   _mm512_or_si512( _mm512_bsrli_epi128( v, c ), \
+                    _mm512_bslli_epi128( v, 16-(c) ) )
+#define mm512_brol_128( v, c ) \
+   _mm512_or_si512( _mm512_bslli_epi128( v, c ), \
+                    _mm512_bsrli_epi128( v, 16-(c) ) )
+
+
+//
+// Rotate elements within 64 bit lanes.
+
+// Swap 32 bit elements in each 64 bit lane
+#define mm512_swap32_64( v )      _mm512_shuffle_epi32( v, 0xb1 )
+
+// _mm512_set_epi8 doesn't seem to work
+
+// Rotate each 64 bit lane by one 16 bit element.
+#define mm512_ror1x16_64( v ) \
+    _mm512_permutexvar_epi8( v, _mm512_set_epi32( \
+                      0x39383F3E, 0x3D3C3B3A,   0x31303736, 0x35343332, \
+                      0x29282F2E, 0x2D2C2B2A,   0x21202726, 0x25242322, \
+                      0x19181F1E, 0x1D1C1B1A,   0x11101716, 0x15141312, \
+                      0x09080F0E, 0x0D0C0B0A,   0x01000706, 0x05040302 ) )
+
+#define mm512_rol1x16_64( v ) \
+    _mm512_permutexvar_epi8( v, _mm512_set_epi32( \
+                      0x3D3C3B3A, 0x39383F3E,   0x35343332, 0x31303736 \
+                      0x2D2C2B2A, 0x29282F2E,   0x25242322, 0x21202726 \
+                      0x1D1C1B1A, 0x19181F1E,   0x15141312, 0x11101716 \
+                      0x0D0C0B0A, 0x09080F0E,   0x05040302, 0x01000706 ) ) 
+
+// Rotate each 64 bit lane by one byte.
+#define mm512_ror1x8_64( v ) \
+    _mm512_permutexvar_epi8( v, _mm512_set_epi32( \
+                      0x383F3E3D, 0x3C3B3A39,   0x30373635, 0x34333231, \
+                      0x282F2E2D, 0x2C2B2A29,   0x20272625, 0x24232221, \
+                      0x181F1E1D, 0x1C1B1A19,   0x10171615, 0x14131211, \
+                      0x080F0E0D, 0x0C0B0A09,   0x00070605, 0x0403020 )
+#define mm512_rol1x8_64( v ) \
+    _mm512_permutexvar_epi8( v, _mm512_set_epi32( \
+                       0x3E3D3C3B, 0x3A39383F,   0x36353433, 0x32313037, \
+                       0x2E2D2C2B, 0x2A29282F,   0x26252423, 0x22212027, \
+                       0x1E1D1C1B, 0x1A19181F,   0x16151413, 0x12111017, \
+                       0x0E0D0C0B, 0x0A09080F,   0x06050403, 0x02010007 )
+
+//
+// Rotate elements within 32 bit lanes.
+
+#define mm512_swap16_32( v ) \
+   _mm512_permutexvar_epi8( v, _mm512_set_epi32( \
+                       0x001D001C, 0x001F001E, 0x00190018, 0x001B001A, \
+                       0x00150014, 0x00170016, 0x00110010, 0x00130012, \
+                       0x000D000C, 0x000F000E, 0x00190008, 0x000B000A, \
+                       0x00050004, 0x00070006, 0x00110000, 0x00030002 )
+
+#define mm512_ror1x8_32( v ) \
+   _mm512_permutexvar_epi8( v, _mm512_set_epi32( \
+                       0x3C3F3E3D, 0x383B3A39, 0x34373635, 0x30333231, \
+                       0x2C2F2E2D, 0x282B2A29, 0x24272625, 0x20232221, \
+                       0x1C1F1E1D, 0x181B1A19, 0x14171615, 0x10131211, \
+                       0x0C0F0E0D, 0x080B0A09, 0x04070605, 0x00030201 ) )
+
+#define mm512_rol1x8_32( v ) \
+   _mm512_permutexvar_epi8( v, _mm512_set_epi32( \
+                       0x3E3D3C3F, 0x3A39383B, 0x36353437, 0x32313033, \
+                       0x2E2D2C2F, 0x2A29282B, 0x26252427, 0x22212023, \
+                       0x1E1D1C1F, 0x1A19181B, 0x16151417, 0x12111013, \
+                       0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003 ) )
+
+//
+// Swap bytes in vector elements, vectorized bswap.
+
+#define mm512_bswap_64( v ) \
+   _mm512_permutexvar_epi8( v, _mm512_set_epi32( \
+                       0x38393A3B, 0x3C3D3E3F,   0x20313233, 0x34353637, \
+                       0x28292A2B, 0x2C2D2E2F,   0x20212223, 0x34353637, \
+                       0x18191A1B, 0x1C1D1E1F,   0x10111213, 0x14151617, \
+                       0x08090A0B, 0x0C0D0E0F,   0x00010203, 0x04050607 ) )
+
+#define mm512_bswap_32( v ) \
+   _mm512_permutexvar_epi8( v, _mm512_set_epi832( \
+                       0x3C3D3E3F, 0x38393A3B, 0x34353637, 0x30313233, \
+                       0x3C3D3E3F, 0x38393A3B, 0x34353637, 0x30313233, \
+                       0x3C3D3E3F, 0x38393A3B, 0x34353637, 0x30313233, \
+                       0x3C3D3E3F, 0x38393A3B, 0x34353637, 0x30313233 ) )
+
+#define mm512_bswap_16( v ) \
+   _mm512_permutexvar_epi8( v, _mm512_set_epi32( \
+                       0x3E3F3C3D, 0x3A3B3839, 0x36373435, 0x32333031, \
+                       0x2E2F2C2D, 0x2A2B2829, 0x26272425, 0x22232021, \
+                       0x1E1F1C1D, 0x1A1B1819, 0x16171415, 0x12131011, \
+                       0x0E0F0C0D, 0x0A0B0809, 0x06070405, 0x02030001 ) )
+
+//
+//  Rotate elements from 2 512 bit vectors in place, source arguments
+//  are overwritten.
+//  These can all be done with 2 permutex2var instructions but they are
+//  slower than either xor or alignr.
+
+#define mm512_swap512_1024(v1, v2) \
+   v1 = _mm512_xor_si512(v1, v2); \
+   v2 = _mm512_xor_si512(v1, v2); \
+   v1 = _mm512_xor_si512(v1, v2);
+
+#define mm512_ror1x256_1024( v1, v2 ) \
+do { \
+   __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
+   v1 = _mm512_alignr_epi64( v2, v1, 4 ); \
+   v2 = t; \
+} while(0)
+
+#define mm512_rol1x256_1024( v1, v2 ) \
+do { \
+   __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
+   v2 = _mm512_alignr_epi64( v2, v1, 4 ); \
+   v1 = t; \
+} while(0)
+
+#define mm512_ror1x128_1024( v1, v2 ) \
+do { \
+   __m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \
+   v1 = _mm512_alignr_epi64( v2, v1, 2 ); \
+   v2 = t; \
+} while(0)
+
+#define mm512_rol1x128_1024( v1, v2 ) \
+do { \
+   __m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \
+   v2 = _mm512_alignr_epi64( v2, v1, 6 ); \
+   v1 = t; \
+} while(0)
+
+#define mm512_ror1x64_1024( v1, v2 ) \
+do { \
+   __m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \
+   v1 = _mm512_alignr_epi64( v2, v1, 1 ); \
+   v2 = t; \
+} while(0)
+
+#define mm512_rol1x64_1024( v1, v2 ) \
+do { \
+   __m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \
+   v2 = _mm512_alignr_epi64( v2, v1, 7 ); \
+   v1 = t; \
+} while(0)
+
+#define mm512_ror1x32_1024( v1, v2 ) \
+do { \
+   __m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \
+   v1 = _mm512_alignr_epi32( v2, v1, 1 ); \
+   v2 = t; \
+} while(0)
+
+#define mm512_rol1x32_1024( v1, v2 ) \
+do { \
+   __m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \
+   v2 = _mm512_alignr_epi32( v2, v1, 15 ); \
+   v1 = t; \
+} while(0)
+
+#define mm512_ror1x16_1024( v1, v2 ) \
+do { \
+   __m512i t = _mm512_alignr_epi8( v1, v2, 2 ); \
+   v1 = _mm512_alignr_epi8( v2, v1, 2 ); \
+   v2 = t; \
+} while(0)
+
+#define mm512_rol1x16_1024( v1, v2 ) \
+do { \
+   __m512i t = _mm512_alignr_epi8( v1, v2, 62 ); \
+   v2 = _mm512_alignr_epi8( v2, v1, 62 ); \
+   v1 = t; \
+} while(0)
+
+#define mm512_ror1x8_1024( v1, v2 ) \
+do { \
+   __m512i t = _mm512_alignr_epi8( v1, v2, 1 ); \
+   v1 = _mm512_alignr_epi8( v2, v1, 1 ); \
+   v2 = t; \
+} while(0)
+
+#define mm512_rol1x8_1024( v1, v2 ) \
+do { \
+   __m512i t = _mm512_alignr_epi8( v1, v2, 63 ); \
+   v2 = _mm512_alignr_epi8( v2, v1, 63 ); \
+   v1 = t; \
+} while(0)
+
+#endif // AVX512
+#endif // SIMD_AVX512_H__
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -0,0 +1,84 @@
+#if !defined(SIMD_SCALAR_H__)
+#define SIMD_SCALAR_H__ 1
+
+///////////////////////////////////
+//
+//    Integers up to 64 bits.
+//
+
+
+// MMX has no extract instruction for 32 bit elements so this:
+// Lo is trivial, high is a simple shift. 
+// Input may be uint64_t or __m64, returns uint32_t.
+#define u64_extr_lo32(a)   ( (uint32_t)( (uint64_t)(a) ) )
+#define u64_extr_hi32(a)   ( (uint32_t)( ((uint64_t)(a)) >> 32)  )
+
+#define u64_extr_32( a, n )  ( (uint32_t)( (a) >> ( ( 2-(n)) <<5 ) ) )
+#define u64_extr_16( a, n )  ( (uint16_t)( (a) >> ( ( 4-(n)) <<4 ) ) )
+#define u64_extr_8(  a, n )  ( (uint8_t) ( (a) >> ( ( 8-(n)) <<3 ) ) )
+
+
+// Rotate bits in various sized integers.
+#define u64_ror_64( x, c ) \
+      (uint64_t)( ( (uint64_t)(x) >> (c) ) | ( (uint64_t)(x) << (64-(c)) ) )
+#define u64_rol_64( x, c ) \
+      (uint64_t)( ( (uint64_t)(x) << (c) ) | ( (uint64_t)(x) >> (64-(c)) ) )
+#define u32_ror_32( x, c ) \
+      (uint32_t)( ( (uint32_t)(x) >> (c) ) | ( (uint32_t)(x) << (32-(c)) ) )
+#define u32_rol_32( x, c ) \
+      (uint32_t)( ( (uint32_t)(x) << (c) ) | ( (uint32_t)(x) >> (32-(c)) ) )
+#define u16_ror_16( x, c ) \
+      (uint16_t)( ( (uint16_t)(x) >> (c) ) | ( (uint16_t)(x) << (16-(c)) ) )
+#define u16rol_16( x, c ) \
+      (uint16_t)( ( (uint16_t)(x) << (c) ) | ( (uint16_t)(x) >> (16-(c)) ) )
+#define u8_ror_8( x, c ) \
+      (uint8_t) ( ( (uint8_t) (x) >> (c) ) | ( (uint8_t) (x) << ( 8-(c)) ) )
+#define u8_rol_8( x, c ) \
+      (uint8_t) ( ( (uint8_t) (x) << (c) ) | ( (uint8_t) (x) >> ( 8-(c)) ) )
+
+
+// 64 bit mem functions use integral sizes instead of bytes, data must
+// be aligned to 64 bits. Mostly for scaled indexing convenience.
+static inline void memcpy_64( uint64_t *dst, const uint64_t *src, int n )
+{   for ( int i = 0; i < n; i++ ) dst[i] = src[i]; }
+
+static inline void memset_zero_64( uint64_t *src, int n )
+{   for ( int i = 0; i < n; i++ ) src[i] = 0ull; }
+
+static inline void memset_64( uint64_t *dst, const uint64_t a,  int n )
+{   for ( int i = 0; i < n; i++ ) dst[i] = a; }
+
+#if defined (GCC_INT128)
+
+///////////////////////////////////////
+// 
+//      128 bit integers
+//
+
+// No real need or use.
+#define i128_neg1        ((uint128_t)(-1LL))
+
+// Extract specified 64 bit half of 128 bit integer.
+// typecast should work for lo: (uint64_t)(x), test it!
+#define u128_hi64( x )    ( (uint64_t)( (uint128_t)(x) >> 64 ) )
+#define u128_lo64( x )    ( (uint64_t)( (uint128_t)(x) << 64 >> 64 ) )
+// #define i128_lo64( x )    ((uint64_t)(x))
+
+// Generic extract, 
+#define u128_extr_64( a, n )  ( (uint64_t)( (a) >> ( ( 2-(n)) <<6 ) ) )
+#define u128_extr_32( a, n )  ( (uint32_t)( (a) >> ( ( 4-(n)) <<5 ) ) )
+#define u128_extr_16( a, n )  ( (uint16_t)( (a) >> ( ( 8-(n)) <<4 ) ) )
+#define u128_extr_8(  a, n )  ( (uint8_t) ( (a) >> ( (16-(n)) <<3 ) ) )
+
+
+// Not much need for this but it fills a gap.
+#define u128_ror_128( x, c ) \
+       ( ( (uint128_t)(x) >> (c) ) | ( (uint128_t)(x) << (128-(c)) ) )
+#define u128_rol_128( x, c ) \
+       ( ( (uint128_t)(x) << (c) ) | ( (uint128_t)(x) >> (128-(c)) ) )
+
+#endif  // GCC_INT128
+
+#endif // SIMD_SCALAR_H__
+
+
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	b2331375a3	v3.9.2.5	2019-06-13 11:20:27 -04:00
Jay D Dee	7fec680835	v3.9.2.4	2019-06-07 23:30:38 -04:00
Jay D Dee	1b0a5aadf6	v3.9.2.3	2019-06-05 12:20:04 -04:00
Jay D Dee	0a3c52810e	v3.9.2.2	2019-06-04 17:14:03 -04:00
Jay D Dee	4d4386a374	v3.9.2.1	2019-06-04 16:56:44 -04:00
Jay D Dee	ce259b915a	v3.9.2	2019-06-03 21:36:33 -04:00