v3.9.2.4

v3.9.2.3
v3.9.2.2
2025-09-17 23:44:27 +00:00 · 2019-06-07 23:30:38 -04:00 · 2019-06-05 12:20:04 -04:00 · 2019-06-04 17:14:03 -04:00 · 2019-06-04 16:56:44 -04:00 · 2019-06-03 21:36:33 -04:00
69 changed files with 4150 additions and 5086 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -68,7 +68,8 @@ cpuminer_SOURCES = \
  algo/blake/pentablake-4way.c \
  algo/blake/pentablake.c \
  algo/bmw/sph_bmw.c \
-  algo/bmw/bmw-hash-4way.c \
+  algo/bmw/bmw256-hash-4way.c \
+  algo/bmw/bmw512-hash-4way.c \
  algo/bmw/bmw256.c \
  algo/cryptonight/cryptolight.c \
  algo/cryptonight/cryptonight-common.c\
@@ -162,10 +163,13 @@ cpuminer_SOURCES = \
  algo/sha/sph_sha2.c \
  algo/sha/sph_sha2big.c \
  algo/sha/sha2-hash-4way.c \
+  algo/sha/sha256_hash_11way.c \
  algo/sha/sha2.c \
  algo/sha/sha256t-gate.c \
  algo/sha/sha256t-4way.c \
  algo/sha/sha256t.c \
+  algo/sha/sha256q-4way.c \
+  algo/sha/sha256q.c \
  algo/shabal/sph_shabal.c \
  algo/shabal/shabal-hash-4way.c \
  algo/shavite/sph_shavite.c \
@@ -262,7 +266,7 @@ cpuminer_SOURCES = \
  algo/yescrypt/sha256_Y.c \
  algo/yescrypt/yescrypt-best.c \
  algo/yespower/yespower.c \
-  algo/yespower/sha256.c \
+  algo/yespower/sha256_p.c \
  algo/yespower/yespower-opt.c

 disable_flags =
--- a/README.txt
+++ b/README.txt
@@ -12,7 +12,7 @@ the software, don't use it.
 Choose the exe that best matches you CPU's features or use trial and
 error to find the fastest one that doesn't crash. Pay attention to
 the features listed at cpuminer startup to ensure you are mining at
-optimum speed using all the available features.
+optimum speed using the best available features.

 Architecture names and compile options used are only provided for Intel
 Core series. Even the newest Pentium and Celeron CPUs are often missing
@@ -22,8 +22,6 @@ AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
 supported by cpuminer-opt due to an incompatible implementation of SSE2 on
 these CPUs. Some algos may crash the miner with an invalid instruction.
 Users are recommended to use an unoptimized miner such as cpuminer-multi.
-Changes in v3.8.4 may have improved compatibility with some of these CPUs.
-

 Exe name                Compile flags            Arch name

--- a/35
+++ b/35
@@ -33,11 +33,44 @@ Requirements
 Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
 supported.

-64 bit Linux or Windows operating system. Apple is not supported.
+64 bit Linux or Windows operating system. Apple and Android are not supported.

 Change Log
 ----------

+v3.9.2.4
+
+Yet another affinity fix. Hopefully the last one.
+
+v3.9.2.3
+
+Another cpu-affinity fix.
+Disabled test code that fails to compile on some CPUs with limited
+AVX512 capabilities.
+
+v3.9.2.2
+
+Fixed some day one cpu-affinity issues.
+
+v3.9.2
+
+Added sha256q algo.
+Yespower now uses openssl SHA256, but no observable hash rate increase
+on Ryzen.
+Ongoing rearchitecting.
+Lyra2z now hashes 8-way on CPUs with AVX2.
+Lyra2 (all including phi2) now runs optimized code with SSE2.
+
+v3.9.1.1
+
+Fixed lyra2v3 AVX and below.
+
+Compiling on Windows using Cygwin now works. Simply use "./build.sh"
+just like on Linux. It isn't portable therefore the binaries package will
+continue to use the existing procedure.
+The Cygwin procedure will be documented in more detail later and will
+include a list of packages that need to be installed.
+
 v3.9.1

 Fixed AVX2 version of anime algo.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -210,6 +210,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_SCRYPTJANE:   register_scryptjane_algo   ( gate ); break;
     case ALGO_SHA256D:      register_sha256d_algo      ( gate ); break;
     case ALGO_SHA256T:      register_sha256t_algo      ( gate ); break;
+     case ALGO_SHA256Q:      register_sha256q_algo      ( gate ); break;
     case ALGO_SHAVITE3:     register_shavite_algo      ( gate ); break;
     case ALGO_SKEIN:        register_skein_algo        ( gate ); break;
     case ALGO_SKEIN2:       register_skein2_algo       ( gate ); break;
@@ -344,9 +345,9 @@ const char* const algo_alias_map[][2] =
  { NULL,                NULL           }   
 };

-// if arg is a valid alias for a known algo it is updated with the proper name.
-// No validation of the algo or alias is done, It is the responsinility of the
-// calling function to validate the algo after return.
+// if arg is a valid alias for a known algo it is updated with the proper
+// name. No validation of the algo or alias is done, It is the responsinility
+// of the calling function to validate the algo after return.
 void get_algo_alias( char** algo_or_alias )
 {
  int i;
@@ -361,3 +362,22 @@ void get_algo_alias( char** algo_or_alias )

 #undef ALIAS
 #undef PROPER
+
+// only for parallel when there are lanes.
+bool submit_solution( struct work *work, void *hash,
+                      struct thr_info *thr, int lane )
+{
+     work_set_target_ratio( work, hash );
+     if ( submit_work( thr, work ) )
+     {
+         applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
+                 accepted_share_count + rejected_share_count + 1,
+                 thr->id, lane );
+         return true;
+     }
+     else
+          applog( LOG_WARNING, "Failed to submit share." );
+     return false;
+}
+
+
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -196,8 +196,9 @@ void four_way_not_tested();
 int null_scanhash();

 // The one and only, a callback for scanhash.
-
-
+bool submit_solution( struct work *work, void *hash,
+                      struct thr_info *thr, int lane );
+ 
 bool submit_work( struct thr_info *thr, const struct work *work_in );

 // displays warning
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -41,7 +41,6 @@ extern "C"{
 #endif

 #include <stddef.h>
-#ifdef __AVX2__

 #include "algo/sha/sph_types.h"
 #include "avxdefs.h"
@@ -50,6 +49,10 @@ extern "C"{

 #define SPH_SIZE_bmw512   512

+#if defined(__SSE2__)
+
+// BMW-256 4 way 32
+
 typedef struct {
   __m128i buf[64];
   __m128i H[16];
@@ -59,6 +62,60 @@ typedef struct {

 typedef bmw_4way_small_context bmw256_4way_context;

+void bmw256_4way_init(void *cc);
+
+void bmw256_4way(void *cc, const void *data, size_t len);
+
+void bmw256_4way_close(void *cc, void *dst);
+
+void bmw256_4way_addbits_and_close(
+        void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif  // __SSE2__
+
+#if defined(__AVX2__)
+
+// BMW-256 8 way 32
+
+typedef struct {
+   __m256i buf[64];
+   __m256i H[16];
+   size_t ptr;
+   uint32_t bit_count;  // assume bit_count fits in 32 bits
+} bmw_8way_small_context __attribute__ ((aligned (64)));
+
+typedef bmw_8way_small_context bmw256_8way_context;
+
+void bmw256_8way_init( bmw256_8way_context *ctx );
+void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len );
+void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
+
+#endif
+
+
+#if defined(__SSE2__)
+
+// BMW-512 2 way 64
+
+typedef struct {
+   __m128i buf[16];
+   __m128i H[16];
+   size_t ptr;
+   uint64_t bit_count; 
+} bmw_2way_big_context __attribute__ ((aligned (64)));
+
+typedef bmw_2way_big_context bmw512_2way_context;
+
+void bmw512_2way_init( bmw512_2way_context *ctx );
+void bmw512_2way( bmw512_2way_context *ctx, const void *data, size_t len );
+void bmw512_2way_close( bmw512_2way_context *ctx, void *dst );
+
+#endif // __SSE2__
+
+#if defined(__AVX2__)
+
+// BMW-512 4 way 64
+
 typedef struct {
   __m256i buf[16];
   __m256i H[16];
@@ -68,14 +125,6 @@ typedef struct {

 typedef bmw_4way_big_context bmw512_4way_context;

-void bmw256_4way_init(void *cc);
-
-void bmw256_4way(void *cc, const void *data, size_t len);
-
-void bmw256_4way_close(void *cc, void *dst);
-
-void bmw256_4way_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);

 void bmw512_4way_init(void *cc);

@@ -86,10 +135,10 @@ void bmw512_4way_close(void *cc, void *dst);
 void bmw512_4way_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);

-#endif
+#endif  // __AVX2__

 #ifdef __cplusplus
 }
 #endif

-#endif
+#endif // BMW_HASH_H__
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
--- a/algo/fugue/sph_fugue.c
+++ b/algo/fugue/sph_fugue.c
@@ -11,6 +11,8 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

+#define SPH_FUGUE_NOCOPY 1
+
 static const sph_u32 IV224[] = {
 	SPH_C32(0xf4c9120d), SPH_C32(0x6286f757), SPH_C32(0xee39e01c),
 	SPH_C32(0xe074e3cb), SPH_C32(0xa1127c62), SPH_C32(0x9a43d215),
--- a/algo/hodl/sha512_avx.c
+++ b/algo/hodl/sha512_avx.c
@@ -11,6 +11,10 @@
 #include <sys/endian.h>
 #endif 

+#if defined(__CYGWIN__)
+#include <endian.h>
+#endif
+
 #include "tmmintrin.h"
 #include "smmintrin.h"

--- a/algo/hodl/sha512_avx2.c
+++ b/algo/hodl/sha512_avx2.c
@@ -8,6 +8,10 @@
 #include <sys/endian.h>
 #endif 

+#if defined(__CYGWIN__)
+#include <endian.h>
+#endif
+
 #include "tmmintrin.h"
 #include "smmintrin.h"
 #include "immintrin.h"
--- a/algo/keccak/sse2/keccak.c
+++ b/algo/keccak/sse2/keccak.c
@@ -91,7 +91,7 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

-
+/*
 static const sph_u64 RC[] = {
 	SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
 	SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
@@ -106,7 +106,7 @@ static const sph_u64 RC[] = {
 	SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
 	SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
 };
-
+*/
 #define kekDECL_STATE \
 	sph_u64 keca00, keca01, keca02, keca03, keca04; \
 	sph_u64 keca10, keca11, keca12, keca13, keca14; \
@@ -756,6 +756,20 @@ static const sph_u64 RC[] = {
 * tested faster saving space
 */
 #define KECCAK_F_1600_   do { \
+static const sph_u64 RC[] = { \
+        SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), \
+        SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), \
+        SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), \
+        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), \
+        SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), \
+        SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), \
+        SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), \
+        SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), \
+        SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), \
+        SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), \
+        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), \
+        SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) \
+}; \
 		int j; \
 		for (j = 0; j < 24; j += 4) { \
 			KF_ELT( 0,  1, RC[j + 0]); \
@@ -791,7 +805,7 @@ static const sph_u64 RC[] = {
 /* load initial constants */
 #define KEC_I 

-static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; 
+//static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; 
 /*
 unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \
 */
@@ -799,6 +813,7 @@ static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0
 /* load hash for loop */
 #define KEC_U \
 do { \
+static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \
    /*memcpy(hashbuf, hash, 64); */ \
    memcpy(hash + 64, keczword, 8); \
 } while (0); 
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -90,7 +90,7 @@ void allium_4way_hash( void *state, const void *input )
 }

 int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                             uint64_t *hashes_done )
+                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -100,40 +100,41 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 76; // 19*4
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   swab32_array( edata, pdata, 20 );
+   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   blake256_4way_init( &allium_4way_ctx.blake );
   blake256_4way( &allium_4way_ctx.blake, vdata, 64 );

   do {
-     be32enc( noncep,   n   );
-     be32enc( noncep+1, n+1 );
-     be32enc( noncep+2, n+2 );
-     be32enc( noncep+3, n+3 );
+     *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );

     allium_4way_hash( hash, vdata );
     pdata[19] = n;

-     for ( int i = 0; i < 4; i++ )
-     if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+     for ( int lane = 0; lane < 4; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
     {
-         pdata[19] = n+i;
-         nonces[ num_found++ ] = n+i;
-         work_set_target_ratio( work, hash+(i<<3) );
+        if ( fulltest( hash+(lane<<3), ptarget ) )
+        {
+           pdata[19] = n + lane;
+           submit_solution( work, hash+(lane<<3), mythr, lane );
+         }
     }
     n += 4;
-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/lyra2/allium.c
+++ b/algo/lyra2/allium.c
@@ -70,7 +70,7 @@ void allium_hash(void *state, const void *input)
 }

 int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done )
+                     uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t _ALIGN(128) hash[8];
    uint32_t _ALIGN(128) endiandata[20];
@@ -80,6 +80,7 @@ int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t nonce = first_nonce;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

    if ( opt_benchmark )
        ptarget[7] = 0x3ffff;
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -1,6 +1,43 @@
 #include "lyra2-gate.h"


+// huge pages
+//
+// Use MAP_PRIVATE instead
+// In register algo:
+// replace thread safe whole matrix with a char**
+// alloc huge pages matrixsize * threads
+// make pointers to each thread to each thread, creating an 
+// array[thread][matrix].
+// Each thread can create its own matrix pointer:
+//  my_matrix = the matrix + ( thread_id * matrix_size  )
+//
+// Compiler version check?
+// Fallback?
+//
+// create a generic utility to map & unmap huge pages.
+// ptr = malloc_huge( size );
+// Yespower wrapper checks for 64 byte alignment, seems unnecessary as
+// it should be aligned to the page boundary. It may be desireable to
+// have the matrix size rounded up if necessary to something bigger
+// than 64 byte, say 4 kbytes a small page size.
+
+// Define some constants for indivual parameters and matrix size for
+// each algo. Use the parameter constants where apropriate.
+// Convert algos that don't yet do so to use dynamic alllocation.
+// Alloc huge pages globally. If ok each thread will create a pointer to
+// its chunk. If fail each thread will use use _mm_alloc for itself. 
+// BLOCK_LEN_BYTES is 768.
+
+#define LYRA2REV3_NROWS 4
+#define LYRA2REV3_NCOLS 4
+/*
+#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)*(LYRA2REV3_NCOLS)* \
+                                                 (LYRA2REV3_NROWS)*8)
+*/
+
+#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)<<4)
+
 __thread uint64_t* l2v3_wholeMatrix;

 bool lyra2rev3_thread_init()
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -43,25 +43,25 @@ bool register_lyra2rev2_algo( algo_gate_t* gate );

 void lyra2rev2_4way_hash( void *state, const void *input );
 int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                             uint64_t *hashes_done );
+                             uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev2_4way_ctx();

 #else

 void lyra2rev2_hash( void *state, const void *input );
 int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done );
+                        uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev2_ctx();

 #endif

 /////////////////////////

-#if defined(__SSE4_2__)
+#if defined(__SSE2__)
  #define LYRA2Z_4WAY
 #endif
 #if defined(__AVX2__)
-//  #define LYRA2Z_8WAY
+  #define LYRA2Z_8WAY
 #endif


@@ -71,21 +71,21 @@ bool init_lyra2rev2_ctx();

 void lyra2z_8way_hash( void *state, const void *input );
 int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done );
+                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_8way_thread_init();

 #elif defined(LYRA2Z_4WAY)

 void lyra2z_4way_hash( void *state, const void *input );
 int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done );
+                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_4way_thread_init();

 #else

 void lyra2z_hash( void *state, const void *input );
 int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
+                     uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_thread_init();

 #endif
@@ -102,14 +102,14 @@ bool lyra2z_thread_init();

 void lyra2h_4way_hash( void *state, const void *input );
 int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done );
+                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2h_4way_thread_init();

 #else

 void lyra2h_hash( void *state, const void *input );
 int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
+                     uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2h_thread_init();

 #endif
@@ -126,14 +126,14 @@ bool register_allium_algo( algo_gate_t* gate );

 void allium_4way_hash( void *state, const void *input );
 int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done );
+                          uint64_t *hashes_done, struct thr_info *mythr );
 bool init_allium_4way_ctx();

 #else

 void allium_hash( void *state, const void *input );
 int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
+                     uint64_t *hashes_done, struct thr_info *mythr );
 bool init_allium_ctx();

 #endif 
@@ -146,7 +146,7 @@ bool register_phi2_algo( algo_gate_t* gate );

 void phi2_hash( void *state, const void *input );
 int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
+                     uint64_t *hashes_done, struct thr_info *mythr );
 void init_phi2_ctx();

 #endif  // LYRA2_GATE_H__
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -236,7 +236,7 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
   //Tries to allocate enough space for the whole memory matrix

   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+//   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
   const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
 /*
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -50,7 +50,7 @@ void lyra2h_4way_hash( void *state, const void *input )
 }

 int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done )
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -63,6 +63,7 @@ int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t *nonces = work->nonces;
   int num_found = 0;
   uint32_t *noncep= vdata + 76; // 19*4
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;
--- a/algo/lyra2/lyra2h.c
+++ b/algo/lyra2/lyra2h.c
@@ -36,7 +36,7 @@ void lyra2h_hash( void *state, const void *input )
 }

 int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done )
+                    uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(64) hash[8];
 	uint32_t _ALIGN(64) endiandata[20];
@@ -45,6 +45,7 @@ int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

 	if (opt_benchmark)
 		ptarget[7] = 0x0000ff;
--- a/algo/lyra2/lyra2re.c
+++ b/algo/lyra2/lyra2re.c
@@ -81,8 +81,8 @@ void lyra2re_hash(void *state, const void *input)
 	memcpy(state, hashA, 32);
 }

-int scanhash_lyra2re(int thr_id, struct work *work,
-	uint32_t max_nonce,	uint64_t *hashes_done)
+int scanhash_lyra2re( int thr_id, struct work *work, uint32_t max_nonce,
+	              uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -91,6 +91,7 @@ int scanhash_lyra2re(int thr_id, struct work *work,
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
        const uint32_t Htarg = ptarget[7];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

        swab32_array( endiandata, pdata, 20 );

--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -82,7 +82,7 @@ void lyra2rev2_4way_hash( void *state, const void *input )
 }

 int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                             uint64_t *hashes_done )
+                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -95,6 +95,7 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t *nonces = work->nonces;
   int num_found = 0;
   uint32_t *noncep = vdata + 76; // 19*4
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -73,7 +73,7 @@ void lyra2rev2_hash( void *state, const void *input )
 }

 int scanhash_lyra2rev2(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -82,6 +82,7 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
        const uint32_t Htarg = ptarget[7];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -74,7 +74,6 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
-   int num_found = 0;
   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
   
@@ -104,13 +103,7 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
         if ( fulltest( lane_hash, ptarget ) )
         {
              pdata[19] = n + lane;    
-              work_set_target_ratio( work, lane_hash );
-              if ( submit_work( mythr, work ) )
-                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-		             accepted_share_count + rejected_share_count + 1,
-			     thr_id, lane );
-              else
-                applog( LOG_WARNING, "Failed to submit share." );
+              submit_solution( work, lane_hash, mythr, lane );
 	 }
      }
      n += 4;
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -50,7 +50,7 @@ void lyra2z_4way_hash( void *state, const void *input )
 }

 int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done )
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -60,25 +60,23 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 76; // 19*4
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;

-   for ( int i=0; i < 20; i++ )
-      be32enc( &edata[i], pdata[i] );
-
+   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );

   lyra2z_4way_midstate( vdata );

   do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+1, n+1 );
-      be32enc( noncep+2, n+2 );
-      be32enc( noncep+3, n+3 );
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );

      lyra2z_4way_hash( hash, vdata );
      pdata[19] = n;
@@ -87,15 +85,19 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
      {
          pdata[19] = n+i;         
-          nonces[ num_found++ ] = n+i;
          work_set_target_ratio( work, hash+(i<<3) );
+          if ( submit_work( mythr, work ) )
+              applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
+                             accepted_share_count + rejected_share_count + 1,
+                             thr_id, i );
+          else
+              applog( LOG_WARNING, "Failed to submit share." );
      }
      n += 4;
-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
@@ -150,14 +152,14 @@ void lyra2z_8way_hash( void *state, const void *input )
     memcpy( state+ 32, hash1, 32 );
     memcpy( state+ 64, hash2, 32 );
     memcpy( state+ 96, hash3, 32 );
-     memcpy( state+128, hash1, 32 );
-     memcpy( state+160, hash2, 32 );
-     memcpy( state+192, hash3, 32 );
-     memcpy( state+224, hash1, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
 }

 int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done )
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
@@ -167,15 +169,15 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 152; // 19*8
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;

-   for ( int i=0; i < 19; i++ )
-      be32enc( &edata[i], pdata[i] );
+   casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+   casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );

   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
                                 edata, edata, edata, edata, 640 );
@@ -183,15 +185,8 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
   lyra2z_8way_midstate( vdata );

   do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+1, n+1 );
-      be32enc( noncep+2, n+2 );
-      be32enc( noncep+3, n+3 );
-      be32enc( noncep+4, n+4 );
-      be32enc( noncep+5, n+5 );
-      be32enc( noncep+6, n+6 );
-      be32enc( noncep+7, n+7 );
-
+      *noncev = mm256_bswap_32(
+                 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
      lyra2z_8way_hash( hash, vdata );
      pdata[19] = n;

@@ -199,15 +194,13 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
      {
          pdata[19] = n+i;         
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          submit_solution( work, hash+(i<<3), mythr, i );
      }
      n += 8;
-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
+   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }


--- a/algo/lyra2/lyra2z.c
+++ b/algo/lyra2/lyra2z.c
@@ -44,7 +44,7 @@ void lyra2z_hash( void *state, const void *input )
 }

 int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done )
+                    uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(64) hash[8];
 	uint32_t _ALIGN(64) endiandata[20];
@@ -53,6 +53,7 @@ int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

 	if (opt_benchmark)
 		ptarget[7] = 0x0000ff;
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -16,39 +16,43 @@ void lyra2z330_hash(void *state, const void *input, uint32_t height)
 }

 int scanhash_lyra2z330( int thr_id, struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done )
+                        uint64_t *hashes_done, struct thr_info *mythr )
 {
-	uint32_t hash[8] __attribute__ ((aligned (64))); 
-	uint32_t endiandata[20] __attribute__ ((aligned (64)));
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
-	const uint32_t Htarg = ptarget[7];
-	const uint32_t first_nonce = pdata[19];
-	uint32_t nonce = first_nonce;
-	if (opt_benchmark)
-		ptarget[7] = 0x0000ff;
+   uint32_t hash[8] __attribute__ ((aligned (64))); 
+   uint32_t endiandata[20] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t nonce = first_nonce;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

-	for (int i=0; i < 19; i++) {
-		be32enc(&endiandata[i], pdata[i]);
-	}
+   if (opt_benchmark)
+	ptarget[7] = 0x0000ff;

-	do {
-		be32enc(&endiandata[19], nonce);
-		lyra2z330_hash( hash, endiandata, work->height );
-
-		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
-			work_set_target_ratio(work, hash);
-			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-			return 1;
-		}
-		nonce++;
-
-	} while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-	pdata[19] = nonce;
-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
+   for (int i=0; i < 19; i++)
+      be32enc(&endiandata[i], pdata[i]);
+        
+   do
+   {
+      be32enc(&endiandata[19], nonce);
+      lyra2z330_hash( hash, endiandata, work->height );
+      if ( hash[7] <= Htarg && fulltest(hash, ptarget) )
+      {
+         work_set_target_ratio(work, hash);
+         pdata[19] = nonce;
+         if ( submit_work( mythr, work ) )
+             applog( LOG_NOTICE, "Share %d submitted by thread %d",
+                     accepted_share_count + rejected_share_count + 1,
+                     mythr->id );
+         else
+             applog( LOG_WARNING, "Failed to submit share." );
+      }
+      nonce++;
+   } while (nonce < max_nonce && !work_restart[thr_id].restart);
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
+   return 0;
 }

 void lyra2z330_set_target( struct work* work, double job_diff )
--- a/algo/lyra2/phi2.c
+++ b/algo/lyra2/phi2.c
@@ -92,42 +92,50 @@ void phi2_hash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }

-int scanhash_phi2(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
+	           uint64_t *hashes_done, struct thr_info *mythr )
 {
-	uint32_t _ALIGN(128) hash[8];
-	uint32_t _ALIGN(128) endiandata[36];
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
+   uint32_t _ALIGN(128) hash[8];
+   uint32_t _ALIGN(128) endiandata[36];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

-	const uint32_t Htarg = ptarget[7];
-	const uint32_t first_nonce = pdata[19];
-	uint32_t n = first_nonce;
+   if(opt_benchmark){
+   	ptarget[7] = 0x00ff;
+   }

-	if(opt_benchmark){
-		ptarget[7] = 0x00ff;
-	}
+   phi2_has_roots = false;
+   for ( int i=0; i < 36; i++ )
+   {
+	be32enc(&endiandata[i], pdata[i]);
+	if (i >= 20 && pdata[i]) phi2_has_roots = true;
+   }

-	phi2_has_roots = false;
-	for (int i=0; i < 36; i++) {
-		be32enc(&endiandata[i], pdata[i]);
-		if (i >= 20 && pdata[i]) phi2_has_roots = true;
-	}
+   do {
+	be32enc( &endiandata[19], n );
+	phi2_hash( hash, endiandata );

-	do {
-		be32enc(&endiandata[19], n);
-		phi2_hash(hash, endiandata);
-
-		if (hash[7] < Htarg && fulltest(hash, ptarget)) {
-			work_set_target_ratio(work, hash);
+	if ( hash[7] < Htarg && fulltest( hash, ptarget ) )
+       	{
+           pdata[19] = n;
+           work_set_target_ratio( work, hash );
+           if ( submit_work( mythr, work ) )
+               applog( LOG_NOTICE, "Share %d submitted by thread %d.",
+                            accepted_share_count + rejected_share_count + 1,
+                            thr_id );
+           else
+               applog( LOG_WARNING, "Failed to submit share." );
 			*hashes_done = n - first_nonce + 1;
-			pdata[19] = n;
-			return 1;
-		}
-		n++;
+	}
+	n++;

-	} while (n < max_nonce && !work_restart[thr_id].restart);
+   } while ( n < max_nonce && !work_restart[thr_id].restart );

-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-	return 0;
+   *hashes_done = n - first_nonce + 1;
+   pdata[19] = n;
+   return 0;
 }
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -108,7 +108,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_rol1x64_256( s2, s3 ); \
+   mm128_ror1x64_256( s2, s3 ); \
   mm128_swap128_256( s4, s5 ); \
   mm128_rol1x64_256( s6, s7 ); \
   G_2X64( s0, s2, s4, s6 ); \
@@ -132,7 +132,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \


-#endif // AVX2
+#endif // AVX2 else SSE2

 // Scalar
 //Blake2b's G function
--- a/algo/sha/sha2-hash-4way.c
+++ b/algo/sha/sha2-hash-4way.c
@@ -30,7 +30,7 @@
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */

-#if defined(__SSE4_2__)
+#if defined(__SSE2__)

 #include <stddef.h>
 #include <string.h>
@@ -716,4 +716,4 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
 }

 #endif  // __AVX2__
-#endif  // __SSE4_2__
+#endif  // __SSE2__
--- a/algo/sha/sha2-hash-4way.h
+++ b/algo/sha/sha2-hash-4way.h
@@ -44,7 +44,8 @@
 #include "sph_types.h"
 #include "avxdefs.h"

-#if defined(__SSE4_2__)
+#if defined(__SSE2__)
+//#if defined(__SSE4_2__)

 //#define SPH_SIZE_sha256   256

@@ -60,6 +61,26 @@ void sha256_4way_init( sha256_4way_context *sc );
 void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
 void sha256_4way_close( sha256_4way_context *sc, void *dst );

+/*
+// SHA-256 7 way hybrid
+// Combines SSE, MMX and scalar data to do 8 + 2 + 1 parallel.
+typedef struct {
+   __m128i  bufx[64>>2];
+   __m128i  valx[8];
+   __m64    bufy[64>>2];
+   __m64    valy[8];
+   uint32_t bufz[64>>2];
+   uint32_t valz[8];
+   uint32_t count_high, count_low;
+} sha256_7way_context;
+
+void sha256_7way_init( sha256_7way_context *ctx );
+void sha256_7way( sha256_7way_context *ctx, const void *datax,
+                         void *datay, void *dataz, size_t len );
+void sha256_7way_close( sha256_7way_context *ctx, void *dstx, void *dstyx,
+                         void *dstz  );
+*/
+
 #if defined (__AVX2__)

 // SHA-256 8 way
@@ -88,6 +109,24 @@ void sha512_4way_init( sha512_4way_context *sc);
 void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
 void sha512_4way_close( sha512_4way_context *sc, void *dst );

-#endif
-#endif
-#endif
+// SHA-256 11 way hybrid
+// Combines AVX2, MMX and scalar data to do 8 + 2 + 1 parallel.
+typedef struct {
+   __m256i  bufx[64>>2];
+   __m256i  valx[8];
+   __m64    bufy[64>>2];
+   __m64    valy[8];
+   uint32_t bufz[64>>2];
+   uint32_t valz[8];
+   uint32_t count_high, count_low;
+} sha256_11way_context;
+
+void sha256_11way_init( sha256_11way_context *ctx );
+void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
+	                 const void *datay, const void *dataz, size_t len );
+void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx,
+	                 void *dstz  );
+
+#endif  // __AVX2__
+#endif  // __SSE2__
+#endif  // SHA256_4WAY_H__
--- a/algo/sha/sha256_hash_11way.c
+++ b/algo/sha/sha256_hash_11way.c
@@ -0,0 +1,536 @@
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sha2-hash-4way.h"
+
+#if defined(__AVX2__)
+
+// naming convention for variables and macros
+// VARx: AVX2 8 way 32 bit
+// VARy: MMX 2 way 32 bit
+// VARz: scalar integer 32 bit
+
+
+static const uint32_t H256[8] =
+{
+        0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+        0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+static const uint32_t K256[64] = 
+{
+        0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
+        0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+        0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
+        0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+        0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
+        0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+        0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
+        0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+        0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
+        0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+        0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
+        0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+        0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
+        0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+        0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
+        0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
+};
+
+#define CHx(X, Y, Z) \
+   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
+
+#define CHy(X, Y, Z) \
+   _mm_xor_si64( _mm_and_si64( _mm_xor_si64( Y, Z ), X ), Z )
+
+#define CHz(X, Y, Z) ((( (Y) ^ (Z) ) & (X) ) ^ (Z) )
+
+
+#define MAJx(X, Y, Z) \
+   _mm256_or_si256( _mm256_and_si256( X, Y ), \
+                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
+
+#define MAJy(X, Y, Z) \
+   _mm_or_si64( _mm_and_si64( X, Y ), \
+                    _mm_and_si64( _mm_or_si64( X, Y ), Z ) )
+
+#define MAJz(X, Y, Z)  ( ( (X) & (Y) ) | ( ( (X) | (Y) ) & (Z) ) )
+
+#define BSG2_0x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_ror_32(x,2), mm256_ror_32(x,13) ), _mm256_srli_epi32(x,22) )
+
+#define BSG2_0y(x) \
+   _mm_xor_si64( _mm_xor_si64( \
+       mm64_ror_32(x,2), mm64_ror_32(x,13) ), _mm_srli_pi32(x,22) )
+
+#define BSG2_0z(x)  ( ror_32(x,2) ^ ror_32(x,13)  ^ ((x)>>22) )
+
+#define BSG2_1x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_ror_32(x,6), mm256_ror_32(x,11) ), _mm256_srli_epi32(x,25) )
+
+#define BSG2_1y(x) \
+   _mm_xor_si64( _mm_xor_si64( \
+       mm64_ror_32(x,6), mm64_ror_32(x,11) ), _mm_srli_pi32(x,25) )
+
+#define BSG2_1z(x)   ( ror_32(x,6) ^ ror_32(x,11) ^ ((x)>>25) )
+
+#define SSG2_0x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_ror_32(x,7), mm256_ror_32(x,18) ), _mm256_srli_epi32(x,3) ) 
+
+#define SSG2_0y(x) \
+   _mm_xor_si64( _mm_xor_si64( \
+       mm64_ror_32(x,7), mm64_ror_32(x,18) ), _mm_srli_pi32(x,3) )
+
+#define SSG2_0z(x)  (( ror_32(x,7) ^ ror_32(x,18) ) ^ ((x)>>3) )
+
+#define SSG2_1x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_ror_32(x,17), mm256_ror_32(x,19) ), _mm256_srli_epi32(x,10) )
+
+#define SSG2_1y(x) \
+   _mm_xor_si64( _mm_xor_si64( \
+       mm64_ror_32(x,17), mm64_ror_32(x,19) ), _mm_srli_pi32(x,10) )
+
+#define SSG2_1z(x)   ( ror_32(x,17) ^ ror_32(x,19)  ^ ((x)>>10) )
+
+#define SHA2x_MEXP( a, b, c, d ) \
+     _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
+                 SSG2_1x( Wx[a] ), Wx[b] ), SSG2_0x( Wx[c] ) ), Wx[d] )
+
+#define SHA2y_MEXP( a, b, c, d ) \
+     _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
+                 SSG2_1y( Wy[a] ), Wy[b] ), SSG2_0y( Wy[c] ) ), Wy[d] )
+
+#define SHA2z_MEXP( a, b, c, d ) \
+               ( SSG2_1z( Wz[a] ) + Wz[b] + SSG2_0z( Wz[c] ) + Wz[d] )
+
+
+#define SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, \
+	                  Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, \
+		          Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, i, j) \
+do { \
+  __m256i T1x, T2x; \
+  __m64 T1y, T2y; \
+  uint32_t T1z, T2z; \
+  T1x = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
+        _mm256_add_epi32( Hx, BSG2_1x(Ex) ), CHx(Ex, Fx, Gx) ), \
+                          _mm256_set1_epi32( K256[( (j)+(i) )] ) ), Wx[i] ); \
+  T1y = _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
+        _mm_add_pi32( Hy, BSG2_1y(Ey) ), CHy(Ey, Fy, Gy) ), \
+                          _mm_set1_pi32( K256[( (j)+(i) )] ) ), Wy[i] ); \
+  T1z = Hz + BSG2_1z( Ez ) + CHz( Ez, Fz, Gz ) + K256[ ((j)+(i)) ] + Wz[i]; \
+  T2x = _mm256_add_epi32( BSG2_0x(Ax), MAJx(Ax, Bx, Cx) ); \
+  T2y = _mm_add_pi32( BSG2_0y(Ay), MAJy(Ay, By, Cy) ); \
+  T2z = BSG2_0z( Az ) + MAJz( Az, Bz, Cz ); \
+  Dx  = _mm256_add_epi32( Dx,  T1x ); \
+  Dy  = _mm_add_pi32( Dy, T1y ); \
+  Dz  = Dz + T1z; \
+  Hx  = _mm256_add_epi32( T1x, T2x ); \
+  Hy  = _mm_add_pi32( T1y, T2y ); \
+  Hz  = T1z + T2z; \
+} while (0)
+	
+void sha256_11way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 ry[8],
+                         uint32_t *inz, uint32_t rz[8] )
+{
+   __m256i Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx;
+   __m256i Wx[16];
+   __m64 Ay, By, Cy, Dy, Ey, Fy, Gy, Hy;
+   __m64 Wy[16];
+   uint32_t Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz;
+   uint32_t Wz[16];
+
+   Wx[ 0] = mm256_bswap_32( inx[ 0] );
+   Wy[ 0] =  mm64_bswap_32( iny[ 0] );
+   Wz[ 0] =       bswap_32( inz[ 0] );
+
+   Wx[ 1] = mm256_bswap_32( inx[ 1] );
+   Wy[ 1] =  mm64_bswap_32( iny[ 1] );
+   Wz[ 1] =       bswap_32( inz[ 1] );
+
+   Wx[ 2] = mm256_bswap_32( inx[ 2] );
+   Wy[ 2] =  mm64_bswap_32( iny[ 2] );
+   Wz[ 2] =       bswap_32( inz[ 2] );
+
+   Wx[ 3] = mm256_bswap_32( inx[ 3] );
+   Wy[ 3] =  mm64_bswap_32( iny[ 3] );
+   Wz[ 3] =       bswap_32( inz[ 3] );
+
+   Wx[ 4] = mm256_bswap_32( inx[ 4] );
+   Wy[ 4] =  mm64_bswap_32( iny[ 4] );
+   Wz[ 4] =       bswap_32( inz[ 4] );
+
+   Wx[ 5] = mm256_bswap_32( inx[ 5] );
+   Wy[ 5] =  mm64_bswap_32( iny[ 5] );
+   Wz[ 5] =       bswap_32( inz[ 5] );
+
+   Wx[ 6] = mm256_bswap_32( inx[ 6] );
+   Wy[ 6] =  mm64_bswap_32( iny[ 6] );
+   Wz[ 6] =       bswap_32( inz[ 6] );
+
+   Wx[ 7] = mm256_bswap_32( inx[ 7] );
+   Wy[ 7] =  mm64_bswap_32( iny[ 7] );
+   Wz[ 7] =       bswap_32( inz[ 7] );
+
+   Wx[ 8] = mm256_bswap_32( inx[ 8] );
+   Wy[ 8] =  mm64_bswap_32( iny[ 8] );
+   Wz[ 8] =       bswap_32( inz[ 8] );
+
+   Wx[ 9] = mm256_bswap_32( inx[ 9] );
+   Wy[ 9] =  mm64_bswap_32( iny[ 9] );
+   Wz[ 9] =       bswap_32( inz[ 9] );
+
+   Wx[10] = mm256_bswap_32( inx[10] );
+   Wy[10] =  mm64_bswap_32( iny[10] );
+   Wz[10] =       bswap_32( inz[10] );
+
+   Wx[11] = mm256_bswap_32( inx[11] );
+   Wy[11] =  mm64_bswap_32( iny[11] );
+   Wz[11] =       bswap_32( inz[11] );
+
+   Wx[12] = mm256_bswap_32( inx[12] );
+   Wy[12] =  mm64_bswap_32( iny[12] );
+   Wz[12] =       bswap_32( inz[12] );
+
+   Wx[13] = mm256_bswap_32( inx[13] );
+   Wy[13] =  mm64_bswap_32( iny[13] );
+   Wz[13] =       bswap_32( inz[13] );
+
+   Wx[14] = mm256_bswap_32( inx[14] );
+   Wy[14] =  mm64_bswap_32( iny[14] );
+   Wz[14] =       bswap_32( inz[14] );
+
+   Wx[15] = mm256_bswap_32( inx[15] );
+   Wy[15] =  mm64_bswap_32( iny[15] );
+   Wz[15] =       bswap_32( inz[15] );
+
+   Ax = rx[0];     Ay = ry[0];     Az = rz[0];
+   Bx = rx[1];     By = ry[1];     Bz = rz[1];
+   Cx = rx[2];     Cy = ry[2];     Cz = rz[2];
+   Dx = rx[3];     Dy = ry[3];     Dz = rz[3];
+   Ex = rx[4];     Ey = ry[4];     Ez = rz[4];
+   Fx = rx[5];     Fy = ry[5];     Fz = rz[5];
+   Gx = rx[6];     Gy = ry[6];     Gz = rz[6];
+   Hx = rx[7];     Hy = ry[7];     Hz = rz[7];
+
+   SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
+                     Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
+                     Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  0, 0 );
+   SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
+		     Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
+		     Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  1, 0 );
+   SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
+		     Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
+		     Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz,  2, 0 );
+   SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
+		     Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
+		     Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez,  3, 0 );
+   SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
+		     Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
+		     Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz,  4, 0 );
+   SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
+		     Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
+		     Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz,  5, 0 );
+   SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
+		     Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
+		     Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz,  6, 0 );
+   SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
+		     By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
+		     Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az,  7, 0 );
+   SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
+		     Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
+		     Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  8, 0 );
+   SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
+		     Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
+		     Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  9, 0 );
+   SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
+		     Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
+		     Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, 0 );
+   SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
+		     Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
+		     Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, 0 );
+   SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
+		     Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
+		     Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, 0 );
+   SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
+		     Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
+		     Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, 0 );
+   SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
+		     Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
+		     Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, 0 );
+   SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
+		     By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
+		     Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      Wx[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
+      Wy[ 0] = SHA2y_MEXP( 14,  9,  1,  0 );
+      Wz[ 0] = SHA2z_MEXP( 14,  9,  1,  0 );
+
+      Wx[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
+      Wy[ 1] = SHA2y_MEXP( 15, 10,  2,  1 );
+      Wz[ 1] = SHA2z_MEXP( 15, 10,  2,  1 );
+
+      Wx[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
+      Wy[ 2] = SHA2y_MEXP(  0, 11,  3,  2 );
+      Wz[ 2] = SHA2z_MEXP(  0, 11,  3,  2 );
+
+      Wx[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
+      Wy[ 3] = SHA2y_MEXP(  1, 12,  4,  3 );
+      Wz[ 3] = SHA2z_MEXP(  1, 12,  4,  3 );
+
+      Wx[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
+      Wy[ 4] = SHA2y_MEXP(  2, 13,  5,  4 );
+      Wz[ 4] = SHA2z_MEXP(  2, 13,  5,  4 );
+
+      Wx[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
+      Wy[ 5] = SHA2y_MEXP(  3, 14,  6,  5 );
+      Wz[ 5] = SHA2z_MEXP(  3, 14,  6,  5 );
+
+      Wx[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
+      Wy[ 6] = SHA2y_MEXP(  4, 15,  7,  6 );
+      Wz[ 6] = SHA2z_MEXP(  4, 15,  7,  6 );
+
+      Wx[ 7] = SHA2x_MEXP(  5,  0,  8,  7);
+      Wy[ 7] = SHA2y_MEXP(  5,  0,  8,  7);
+      Wz[ 7] = SHA2z_MEXP(  5,  0,  8,  7);
+
+      Wx[ 8] = SHA2x_MEXP(  6,  1,  9,  8);
+      Wy[ 8] = SHA2y_MEXP(  6,  1,  9,  8);
+      Wz[ 8] = SHA2z_MEXP(  6,  1,  9,  8);
+
+      Wx[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
+      Wy[ 9] = SHA2y_MEXP(  7,  2, 10,  9);
+      Wz[ 9] = SHA2z_MEXP(  7,  2, 10,  9);
+
+      Wx[10] = SHA2x_MEXP(  8,  3, 11, 10 );
+      Wy[10] = SHA2y_MEXP(  8,  3, 11, 10);
+      Wz[10] = SHA2z_MEXP(  8,  3, 11, 10);
+
+      Wx[11] = SHA2x_MEXP(  9,  4, 12, 11);
+      Wy[11] = SHA2y_MEXP(  9,  4, 12, 11);
+      Wz[11] = SHA2z_MEXP(  9,  4, 12, 11 );
+
+      Wx[12] = SHA2x_MEXP( 10,  5, 13, 12 );
+      Wy[12] = SHA2y_MEXP( 10,  5, 13, 12 );
+      Wz[12] = SHA2z_MEXP( 10,  5, 13, 12 );
+
+      Wx[13] = SHA2x_MEXP( 11,  6, 14, 13 );
+      Wy[13] = SHA2y_MEXP( 11,  6, 14, 13 );
+      Wz[13] = SHA2z_MEXP( 11,  6, 14, 13 );
+
+      Wx[14] = SHA2x_MEXP( 12,  7, 15, 14 );
+      Wy[14] = SHA2y_MEXP( 12,  7, 15, 14 );
+      Wz[14] = SHA2z_MEXP( 12,  7, 15, 14 );
+
+      Wx[15] = SHA2x_MEXP( 13,  8,  0, 15 );
+      Wy[15] = SHA2y_MEXP( 13,  8,  0, 15 );
+      Wz[15] = SHA2z_MEXP( 13,  8,  0, 15 );
+
+
+      SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
+                        Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
+			Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,	 0, j );
+      SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
+		        Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
+		       	Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  1, j );
+      SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
+		        Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
+		       	Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz,  2, j );
+      SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
+		        Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
+		       	Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez,  3, j );
+      SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
+		        Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
+		       	Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz,  4, j );
+      SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
+		        Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
+		       	Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz,  5, j );
+      SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
+		        Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
+		       	Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz,  6, j );
+      SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
+		        By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
+		       	Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az,  7, j );
+      SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
+                        Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
+                        Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  8, j );
+      SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx, 
+                        Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, 
+                        Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  9, j );
+      SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx, 
+                        Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, 
+                        Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, j );
+      SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex, 
+                        Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, 
+                        Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, j );
+      SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx, 
+                        Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, 
+                        Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, j );
+      SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx, 
+                        Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, 
+                        Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, j );
+      SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx, 
+                        Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, 
+                        Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, j );
+      SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax, 
+                        By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, 
+                        Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, j );
+   }
+
+   rx[0] = _mm256_add_epi32( rx[0], Ax );
+   ry[0] =     _mm_add_pi32( ry[0], Ay );
+   rz[0] =                   rz[0]+ Az;
+   rx[1] = _mm256_add_epi32( rx[1], Bx );
+   ry[1] =     _mm_add_pi32( ry[1], By );
+   rz[1] =                   rz[1]+ Bz;
+   rx[2] = _mm256_add_epi32( rx[2], Cx );
+   ry[2] =     _mm_add_pi32( ry[2], Cy );
+   rz[3] =                   rz[3]+ Dz;
+   rx[4] = _mm256_add_epi32( rx[4], Ex );
+   ry[4] =     _mm_add_pi32( ry[4], Ey );
+   rz[4] =                   rz[4]+ Ez;
+   rx[5] = _mm256_add_epi32( rx[5], Fx );
+   ry[5] =     _mm_add_pi32( ry[5], Fy );
+   rz[5] =                   rz[5]+ Fz;
+   rx[6] = _mm256_add_epi32( rx[6], Gx );
+   ry[6] =     _mm_add_pi32( ry[6], Gy );
+   rz[6] =                   rz[6]+ Gz;
+   rx[7] = _mm256_add_epi32( rx[7], Hx );
+   ry[7] =     _mm_add_pi32( ry[7], Hy );
+   rz[7] =                   rz[7]+ Hz;
+
+}
+
+void sha256_11way_init( sha256_11way_context *ctx )
+{
+   ctx->count_high = ctx->count_low = 0;
+   ctx->valx[0] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[0] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[1] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[1] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[2] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[2] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[3] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[3] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[4] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[4] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[5] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[5] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[6] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[6] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[7] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[7] =     _mm_set1_pi32( H256[0] );
+   memcpy( ctx->valz, H256, 32 );
+}
+
+
+void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
+	                  const void *datay, const void *dataz, size_t len )
+{
+   __m256i  *vdatax = (__m256i*) datax;
+    __m64   *vdatay = (__m64*)   datay;
+   uint32_t *idataz = (uint32_t*)dataz;
+   size_t ptr;
+   const int buf_size = 64;
+
+   ptr = (unsigned)ctx->count_low & (buf_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      uint32_t clow, clow2;
+
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( ctx->bufx + (ptr>>2), vdatax + (ptr>>2), clen>>2 );
+      memcpy_64 ( ctx->bufy + (ptr>>2), vdatay + (ptr>>2), clen>>2 );
+      memcpy    ( ctx->bufz +  ptr,     idataz +  ptr,     clen    );
+      ptr += clen;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         sha256_11way_round( ctx->bufx, ctx->valx,
+			     ctx->bufy, ctx->valy,
+			     ctx->bufz, ctx->valz );
+         ptr = 0;
+      }
+      clow = ctx->count_low;
+      clow2 = clow + clen;
+      ctx->count_low = clow2;
+      if ( clow2 < clow )
+         ctx->count_high++;
+   }
+}
+
+
+void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dsty,
+	                                            void *dstz)
+{
+    unsigned ptr, u;
+    uint32_t low, high;
+    const int buf_size = 64;
+    const int pad = buf_size - 8;
+
+    ptr = (unsigned)ctx->count_low & (buf_size - 1U);
+    ctx->bufx[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
+    ctx->bufy[ ptr>>2 ] = _mm_set1_pi32( 0x80 );
+    ctx->bufz[ ptr>>2 ] = 0x80;
+    ptr += 4;
+
+    if ( ptr > pad )
+    {
+         memset_zero_256( ctx->bufx + (ptr>>2), (buf_size - ptr) >> 2 );
+         memset_zero_64(  ctx->bufy + (ptr>>2), (buf_size - ptr) >> 2 );
+         memset(      ctx->bufz + (ptr>>2), 0,  (buf_size - ptr) >> 2 );
+         sha256_11way_round( ctx->bufx, ctx->valx,
+			     ctx->bufy, ctx->valy,
+			     ctx->bufz, ctx->valz );
+         memset_zero_256( ctx->bufx, pad >> 2 );
+         memset_zero_64(  ctx->bufy, pad >> 2 );
+         memset(      ctx->bufz, 0,  pad >> 2 );
+    }
+    else
+    {
+        memset_zero_256( ctx->bufx + (ptr>>2),    (pad - ptr) >> 2 );
+        memset_zero_64(  ctx->bufy + (ptr>>2),    (pad - ptr) >> 2 );
+        memset(          ctx->bufz + (ptr>>2), 0, (pad - ptr) >> 2 );
+    }
+
+    low = ctx->count_low;
+    high = (ctx->count_high << 3) | (low >> 29);
+    low = low << 3;
+
+    ctx->bufx[ pad >> 2 ] =
+                 mm256_bswap_32( _mm256_set1_epi32( high ) );
+    ctx->bufy[ pad >> 2 ] =
+                 mm64_bswap_32( _mm_set1_pi32( high ) );
+    ctx->bufz[ pad >> 2 ] =
+                 bswap_32( high );
+
+
+    ctx->bufx[ ( pad+4 ) >> 2 ] =
+                 mm256_bswap_32( _mm256_set1_epi32( low ) );
+    ctx->bufy[ ( pad+4 ) >> 2 ] =
+                 mm64_bswap_32( _mm_set1_pi32( low ) );
+    ctx->bufz[ ( pad+4 ) >> 2 ] =
+                 bswap_32( low );
+
+    sha256_11way_round( ctx->bufx, ctx->valx,
+		       ctx->bufy, ctx->valy,
+		       ctx->bufz, ctx->valz  );
+
+    for ( u = 0; u < 8; u ++ )
+    {
+       casti_m256i( dstx, u ) = mm256_bswap_32( ctx->valx[u] );
+       casti_m64  ( dsty, u ) =  mm64_bswap_32( ctx->valy[u] );
+       ((uint32_t*)dstz)[u] = bswap_32( ctx->valz[u] );
+   }
+}
+
+#endif
--- a/algo/sha/sha256q-4way.c
+++ b/algo/sha/sha256q-4way.c
@@ -0,0 +1,219 @@
+#include "sha256t-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "sha2-hash-4way.h"
+
+#if defined(SHA256T_8WAY)
+
+static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
+
+void sha256q_8way_hash( void* output, const void* input )
+{
+   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+   sha256_8way_context ctx;
+   memcpy( &ctx, &sha256_ctx8, sizeof ctx );
+
+   sha256_8way( &ctx, input + (64<<3), 16 );
+   sha256_8way_close( &ctx, vhash );
+
+   sha256_8way_init( &ctx );
+   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_close( &ctx, vhash );
+
+   sha256_8way_init( &ctx );
+   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_close( &ctx, vhash );
+
+   sha256_8way_init( &ctx );
+   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_close( &ctx, output );
+}
+
+int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
+	                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t edata[20] __attribute__ ((aligned (32)));;
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+
+   const uint64_t htmax[] = {          0,
+                                     0xF,
+                                    0xFF,
+                                   0xFFF,
+                                  0xFFFF,
+                              0x10000000 };
+   const uint32_t masks[] = {  0xFFFFFFFF,
+                               0xFFFFFFF0,
+                               0xFFFFFF00,
+                               0xFFFFF000,
+                               0xFFFF0000,
+                                        0 };
+
+   // Need big endian data
+   casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+   casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+                                 edata, edata, edata, edata, 640 );
+   sha256_8way_init( &sha256_ctx8 );
+   sha256_8way( &sha256_ctx8, vdata, 64 );
+
+   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+   {
+      uint32_t mask = masks[m];
+      do
+      {
+        *noncev = mm256_bswap_32(
+		 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
+
+	 pdata[19] = n;
+
+         sha256q_8way_hash( hash, vdata );
+
+         uint32_t *hash7 = &(hash[7<<3]); 
+	 
+         for ( int lane = 0; lane < 8; lane++ )
+         if ( !( hash7[ lane ] & mask ) )
+         { 
+            // deinterleave hash for lane
+	    uint32_t lane_hash[8];
+	    mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
+
+	    if ( fulltest( lane_hash, ptarget ) )
+            {
+	      pdata[19] = n + lane;
+              work_set_target_ratio( work, lane_hash );
+              if ( submit_work( mythr, work ) )
+                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
+                             accepted_share_count + rejected_share_count + 1,
+                             thr_id, lane );
+              else
+                applog( LOG_WARNING, "Failed to submit share." );
+	    }
+	 }
+         n += 8;
+
+      } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
+      break;
+   }
+    
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#endif
+
+#if defined(SHA256T_4WAY)
+
+static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
+
+void sha256q_4way_hash( void* output, const void* input )
+{
+   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+   sha256_4way_context ctx;
+   memcpy( &ctx, &sha256_ctx4, sizeof ctx );
+
+   sha256_4way( &ctx, input + (64<<2), 16 );
+   sha256_4way_close( &ctx, vhash );
+
+   sha256_4way_init( &ctx );
+   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_close( &ctx, vhash );
+
+   sha256_4way_init( &ctx );
+   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_close( &ctx, vhash );
+
+   sha256_4way_init( &ctx );
+   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_close( &ctx, output );
+}
+
+int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
+	                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[7<<2]);
+   uint32_t lane_hash[8];
+   uint32_t edata[20] __attribute__ ((aligned (32)));;
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+
+   const uint64_t htmax[] = {          0,
+                                     0xF,
+                                    0xFF,
+                                   0xFFF,
+                                  0xFFFF,
+                              0x10000000 };
+   const uint32_t masks[] = {  0xFFFFFFFF,
+                               0xFFFFFFF0,
+                               0xFFFFFF00,
+                               0xFFFFF000,
+                               0xFFFF0000,
+                                        0 };
+
+   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   sha256_4way_init( &sha256_ctx4 );
+   sha256_4way( &sha256_ctx4, vdata, 64 );
+
+   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+   {
+      uint32_t mask = masks[m];
+      do {
+         *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
+	 pdata[19] = n;
+
+         sha256q_4way_hash( hash, vdata );
+
+         for ( int lane = 0; lane < 4; lane++ )
+         if ( !( hash7[ lane ] & mask ) )
+         {
+            mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+
+            if ( fulltest( lane_hash, ptarget ) )
+            {
+              pdata[19] = n + lane;
+              work_set_target_ratio( work, lane_hash );
+              if ( submit_work( mythr, work ) )
+                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
+                             accepted_share_count + rejected_share_count + 1,
+                             thr_id, lane );
+              else
+                applog( LOG_WARNING, "Failed to submit share." );
+            }
+         }
+
+	 n += 4;
+
+      } while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
+      break;
+   }
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#endif
+
--- a/algo/sha/sha256q.c
+++ b/algo/sha/sha256q.c
@@ -0,0 +1,113 @@
+#include "sha256t-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <openssl/sha.h>
+
+static __thread SHA256_CTX sha256q_ctx __attribute__ ((aligned (64)));
+
+void sha256q_midstate( const void* input )
+{
+    SHA256_Init( &sha256q_ctx );
+    SHA256_Update( &sha256q_ctx, input, 64 );
+}
+
+void sha256q_hash( void* output, const void* input )
+{
+   uint32_t _ALIGN(64) hash[16];
+   const int midlen = 64;            // bytes
+   const int tail   = 80 - midlen;   // 16
+
+   SHA256_CTX ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &sha256q_ctx, sizeof sha256q_ctx );
+
+   SHA256_Update( &ctx, input + midlen, tail );
+   SHA256_Final( (unsigned char*)hash, &ctx );
+
+   SHA256_Init( &ctx );
+   SHA256_Update( &ctx, hash, 32 );
+   SHA256_Final( (unsigned char*)hash, &ctx );
+
+   SHA256_Init( &ctx );
+   SHA256_Update( &ctx, hash, 32 );
+   SHA256_Final( (unsigned char*)hash, &ctx );
+
+   SHA256_Init( &ctx );
+   SHA256_Update( &ctx, hash, 32 );
+   SHA256_Final( (unsigned char*)hash, &ctx );
+
+   memcpy( output, hash, 32 );
+}
+
+int scanhash_sha256q( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19] - 1;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t Htarg = ptarget[7];
+#ifdef _MSC_VER
+   uint32_t __declspec(align(32)) hash64[8];
+#else
+   uint32_t hash64[8] __attribute__((aligned(32)));
+#endif
+   uint32_t endiandata[32];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+
+   uint64_t htmax[] = {
+		0,
+		0xF,
+		0xFF,
+		0xFFF,
+		0xFFFF,
+		0x10000000
+	};
+   uint32_t masks[] = {
+		0xFFFFFFFF,
+		0xFFFFFFF0,
+		0xFFFFFF00,
+		0xFFFFF000,
+		0xFFFF0000,
+		0
+	};
+
+   // we need bigendian data...
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+   sha256q_midstate( endiandata );
+
+   for ( int m = 0; m < 6; m++ )
+   {
+      if ( Htarg <= htmax[m] )
+      {
+         uint32_t mask = masks[m];
+         do {
+            pdata[19] = ++n;
+            be32enc(&endiandata[19], n);
+            sha256q_hash( hash64, endiandata );
+            if ( ( !(hash64[7] & mask) ) && fulltest( hash64, ptarget ) )
+            {
+               work_set_target_ratio( work, hash64 );
+               if ( submit_work( mythr, work ) )
+                  applog( LOG_NOTICE, "Share %d submitted by thread %d.",
+                             accepted_share_count + rejected_share_count + 1,
+                             thr_id );
+               else
+                  applog( LOG_WARNING, "Failed to submit share." );
+               *hashes_done = n - first_nonce + 1;
+            }
+         } while ( n < max_nonce && !work_restart[thr_id].restart );
+         break;
+      }
+   }
+
+   *hashes_done = n - first_nonce + 1;
+   pdata[19] = n;
+   return 0;
+}
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -5,6 +5,137 @@
 #include <stdio.h>
 #include "sha2-hash-4way.h"

+#if defined(SHA256T_11WAY)
+
+static __thread sha256_11way_context sha256_ctx11 __attribute__ ((aligned (64)));
+
+void sha256t_11way_hash( void *outx, void *outy, void *outz, const void *inpx,
+	                 const void *inpy, const void*inpz )
+{
+   uint32_t hashx[8*8] __attribute__ ((aligned (64)));
+   uint32_t hashy[8*2] __attribute__ ((aligned (64)));
+   uint32_t hashz[8]   __attribute__ ((aligned (64)));
+   sha256_11way_context ctx;
+   const void *inpx64 = inpx+(64<<3);
+   const void *inpy64 = inpy+(64<<1);
+   const void *inpz64 = inpz+ 64;
+
+   memcpy( &ctx, &sha256_ctx11, sizeof ctx );
+   sha256_11way_update( &ctx, inpx64, inpy64, inpz64,  16 );
+   sha256_11way_close( &ctx, hashx, hashy, hashz );
+
+   sha256_11way_init( &ctx );
+   sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
+   sha256_11way_close( &ctx, hashx, hashy, hashz );
+
+   sha256_11way_init( &ctx );
+   sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
+   sha256_11way_close( &ctx, outx, outy, outz );
+}
+
+int scanhash_sha256t_11way( int thr_id, struct work *work, uint32_t max_nonce,
+	                    uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t datax[20*8]  __attribute__ ((aligned (64)));
+   uint32_t datay[20*2]  __attribute__ ((aligned (32)));
+   uint32_t dataz[20]    __attribute__ ((aligned (32)));
+   uint32_t hashx[8*8]   __attribute__ ((aligned (32)));
+   uint32_t hashy[8*2]   __attribute__ ((aligned (32)));
+   uint32_t hashz[8]     __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7;
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   __m256i  *noncex = (__m256i*) datax + 19;
+   __m64    *noncey = (__m64*)   datay + 19;
+   uint32_t *noncez = (uint32_t*)dataz + 19;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   int i;
+   const uint64_t htmax[] = {           0,
+                                      0xF,
+                                     0xFF,
+                                    0xFFF,
+                                   0xFFFF,
+                               0x10000000 };
+   const uint32_t masks[] = {  0xFFFFFFFF,
+                               0xFFFFFFF0,
+                               0xFFFFFF00,
+                               0xFFFFF000,
+                               0xFFFF0000,
+                                        0 };
+
+   // Use dataz (scalar) to stage bswapped data for the vectors.
+   casti_m256i( dataz, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+   casti_m256i( dataz, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+   casti_m128i( dataz, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+   mm256_interleave_8x32( datax, dataz, dataz, dataz, dataz,
+                                 dataz, dataz, dataz, dataz, 640 );
+   mm64_interleave_2x32( datay, dataz, dataz, 640 );
+
+   sha256_11way_init( &sha256_ctx11 );
+   sha256_11way_update( &sha256_ctx11, datax, datay, dataz, 64 );
+
+   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+   {
+      uint32_t mask = masks[m];
+      do
+      {
+        *noncex = mm256_bswap_32(
+         _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
+        *noncey = mm64_bswap_32( _mm_set_pi32( n+9, n+8 ) );
+        *noncez = bswap_32( n+10 );
+
+        pdata[19] = n;
+
+        sha256t_11way_hash( hashx, hashy, hashz, datax, datay, dataz );
+
+        if ( opt_benchmark ) { n += 11; continue; }
+
+        hash7 = &(hashx[7<<3]); 
+        for ( i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
+        { 
+            // deinterleave hash for lane
+            mm256_extract_lane_8x32( lane_hash, hashx, i, 256 );
+            if ( fulltest( lane_hash, ptarget ) )
+            {
+	            pdata[19] = n + i;
+               submit_solution( work, lane_hash, mythr, i );
+            }
+        }
+
+        hash7 = &(hashy[7<<1]);
+        for( i = 0; i < 2; i++ ) if ( !(hash7[ 0] & mask ) )
+ 
+        {
+            mm64_extract_lane_2x32( lane_hash, hashy, i, 256 );
+           if ( fulltest( lane_hash, ptarget ) )
+           {
+               pdata[19] = n + 8 + i;
+               submit_solution( work, lane_hash, mythr, i+8 );
+           }
+	     }
+
+        if ( !(hashz[7] & mask ) && fulltest( hashz, ptarget ) )
+        {
+            pdata[19] = n+10;
+            submit_solution( work, hashz, mythr, 10 );
+        }
+        n += 11;
+
+      } while ( (n < max_nonce-12) && !work_restart[thr_id].restart );
+      break;
+   }
+    
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#endif
+
 #if defined(SHA256T_8WAY)

 static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
@@ -29,7 +160,7 @@ void sha256t_8way_hash( void* output, const void* input )
 }

 int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
-	                   uint64_t *hashes_done, struct thr_info *mythr )
+                           uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t hash[8*8] __attribute__ ((aligned (32)));
@@ -71,44 +202,38 @@ int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
      do
      {
        *noncev = mm256_bswap_32(
-		 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
-
-	 pdata[19] = n;
+                 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
+         pdata[19] = n;

         sha256t_8way_hash( hash, vdata );

-         uint32_t *hash7 = &(hash[7<<3]); 
-	 
+         uint32_t *hash7 = &(hash[7<<3]);
+
         for ( int lane = 0; lane < 8; lane++ )
         if ( !( hash7[ lane ] & mask ) )
-         { 
+         {
            // deinterleave hash for lane
-	    uint32_t lane_hash[8];
-	    mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
+            uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+            mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );

-	    if ( fulltest( lane_hash, ptarget ) )
+            if ( fulltest( lane_hash, ptarget ) )
            {
-	      pdata[19] = n + lane;
-              work_set_target_ratio( work, lane_hash );
-              if ( submit_work( mythr, work ) )
-                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-                             accepted_share_count + rejected_share_count + 1,
-                             thr_id, lane );
-              else
-                applog( LOG_WARNING, "Failed to submit share." );
+              pdata[19] = n + lane;
+              submit_solution( work, lane_hash, mythr, lane );
 	    }
-	 }
+         }
         n += 8;

      } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
      break;
   }
-    
   *hashes_done = n - first_nonce + 1;
   return 0;
 }

-#elif defined(SHA256T_4WAY)
+#endif
+
+#if defined(SHA256T_4WAY)

 static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));

@@ -136,9 +261,9 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
-   uint32_t *hash7 = &(hash[7<<2]);
-   uint32_t lane_hash[8];
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t edata[20] __attribute__ ((aligned (32)));;
+   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
@@ -187,22 +312,14 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
            if ( fulltest( lane_hash, ptarget ) )
            {
              pdata[19] = n + lane;
-              work_set_target_ratio( work, lane_hash );
-              if ( submit_work( mythr, work ) )
-                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-                             accepted_share_count + rejected_share_count + 1,
-                             thr_id, lane );
-              else
-                applog( LOG_WARNING, "Failed to submit share." );
-            }
+              submit_solution( work, lane_hash, mythr, lane );
+	    }
         }
-
 	 n += 4;

      } while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
      break;
   }
-
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -2,16 +2,20 @@

 bool register_sha256t_algo( algo_gate_t* gate )
 {
-#if defined(SHA256T_8WAY)
-    gate->optimizations = SSE42_OPT | AVX2_OPT;
+#if defined(SHA256T_11WAY)
+    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
+    gate->scanhash   = (void*)&scanhash_sha256t_11way;
+    gate->hash       = (void*)&sha256t_11way_hash;
+#elif defined(SHA256T_8WAY)
+    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t_8way;
    gate->hash       = (void*)&sha256t_8way_hash;
 #elif defined(SHA256T_4WAY)
-    gate->optimizations = SSE42_OPT | AVX2_OPT;
+    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t_4way;
    gate->hash       = (void*)&sha256t_4way_hash;
 #else
-    gate->optimizations = SSE42_OPT | AVX2_OPT | SHA_OPT;
+gate->optimizations = SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t;
    gate->hash       = (void*)&sha256t_hash;
 #endif
@@ -19,3 +23,23 @@ bool register_sha256t_algo( algo_gate_t* gate )
    return true;
 }

+bool register_sha256q_algo( algo_gate_t* gate )
+{
+#if defined(SHA256T_8WAY)
+    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
+    gate->scanhash   = (void*)&scanhash_sha256q_8way;
+    gate->hash       = (void*)&sha256q_8way_hash;
+#elif defined(SHA256T_4WAY)
+    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
+    gate->scanhash   = (void*)&scanhash_sha256q_4way;
+    gate->hash       = (void*)&sha256q_4way_hash;
+#else
+    gate->optimizations = SHA_OPT;
+    gate->scanhash   = (void*)&scanhash_sha256q;
+    gate->hash       = (void*)&sha256q_hash;
+#endif
+    gate->get_max64  = (void*)&get_max64_0x3ffff;
+    return true;
+
+}
+
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -6,34 +6,55 @@

 // Override multi way on ryzen, SHA is better.
 #if !defined(RYZEN_)
-#if defined(__SSE4_2__)
+#if defined(__SSE2__)
  #define SHA256T_4WAY
 #endif
 #if defined(__AVX2__)
  #define SHA256T_8WAY
+//  #define SHA256T_11WAY
 #endif
 #endif

-bool register_blake2s_algo( algo_gate_t* gate );
+bool register_sha256t_algo( algo_gate_t* gate );
+bool register_sha256q_algo( algo_gate_t* gate );
+
+#if defined(SHA256T_11WAY)
+
+void sha256t_11way_hash( void *outx, void *outy, void *outz, const void *inpx,
+	                 const void *inpy, const void *inpz );
+int scanhash_sha256t_11way( int thr_id, struct work *work, uint32_t max_nonce,
+                            uint64_t *hashes_done, struct thr_info *mythr );
+//void sha256q_8way_hash( void *output, const void *input );
+//int scanhash_sha256q_11way( int thr_id, struct work *work, uint32_t max_nonce,
+//                            uint64_t *hashes_done, struct thr_info *mythr );
+#endif

 #if defined(SHA256T_8WAY)

 void sha256t_8way_hash( void *output, const void *input );
 int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
+void sha256q_8way_hash( void *output, const void *input );
+int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+#endif

-#elif defined (SHA256T_4WAY)
+#if defined(SHA256T_4WAY)

 void sha256t_4way_hash( void *output, const void *input );
 int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
-#else
+void sha256q_4way_hash( void *output, const void *input );
+int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+#endif

 void sha256t_hash( void *output, const void *input );
 int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );
-
-#endif
+void sha256q_hash( void *output, const void *input );
+int scanhash_sha256q( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );

 #endif

--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -5,8 +5,6 @@
 #include <stdio.h>
 #include <openssl/sha.h>

-#if !defined(SHA256T_4WAY)
-
 static __thread SHA256_CTX sha256t_ctx __attribute__ ((aligned (64)));

 void sha256t_midstate( const void* input )
@@ -72,8 +70,11 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
 	};

   // we need bigendian data...
-   for ( int k = 0; k < 19; k++ )
-      be32enc( &endiandata[k], pdata[k] );
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );

   sha256t_midstate( endiandata );

@@ -89,7 +90,13 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
            if ( ( !(hash64[7] & mask) ) && fulltest( hash64, ptarget ) )
            {
               *hashes_done = n - first_nonce + 1;
-               return true;
+               work_set_target_ratio( work, hash64 );
+               if ( submit_work( mythr, work ) )
+                  applog( LOG_NOTICE, "Share %d submitted by thread %d.",
+                             accepted_share_count + rejected_share_count + 1,
+                             thr_id );
+               else
+                  applog( LOG_WARNING, "Failed to submit share." );
            }
         } while ( n < max_nonce && !work_restart[thr_id].restart );
         break;
@@ -100,4 +107,3 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
   pdata[19] = n;
   return 0;
 }
-#endif
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -346,7 +346,7 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
      memcpy( buf + ptr, data, clen );
      data = (const unsigned char *)data + clen;
      ptr += clen;
-      len -= clen >> 1;
+      len -= (clen >> 1);
      if ( ptr == sizeof ctx->buf )
      {
         if ( ( ctx->count0 = ctx->count0 + 1024 )  == 0 )
@@ -365,16 +365,8 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
   }

   uint32_t vp = ptr>>5;
-
-   // Terminating byte then zero pad
-   casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
-
-   // Zero pad full vectors up to count
-   for ( ; vp < 6; vp++ )
-       casti_m256i( buf, vp ) = m256_zero;
-
   // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
-   // Count is misaligned to 16 bits and straddles a vector.
+   // Count is misaligned to 16 bits and straddles 2 vectors.
   // Use u32 overlay to stage then u16 to load buf.
   union
   {
@@ -387,6 +379,18 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
   count.u32[2] = ctx->count2;
   count.u32[3] = ctx->count3;

+   if ( vp == 0 )    // empty buf, xevan.
+   { 
+      casti_m256i( buf, 0 ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
+      memset_zero_256( (__m256i*)buf + 1, 5 );
+      ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
+   }
+   else     // half full buf, everyone else.
+   {
+      casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
+      memset_zero_256( (__m256i*)buf + vp, 6 - vp );
+   }
+
   casti_m256i( buf, 6 ) = _mm256_set_epi16( count.u16[0], 0,0,0,0,0,0,0,
                                             count.u16[0], 0,0,0,0,0,0,0 );
   casti_m256i( buf, 7 ) = _mm256_set_epi16(
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -25,7 +25,8 @@
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/sha/sha2-hash-4way.h"

-typedef struct {
+union _sonoa_4way_context_overlay
+{
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
@@ -43,8 +44,10 @@ typedef struct {
    sph_whirlpool_context   whirlpool;
    sha512_4way_context     sha512;
    haval256_5_4way_context haval;
-} sonoa_4way_ctx_holder;
+};

+typedef union _sonoa_4way_context_overlay sonoa_4way_context_overlay;
+/*
 sonoa_4way_ctx_holder sonoa_4way_ctx __attribute__ ((aligned (64)));

 void init_sonoa_4way_ctx()
@@ -67,6 +70,7 @@ void init_sonoa_4way_ctx()
     sha512_4way_init( &sonoa_4way_ctx.sha512 );
     haval256_5_4way_init( &sonoa_4way_ctx.haval );
 };
+*/

 void sonoa_4way_hash( void *state, const void *input )
 {
@@ -77,19 +81,23 @@ void sonoa_4way_hash( void *state, const void *input )
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
-     sonoa_4way_ctx_holder ctx __attribute__ ((aligned (64)));
-        memcpy( &ctx, &sonoa_4way_ctx, sizeof(sonoa_4way_ctx) );
+     sonoa_4way_context_overlay ctx;
+//     sonoa_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+//        memcpy( &ctx, &sonoa_4way_ctx, sizeof(sonoa_4way_ctx) );

 // 1

+     blake512_4way_init( &ctx.blake );
     blake512_4way( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );

+     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
@@ -100,29 +108,36 @@ void sonoa_4way_hash( void *state, const void *input )

     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

+     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
     skein512_4way_close( &ctx.skein, vhash );

+     jh512_4way_init( &ctx.jh );
     jh512_4way( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );

+     keccak512_4way_init( &ctx.keccak );
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );

     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );

+     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );

+     cube_2way_init( &ctx.cube, 512, 16, 32 );
     cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
     cube_2way_init( &ctx.cube, 512, 16, 32 );
     cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );

+     shavite512_2way_init( &ctx.shavite );
     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
     shavite512_2way_init( &ctx.shavite );
     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );

+     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
@@ -130,6 +145,7 @@ void sonoa_4way_hash( void *state, const void *input )
     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );

+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, 512 );
     init_echo( &ctx.echo, 512 );
@@ -215,10 +231,12 @@ void sonoa_4way_hash( void *state, const void *input )

     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

+     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );

 // 3
+
     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
@@ -294,6 +312,7 @@ void sonoa_4way_hash( void *state, const void *input )

     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
     sph_fugue512_init( &ctx.fugue );
@@ -399,10 +418,11 @@ void sonoa_4way_hash( void *state, const void *input )

     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

+     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );

-     mm256_reinterleave_4x64( vhashB, vhash, 512 ); 
+     mm256_reinterleave_4x32_4x64( vhashB, vhash, 512 ); 

     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhashB, 64 );
@@ -438,7 +458,7 @@ void sonoa_4way_hash( void *state, const void *input )
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

-     mm256_reinterleave_4x32( vhashB, vhash,  512 );
+     mm256_reinterleave_4x64_4x32( vhashB, vhash,  512 );

     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhashB, 64 );
@@ -536,6 +556,7 @@ void sonoa_4way_hash( void *state, const void *input )

     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash0 );
     sph_whirlpool_init( &ctx.whirlpool );
@@ -663,6 +684,7 @@ void sonoa_4way_hash( void *state, const void *input )

     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

+     sha512_4way_init( &ctx.sha512 );
     sha512_4way( &ctx.sha512, vhash, 64 );
     sha512_4way_close( &ctx.sha512, vhash );

@@ -800,11 +822,11 @@ void sonoa_4way_hash( void *state, const void *input )
     sha512_4way( &ctx.sha512, vhash, 64 );
     sha512_4way_close( &ctx.sha512, vhash );

-     mm256_reinterleave_4x32( vhashB, vhash,  512 );
+     mm256_reinterleave_4x64_4x32( vhashB, vhash,  512 );

+     haval256_5_4way_init( &ctx.haval );
     haval256_5_4way( &ctx.haval, vhashB, 64 );
     haval256_5_4way_close( &ctx.haval, state );
-
 }

 int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -819,10 +841,7 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
-     uint32_t *nonces = work->nonces;
-     int num_found = 0;
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-//     uint32_t *noncep = vdata + 73;   // 9*8 + 1
     const uint32_t Htarg = ptarget[7];
     /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
     uint64_t htmax[] = {          0,        0xF,       0xFF,
@@ -855,18 +874,23 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
              if ( fulltest( lane_hash, ptarget ) )
              {
                 pdata[19] = n + lane;
-                 nonces[ num_found++ ] = n + lane;
                 work_set_target_ratio( work, lane_hash );
+                 if ( submit_work( mythr, work ) )
+                    applog( LOG_NOTICE,
+                             "Share %d submitted by thread %d, lane %d.",
+                             accepted_share_count + rejected_share_count + 1,
+                             thr_id, lane );
+                 else
+                    applog( LOG_WARNING, "Failed to submit share." );
              }
           }
           n += 4;
-        } while ( ( num_found == 0 ) && ( n < max_nonce )
-                  && !work_restart[thr_id].restart );
+        } while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
        break;
     }

     *hashes_done = n - first_nonce + 1;
-     return num_found;
+     return 0;
 }

 #endif
--- a/algo/x17/sonoa-gate.c
+++ b/algo/x17/sonoa-gate.c
@@ -3,7 +3,7 @@
 bool register_sonoa_algo( algo_gate_t* gate )
 {
 #if defined (SONOA_4WAY)
-  init_sonoa_4way_ctx();
+//  init_sonoa_4way_ctx();
  gate->scanhash  = (void*)&scanhash_sonoa_4way;
  gate->hash      = (void*)&sonoa_4way_hash;
 #else
--- a/algo/x17/sonoa-gate.h
+++ b/algo/x17/sonoa-gate.h
@@ -17,7 +17,7 @@ void sonoa_4way_hash( void *state, const void *input );
 int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );

-void init_sonoa_4way_ctx();
+//void init_sonoa_4way_ctx();

 #endif

--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -14,7 +14,6 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
-#include "algo/shavite/sph_shavite.h"
 #include "algo/shavite/shavite-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -222,7 +221,7 @@ void x17_4way_hash( void *state, const void *input )
     sha512_4way_close( &ctx.sha512, vhash );     

     // 17 Haval parallel 32 bit
-     mm256_reinterleave_4x32( vhashB, vhash,  512 );
+     mm256_reinterleave_4x64_4x32( vhashB, vhash,  512 );

     haval256_5_4way_init( &ctx.haval );
     haval256_5_4way( &ctx.haval, vhashB, 64 );
@@ -242,8 +241,6 @@ int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
-     uint32_t *nonces = work->nonces;
-     int num_found = 0;
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
     const uint32_t Htarg = ptarget[7];
@@ -260,35 +257,40 @@ int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
     uint64_t *edata = (uint64_t*)endiandata;
     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

-     for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+     for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
     {
-        uint32_t mask = masks[m];
+        uint32_t mask = masks[ m ];
        do
        {
  	   *noncev = mm256_interleave_blend_32( mm256_bswap_32(
-	                     _mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ),
+	                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ),
 	  		                        *noncev );
           x17_4way_hash( hash, vdata );

 	   for ( int lane = 0; lane < 4; lane++ )
-           if ( ( ( hash7[ lane ] & mask ) == 0 ) )
+           if ( ( hash7[ lane ] & mask ) == 0 )
           {
              mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
              if ( fulltest( lane_hash, ptarget ) )
              {
                 pdata[19] = n + lane;
-                 nonces[ num_found++ ] = n + lane;
                 work_set_target_ratio( work, lane_hash );
+                 if ( submit_work( mythr, work ) )
+                    applog( LOG_NOTICE,
+			     "Share %d submitted by thread %d, lane %d.",
+                             accepted_share_count + rejected_share_count + 1,
+                             thr_id, lane );
+                 else
+                    applog( LOG_WARNING, "Failed to submit share." );
              }
           }
           n += 4;
-        } while ( ( num_found == 0 ) && ( n < max_nonce )
-                   && !work_restart[thr_id].restart );
+        } while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
        break;
     }

     *hashes_done = n - first_nonce + 1;
-     return num_found;
+     return 0;
 }

 #endif
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -12,8 +12,9 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
-#include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
+#include "algo/shavite/shavite-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -24,16 +25,17 @@
 #include "algo/sha/sha2-hash-4way.h"
 #include "algo/haval/haval-hash-4way.h"

-typedef struct {
-        blake512_4way_context   blake;
+union _xevan_4way_context_overlay
+{
+	blake512_4way_context   blake;
        bmw512_4way_context     bmw;
        hashState_groestl       groestl;
        skein512_4way_context   skein;
        jh512_4way_context      jh;
        keccak512_4way_context  keccak;
        luffa_2way_context      luffa;
-        cubehashParam           cube;
-        sph_shavite512_context  shavite;
+        cube_2way_context       cube;
+        shavite512_2way_context shavite;
        simd_2way_context       simd;
        hashState_echo          echo;
        hamsi512_4way_context   hamsi;
@@ -42,39 +44,8 @@ typedef struct {
        sph_whirlpool_context   whirlpool;
        sha512_4way_context     sha512;
        haval256_5_4way_context haval;
-} xevan_4way_ctx_holder;
-
-xevan_4way_ctx_holder xevan_4way_ctx __attribute__ ((aligned (64)));
-static __thread blake512_4way_context xevan_blake_4way_mid
-                                        __attribute__ ((aligned (64)));
-
-void init_xevan_4way_ctx()
-{
-        blake512_4way_init(&xevan_4way_ctx.blake);
-        bmw512_4way_init( &xevan_4way_ctx.bmw );
-        init_groestl( &xevan_4way_ctx.groestl, 64 );
-        skein512_4way_init(&xevan_4way_ctx.skein);
-        jh512_4way_init(&xevan_4way_ctx.jh);
-        keccak512_4way_init(&xevan_4way_ctx.keccak);
-        luffa_2way_init( &xevan_4way_ctx.luffa, 512 );
-        cubehashInit( &xevan_4way_ctx.cube, 512, 16, 32 );
-        sph_shavite512_init( &xevan_4way_ctx.shavite );
-        simd_2way_init( &xevan_4way_ctx.simd, 512 );
-        init_echo( &xevan_4way_ctx.echo, 512 );
-        hamsi512_4way_init( &xevan_4way_ctx.hamsi );
-        sph_fugue512_init( &xevan_4way_ctx.fugue );
-        shabal512_4way_init( &xevan_4way_ctx.shabal );
-        sph_whirlpool_init( &xevan_4way_ctx.whirlpool );
-        sha512_4way_init( &xevan_4way_ctx.sha512 );
-        haval256_5_4way_init( &xevan_4way_ctx.haval );
 };
-
-void xevan_4way_blake512_midstate( const void* input )
-{
-    memcpy( &xevan_blake_4way_mid, &xevan_4way_ctx.blake,
-            sizeof(xevan_blake_4way_mid) );
-    blake512_4way( &xevan_blake_4way_mid, input, 64 );
-}
+typedef union _xevan_4way_context_overlay xevan_4way_context_overlay;

 void xevan_4way_hash( void *output, const void *input )
 {
@@ -83,293 +54,283 @@ void xevan_4way_hash( void *output, const void *input )
     uint64_t hash2[16] __attribute__ ((aligned (64)));
     uint64_t hash3[16] __attribute__ ((aligned (64)));
     uint64_t vhash[16<<2] __attribute__ ((aligned (64)));
-     uint64_t vhash32[16<<2] __attribute__ ((aligned (64)));
+     uint64_t vhashA[16<<2] __attribute__ ((aligned (64)));
+     uint64_t vhashB[16<<2] __attribute__ ((aligned (64)));
     const int dataLen = 128;
-     const int midlen = 64;            // bytes
-     const int tail   = 80 - midlen;   // 16
-     xevan_4way_ctx_holder ctx __attribute__ ((aligned (64)));
-     memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );
+     xevan_4way_context_overlay ctx __attribute__ ((aligned (64)));

-     // parallel way
-     memcpy( &ctx.blake, &xevan_blake_4way_mid,
-             sizeof(xevan_blake_4way_mid) );
-     blake512_4way( &ctx.blake, input + (midlen<<2), tail );
+     // parallel 4 way
+
+     blake512_4way_init( &ctx.blake );
+     blake512_4way( &ctx.blake, input, 80 );
     blake512_4way_close(&ctx.blake, vhash);
     memset( &vhash[8<<2], 0, 64<<2 );

+     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, dataLen );
     bmw512_4way_close( &ctx.bmw, vhash );

     // Serial
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
                               dataLen<<3 );
-     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
                               dataLen<<3 );
-     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
                               dataLen<<3 );
-     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
                               dataLen<<3 );

     // Parallel 4way
     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

+     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, dataLen );
     skein512_4way_close( &ctx.skein, vhash );

+     jh512_4way_init( &ctx.jh );
     jh512_4way( &ctx.jh, vhash, dataLen );
     jh512_4way_close( &ctx.jh, vhash );

+     keccak512_4way_init( &ctx.keccak );
     keccak512_4way( &ctx.keccak, vhash, dataLen );
     keccak512_4way_close( &ctx.keccak, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
-     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
-     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
-     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
+     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
+
     luffa_2way_init( &ctx.luffa, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
+     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );

-     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
-                           dataLen );
-     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
-                           dataLen );
-     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
-                           dataLen );
-     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
-                           dataLen );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashB, vhashB, dataLen );

-     sph_shavite512( &ctx.shavite, hash0, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash1, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash2, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash3, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash3 );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, dataLen );

-     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
-     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
-     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
     simd_2way_init( &ctx.simd, 512 );
-     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
+     simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );

+     mm256_deinterleave_1x128( hash0, hash1, vhashA, dataLen<<3 );
+     mm256_deinterleave_1x128( hash2, hash3, vhashB, dataLen<<3 );
+
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, dataLen<<3 );
-     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, dataLen<<3 );
-     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, dataLen<<3 );
-     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, dataLen<<3 );
     // Parallel
     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, dataLen );
     hamsi512_4way_close( &ctx.hamsi, vhash );
+
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, dataLen );
     sph_fugue512_close( &ctx.fugue, hash0 );
-     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash1, dataLen );
     sph_fugue512_close( &ctx.fugue, hash1 );
-     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash2, dataLen );
     sph_fugue512_close( &ctx.fugue, hash2 );
-     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash3, dataLen );
     sph_fugue512_close( &ctx.fugue, hash3 );

     // Parallel 4way 32 bit
     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, dataLen );
     shabal512_4way_close( &ctx.shabal, vhash );
+
     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

     // Serial
+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
     sph_whirlpool_close( &ctx.whirlpool, hash3 );

     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     sha512_4way_init( &ctx.sha512 );
     sha512_4way( &ctx.sha512, vhash, dataLen );
     sha512_4way_close( &ctx.sha512, vhash );

-     mm256_reinterleave_4x32( vhash32, vhash, dataLen<<3 );
-     haval256_5_4way( &ctx.haval, vhash32, dataLen );
-     haval256_5_4way_close( &ctx.haval, vhash );
-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+     mm256_reinterleave_4x64_4x32( vhashA, vhash, dataLen<<3 );
+
+     haval256_5_4way_init( &ctx.haval );
+     haval256_5_4way( &ctx.haval, vhashA, dataLen );
+     haval256_5_4way_close( &ctx.haval, vhashA );
+
+     mm256_reinterleave_4x32_4x64( vhash, vhashA, dataLen<<3 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
     memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 );
-     memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );

+     blake512_4way_init( &ctx.blake );
     blake512_4way( &ctx.blake, vhash, dataLen );
     blake512_4way_close(&ctx.blake, vhash);

+     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, dataLen );
     bmw512_4way_close( &ctx.bmw, vhash );

     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
                               dataLen<<3 );
-     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
                               dataLen<<3 );
-     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
                               dataLen<<3 );
-     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
                               dataLen<<3 );

     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

+     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, dataLen );
     skein512_4way_close( &ctx.skein, vhash );

+     jh512_4way_init( &ctx.jh );
     jh512_4way( &ctx.jh, vhash, dataLen );
     jh512_4way_close( &ctx.jh, vhash );

+     keccak512_4way_init( &ctx.keccak );
     keccak512_4way( &ctx.keccak, vhash, dataLen );
     keccak512_4way_close( &ctx.keccak, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
-     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
-     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
-     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
+     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
+
     luffa_2way_init( &ctx.luffa, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
+     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );

-     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
-                           dataLen );
-     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
-                           dataLen );
-     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
-                           dataLen );
-     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
-                           dataLen );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashB, vhashB, dataLen );

-     sph_shavite512( &ctx.shavite, hash0, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash1, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash2, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash3, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash3 );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, dataLen );

-     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
-     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
-     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
     simd_2way_init( &ctx.simd, 512 );
-     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
+     simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );

+     mm256_deinterleave_1x128( hash0, hash1, vhashA, dataLen<<3 );
+     mm256_deinterleave_1x128( hash2, hash3, vhashB, dataLen<<3 );
+
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, dataLen<<3 );
-     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, dataLen<<3 );
-     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, dataLen<<3 );
-     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, dataLen<<3 );

     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, dataLen );
     hamsi512_4way_close( &ctx.hamsi, vhash );
+
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, dataLen );
     sph_fugue512_close( &ctx.fugue, hash0 );
-     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash1, dataLen );
     sph_fugue512_close( &ctx.fugue, hash1 );
-     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash2, dataLen );
     sph_fugue512_close( &ctx.fugue, hash2 );
-     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash3, dataLen );
     sph_fugue512_close( &ctx.fugue, hash3 );

     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, dataLen );
     shabal512_4way_close( &ctx.shabal, vhash );
+
     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
     sph_whirlpool_close( &ctx.whirlpool, hash3 );

     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     sha512_4way_init( &ctx.sha512 );
     sha512_4way( &ctx.sha512, vhash, dataLen );
     sha512_4way_close( &ctx.sha512, vhash );

-     mm256_reinterleave_4x32( vhash32, vhash, dataLen<<3 );
-     haval256_5_4way( &ctx.haval, vhash32, dataLen );
+     mm256_reinterleave_4x64_4x32( vhashA, vhash, dataLen<<3 );
+
+     haval256_5_4way_init( &ctx.haval );
+     haval256_5_4way( &ctx.haval, vhashA, dataLen );
     haval256_5_4way_close( &ctx.haval, output );
 }

 int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done )
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[7<<2]);
@@ -378,30 +339,26 @@ int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t _ALIGN(64) endiandata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned

   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 73;   // 9*8 + 1

   if ( opt_benchmark )
      ptarget[7] = 0x0cff;

-   for ( int k=0; k < 19; k++ )
-      be32enc( &endiandata[k], pdata[k] );
-
   uint64_t *edata = (uint64_t*)endiandata;
+
+   casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+   casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

-   xevan_4way_blake512_midstate( vdata );
-
   do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+2, n+1 );
-      be32enc( noncep+4, n+2 );
-      be32enc( noncep+6, n+3 );
+      *noncev = mm256_interleave_blend_32( mm256_bswap_32(
+               _mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ), *noncev );

      xevan_4way_hash( hash, vdata );
      for ( int lane = 0; lane < 4; lane++ )
@@ -411,15 +368,20 @@ int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
 	 if ( fulltest( lane_hash, ptarget ) )
         {
             pdata[19] = n + lane;
-             nonces[ num_found++ ] = n + lane;
             work_set_target_ratio( work, lane_hash );
+             if ( submit_work( mythr, work ) )
+                applog( LOG_NOTICE,
+                        "Share %d submitted by thread %d, lane %d.",
+                         accepted_share_count + rejected_share_count + 1,
+                         thr_id, lane );
+             else
+                applog( LOG_WARNING, "Failed to submit share." );
         }
      }
      n += 4;
-   } while ( ( num_found == 0 ) && ( n < max_nonce )
-             && !work_restart[thr_id].restart );
+   } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/x17/xevan-gate.c
+++ b/algo/x17/xevan-gate.c
@@ -8,7 +8,7 @@ void xevan_set_target( struct work* work, double job_diff )
 bool register_xevan_algo( algo_gate_t* gate )
 {
 #if defined (XEVAN_4WAY)
-  init_xevan_4way_ctx();
+//  init_xevan_4way_ctx();
  gate->scanhash  = (void*)&scanhash_xevan_4way;
  gate->hash      = (void*)&xevan_4way_hash;
 #else
--- a/algo/x17/xevan-gate.h
+++ b/algo/x17/xevan-gate.h
@@ -15,16 +15,16 @@ bool register_xevan_algo( algo_gate_t* gate );
 void xevan_4way_hash( void *state, const void *input );

 int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done );
+                       uint64_t *hashes_done, struct thr_info *mythr );

-void init_xevan_4way_ctx();
+//void init_xevan_4way_ctx();

 #endif

 void xevan_hash( void *state, const void *input );

 int scanhash_xevan( int thr_id, struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done );
+                  uint64_t *hashes_done, struct thr_info *mythr );

 void init_xevan_ctx();

--- a/algo/x17/xevan.c
+++ b/algo/x17/xevan.c
@@ -230,12 +230,14 @@ void xevan_hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-int scanhash_xevan(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_xevan( int thr_id, struct work *work, uint32_t max_nonce,
+	            uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(64) hash[8];
 	uint32_t _ALIGN(64) endiandata[20];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
+        /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
--- a/algo/yescrypt/sha256_Y.c
+++ b/algo/yescrypt/sha256_Y.c
@@ -290,7 +290,7 @@ SHA256_Final_Y(unsigned char digest[32], SHA256_CTX_Y * ctx)

 /* Initialize an HMAC-SHA256 operation with the given key. */
 void
-HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
+HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y * ctx, const void * _K, size_t Klen)
 {
 	unsigned char pad[64];
 	unsigned char khash[32];
@@ -326,7 +326,7 @@ HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)

 /* Add bytes to the HMAC-SHA256 operation. */
 void
-HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)
+HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y * ctx, const void *in, size_t len)
 {

 	/* Feed data to the inner SHA256 operation. */
@@ -335,7 +335,7 @@ HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)

 /* Finish an HMAC-SHA256 operation. */
 void
-HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx)
+HMAC_SHA256_Final_Y(unsigned char digest[32], HMAC_SHA256_CTX_Y * ctx)
 {
 	unsigned char ihash[32];

@@ -361,7 +361,7 @@ void
 PBKDF2_SHA256_Y(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
 {
-	HMAC_SHA256_CTX PShctx, hctx;
+	HMAC_SHA256_CTX_Y PShctx, hctx;
 	uint8_t _ALIGN(128) T[32];
 	uint8_t _ALIGN(128) U[32];
 	uint8_t ivec[4];
@@ -370,8 +370,8 @@ PBKDF2_SHA256_Y(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
 	int k;

 	/* Compute HMAC state after processing P and S. */
-	HMAC_SHA256_Init(&PShctx, passwd, passwdlen);
-	HMAC_SHA256_Update(&PShctx, salt, saltlen);
+	HMAC_SHA256_Init_Y(&PShctx, passwd, passwdlen);
+	HMAC_SHA256_Update_Y(&PShctx, salt, saltlen);

 	/* Iterate through the blocks. */
 	for (i = 0; i * 32 < dkLen; i++) {
@@ -379,18 +379,18 @@ PBKDF2_SHA256_Y(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
 		be32enc(ivec, (uint32_t)(i + 1));

 		/* Compute U_1 = PRF(P, S || INT(i)). */
-		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
-		HMAC_SHA256_Update(&hctx, ivec, 4);
-		HMAC_SHA256_Final(U, &hctx);
+		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_Y));
+		HMAC_SHA256_Update_Y(&hctx, ivec, 4);
+		HMAC_SHA256_Final_Y(U, &hctx);

 		/* T_i = U_1 ... */
 		memcpy(T, U, 32);

 		for (j = 2; j <= c; j++) {
 			/* Compute U_j. */
-			HMAC_SHA256_Init(&hctx, passwd, passwdlen);
-			HMAC_SHA256_Update(&hctx, U, 32);
-			HMAC_SHA256_Final(U, &hctx);
+			HMAC_SHA256_Init_Y(&hctx, passwd, passwdlen);
+			HMAC_SHA256_Update_Y(&hctx, U, 32);
+			HMAC_SHA256_Final_Y(U, &hctx);

 			/* ... xor U_j ... */
 			for (k = 0; k < 32; k++)
--- a/algo/yescrypt/sha256_Y.h
+++ b/algo/yescrypt/sha256_Y.h
@@ -49,14 +49,14 @@ typedef struct HMAC_SHA256Context {
 typedef struct HMAC_SHA256Context {
        SHA256_CTX ictx;
        SHA256_CTX octx;
-} HMAC_SHA256_CTX;
+} HMAC_SHA256_CTX_Y;

 void	SHA256_Init_Y(SHA256_CTX_Y *);
 void	SHA256_Update_Y(SHA256_CTX_Y *, const void *, size_t);
 void	SHA256_Final_Y(unsigned char [32], SHA256_CTX_Y *);
-void	HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t);
-void	HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t);
-void	HMAC_SHA256_Final(unsigned char [32], HMAC_SHA256_CTX *);
+void	HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
+void	HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
+void	HMAC_SHA256_Final_Y(unsigned char [32], HMAC_SHA256_CTX_Y *);

 /**
 * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
--- a/algo/yescrypt/yescrypt-simd.c
+++ b/algo/yescrypt/yescrypt-simd.c
@@ -1354,14 +1354,14 @@ yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
 	if ((t || flags) && buflen == sizeof(sha256)) {
 	   /* Compute ClientKey */
 	   {
-		HMAC_SHA256_CTX ctx;
-		HMAC_SHA256_Init(&ctx, buf, buflen);
+		HMAC_SHA256_CTX_Y ctx;
+		HMAC_SHA256_Init_Y(&ctx, buf, buflen);
                if ( yescrypt_client_key )
-                    HMAC_SHA256_Update( &ctx, (uint8_t*)yescrypt_client_key,
+                    HMAC_SHA256_Update_Y( &ctx, (uint8_t*)yescrypt_client_key,
                                        yescrypt_client_key_len );
                else
-                    HMAC_SHA256_Update( &ctx, salt, saltlen );
-		HMAC_SHA256_Final(sha256, &ctx);
+                    HMAC_SHA256_Update_Y( &ctx, salt, saltlen );
+		HMAC_SHA256_Final_Y(sha256, &ctx);
 	   }
 	   /* Compute StoredKey */
 	   {
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -383,7 +383,7 @@ void yescrypthash(void *output, const void *input)
 }

 int scanhash_yescrypt( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done )
+                       uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t _ALIGN(64) vhash[8];
        uint32_t _ALIGN(64) endiandata[20];
@@ -393,6 +393,7 @@ int scanhash_yescrypt( int thr_id, struct work *work, uint32_t max_nonce,
        const uint32_t Htarg = ptarget[7];
        const uint32_t first_nonce = pdata[19];
        uint32_t n = first_nonce;
+        /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

        for (int k = 0; k < 19; k++)
                be32enc(&endiandata[k], pdata[k]);
--- a/algo/yespower/sha256-avx2.c
+++ b/algo/yespower/sha256-avx2.c
@@ -1,646 +0,0 @@
-/*-
- * Copyright 2005-2016 Colin Percival
- * Copyright 2016-2018 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <assert.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "insecure_memzero.h"
-#include "sysendian.h"
-
-#include "sha256.h"
-
-#ifdef __ICC
-/* Miscompile with icc 14.0.0 (at least), so don't use restrict there */
-#define restrict
-#elif __STDC_VERSION__ >= 199901L
-/* Have restrict */
-#elif defined(__GNUC__)
-#define restrict __restrict
-#else
-#define restrict
-#endif
-
-/*
- * Encode a length len*2 vector of (uint32_t) into a length len*8 vector of
- * (uint8_t) in big-endian form.
- */
-static void
-be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
-{
-
-	/* Encode vector, two words at a time. */
-	do {
-		be32enc(&dst[0], src[0]);
-		be32enc(&dst[4], src[1]);
-		src += 2;
-		dst += 8;
-	} while (--len);
-}
-
-/*
- * Decode a big-endian length len*8 vector of (uint8_t) into a length
- * len*2 vector of (uint32_t).
- */
-static void
-be32dec_vect(uint32_t * dst, const uint8_t * src, size_t len)
-{
-
-	/* Decode vector, two words at a time. */
-	do {
-		dst[0] = be32dec(&src[0]);
-		dst[1] = be32dec(&src[4]);
-		src += 8;
-		dst += 2;
-	} while (--len);
-}
-
-/* SHA256 round constants. */
-static const uint32_t Krnd[64] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-/* Elementary functions used by SHA256 */
-#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
-#define Maj(x, y, z)	((x & (y | z)) | (y & z))
-#define SHR(x, n)	(x >> n)
-#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
-#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
-#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
-#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
-#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
-
-/* SHA256 round function */
-#define RND(a, b, c, d, e, f, g, h, k)			\
-	h += S1(e) + Ch(e, f, g) + k;			\
-	d += h;						\
-	h += S0(a) + Maj(a, b, c);
-
-/* Adjusted round function for rotating state */
-#define RNDr(S, W, i, ii)			\
-	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
-	    S[(66 - i) % 8], S[(67 - i) % 8],	\
-	    S[(68 - i) % 8], S[(69 - i) % 8],	\
-	    S[(70 - i) % 8], S[(71 - i) % 8],	\
-	    W[i + ii] + Krnd[i + ii])
-
-/* Message schedule computation */
-#define MSCH(W, ii, i)				\
-	W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + s0(W[i + ii + 1]) + W[i + ii]
-
-/*
- * SHA256 block compression function.  The 256-bit state is transformed via
- * the 512-bit input block to produce a new state.
- */
-static void
-SHA256_Transform(uint32_t state[static restrict 8],
-    const uint8_t block[static restrict 64],
-    uint32_t W[static restrict 64], uint32_t S[static restrict 8])
-{
-	int i;
-
-	/* 1. Prepare the first part of the message schedule W. */
-	be32dec_vect(W, block, 8);
-
-	/* 2. Initialize working variables. */
-	memcpy(S, state, 32);
-
-	/* 3. Mix. */
-	for (i = 0; i < 64; i += 16) {
-		RNDr(S, W, 0, i);
-		RNDr(S, W, 1, i);
-		RNDr(S, W, 2, i);
-		RNDr(S, W, 3, i);
-		RNDr(S, W, 4, i);
-		RNDr(S, W, 5, i);
-		RNDr(S, W, 6, i);
-		RNDr(S, W, 7, i);
-		RNDr(S, W, 8, i);
-		RNDr(S, W, 9, i);
-		RNDr(S, W, 10, i);
-		RNDr(S, W, 11, i);
-		RNDr(S, W, 12, i);
-		RNDr(S, W, 13, i);
-		RNDr(S, W, 14, i);
-		RNDr(S, W, 15, i);
-
-		if (i == 48)
-			break;
-		MSCH(W, 0, i);
-		MSCH(W, 1, i);
-		MSCH(W, 2, i);
-		MSCH(W, 3, i);
-		MSCH(W, 4, i);
-		MSCH(W, 5, i);
-		MSCH(W, 6, i);
-		MSCH(W, 7, i);
-		MSCH(W, 8, i);
-		MSCH(W, 9, i);
-		MSCH(W, 10, i);
-		MSCH(W, 11, i);
-		MSCH(W, 12, i);
-		MSCH(W, 13, i);
-		MSCH(W, 14, i);
-		MSCH(W, 15, i);
-	}
-
-	/* 4. Mix local working variables into global state. */
-	state[0] += S[0];
-	state[1] += S[1];
-	state[2] += S[2];
-	state[3] += S[3];
-	state[4] += S[4];
-	state[5] += S[5];
-	state[6] += S[6];
-	state[7] += S[7];
-}
-
-static const uint8_t PAD[64] = {
-	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-/* Add padding and terminating bit-count. */
-static void
-SHA256_Pad(SHA256_CTX * ctx, uint32_t tmp32[static restrict 72])
-{
-	size_t r;
-
-	/* Figure out how many bytes we have buffered. */
-	r = (ctx->count >> 3) & 0x3f;
-
-	/* Pad to 56 mod 64, transforming if we finish a block en route. */
-	if (r < 56) {
-		/* Pad to 56 mod 64. */
-		memcpy(&ctx->buf[r], PAD, 56 - r);
-	} else {
-		/* Finish the current block and mix. */
-		memcpy(&ctx->buf[r], PAD, 64 - r);
-		SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-
-		/* The start of the final block is all zeroes. */
-		memset(&ctx->buf[0], 0, 56);
-	}
-
-	/* Add the terminating bit-count. */
-	be64enc(&ctx->buf[56], ctx->count);
-
-	/* Mix in the final block. */
-	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-}
-
-/* Magic initialization constants. */
-static const uint32_t initial_state[8] = {
-	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-};
-
-/**
- * SHA256_Init(ctx):
- * Initialize the SHA256 context ${ctx}.
- */
-void
-SHA256_Init(SHA256_CTX * ctx)
-{
-
-	/* Zero bits processed so far. */
-	ctx->count = 0;
-
-	/* Initialize state. */
-	memcpy(ctx->state, initial_state, sizeof(initial_state));
-}
-
-/**
- * SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
- */
-static void
-_SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len,
-    uint32_t tmp32[static restrict 72])
-{
-	uint32_t r;
-	const uint8_t * src = in;
-
-	/* Return immediately if we have nothing to do. */
-	if (len == 0)
-		return;
-
-	/* Number of bytes left in the buffer from previous updates. */
-	r = (ctx->count >> 3) & 0x3f;
-
-	/* Update number of bits. */
-	ctx->count += (uint64_t)(len) << 3;
-
-	/* Handle the case where we don't need to perform any transforms. */
-	if (len < 64 - r) {
-		memcpy(&ctx->buf[r], src, len);
-		return;
-	}
-
-	/* Finish the current block. */
-	memcpy(&ctx->buf[r], src, 64 - r);
-	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-	src += 64 - r;
-	len -= 64 - r;
-
-	/* Perform complete blocks. */
-	while (len >= 64) {
-		SHA256_Transform(ctx->state, src, &tmp32[0], &tmp32[64]);
-		src += 64;
-		len -= 64;
-	}
-
-	/* Copy left over data into buffer. */
-	memcpy(ctx->buf, src, len);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_SHA256_Update(ctx, in, len, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * SHA256_Final(digest, ctx):
- * Output the SHA256 hash of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-static void
-_SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx,
-    uint32_t tmp32[static restrict 72])
-{
-
-	/* Add padding. */
-	SHA256_Pad(ctx, tmp32);
-
-	/* Write the hash. */
-	be32enc_vect(digest, ctx->state, 4);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_SHA256_Final(digest, ctx, tmp32);
-
-	/* Clear the context state. */
-	insecure_memzero(ctx, sizeof(SHA256_CTX));
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * SHA256_Buf(in, len, digest):
- * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
- */
-void
-SHA256_Buf(const void * in, size_t len, uint8_t digest[32])
-{
-	SHA256_CTX ctx;
-	uint32_t tmp32[72];
-
-	SHA256_Init(&ctx);
-	_SHA256_Update(&ctx, in, len, tmp32);
-	_SHA256_Final(digest, &ctx, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(&ctx, sizeof(SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Init(ctx, K, Klen):
- * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
- * ${K}.
- */
-static void
-_HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen,
-    uint32_t tmp32[static restrict 72], uint8_t pad[static restrict 64],
-    uint8_t khash[static restrict 32])
-{
-	const uint8_t * K = _K;
-	size_t i;
-
-	/* If Klen > 64, the key is really SHA256(K). */
-	if (Klen > 64) {
-		SHA256_Init(&ctx->ictx);
-		_SHA256_Update(&ctx->ictx, K, Klen, tmp32);
-		_SHA256_Final(khash, &ctx->ictx, tmp32);
-		K = khash;
-		Klen = 32;
-	}
-
-	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-	SHA256_Init(&ctx->ictx);
-	memset(pad, 0x36, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-	_SHA256_Update(&ctx->ictx, pad, 64, tmp32);
-
-	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-	SHA256_Init(&ctx->octx);
-	memset(pad, 0x5c, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-	_SHA256_Update(&ctx->octx, pad, 64, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
-{
-	uint32_t tmp32[72];
-	uint8_t pad[64];
-	uint8_t khash[32];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Init(ctx, _K, Klen, tmp32, pad, khash);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(khash, 32);
-	insecure_memzero(pad, 64);
-}
-
-/**
- * HMAC_SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
- */
-static void
-_HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len,
-    uint32_t tmp32[static restrict 72])
-{
-
-	/* Feed data to the inner SHA256 operation. */
-	_SHA256_Update(&ctx->ictx, in, len, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Update(ctx, in, len, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Final(digest, ctx):
- * Output the HMAC-SHA256 of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-static void
-_HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx,
-    uint32_t tmp32[static restrict 72], uint8_t ihash[static restrict 32])
-{
-
-	/* Finish the inner SHA256 operation. */
-	_SHA256_Final(ihash, &ctx->ictx, tmp32);
-
-	/* Feed the inner hash to the outer SHA256 operation. */
-	_SHA256_Update(&ctx->octx, ihash, 32, tmp32);
-
-	/* Finish the outer SHA256 operation. */
-	_SHA256_Final(digest, &ctx->octx, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx)
-{
-	uint32_t tmp32[72];
-	uint8_t ihash[32];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Final(digest, ctx, tmp32, ihash);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(ihash, 32);
-}
-
-/**
- * HMAC_SHA256_Buf(K, Klen, in, len, digest):
- * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
- * length ${Klen}, and write the result to ${digest}.
- */
-void
-HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
-    uint8_t digest[32])
-{
-	HMAC_SHA256_CTX ctx;
-	uint32_t tmp32[72];
-	uint8_t tmp8[96];
-
-	_HMAC_SHA256_Init(&ctx, K, Klen, tmp32, &tmp8[0], &tmp8[64]);
-	_HMAC_SHA256_Update(&ctx, in, len, tmp32);
-	_HMAC_SHA256_Final(digest, &ctx, tmp32, &tmp8[0]);
-
-	/* Clean the stack. */
-	insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(tmp8, 96);
-}
-
-/* Add padding and terminating bit-count, but don't invoke Transform yet. */
-static int
-SHA256_Pad_Almost(SHA256_CTX * ctx, uint8_t len[static restrict 8],
-    uint32_t tmp32[static restrict 72])
-{
-	uint32_t r;
-
-	r = (ctx->count >> 3) & 0x3f;
-	if (r >= 56)
-		return -1;
-
-	/*
-	 * Convert length to a vector of bytes -- we do this now rather
-	 * than later because the length will change after we pad.
-	 */
-	be64enc(len, ctx->count);
-
-	/* Add 1--56 bytes so that the resulting length is 56 mod 64. */
-	_SHA256_Update(ctx, PAD, 56 - r, tmp32);
-
-	/* Add the terminating bit-count. */
-	ctx->buf[63] = len[7];
-	_SHA256_Update(ctx, len, 7, tmp32);
-
-	return 0;
-}
-
-/**
- * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
- * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
- * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
- */
-void
-PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
-    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
-{
-	HMAC_SHA256_CTX Phctx, PShctx, hctx;
-	uint32_t tmp32[72];
-	union {
-		uint8_t tmp8[96];
-		uint32_t state[8];
-	} u;
-	size_t i;
-	uint8_t ivec[4];
-	uint8_t U[32];
-	uint8_t T[32];
-	uint64_t j;
-	int k;
-	size_t clen;
-
-	/* Sanity-check. */
-	assert(dkLen <= 32 * (size_t)(UINT32_MAX));
-
-	if (c == 1 && (dkLen & 31) == 0 && (saltlen & 63) <= 51) {
-		uint32_t oldcount;
-		uint8_t * ivecp;
-
-		/* Compute HMAC state after processing P and S. */
-		_HMAC_SHA256_Init(&hctx, passwd, passwdlen,
-		    tmp32, &u.tmp8[0], &u.tmp8[64]);
-		_HMAC_SHA256_Update(&hctx, salt, saltlen, tmp32);
-
-		/* Prepare ictx padding. */
-		oldcount = hctx.ictx.count & (0x3f << 3);
-		_HMAC_SHA256_Update(&hctx, "\0\0\0", 4, tmp32);
-		if ((hctx.ictx.count & (0x3f << 3)) < oldcount ||
-		    SHA256_Pad_Almost(&hctx.ictx, u.tmp8, tmp32))
-			goto generic; /* Can't happen due to saltlen check */
-		ivecp = hctx.ictx.buf + (oldcount >> 3);
-
-		/* Prepare octx padding. */
-		hctx.octx.count += 32 << 3;
-		SHA256_Pad_Almost(&hctx.octx, u.tmp8, tmp32);
-
-		/* Iterate through the blocks. */
-		for (i = 0; i * 32 < dkLen; i++) {
-			/* Generate INT(i + 1). */
-			be32enc(ivecp, (uint32_t)(i + 1));
-
-			/* Compute U_1 = PRF(P, S || INT(i)). */
-			memcpy(u.state, hctx.ictx.state, sizeof(u.state));
-			SHA256_Transform(u.state, hctx.ictx.buf,
-			    &tmp32[0], &tmp32[64]);
-			be32enc_vect(hctx.octx.buf, u.state, 4);
-			memcpy(u.state, hctx.octx.state, sizeof(u.state));
-			SHA256_Transform(u.state, hctx.octx.buf,
-			    &tmp32[0], &tmp32[64]);
-			be32enc_vect(&buf[i * 32], u.state, 4);
-		}
-
-		goto cleanup;
-	}
-
-generic:
-	/* Compute HMAC state after processing P. */
-	_HMAC_SHA256_Init(&Phctx, passwd, passwdlen,
-	    tmp32, &u.tmp8[0], &u.tmp8[64]);
-
-	/* Compute HMAC state after processing P and S. */
-	memcpy(&PShctx, &Phctx, sizeof(HMAC_SHA256_CTX));
-	_HMAC_SHA256_Update(&PShctx, salt, saltlen, tmp32);
-
-	/* Iterate through the blocks. */
-	for (i = 0; i * 32 < dkLen; i++) {
-		/* Generate INT(i + 1). */
-		be32enc(ivec, (uint32_t)(i + 1));
-
-		/* Compute U_1 = PRF(P, S || INT(i)). */
-		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
-		_HMAC_SHA256_Update(&hctx, ivec, 4, tmp32);
-		_HMAC_SHA256_Final(T, &hctx, tmp32, u.tmp8);
-
-		if (c > 1) {
-			/* T_i = U_1 ... */
-			memcpy(U, T, 32);
-
-			for (j = 2; j <= c; j++) {
-				/* Compute U_j. */
-				memcpy(&hctx, &Phctx, sizeof(HMAC_SHA256_CTX));
-				_HMAC_SHA256_Update(&hctx, U, 32, tmp32);
-				_HMAC_SHA256_Final(U, &hctx, tmp32, u.tmp8);
-
-				/* ... xor U_j ... */
-				for (k = 0; k < 32; k++)
-					T[k] ^= U[k];
-			}
-		}
-
-		/* Copy as many bytes as necessary into buf. */
-		clen = dkLen - i * 32;
-		if (clen > 32)
-			clen = 32;
-		memcpy(&buf[i * 32], T, clen);
-	}
-
-	/* Clean the stack. */
-	insecure_memzero(&Phctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(&PShctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(U, 32);
-	insecure_memzero(T, 32);
-
-cleanup:
-	insecure_memzero(&hctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(&u, sizeof(u));
-}
--- a/algo/yespower/sha256.c
+++ b/algo/yespower/sha256.c
@@ -1,680 +0,0 @@
-/*-
- * Copyright 2005-2016 Colin Percival
- * Copyright 2016-2018 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <assert.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "insecure_memzero.h"
-#include "sysendian.h"
-
-#include "sha256.h"
-#include "avxdefs.h"
-
-#ifdef __ICC
-/* Miscompile with icc 14.0.0 (at least), so don't use restrict there */
-#define restrict
-#elif __STDC_VERSION__ >= 199901L
-/* Have restrict */
-#elif defined(__GNUC__)
-#define restrict __restrict
-#else
-#define restrict
-#endif
-
-/*
- * Encode a length len*2 vector of (uint32_t) into a length len*8 vector of
- * (uint8_t) in big-endian form.
- */
-static void
-be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
-{
-
-	/* Encode vector, two words at a time. */
-	do {
-		be32enc(&dst[0], src[0]);
-		be32enc(&dst[4], src[1]);
-		src += 2;
-		dst += 8;
-	} while (--len);
-}
-
-/*
- * Decode a big-endian length len*8 vector of (uint8_t) into a length
- * len*2 vector of (uint32_t).
- */
-static void
-be32dec_vect(uint32_t * dst, const uint8_t * src, size_t len)
-{
-
-	/* Decode vector, two words at a time. */
-	do {
-		dst[0] = be32dec(&src[0]);
-		dst[1] = be32dec(&src[4]);
-		src += 8;
-		dst += 2;
-	} while (--len);
-}
-
-/* SHA256 round constants. */
-static const uint32_t Krnd[64] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-/* Elementary functions used by SHA256 */
-#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
-#define Maj(x, y, z)	((x & (y | z)) | (y & z))
-#define SHR(x, n)	(x >> n)
-#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
-#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
-#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
-#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
-#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
-
-#if 0    //defined(__SHA__)
-
-// ABEF = _mm_sha256rnds2_epu32( CDGH, ABEF, k )
-//_mm_sha256rnds2_epu32 (__m128i a, __m128i b, __m128i k)
-// b = { ABEF }   a = { CDGH }
-//
-//a = _mm_set_epi32( S[(66 - i) % 8], S[(67 - i) % 8],
-//                 S[(70 - i) % 8], S[(71 - i) % 8] );
-//b = _mm_set_epi32( S[(64 - i) % 8], S[(65 - i) % 8],
-//                 S[(68 - i) % 8], S[(69 - i) % 8] );
-//k = _mm_set1_epi32( W[i + ii] + Krnd[i + ii] )
-// _mm_sha256rnds2_epu32(a,b,k)
-
-#define RNDr( S, W, i, ii ) do \
-{ \
-uint32_t abef[4]; \
-  __m128i ABEF =  _mm_set_epi32( S[(66 - i) % 8], S[(67 - i) % 8], \
-                                 S[(70 - i) % 8], S[(71 - i) % 8] ); \
-  __m128i CDGH =  _mm_set_epi32( S[(64 - i) % 8], S[(65 - i) % 8], \
-                                 S[(68 - i) % 8], S[(69 - i) % 8] ); \
-  __m128i    K =  _mm_set1_epi32( W[i + ii] + Krnd[i + ii] ); \
-  casti_m128i( abef, 0 )  = _mm_sha256rnds2_epu32( CDGH, ABEF, K ); \
-  S[(66 - i) % 8] = abef[3]; \
-  S[(67 - i) % 8] = abef[2]; \
-  S[(64 - i) % 8] = abef[1]; \
-  S[(65 - i) % 8] = abef[0]; \
-} while(0)
-
-#else
-
-/* SHA256 round function */
-
-#define RND(a, b, c, d, e, f, g, h, k)			\
-	h += S1(e) + Ch(e, f, g) + k;			\
-	d += h;						\
-	h += S0(a) + Maj(a, b, c);
-
-/* Adjusted round function for rotating state */
-#define RNDr(S, W, i, ii)			\
-	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
-	    S[(66 - i) % 8], S[(67 - i) % 8],	\
-	    S[(68 - i) % 8], S[(69 - i) % 8],	\
-	    S[(70 - i) % 8], S[(71 - i) % 8],	\
-	    W[i + ii] + Krnd[i + ii])
-
-#endif
-
-/* Message schedule computation */
-#define MSCH(W, ii, i)				\
-	W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + s0(W[i + ii + 1]) + W[i + ii]
-
-/*
- * SHA256 block compression function.  The 256-bit state is transformed via
- * the 512-bit input block to produce a new state.
- */
-static void
-SHA256_Transform(uint32_t state[static restrict 8],
-    const uint8_t block[static restrict 64],
-    uint32_t W[static restrict 64], uint32_t S[static restrict 8])
-{
-	int i;
-
-	/* 1. Prepare the first part of the message schedule W. */
-	be32dec_vect(W, block, 8);
-
-	/* 2. Initialize working variables. */
-	memcpy(S, state, 32);
-
-	/* 3. Mix. */
-	for (i = 0; i < 64; i += 16) {
-		RNDr(S, W, 0, i);
-		RNDr(S, W, 1, i);
-		RNDr(S, W, 2, i);
-		RNDr(S, W, 3, i);
-		RNDr(S, W, 4, i);
-		RNDr(S, W, 5, i);
-		RNDr(S, W, 6, i);
-		RNDr(S, W, 7, i);
-		RNDr(S, W, 8, i);
-		RNDr(S, W, 9, i);
-		RNDr(S, W, 10, i);
-		RNDr(S, W, 11, i);
-		RNDr(S, W, 12, i);
-		RNDr(S, W, 13, i);
-		RNDr(S, W, 14, i);
-		RNDr(S, W, 15, i);
-
-		if (i == 48)
-			break;
-		MSCH(W, 0, i);
-		MSCH(W, 1, i);
-		MSCH(W, 2, i);
-		MSCH(W, 3, i);
-		MSCH(W, 4, i);
-		MSCH(W, 5, i);
-		MSCH(W, 6, i);
-		MSCH(W, 7, i);
-		MSCH(W, 8, i);
-		MSCH(W, 9, i);
-		MSCH(W, 10, i);
-		MSCH(W, 11, i);
-		MSCH(W, 12, i);
-		MSCH(W, 13, i);
-		MSCH(W, 14, i);
-		MSCH(W, 15, i);
-	}
-
-	/* 4. Mix local working variables into global state. */
-	state[0] += S[0];
-	state[1] += S[1];
-	state[2] += S[2];
-	state[3] += S[3];
-	state[4] += S[4];
-	state[5] += S[5];
-	state[6] += S[6];
-	state[7] += S[7];
-}
-
-static const uint8_t PAD[64] = {
-	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-/* Add padding and terminating bit-count. */
-static void
-SHA256_Pad(SHA256_CTX * ctx, uint32_t tmp32[static restrict 72])
-{
-	size_t r;
-
-	/* Figure out how many bytes we have buffered. */
-	r = (ctx->count >> 3) & 0x3f;
-
-	/* Pad to 56 mod 64, transforming if we finish a block en route. */
-	if (r < 56) {
-		/* Pad to 56 mod 64. */
-		memcpy(&ctx->buf[r], PAD, 56 - r);
-	} else {
-		/* Finish the current block and mix. */
-		memcpy(&ctx->buf[r], PAD, 64 - r);
-		SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-
-		/* The start of the final block is all zeroes. */
-		memset(&ctx->buf[0], 0, 56);
-	}
-
-	/* Add the terminating bit-count. */
-	be64enc(&ctx->buf[56], ctx->count);
-
-	/* Mix in the final block. */
-	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-}
-
-/* Magic initialization constants. */
-static const uint32_t initial_state[8] = {
-	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-};
-
-/**
- * SHA256_Init(ctx):
- * Initialize the SHA256 context ${ctx}.
- */
-void
-SHA256_Init(SHA256_CTX * ctx)
-{
-
-	/* Zero bits processed so far. */
-	ctx->count = 0;
-
-	/* Initialize state. */
-	memcpy(ctx->state, initial_state, sizeof(initial_state));
-}
-
-/**
- * SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
- */
-static void
-_SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len,
-    uint32_t tmp32[static restrict 72])
-{
-	uint32_t r;
-	const uint8_t * src = in;
-
-	/* Return immediately if we have nothing to do. */
-	if (len == 0)
-		return;
-
-	/* Number of bytes left in the buffer from previous updates. */
-	r = (ctx->count >> 3) & 0x3f;
-
-	/* Update number of bits. */
-	ctx->count += (uint64_t)(len) << 3;
-
-	/* Handle the case where we don't need to perform any transforms. */
-	if (len < 64 - r) {
-		memcpy(&ctx->buf[r], src, len);
-		return;
-	}
-
-	/* Finish the current block. */
-	memcpy(&ctx->buf[r], src, 64 - r);
-	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-	src += 64 - r;
-	len -= 64 - r;
-
-	/* Perform complete blocks. */
-	while (len >= 64) {
-		SHA256_Transform(ctx->state, src, &tmp32[0], &tmp32[64]);
-		src += 64;
-		len -= 64;
-	}
-
-	/* Copy left over data into buffer. */
-	memcpy(ctx->buf, src, len);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_SHA256_Update(ctx, in, len, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * SHA256_Final(digest, ctx):
- * Output the SHA256 hash of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-static void
-_SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx,
-    uint32_t tmp32[static restrict 72])
-{
-
-	/* Add padding. */
-	SHA256_Pad(ctx, tmp32);
-
-	/* Write the hash. */
-	be32enc_vect(digest, ctx->state, 4);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_SHA256_Final(digest, ctx, tmp32);
-
-	/* Clear the context state. */
-	insecure_memzero(ctx, sizeof(SHA256_CTX));
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * SHA256_Buf(in, len, digest):
- * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
- */
-void
-SHA256_Buf(const void * in, size_t len, uint8_t digest[32])
-{
-	SHA256_CTX ctx;
-	uint32_t tmp32[72];
-
-	SHA256_Init(&ctx);
-	_SHA256_Update(&ctx, in, len, tmp32);
-	_SHA256_Final(digest, &ctx, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(&ctx, sizeof(SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Init(ctx, K, Klen):
- * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
- * ${K}.
- */
-static void
-_HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen,
-    uint32_t tmp32[static restrict 72], uint8_t pad[static restrict 64],
-    uint8_t khash[static restrict 32])
-{
-	const uint8_t * K = _K;
-	size_t i;
-
-	/* If Klen > 64, the key is really SHA256(K). */
-	if (Klen > 64) {
-		SHA256_Init(&ctx->ictx);
-		_SHA256_Update(&ctx->ictx, K, Klen, tmp32);
-		_SHA256_Final(khash, &ctx->ictx, tmp32);
-		K = khash;
-		Klen = 32;
-	}
-
-	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-	SHA256_Init(&ctx->ictx);
-	memset(pad, 0x36, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-	_SHA256_Update(&ctx->ictx, pad, 64, tmp32);
-
-	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-	SHA256_Init(&ctx->octx);
-	memset(pad, 0x5c, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-	_SHA256_Update(&ctx->octx, pad, 64, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
-{
-	uint32_t tmp32[72];
-	uint8_t pad[64];
-	uint8_t khash[32];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Init(ctx, _K, Klen, tmp32, pad, khash);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(khash, 32);
-	insecure_memzero(pad, 64);
-}
-
-/**
- * HMAC_SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
- */
-static void
-_HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len,
-    uint32_t tmp32[static restrict 72])
-{
-
-	/* Feed data to the inner SHA256 operation. */
-	_SHA256_Update(&ctx->ictx, in, len, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Update(ctx, in, len, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Final(digest, ctx):
- * Output the HMAC-SHA256 of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-static void
-_HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx,
-    uint32_t tmp32[static restrict 72], uint8_t ihash[static restrict 32])
-{
-
-	/* Finish the inner SHA256 operation. */
-	_SHA256_Final(ihash, &ctx->ictx, tmp32);
-
-	/* Feed the inner hash to the outer SHA256 operation. */
-	_SHA256_Update(&ctx->octx, ihash, 32, tmp32);
-
-	/* Finish the outer SHA256 operation. */
-	_SHA256_Final(digest, &ctx->octx, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx)
-{
-	uint32_t tmp32[72];
-	uint8_t ihash[32];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Final(digest, ctx, tmp32, ihash);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(ihash, 32);
-}
-
-/**
- * HMAC_SHA256_Buf(K, Klen, in, len, digest):
- * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
- * length ${Klen}, and write the result to ${digest}.
- */
-void
-HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
-    uint8_t digest[32])
-{
-	HMAC_SHA256_CTX ctx;
-	uint32_t tmp32[72];
-	uint8_t tmp8[96];
-
-	_HMAC_SHA256_Init(&ctx, K, Klen, tmp32, &tmp8[0], &tmp8[64]);
-	_HMAC_SHA256_Update(&ctx, in, len, tmp32);
-	_HMAC_SHA256_Final(digest, &ctx, tmp32, &tmp8[0]);
-
-	/* Clean the stack. */
-	insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(tmp8, 96);
-}
-
-/* Add padding and terminating bit-count, but don't invoke Transform yet. */
-static int
-SHA256_Pad_Almost(SHA256_CTX * ctx, uint8_t len[static restrict 8],
-    uint32_t tmp32[static restrict 72])
-{
-	uint32_t r;
-
-	r = (ctx->count >> 3) & 0x3f;
-	if (r >= 56)
-		return -1;
-
-	/*
-	 * Convert length to a vector of bytes -- we do this now rather
-	 * than later because the length will change after we pad.
-	 */
-	be64enc(len, ctx->count);
-
-	/* Add 1--56 bytes so that the resulting length is 56 mod 64. */
-	_SHA256_Update(ctx, PAD, 56 - r, tmp32);
-
-	/* Add the terminating bit-count. */
-	ctx->buf[63] = len[7];
-	_SHA256_Update(ctx, len, 7, tmp32);
-
-	return 0;
-}
-
-/**
- * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
- * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
- * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
- */
-void
-PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
-    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
-{
-	HMAC_SHA256_CTX Phctx, PShctx, hctx;
-	uint32_t tmp32[72];
-	union {
-		uint8_t tmp8[96];
-		uint32_t state[8];
-	} u;
-	size_t i;
-	uint8_t ivec[4];
-	uint8_t U[32];
-	uint8_t T[32];
-	uint64_t j;
-	int k;
-	size_t clen;
-
-	/* Sanity-check. */
-	assert(dkLen <= 32 * (size_t)(UINT32_MAX));
-
-	if (c == 1 && (dkLen & 31) == 0 && (saltlen & 63) <= 51) {
-		uint32_t oldcount;
-		uint8_t * ivecp;
-
-		/* Compute HMAC state after processing P and S. */
-		_HMAC_SHA256_Init(&hctx, passwd, passwdlen,
-		    tmp32, &u.tmp8[0], &u.tmp8[64]);
-		_HMAC_SHA256_Update(&hctx, salt, saltlen, tmp32);
-
-		/* Prepare ictx padding. */
-		oldcount = hctx.ictx.count & (0x3f << 3);
-		_HMAC_SHA256_Update(&hctx, "\0\0\0", 4, tmp32);
-		if ((hctx.ictx.count & (0x3f << 3)) < oldcount ||
-		    SHA256_Pad_Almost(&hctx.ictx, u.tmp8, tmp32))
-			goto generic; /* Can't happen due to saltlen check */
-		ivecp = hctx.ictx.buf + (oldcount >> 3);
-
-		/* Prepare octx padding. */
-		hctx.octx.count += 32 << 3;
-		SHA256_Pad_Almost(&hctx.octx, u.tmp8, tmp32);
-
-		/* Iterate through the blocks. */
-		for (i = 0; i * 32 < dkLen; i++) {
-			/* Generate INT(i + 1). */
-			be32enc(ivecp, (uint32_t)(i + 1));
-
-			/* Compute U_1 = PRF(P, S || INT(i)). */
-			memcpy(u.state, hctx.ictx.state, sizeof(u.state));
-			SHA256_Transform(u.state, hctx.ictx.buf,
-			    &tmp32[0], &tmp32[64]);
-			be32enc_vect(hctx.octx.buf, u.state, 4);
-			memcpy(u.state, hctx.octx.state, sizeof(u.state));
-			SHA256_Transform(u.state, hctx.octx.buf,
-			    &tmp32[0], &tmp32[64]);
-			be32enc_vect(&buf[i * 32], u.state, 4);
-		}
-
-		goto cleanup;
-	}
-
-generic:
-	/* Compute HMAC state after processing P. */
-	_HMAC_SHA256_Init(&Phctx, passwd, passwdlen,
-	    tmp32, &u.tmp8[0], &u.tmp8[64]);
-
-	/* Compute HMAC state after processing P and S. */
-	memcpy(&PShctx, &Phctx, sizeof(HMAC_SHA256_CTX));
-	_HMAC_SHA256_Update(&PShctx, salt, saltlen, tmp32);
-
-	/* Iterate through the blocks. */
-	for (i = 0; i * 32 < dkLen; i++) {
-		/* Generate INT(i + 1). */
-		be32enc(ivec, (uint32_t)(i + 1));
-
-		/* Compute U_1 = PRF(P, S || INT(i)). */
-		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
-		_HMAC_SHA256_Update(&hctx, ivec, 4, tmp32);
-		_HMAC_SHA256_Final(T, &hctx, tmp32, u.tmp8);
-
-		if (c > 1) {
-			/* T_i = U_1 ... */
-			memcpy(U, T, 32);
-
-			for (j = 2; j <= c; j++) {
-				/* Compute U_j. */
-				memcpy(&hctx, &Phctx, sizeof(HMAC_SHA256_CTX));
-				_HMAC_SHA256_Update(&hctx, U, 32, tmp32);
-				_HMAC_SHA256_Final(U, &hctx, tmp32, u.tmp8);
-
-				/* ... xor U_j ... */
-				for (k = 0; k < 32; k++)
-					T[k] ^= U[k];
-			}
-		}
-
-		/* Copy as many bytes as necessary into buf. */
-		clen = dkLen - i * 32;
-		if (clen > 32)
-			clen = 32;
-		memcpy(&buf[i * 32], T, clen);
-	}
-
-	/* Clean the stack. */
-	insecure_memzero(&Phctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(&PShctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(U, 32);
-	insecure_memzero(T, 32);
-
-cleanup:
-	insecure_memzero(&hctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(&u, sizeof(u));
-}
--- a/algo/yespower/sha256.c.new
+++ b/algo/yespower/sha256.c.new
@@ -1,672 +0,0 @@
-/*-
- * Copyright 2005-2016 Colin Percival
- * Copyright 2016-2018 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <assert.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "insecure_memzero.h"
-#include "sysendian.h"
-
-#include "sha256.h"
-
-#ifdef __ICC
-/* Miscompile with icc 14.0.0 (at least), so don't use restrict there */
-#define restrict
-#elif __STDC_VERSION__ >= 199901L
-/* Have restrict */
-#elif defined(__GNUC__)
-#define restrict __restrict
-#else
-#define restrict
-#endif
-
-/*
- * Encode a length len*2 vector of (uint32_t) into a length len*8 vector of
- * (uint8_t) in big-endian form.
- */
-static void
-be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
-{
-
-	/* Encode vector, two words at a time. */
-	do {
-		be32enc(&dst[0], src[0]);
-		be32enc(&dst[4], src[1]);
-		src += 2;
-		dst += 8;
-	} while (--len);
-}
-
-/*
- * Decode a big-endian length len*8 vector of (uint8_t) into a length
- * len*2 vector of (uint32_t).
- */
-static void
-be32dec_vect(uint32_t * dst, const uint8_t * src, size_t len)
-{
-
-	/* Decode vector, two words at a time. */
-	do {
-		dst[0] = be32dec(&src[0]);
-		dst[1] = be32dec(&src[4]);
-		src += 8;
-		dst += 2;
-	} while (--len);
-}
-
-#if 0
-/* SHA256 round constants. */
-static const uint32_t Krnd[64] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-/* Elementary functions used by SHA256 */
-#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
-#define Maj(x, y, z)	((x & (y | z)) | (y & z))
-#define SHR(x, n)	(x >> n)
-#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
-#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
-#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
-#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
-#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
-
-/* SHA256 round function */
-#define RND(a, b, c, d, e, f, g, h, k)			\
-	h += S1(e) + Ch(e, f, g) + k;			\
-	d += h;						\
-	h += S0(a) + Maj(a, b, c);
-
-/* Adjusted round function for rotating state */
-#define RNDr(S, W, i, ii)			\
-	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
-	    S[(66 - i) % 8], S[(67 - i) % 8],	\
-	    S[(68 - i) % 8], S[(69 - i) % 8],	\
-	    S[(70 - i) % 8], S[(71 - i) % 8],	\
-	    W[i + ii] + Krnd[i + ii])
-
-/* Message schedule computation */
-#define MSCH(W, ii, i)				\
-	W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + s0(W[i + ii + 1]) + W[i + ii]
-
-/*
- * SHA256 block compression function.  The 256-bit state is transformed via
- * the 512-bit input block to produce a new state.
- */
-static void
-SHA256_Transform(uint32_t state[static restrict 8],
-    const uint8_t block[static restrict 64],
-    uint32_t W[static restrict 64], uint32_t S[static restrict 8])
-{
-	int i;
-
-	/* 1. Prepare the first part of the message schedule W. */
-	be32dec_vect(W, block, 8);
-
-	/* 2. Initialize working variables. */
-	memcpy(S, state, 32);
-
-	/* 3. Mix. */
-	for (i = 0; i < 64; i += 16) {
-		RNDr(S, W, 0, i);
-		RNDr(S, W, 1, i);
-		RNDr(S, W, 2, i);
-		RNDr(S, W, 3, i);
-		RNDr(S, W, 4, i);
-		RNDr(S, W, 5, i);
-		RNDr(S, W, 6, i);
-		RNDr(S, W, 7, i);
-		RNDr(S, W, 8, i);
-		RNDr(S, W, 9, i);
-		RNDr(S, W, 10, i);
-		RNDr(S, W, 11, i);
-		RNDr(S, W, 12, i);
-		RNDr(S, W, 13, i);
-		RNDr(S, W, 14, i);
-		RNDr(S, W, 15, i);
-
-		if (i == 48)
-			break;
-		MSCH(W, 0, i);
-		MSCH(W, 1, i);
-		MSCH(W, 2, i);
-		MSCH(W, 3, i);
-		MSCH(W, 4, i);
-		MSCH(W, 5, i);
-		MSCH(W, 6, i);
-		MSCH(W, 7, i);
-		MSCH(W, 8, i);
-		MSCH(W, 9, i);
-		MSCH(W, 10, i);
-		MSCH(W, 11, i);
-		MSCH(W, 12, i);
-		MSCH(W, 13, i);
-		MSCH(W, 14, i);
-		MSCH(W, 15, i);
-	}
-
-	/* 4. Mix local working variables into global state. */
-	state[0] += S[0];
-	state[1] += S[1];
-	state[2] += S[2];
-	state[3] += S[3];
-	state[4] += S[4];
-	state[5] += S[5];
-	state[6] += S[6];
-	state[7] += S[7];
-}
-#endif
-static const uint8_t PAD[64] = {
-	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-/* Add padding and terminating bit-count. */
-static void
-SHA256_Pad(SHA256_CTX * ctx, uint32_t tmp32[static restrict 72])
-{
-	size_t r;
-
-	/* Figure out how many bytes we have buffered. */
-	r = (ctx->count >> 3) & 0x3f;
-
-	/* Pad to 56 mod 64, transforming if we finish a block en route. */
-	if (r < 56) {
-		/* Pad to 56 mod 64. */
-		memcpy(&ctx->buf[r], PAD, 56 - r);
-	} else {
-		/* Finish the current block and mix. */
-		memcpy(&ctx->buf[r], PAD, 64 - r);
-		SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-
-		/* The start of the final block is all zeroes. */
-		memset(&ctx->buf[0], 0, 56);
-	}
-
-	/* Add the terminating bit-count. */
-	be64enc(&ctx->buf[56], ctx->count);
-
-	/* Mix in the final block. */
-	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-}
-#if 0
-/* Magic initialization constants. */
-static const uint32_t initial_state[8] = {
-	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-};
-
-/**
- * SHA256_Init(ctx):
- * Initialize the SHA256 context ${ctx}.
- */
-void
-SHA256_Init(SHA256_CTX * ctx)
-{
-
-	/* Zero bits processed so far. */
-	ctx->count = 0;
-
-	/* Initialize state. */
-	memcpy(ctx->state, initial_state, sizeof(initial_state));
-}
-
-/**
- * SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
- */
-static void
-_SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len,
-    uint32_t tmp32[static restrict 72])
-{
-	uint32_t r;
-	const uint8_t * src = in;
-
-	/* Return immediately if we have nothing to do. */
-	if (len == 0)
-		return;
-
-	/* Number of bytes left in the buffer from previous updates. */
-	r = (ctx->count >> 3) & 0x3f;
-
-	/* Update number of bits. */
-	ctx->count += (uint64_t)(len) << 3;
-
-	/* Handle the case where we don't need to perform any transforms. */
-	if (len < 64 - r) {
-		memcpy(&ctx->buf[r], src, len);
-		return;
-	}
-
-	/* Finish the current block. */
-	memcpy(&ctx->buf[r], src, 64 - r);
-	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-	src += 64 - r;
-	len -= 64 - r;
-
-	/* Perform complete blocks. */
-	while (len >= 64) {
-		SHA256_Transform(ctx->state, src, &tmp32[0], &tmp32[64]);
-		src += 64;
-		len -= 64;
-	}
-
-	/* Copy left over data into buffer. */
-	memcpy(ctx->buf, src, len);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_SHA256_Update(ctx, in, len, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * SHA256_Final(digest, ctx):
- * Output the SHA256 hash of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-static void
-_SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx,
-    uint32_t tmp32[static restrict 72])
-{
-
-	/* Add padding. */
-	SHA256_Pad(ctx, tmp32);
-
-	/* Write the hash. */
-	be32enc_vect(digest, ctx->state, 4);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_SHA256_Final(digest, ctx, tmp32);
-
-	/* Clear the context state. */
-	insecure_memzero(ctx, sizeof(SHA256_CTX));
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-#endif
-/**
- * SHA256_Buf(in, len, digest):
- * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
- */
-void
-SHA256_Buf(const void * in, size_t len, uint8_t digest[32])
-{
-	SHA256_CTX ctx;
-	uint32_t tmp32[72];
-
-	SHA256_Init(&ctx);
-        SHA256_Update(&ctx, in, len);
-        SHA256_Final(digest, &ctx);
-//	_SHA256_Update(&ctx, in, len, tmp32);
-//	_SHA256_Final(digest, &ctx, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(&ctx, sizeof(SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Init(ctx, K, Klen):
- * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
- * ${K}.
- */
-static void
-_HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen,
-    uint32_t tmp32[static restrict 72], uint8_t pad[static restrict 64],
-    uint8_t khash[static restrict 32])
-{
-	const uint8_t * K = _K;
-	size_t i;
-
-	/* If Klen > 64, the key is really SHA256(K). */
-	if (Klen > 64) {
-		SHA256_Init(&ctx->ictx);
-                SHA256_Update(&ctx->ictx, K, Klen);
-                SHA256_Final(khash, &ctx->ictx);
-//		_SHA256_Update(&ctx->ictx, K, Klen, tmp32);
-//		_SHA256_Final(khash, &ctx->ictx, tmp32);
-		K = khash;
-		Klen = 32;
-	}
-
-	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-	SHA256_Init(&ctx->ictx);
-	memset(pad, 0x36, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-        SHA256_Update(&ctx->ictx, pad, 64);
-//	_SHA256_Update(&ctx->ictx, pad, 64, tmp32);
-
-	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-	SHA256_Init(&ctx->octx);
-	memset(pad, 0x5c, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-        SHA256_Update(&ctx->octx, pad, 64);
-//	_SHA256_Update(&ctx->octx, pad, 64, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
-{
-	uint32_t tmp32[72];
-	uint8_t pad[64];
-	uint8_t khash[32];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Init(ctx, _K, Klen, tmp32, pad, khash);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(khash, 32);
-	insecure_memzero(pad, 64);
-}
-
-/**
- * HMAC_SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
- */
-static void
-_HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len,
-    uint32_t tmp32[static restrict 72])
-{
-
-	/* Feed data to the inner SHA256 operation. */
-        SHA256_Update(&ctx->ictx, in, len);
-//	_SHA256_Update(&ctx->ictx, in, len, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Update(ctx, in, len, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Final(digest, ctx):
- * Output the HMAC-SHA256 of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-static void
-_HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx,
-    uint32_t tmp32[static restrict 72], uint8_t ihash[static restrict 32])
-{
-        /* Finish the inner SHA256 operation. */
-        _SHA256_Final(ihash, &ctx->ictx, tmp32);
-
-        /* Feed the inner hash to the outer SHA256 operation. */
-        _SHA256_Update(&ctx->octx, ihash, 32, tmp32);
-
-        /* Finish the outer SHA256 operation. */
-        _SHA256_Final(digest, &ctx->octx, tmp32);
-
-
-//	_SHA256_Final(ihash, &ctx->ictx, tmp32);
-//	_SHA256_Update(&ctx->octx, ihash, 32, tmp32);
-//	_SHA256_Final(digest, &ctx->octx, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx)
-{
-	uint32_t tmp32[72];
-	uint8_t ihash[32];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Final(digest, ctx, tmp32, ihash);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(ihash, 32);
-}
-
-/**
- * HMAC_SHA256_Buf(K, Klen, in, len, digest):
- * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
- * length ${Klen}, and write the result to ${digest}.
- */
-void
-HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
-    uint8_t digest[32])
-{
-	HMAC_SHA256_CTX ctx;
-	uint32_t tmp32[72];
-	uint8_t tmp8[96];
-
-	_HMAC_SHA256_Init(&ctx, K, Klen, tmp32, &tmp8[0], &tmp8[64]);
-	_HMAC_SHA256_Update(&ctx, in, len, tmp32);
-	_HMAC_SHA256_Final(digest, &ctx, tmp32, &tmp8[0]);
-
-	/* Clean the stack. */
-	insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(tmp8, 96);
-}
-
-/* Add padding and terminating bit-count, but don't invoke Transform yet. */
-static int
-SHA256_Pad_Almost(SHA256_CTX * ctx, uint8_t len[static restrict 8],
-    uint32_t tmp32[static restrict 72])
-{
-	uint32_t r;
-
-	r = (ctx->count >> 3) & 0x3f;
-	if (r >= 56)
-		return -1;
-
-	/*
-	 * Convert length to a vector of bytes -- we do this now rather
-	 * than later because the length will change after we pad.
-	 */
-	be64enc(len, ctx->count);
-
-        /* Add 1--56 bytes so that the resulting length is 56 mod 64. */
-        SHA256_Update(ctx, PAD, 56 - r, tmp);
-
-        /* Add the terminating bit-count. */
-        ctx->buf[63] = len[7];
-        SHA256_Update(ctx, len, 7, tmp);
-	
-	/* Add 1--56 bytes so that the resulting length is 56 mod 64. */
-//	_SHA256_Update(ctx, PAD, 56 - r, tmp32);
-
-	/* Add the terminating bit-count. */
-//	ctx->buf[63] = len[7];
-//	_SHA256_Update(ctx, len, 7, tmp32);
-
-	return 0;
-}
-
-/**
- * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
- * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
- * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
- */
-void
-PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
-    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
-{
-	HMAC_SHA256_CTX Phctx, PShctx, hctx;
-	uint32_t tmp32[72];
-	union {
-		uint8_t tmp8[96];
-		uint32_t state[8];
-	} u;
-	size_t i;
-	uint8_t ivec[4];
-	uint8_t U[32];
-	uint8_t T[32];
-	uint64_t j;
-	int k;
-	size_t clen;
-
-	/* Sanity-check. */
-	assert(dkLen <= 32 * (size_t)(UINT32_MAX));
-
-	if (c == 1 && (dkLen & 31) == 0 && (saltlen & 63) <= 51) {
-		uint32_t oldcount;
-		uint8_t * ivecp;
-
-		/* Compute HMAC state after processing P and S. */
-		_HMAC_SHA256_Init(&hctx, passwd, passwdlen,
-		    tmp32, &u.tmp8[0], &u.tmp8[64]);
-		_HMAC_SHA256_Update(&hctx, salt, saltlen, tmp32);
-
-		/* Prepare ictx padding. */
-		oldcount = hctx.ictx.count & (0x3f << 3);
-		_HMAC_SHA256_Update(&hctx, "\0\0\0", 4, tmp32);
-		if ((hctx.ictx.count & (0x3f << 3)) < oldcount ||
-		    SHA256_Pad_Almost(&hctx.ictx, u.tmp8, tmp32))
-			goto generic; /* Can't happen due to saltlen check */
-		ivecp = hctx.ictx.buf + (oldcount >> 3);
-
-		/* Prepare octx padding. */
-		hctx.octx.count += 32 << 3;
-		SHA256_Pad_Almost(&hctx.octx, u.tmp8, tmp32);
-
-		/* Iterate through the blocks. */
-		for (i = 0; i * 32 < dkLen; i++) {
-			/* Generate INT(i + 1). */
-			be32enc(ivecp, (uint32_t)(i + 1));
-
-			/* Compute U_1 = PRF(P, S || INT(i)). */
-			memcpy(u.state, hctx.ictx.state, sizeof(u.state));
-
-                        SHA256_Transform(u.state, hctx.ictx.buf );
-                        be32enc_vect(hctx.octx.buf, u.state, 4);
-                        memcpy(u.state, hctx.octx.state, sizeof(u.state));
-                        SHA256_Transform(u.state, hctx.octx.buf );
-
-//			SHA256_Transform(u.state, hctx.ictx.buf,
-//			    &tmp32[0], &tmp32[64]);
-//			be32enc_vect(hctx.octx.buf, u.state, 4);
-//			memcpy(u.state, hctx.octx.state, sizeof(u.state));
-//			SHA256_Transform(u.state, hctx.octx.buf,
-//			    &tmp32[0], &tmp32[64]);
-
-			be32enc_vect(&buf[i * 32], u.state, 4);
-		}
-
-		goto cleanup;
-	}
-
-generic:
-	/* Compute HMAC state after processing P. */
-	_HMAC_SHA256_Init(&Phctx, passwd, passwdlen,
-	    tmp32, &u.tmp8[0], &u.tmp8[64]);
-
-	/* Compute HMAC state after processing P and S. */
-	memcpy(&PShctx, &Phctx, sizeof(HMAC_SHA256_CTX));
-	_HMAC_SHA256_Update(&PShctx, salt, saltlen, tmp32);
-
-	/* Iterate through the blocks. */
-	for (i = 0; i * 32 < dkLen; i++) {
-		/* Generate INT(i + 1). */
-		be32enc(ivec, (uint32_t)(i + 1));
-
-		/* Compute U_1 = PRF(P, S || INT(i)). */
-		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
-		_HMAC_SHA256_Update(&hctx, ivec, 4, tmp32);
-		_HMAC_SHA256_Final(T, &hctx, tmp32, u.tmp8);
-
-		if (c > 1) {
-			/* T_i = U_1 ... */
-			memcpy(U, T, 32);
-
-			for (j = 2; j <= c; j++) {
-				/* Compute U_j. */
-				memcpy(&hctx, &Phctx, sizeof(HMAC_SHA256_CTX));
-				_HMAC_SHA256_Update(&hctx, U, 32, tmp32);
-				_HMAC_SHA256_Final(U, &hctx, tmp32, u.tmp8);
-
-				/* ... xor U_j ... */
-				for (k = 0; k < 32; k++)
-					T[k] ^= U[k];
-			}
-		}
-
-		/* Copy as many bytes as necessary into buf. */
-		clen = dkLen - i * 32;
-		if (clen > 32)
-			clen = 32;
-		memcpy(&buf[i * 32], T, clen);
-	}
-
-	/* Clean the stack. */
-	insecure_memzero(&Phctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(&PShctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(U, 32);
-	insecure_memzero(T, 32);
-
-cleanup:
-	insecure_memzero(&hctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(&u, sizeof(u));
-}
--- a/algo/yespower/sha256.h
+++ b/algo/yespower/sha256.h
@@ -1,129 +0,0 @@
-/*-
- * Copyright 2005-2016 Colin Percival
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef _SHA256_H_
-#define _SHA256_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Use #defines in order to avoid namespace collisions with anyone else's
- * SHA256 code (e.g., the code in OpenSSL).
- */
-#define SHA256_Init libcperciva_SHA256_Init
-#define SHA256_Update libcperciva_SHA256_Update
-#define SHA256_Final libcperciva_SHA256_Final
-#define SHA256_Buf libcperciva_SHA256_Buf
-#define SHA256_CTX libcperciva_SHA256_CTX
-#define HMAC_SHA256_Init libcperciva_HMAC_SHA256_Init
-#define HMAC_SHA256_Update libcperciva_HMAC_SHA256_Update
-#define HMAC_SHA256_Final libcperciva_HMAC_SHA256_Final
-#define HMAC_SHA256_Buf libcperciva_HMAC_SHA256_Buf
-#define HMAC_SHA256_CTX libcperciva_HMAC_SHA256_CTX
-
-/* Context structure for SHA256 operations. */
-typedef struct {
-	uint32_t state[8];
-	uint64_t count;
-	uint8_t buf[64];
-} SHA256_CTX;
-
-/**
- * SHA256_Init(ctx):
- * Initialize the SHA256 context ${ctx}.
- */
-void SHA256_Init(SHA256_CTX *);
-
-/**
- * SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
- */
-void SHA256_Update(SHA256_CTX *, const void *, size_t);
-
-/**
- * SHA256_Final(digest, ctx):
- * Output the SHA256 hash of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-void SHA256_Final(uint8_t[32], SHA256_CTX *);
-
-/**
- * SHA256_Buf(in, len, digest):
- * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
- */
-void SHA256_Buf(const void *, size_t, uint8_t[32]);
-
-/* Context structure for HMAC-SHA256 operations. */
-typedef struct {
-	SHA256_CTX ictx;
-	SHA256_CTX octx;
-} HMAC_SHA256_CTX;
-
-/**
- * HMAC_SHA256_Init(ctx, K, Klen):
- * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
- * ${K}.
- */
-void HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t);
-
-/**
- * HMAC_SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
- */
-void HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t);
-
-/**
- * HMAC_SHA256_Final(digest, ctx):
- * Output the HMAC-SHA256 of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-void HMAC_SHA256_Final(uint8_t[32], HMAC_SHA256_CTX *);
-
-/**
- * HMAC_SHA256_Buf(K, Klen, in, len, digest):
- * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
- * length ${Klen}, and write the result to ${digest}.
- */
-void HMAC_SHA256_Buf(const void *, size_t, const void *, size_t, uint8_t[32]);
-
-/**
- * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
- * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
- * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
- */
-void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t,
-    uint64_t, uint8_t *, size_t);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* !_SHA256_H_ */
--- a/algo/yespower/sha256.h.new
+++ b/algo/yespower/sha256.h.new
@@ -1,134 +0,0 @@
-/*-
- * Copyright 2005-2016 Colin Percival
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef _SHA256_H_
-#define _SHA256_H_
-
-#include <stddef.h>
-#include <stdint.h>
-#include <openssl.sha>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Use #defines in order to avoid namespace collisions with anyone else's
- * SHA256 code (e.g., the code in OpenSSL).
- */
-/*
-#define SHA256_Init libcperciva_SHA256_Init
-#define SHA256_Update libcperciva_SHA256_Update
-#define SHA256_Final libcperciva_SHA256_Final
-#define SHA256_CTX libcperciva_SHA256_CTX
-*/
-#define SHA256_Buf libcperciva_SHA256_Buf
-#define HMAC_SHA256_Init libcperciva_HMAC_SHA256_Init
-#define HMAC_SHA256_Update libcperciva_HMAC_SHA256_Update
-#define HMAC_SHA256_Final libcperciva_HMAC_SHA256_Final
-#define HMAC_SHA256_Buf libcperciva_HMAC_SHA256_Buf
-#define HMAC_SHA256_CTX libcperciva_HMAC_SHA256_CTX
-
-#if 0
-/* Context structure for SHA256 operations. */
-typedef struct {
-	uint32_t state[8];
-	uint64_t count;
-	uint8_t buf[64];
-} SHA256_CTX;
-
-/**
- * SHA256_Init(ctx):
- * Initialize the SHA256 context ${ctx}.
- */
-void SHA256_Init(SHA256_CTX *);
-
-/**
- * SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
- */
-void SHA256_Update(SHA256_CTX *, const void *, size_t);
-
-/**
- * SHA256_Final(digest, ctx):
- * Output the SHA256 hash of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-void SHA256_Final(uint8_t[32], SHA256_CTX *);
-#endif
-
-/**
- * SHA256_Buf(in, len, digest):
- * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
- */
-void SHA256_Buf(const void *, size_t, uint8_t[32]);
-
-/* Context structure for HMAC-SHA256 operations. */
-typedef struct {
-	SHA256_CTX ictx;
-	SHA256_CTX octx;
-} HMAC_SHA256_CTX;
-
-/**
- * HMAC_SHA256_Init(ctx, K, Klen):
- * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
- * ${K}.
- */
-void HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t);
-
-/**
- * HMAC_SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
- */
-void HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t);
-
-/**
- * HMAC_SHA256_Final(digest, ctx):
- * Output the HMAC-SHA256 of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-void HMAC_SHA256_Final(uint8_t[32], HMAC_SHA256_CTX *);
-
-/**
- * HMAC_SHA256_Buf(K, Klen, in, len, digest):
- * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
- * length ${Klen}, and write the result to ${digest}.
- */
-void HMAC_SHA256_Buf(const void *, size_t, const void *, size_t, uint8_t[32]);
-
-/**
- * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
- * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
- * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
- */
-void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t,
-    uint64_t, uint8_t *, size_t);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* !_SHA256_H_ */
--- a/algo/yespower/sha256_p.c
+++ b/algo/yespower/sha256_p.c
@@ -0,0 +1,218 @@
+/*-
+ * Copyright 2005,2007,2009 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+
+#include <stdint.h>
+#include <string.h>
+
+#include "sysendian.h"
+
+#include "sha256_p.h"
+#include "compat.h"
+
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define SHR(x, n)	(x >> n)
+#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
+#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
+#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k)			\
+	t0 = h + S1(e) + Ch(e, f, g) + k;		\
+	t1 = S0(a) + Maj(a, b, c);			\
+	d += t0;					\
+	h  = t0 + t1;
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i, k)			\
+	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
+	    S[(66 - i) % 8], S[(67 - i) % 8],	\
+	    S[(68 - i) % 8], S[(69 - i) % 8],	\
+	    S[(70 - i) % 8], S[(71 - i) % 8],	\
+	    W[i] + k)
+
+/*
+static unsigned char PAD[64] = {
+	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+*/
+/**
+ * SHA256_Buf(in, len, digest):
+ * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
+ */
+void
+SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
+{
+	SHA256_CTX ctx;
+        SHA256_Init( &ctx );
+        SHA256_Update( &ctx, in, len );
+        SHA256_Final( digest, &ctx );
+}
+
+/**
+ * HMAC_SHA256_Buf(K, Klen, in, len, digest):
+ * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
+ * length ${Klen}, and write the result to ${digest}.
+ */
+void
+HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
+    uint8_t digest[32])
+{
+        HMAC_SHA256_CTX ctx;
+
+        HMAC_SHA256_Init( &ctx, K, Klen );
+        HMAC_SHA256_Update( &ctx, in, len );
+        HMAC_SHA256_Final( digest, &ctx );
+}
+
+/* Initialize an HMAC-SHA256 operation with the given key. */
+void
+HMAC_SHA256_Init( HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen )
+{
+	unsigned char pad[64];
+	unsigned char khash[32];
+	const unsigned char * K = _K;
+	size_t i;
+
+	/* If Klen > 64, the key is really SHA256(K). */
+	if (Klen > 64) {
+		SHA256_Init( &ctx->ictx );
+		SHA256_Update( &ctx->ictx, K, Klen );
+		SHA256_Final( khash, &ctx->ictx );
+		K = khash;
+		Klen = 32;
+	}
+
+	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
+        SHA256_Init( &ctx->ictx );
+	memset( pad, 0x36, 64 );
+	for ( i = 0; i < Klen; i++ )
+		pad[i] ^= K[i];
+	SHA256_Update( &ctx->ictx, pad, 64 );
+
+	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
+	SHA256_Init( &ctx->octx );
+	memset(pad, 0x5c, 64);
+	for ( i = 0; i < Klen; i++ )
+		pad[i] ^= K[i];
+	SHA256_Update( &ctx->octx, pad, 64 );
+
+	/* Clean the stack. */
+	//memset(khash, 0, 32);
+}
+
+/* Add bytes to the HMAC-SHA256 operation. */
+void
+HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)
+{
+
+	/* Feed data to the inner SHA256 operation. */
+	SHA256_Update( &ctx->ictx, in, len );
+}
+
+/* Finish an HMAC-SHA256 operation. */
+void
+HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx )
+{
+	unsigned char ihash[32];
+
+	/* Finish the inner SHA256 operation. */
+	SHA256_Final( ihash, &ctx->ictx );
+
+	/* Feed the inner hash to the outer SHA256 operation. */
+	SHA256_Update( &ctx->octx, ihash, 32 );
+
+	/* Finish the outer SHA256 operation. */
+	SHA256_Final( digest, &ctx->octx );
+
+	/* Clean the stack. */
+	//memset(ihash, 0, 32);
+}
+
+/**
+ * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
+ * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
+ * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
+ */
+void
+PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
+    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
+{
+	HMAC_SHA256_CTX PShctx, hctx;
+	uint8_t _ALIGN(128) T[32];
+	uint8_t _ALIGN(128) U[32];
+	uint8_t ivec[4];
+	size_t i, clen;
+	uint64_t j;
+	int k;
+
+	/* Compute HMAC state after processing P and S. */
+	HMAC_SHA256_Init(&PShctx, passwd, passwdlen);
+	HMAC_SHA256_Update(&PShctx, salt, saltlen);
+
+	/* Iterate through the blocks. */
+	for (i = 0; i * 32 < dkLen; i++) {
+		/* Generate INT(i + 1). */
+		be32enc(ivec, (uint32_t)(i + 1));
+
+		/* Compute U_1 = PRF(P, S || INT(i)). */
+		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
+		HMAC_SHA256_Update(&hctx, ivec, 4);
+		HMAC_SHA256_Final(U, &hctx);
+
+		/* T_i = U_1 ... */
+		memcpy(T, U, 32);
+
+		for (j = 2; j <= c; j++) {
+			/* Compute U_j. */
+			HMAC_SHA256_Init(&hctx, passwd, passwdlen);
+			HMAC_SHA256_Update(&hctx, U, 32);
+			HMAC_SHA256_Final(U, &hctx);
+
+			/* ... xor U_j ... */
+			for (k = 0; k < 32; k++)
+				T[k] ^= U[k];
+		}
+
+		/* Copy as many bytes as necessary into buf. */
+		clen = dkLen - i * 32;
+		if (clen > 32)
+			clen = 32;
+		memcpy(&buf[i * 32], T, clen);
+	}
+
+	/* Clean PShctx, since we never called _Final on it. */
+	//memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y));
+}
--- a/algo/yespower/sha256_p.c.sha
+++ b/algo/yespower/sha256_p.c.sha
@@ -1,496 +0,0 @@
-/*-
- * Copyright 2005,2007,2009 Colin Percival
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <sys/types.h>
-
-#include <stdint.h>
-#include <string.h>
-
-#include "sysendian.h"
-
-#include "sha256_p.h"
-#include "compat.h"
-
-/*
- * Encode a length len/4 vector of (uint32_t) into a length len vector of
- * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
- */
-static void
-be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len)
-{
-	size_t i;
-
-	for (i = 0; i < len / 4; i++)
-		be32enc(dst + i * 4, src[i]);
-}
-
-/*
- * Decode a big-endian length len vector of (unsigned char) into a length
- * len/4 vector of (uint32_t).  Assumes len is a multiple of 4.
- */
-static void
-be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len)
-{
-	size_t i;
-
-	for (i = 0; i < len / 4; i++)
-		dst[i] = be32dec(src + i * 4);
-}
-
-/* Elementary functions used by SHA256 */
-#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
-#define Maj(x, y, z)	((x & (y | z)) | (y & z))
-#define SHR(x, n)	(x >> n)
-#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
-#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
-#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
-#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
-#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
-
-/* SHA256 round function */
-#define RND(a, b, c, d, e, f, g, h, k)			\
-	t0 = h + S1(e) + Ch(e, f, g) + k;		\
-	t1 = S0(a) + Maj(a, b, c);			\
-	d += t0;					\
-	h  = t0 + t1;
-
-/* Adjusted round function for rotating state */
-#define RNDr(S, W, i, k)			\
-	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
-	    S[(66 - i) % 8], S[(67 - i) % 8],	\
-	    S[(68 - i) % 8], S[(69 - i) % 8],	\
-	    S[(70 - i) % 8], S[(71 - i) % 8],	\
-	    W[i] + k)
-
-/*
- * SHA256 block compression function.  The 256-bit state is transformed via
- * the 512-bit input block to produce a new state.
- */
-static void
-SHA256_Transform_p(uint32_t * state, const unsigned char block[64])
-{
-	uint32_t _ALIGN(128) W[64], S[8];
-	uint32_t t0, t1;
-	int i;
-
-	/* 1. Prepare message schedule W. */
-	be32dec_vect(W, block, 64);
-	for (i = 16; i < 64; i++)
-		W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
-
-	/* 2. Initialize working variables. */
-	memcpy(S, state, 32);
-
-	/* 3. Mix. */
-	RNDr(S, W, 0, 0x428a2f98);
-	RNDr(S, W, 1, 0x71374491);
-	RNDr(S, W, 2, 0xb5c0fbcf);
-	RNDr(S, W, 3, 0xe9b5dba5);
-	RNDr(S, W, 4, 0x3956c25b);
-	RNDr(S, W, 5, 0x59f111f1);
-	RNDr(S, W, 6, 0x923f82a4);
-	RNDr(S, W, 7, 0xab1c5ed5);
-	RNDr(S, W, 8, 0xd807aa98);
-	RNDr(S, W, 9, 0x12835b01);
-	RNDr(S, W, 10, 0x243185be);
-	RNDr(S, W, 11, 0x550c7dc3);
-	RNDr(S, W, 12, 0x72be5d74);
-	RNDr(S, W, 13, 0x80deb1fe);
-	RNDr(S, W, 14, 0x9bdc06a7);
-	RNDr(S, W, 15, 0xc19bf174);
-	RNDr(S, W, 16, 0xe49b69c1);
-	RNDr(S, W, 17, 0xefbe4786);
-	RNDr(S, W, 18, 0x0fc19dc6);
-	RNDr(S, W, 19, 0x240ca1cc);
-	RNDr(S, W, 20, 0x2de92c6f);
-	RNDr(S, W, 21, 0x4a7484aa);
-	RNDr(S, W, 22, 0x5cb0a9dc);
-	RNDr(S, W, 23, 0x76f988da);
-	RNDr(S, W, 24, 0x983e5152);
-	RNDr(S, W, 25, 0xa831c66d);
-	RNDr(S, W, 26, 0xb00327c8);
-	RNDr(S, W, 27, 0xbf597fc7);
-	RNDr(S, W, 28, 0xc6e00bf3);
-	RNDr(S, W, 29, 0xd5a79147);
-	RNDr(S, W, 30, 0x06ca6351);
-	RNDr(S, W, 31, 0x14292967);
-	RNDr(S, W, 32, 0x27b70a85);
-	RNDr(S, W, 33, 0x2e1b2138);
-	RNDr(S, W, 34, 0x4d2c6dfc);
-	RNDr(S, W, 35, 0x53380d13);
-	RNDr(S, W, 36, 0x650a7354);
-	RNDr(S, W, 37, 0x766a0abb);
-	RNDr(S, W, 38, 0x81c2c92e);
-	RNDr(S, W, 39, 0x92722c85);
-	RNDr(S, W, 40, 0xa2bfe8a1);
-	RNDr(S, W, 41, 0xa81a664b);
-	RNDr(S, W, 42, 0xc24b8b70);
-	RNDr(S, W, 43, 0xc76c51a3);
-	RNDr(S, W, 44, 0xd192e819);
-	RNDr(S, W, 45, 0xd6990624);
-	RNDr(S, W, 46, 0xf40e3585);
-	RNDr(S, W, 47, 0x106aa070);
-	RNDr(S, W, 48, 0x19a4c116);
-	RNDr(S, W, 49, 0x1e376c08);
-	RNDr(S, W, 50, 0x2748774c);
-	RNDr(S, W, 51, 0x34b0bcb5);
-	RNDr(S, W, 52, 0x391c0cb3);
-	RNDr(S, W, 53, 0x4ed8aa4a);
-	RNDr(S, W, 54, 0x5b9cca4f);
-	RNDr(S, W, 55, 0x682e6ff3);
-	RNDr(S, W, 56, 0x748f82ee);
-	RNDr(S, W, 57, 0x78a5636f);
-	RNDr(S, W, 58, 0x84c87814);
-	RNDr(S, W, 59, 0x8cc70208);
-	RNDr(S, W, 60, 0x90befffa);
-	RNDr(S, W, 61, 0xa4506ceb);
-	RNDr(S, W, 62, 0xbef9a3f7);
-	RNDr(S, W, 63, 0xc67178f2);
-
-	/* 4. Mix local working variables into global state */
-	for (i = 0; i < 8; i++)
-		state[i] += S[i];
-#if 0
-	/* Clean the stack. */
-	memset(W, 0, 256);
-	memset(S, 0, 32);
-	t0 = t1 = 0;
-#endif
-}
-
-static unsigned char PAD[64] = {
-	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-// only called by SHA256_Final_p
-/* Add padding and terminating bit-count. */
-static void
-SHA256_Pad_p(SHA256_CTX_p * ctx)
-{
-	unsigned char len[8];
-	uint32_t r, plen;
-
-	/*
-	 * Convert length to a vector of bytes -- we do this now rather
-	 * than later because the length will change after we pad.
-	 */
-	be32enc_vect(len, ctx->count, 8);
-
-	/* Add 1--64 bytes so that the resulting length is 56 mod 64 */
-	r = (ctx->count[1] >> 3) & 0x3f;
-	plen = (r < 56) ? (56 - r) : (120 - r);
-	SHA256_Update_p(ctx, PAD, (size_t)plen);
-	/* Add the terminating bit-count */
-	SHA256_Update_p(ctx, len, 8);
-}
-
-/* SHA-256 initialization.  Begins a SHA-256 operation. */
-void
-SHA256_Init_p(SHA256_CTX_p * ctx)
-{
-	/* Zero bits processed so far */
-	ctx->count[0] = ctx->count[1] = 0;
-
-	/* Magic initialization constants */
-	ctx->state[0] = 0x6A09E667;
-	ctx->state[1] = 0xBB67AE85;
-	ctx->state[2] = 0x3C6EF372;
-	ctx->state[3] = 0xA54FF53A;
-	ctx->state[4] = 0x510E527F;
-	ctx->state[5] = 0x9B05688C;
-	ctx->state[6] = 0x1F83D9AB;
-	ctx->state[7] = 0x5BE0CD19;
-}
-
-/* Add bytes into the hash */
-void
-SHA256_Update_p(SHA256_CTX_p * ctx, const void *in, size_t len)
-{
-	uint32_t bitlen[2];
-	uint32_t r;
-	const unsigned char *src = in;
-
-	/* Number of bytes left in the buffer from previous updates */
-	r = (ctx->count[1] >> 3) & 0x3f;
-
-	/* Convert the length into a number of bits */
-	bitlen[1] = ((uint32_t)len) << 3;
-	bitlen[0] = (uint32_t)(len >> 29);
-
-	/* Update number of bits */
-	if ((ctx->count[1] += bitlen[1]) < bitlen[1])
-		ctx->count[0]++;
-	ctx->count[0] += bitlen[0];
-
-	/* Handle the case where we don't need to perform any transforms */
-	if (len < 64 - r) {
-		memcpy(&ctx->buf[r], src, len);
-		return;
-	}
-
-	/* Finish the current block */
-	memcpy(&ctx->buf[r], src, 64 - r);
-        SHA256_Transform_p(ctx->state, ctx->buf);
-	src += 64 - r;
-	len -= 64 - r;
-
-	/* Perform complete blocks */
-	while (len >= 64) {
-		SHA256_Transform_p(ctx->state, src);
-		src += 64;
-		len -= 64;
-	}
-
-	/* Copy left over data into buffer */
-	memcpy(ctx->buf, src, len);
-}
-
-/*
- * SHA-256 finalization.  Pads the input data, exports the hash value,
- * and clears the context state.
- */
-void
-SHA256_Final_p(unsigned char digest[32], SHA256_CTX_p * ctx)
-{
-	/* Add padding */
-	SHA256_Pad_p(ctx);
-
-	/* Write the hash */
-	be32enc_vect(digest, ctx->state, 32);
-
-	/* Clear the context state */
-	memset((void *)ctx, 0, sizeof(*ctx));
-}
-
-/**
- * SHA256_Buf(in, len, digest):
- * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
- */
-void
-SHA256_Buf_p(const void * in, size_t len, uint8_t digest[32])
-{
-//        SHA256_CTX_p ctx;
-//        uint32_t tmp32[72];
-
-#if defined(__SHA__)
-        SHA256_CTX ctx;
-        SHA256_Init(&ctx);
-        SHA256_Update(&ctx, in, len);
-        SHA256_Final(digest, &ctx);
-#else
-        SHA256_CTX_p ctx;
-	SHA256_Init_p(&ctx);
-        SHA256_Update_p(&ctx, in, len);
-        SHA256_Final_p(digest, &ctx);
-#endif
-
-        /* Clean the stack. */
-//      insecure_memzero(&ctx, sizeof(SHA256_CTX));
-//      insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Buf(K, Klen, in, len, digest):
- * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
- * length ${Klen}, and write the result to ${digest}.
- */
-void
-HMAC_SHA256_Buf_p(const void * K, size_t Klen, const void * in, size_t len,
-    uint8_t digest[32])
-{
-        HMAC_SHA256_CTX_p ctx;
-//        uint32_t tmp32[72];
-//        uint8_t tmp8[96];
-
-        HMAC_SHA256_Init_p(&ctx, K, Klen);
-        HMAC_SHA256_Update_p(&ctx, in, len);
-        HMAC_SHA256_Final_p(digest, &ctx);
-
-        /* Clean the stack. */
-//        insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
-//        insecure_memzero(tmp32, 288);
-//        insecure_memzero(tmp8, 96);
-}
-
-/* Initialize an HMAC-SHA256 operation with the given key. */
-void
-HMAC_SHA256_Init_p(HMAC_SHA256_CTX_p * ctx, const void * _K, size_t Klen)
-{
-	unsigned char pad[64];
-	unsigned char khash[32];
-	const unsigned char * K = _K;
-	size_t i;
-
-	/* If Klen > 64, the key is really SHA256(K). */
-	if (Klen > 64) {
-#if defined(__SHA__)
-		SHA256_Init(&ctx->ictx);
-		SHA256_Update(&ctx->ictx, K, Klen);
-		SHA256_Final(khash, &ctx->ictx);
-#else
-                SHA256_Init_p(&ctx->ictx);
-                SHA256_Update_p(&ctx->ictx, K, Klen);
-                SHA256_Final_p(khash, &ctx->ictx);
-#endif
-		K = khash;
-		Klen = 32;
-	}
-
-	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-#if defined(__SHA__)
-        SHA256_Init(&ctx->ictx);
-#else
-        SHA256_Init_p(&ctx->ictx);
-#endif
-	memset(pad, 0x36, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-#if defined(__SHA__)
-	SHA256_Update(&ctx->ictx, pad, 64);
-#else
-        SHA256_Update_p(&ctx->ictx, pad, 64);
-#endif
-
-	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-#if defined(__SHA__)
-	SHA256_Init(&ctx->octx);
-#else
-        SHA256_Init_p(&ctx->octx);
-#endif
-	memset(pad, 0x5c, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-#if defined(__SHA__)
-	SHA256_Update(&ctx->octx, pad, 64);
-#else
-        SHA256_Update_p(&ctx->octx, pad, 64);
-#endif
-
-	/* Clean the stack. */
-	//memset(khash, 0, 32);
-}
-
-/* Add bytes to the HMAC-SHA256 operation. */
-void
-HMAC_SHA256_Update_p(HMAC_SHA256_CTX_p * ctx, const void *in, size_t len)
-{
-
-	/* Feed data to the inner SHA256 operation. */
-#if defined(__SHA__)
-	SHA256_Update(&ctx->ictx, in, len);
-#else
-        SHA256_Update_p(&ctx->ictx, in, len);
-#endif
-}
-
-/* Finish an HMAC-SHA256 operation. */
-void
-HMAC_SHA256_Final_p(unsigned char digest[32], HMAC_SHA256_CTX_p * ctx)
-{
-	unsigned char ihash[32];
-
-#if defined(__SHA__)
-	/* Finish the inner SHA256 operation. */
-	SHA256_Final(ihash, &ctx->ictx);
-
-	/* Feed the inner hash to the outer SHA256 operation. */
-	SHA256_Update(&ctx->octx, ihash, 32);
-
-	/* Finish the outer SHA256 operation. */
-	SHA256_Final(digest, &ctx->octx);
-#else
-        /* Finish the inner SHA256 operation. */
-        SHA256_Final_p(ihash, &ctx->ictx);
-
-        /* Feed the inner hash to the outer SHA256 operation. */
-        SHA256_Update_p(&ctx->octx, ihash, 32);
-
-        /* Finish the outer SHA256 operation. */
-        SHA256_Final_p(digest, &ctx->octx);
-#endif
-
-	/* Clean the stack. */
-	//memset(ihash, 0, 32);
-}
-
-/**
- * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
- * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
- * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
- */
-void
-PBKDF2_SHA256_p(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
-    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
-{
-	HMAC_SHA256_CTX_p PShctx, hctx;
-	uint8_t _ALIGN(128) T[32];
-	uint8_t _ALIGN(128) U[32];
-	uint8_t ivec[4];
-	size_t i, clen;
-	uint64_t j;
-	int k;
-
-	/* Compute HMAC state after processing P and S. */
-	HMAC_SHA256_Init_p(&PShctx, passwd, passwdlen);
-	HMAC_SHA256_Update_p(&PShctx, salt, saltlen);
-
-	/* Iterate through the blocks. */
-	for (i = 0; i * 32 < dkLen; i++) {
-		/* Generate INT(i + 1). */
-		be32enc(ivec, (uint32_t)(i + 1));
-
-		/* Compute U_1 = PRF(P, S || INT(i)). */
-		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_p));
-		HMAC_SHA256_Update_p(&hctx, ivec, 4);
-		HMAC_SHA256_Final_p(U, &hctx);
-
-		/* T_i = U_1 ... */
-		memcpy(T, U, 32);
-
-		for (j = 2; j <= c; j++) {
-			/* Compute U_j. */
-			HMAC_SHA256_Init_p(&hctx, passwd, passwdlen);
-			HMAC_SHA256_Update_p(&hctx, U, 32);
-			HMAC_SHA256_Final_p(U, &hctx);
-
-			/* ... xor U_j ... */
-			for (k = 0; k < 32; k++)
-				T[k] ^= U[k];
-		}
-
-		/* Copy as many bytes as necessary into buf. */
-		clen = dkLen - i * 32;
-		if (clen > 32)
-			clen = 32;
-		memcpy(&buf[i * 32], T, clen);
-	}
-
-	/* Clean PShctx, since we never called _Final on it. */
-	//memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y));
-}
--- a/algo/yespower/sha256_p.h.sha
+++ b/algo/yespower/sha256_p.h.sha
@@ -33,45 +33,24 @@
 #include <stdint.h>
 #include <openssl/sha.h>

-typedef struct SHA256Context {
-	uint32_t state[8];
-	uint32_t count[2];
-	unsigned char buf[64];
-} SHA256_CTX_p;
-
-/*
 typedef struct HMAC_SHA256Context {
-	SHA256_CTX_Y ictx;
-	SHA256_CTX_Y octx;
-} HMAC_SHA256_CTX_Y;
-*/
-
-typedef struct HMAC_SHA256Context {
-#if defined(__SHA__)
        SHA256_CTX ictx;
        SHA256_CTX octx;
-#else
-        SHA256_CTX_p ictx;
-        SHA256_CTX_p octx;
-#endif
-} HMAC_SHA256_CTX_p;
+} HMAC_SHA256_CTX;

-void	SHA256_Init_p(SHA256_CTX_p *);
-void	SHA256_Update_p(SHA256_CTX_p *, const void *, size_t);
-void	SHA256_Final_p(unsigned char [32], SHA256_CTX_p *);
-void    SHA256_Buf_p(const void * in, size_t len, uint8_t digest[32]);
-void	HMAC_SHA256_Init_p(HMAC_SHA256_CTX_p *, const void *, size_t);
-void	HMAC_SHA256_Update_p(HMAC_SHA256_CTX_p *, const void *, size_t);
-void	HMAC_SHA256_Final_p(unsigned char [32], HMAC_SHA256_CTX_p *);
-void    HMAC_SHA256_Buf_p(const void * K, size_t Klen, const void * in,
-	size_t len, uint8_t digest[32]);
+void SHA256_Buf( const void * in, size_t len, uint8_t digest[32] );
+void HMAC_SHA256_Init( HMAC_SHA256_CTX *, const void *, size_t );
+void HMAC_SHA256_Update( HMAC_SHA256_CTX *, const void *, size_t );
+void HMAC_SHA256_Final( unsigned char [32], HMAC_SHA256_CTX * );
+void HMAC_SHA256_Buf( const void * K, size_t Klen, const void * in,
+                      size_t len, uint8_t digest[32] );

 /**
 * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
 * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
 * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
 */
-void	PBKDF2_SHA256_p(const uint8_t *, size_t, const uint8_t *, size_t,
-    uint64_t, uint8_t *, size_t);
+void PBKDF2_SHA256( const uint8_t *, size_t, const uint8_t *, size_t,
+                    uint64_t, uint8_t *, size_t);

 #endif /* !_SHA256_H_ */
--- a/algo/yespower/yespower-opt.c
+++ b/algo/yespower/yespower-opt.c
@@ -62,6 +62,7 @@
 #warning "Note: building generic code for non-x86.  That's OK."
 #endif
 */
+
 /*
 * The SSE4 code version has fewer instructions than the generic SSE2 version,
 * but all of the instructions are SIMD, thereby wasting the scalar execution
@@ -96,7 +97,7 @@
 #include <string.h>

 #include "insecure_memzero.h"
-#include "sha256.h"
+#include "sha256_p.h"
 #include "sysendian.h"

 #include "yespower.h"
--- a/algo/yespower/yespower-opt.c.sha
+++ b/algo/yespower/yespower-opt.c.sha
--- a/algo/yespower/yespower-ref.c
+++ b/algo/yespower/yespower-ref.c
@@ -51,7 +51,7 @@
 #include <stdlib.h>
 #include <string.h>

-#include "sha256.h"
+#include "sha256_p.h"
 #include "sysendian.h"

 #include "yespower.h"
@@ -534,11 +534,12 @@ int yespower(yespower_local_t *local,

 		if (pers) {
 			HMAC_SHA256_Buf(dst, sizeof(*dst), pers, perslen,
+               return true;
 			    (uint8_t *)sha256);
 			SHA256_Buf(sha256, sizeof(sha256), (uint8_t *)dst);
 		}
 	} else {
-		HMAC_SHA256_Buf((uint8_t *)B + B_size - 64, 64,
+		HMAC_SHA256_Buf_P((uint8_t *)B + B_size - 64, 64,
 		    sha256, sizeof(sha256), (uint8_t *)dst);
 	}

--- a/algo/yespower/yespower.c
+++ b/algo/yespower/yespower.c
@@ -38,7 +38,7 @@ void yespower_hash( const char *input, char *output, uint32_t len )
 }

 int scanhash_yespower( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done )
+                       uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t _ALIGN(64) vhash[8];
        uint32_t _ALIGN(64) endiandata[20];
@@ -48,6 +48,7 @@ int scanhash_yespower( int thr_id, struct work *work, uint32_t max_nonce,
        const uint32_t Htarg = ptarget[7];
        const uint32_t first_nonce = pdata[19];
        uint32_t n = first_nonce;
+        /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

        for (int k = 0; k < 19; k++)
                be32enc(&endiandata[k], pdata[k]);
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -99,20 +99,73 @@
 #include <memory.h>
 #include <stdbool.h>

-// 64 bit seems completely useless
+// First some integer stuff that mirrors the SIMD utilities
+#define ror_64( x, c ) \
+      (uint64_t)( ( (uint64_t)(x) >> (c) ) | ( (uint64_t)(x) << (64-(c)) ) )
+#define rol_64( x, c ) \
+      (uint64_t)( ( (uint64_t)(x) << (c) ) | ( (uint64_t)(x) >> (64-(c)) ) )
+#define ror_32( x, c ) \
+      (uint32_t)( ( (uint32_t)(x) >> (c) ) | ( (uint32_t)(x) << (32-(c)) ) )
+#define rol_32( x, c ) \
+      (uint32_t)( ( (uint32_t)(x) << (c) ) | ( (uint32_t)(x) >> (32-(c)) ) )
+#define ror_16( x, c ) \
+      (uint16_t)( ( (uint16_t)(x) >> (c) ) | ( (uint16_t)(x) << (16-(c)) ) )
+#define rol_16( x, c ) \
+      (uint16_t)( ( (uint16_t)(x) << (c) ) | ( (uint16_t)(x) >> (16-(c)) ) )
+#define ror_8( x, c ) \
+      (uint8_t) ( ( (uint8_t) (x) >> (c) ) | ( (uint8_t) (x) << ( 8-(c)) ) )
+#define rol_8( x, c ) \
+      (uint8_t) ( ( (uint8_t) (x) << (c) ) | ( (uint8_t) (x) >> ( 8-(c)) ) )
+
+#define bswap_64( x )      __builtin_bswap64(x)
+#define bswap_32( x )      __builtin_bswap32(x)
+
+// 128 bit integer
+//
+// Int128 uses two 64 bit GPRs to hold the data. The main benefits are
+// for 128 bit arithmetic. Vectors are preferred when 128 bit arith
+// is not required. int128 also works better with other integer sizes.
+// Vectors benefit from wider registers. 
+//
+// Use typecasting for conversion to/from 128 bit vector:
+// __m128i v128 = (__m128i)my_int128l
+// __m256i v256 = _mm256_set_m128i( (__m128i)my_int128, (__m128i)my_int128 );
+// my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 );
+
+#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
+
+// Test this before using int128.
+#define GCC_INT128 1
+
+// Familiar looking type names
+typedef          __int128  int128_t;
+typedef unsigned __int128 uint128_t;
+
+// No real need or use.
+#define i128_neg1        (uint128_t)(-1LL)
+
+// Extract selected 64 bit half of 128 bit integer.
+// A generic macro with a selector argument can't be encoded as a statement
+// function and would require a branch.
+#define i128_hi64( x )    (uint64_t)( (uint128_t)(x) >> 64 )
+#define i128_lo64( x )    (uint64_t)( (uint128_t)(x) << 64 >> 64 )
+
+// Not much need for this but it fills a gap.
+#define ror_128( x, c ) \
+       ( ( (uint128_t)(x) >> (c) ) | ( (uint128_t)(x) << (128-(c)) ) )
+#define rol_128( x, c ) \
+       ( ( (uint128_t)(x) << (c) ) | ( (uint128_t)(x) >> (128-(c)) ) )
+
+#endif  // INT128

 ////////////////////////////////////////////////////////////////
 //
-//         64 bit MMX vectors.
+//               64 bit MMX vectors.
 //
 // There are rumours MMX wil be removed. Although casting with int64
 // works there is likely some overhead to move the data to An MMX register
 // and back.
-// Byte swap and rotation may be more efficient using an MMX shuffle
-// except that it won't compile due to a "target specific option mismatch"
-// with "inlining failed in call to always inline". MMX was designed for
-// 32 bit CPUs and might not work on 64 bit CPUs where the CPU has full
-// support for 64 bit operations without vectoring.  
+// Byte swap and rotation may be more efficient using an MMX shuffle.
 //
 // Universal 64 bit overlay
 union _m64v
@@ -165,6 +218,7 @@ typedef union _m64_v16 m64_v16;
 #define casti_m64(p,i) (((__m64*)(p))[(i)])


+
 // cast all arguments as the're likely uint64_t

 // Bitwise not: ~(a)
@@ -173,6 +227,7 @@ typedef union _m64_v16 m64_v16;
 // Unary negate elements
 #define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, (__m64)v )
 #define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, (__m64)v )
+#define mm64_negate_8(  v ) _mm_sub_pi8(  m64_zero, (__m64)v )

 // Rotate bits in packed elements of 64 bit vector
 #define mm64_rol_32( a, n ) \
@@ -206,15 +261,32 @@ typedef union _m64_v16 m64_v16;
 #if defined(__SSSE3__)

 // Endian byte swap packed elements
+// A vectorized version of the u64 bswap, use when data already in MMX reg.
+#define mm64_bswap_64( v ) \
+    _mm_shuffle_pi8( (__m64)v, _mm_set_pi8( 0,1,2,3,4,5,6,7 ) )
+
 #define mm64_bswap_32( v ) \
    _mm_shuffle_pi8( (__m64)v, _mm_set_pi8( 4,5,6,7,  0,1,2,3 ) )

 #define mm64_bswap_16( v ) \
    _mm_shuffle_pi8( (__m64)v, _mm_set_pi8( 6,7,  4,5,  2,3,  0,1 ) );

+#else
+
+#define mm64_bswap_64( v ) \
+       (__m64)__builtin_bswap64( (uint64_t)v )
+
+// Looks clumsy but hopefully it works.
+#define mm64_bswap_32( v ) \
+   _mm_set_pi32( __builtin_bswap32( ((uint32_t*)v)[1] ), \
+                 __builtin_bswap32( ((uint32_t*)v)[0] )  )
+
 #endif

 // Invert vector: {3,2,1,0} -> {0,1,2,3}
+// Invert_64 is the same as bswap64
+// Invert_32 is the same as swap32
+
 #define mm64_invert_16( v ) _mm_shuffle_pi16( (__m64)v, 0x1b )

 #if defined(__SSSE3__)
@@ -237,6 +309,12 @@ static inline void memset_zero_64( __m64 *src, int n )
 static inline void memset_64( __m64 *dst, const __m64 a,  int n )
 {   for ( int i = 0; i < n; i++ ) dst[i] = a; }

+// The b is for broadcast, don't use in hybrid hash, interleave.
+static inline void mem_bcpy_32( __m64 *dst, const uint32_t src, int n )
+{
+   for ( int i = 0; i < n; i++ ) dst[i] = _mm_set1_pi32( src );
+}
+

 //////////////////////////////////////////////////////////////////
 //
@@ -644,57 +722,57 @@ do { \

 #define mm128_ror1x64_256( v1, v2 ) \
 do { \
-   __m128i t  = _mm_srli_si128( v1, 8 ) | _mm_slli_si128( v2, 24 ); \
-           v2 = _mm_srli_si128( v2, 8 ) | _mm_slli_si128( v1, 24 ); \
+   __m128i t  = _mm_srli_si128( v1, 8 ) | _mm_slli_si128( v2, 8 ); \
+           v2 = _mm_srli_si128( v2, 8 ) | _mm_slli_si128( v1, 8 ); \
           v1 = t; \
 } while(0)

 #define mm128_rol1x64_256( v1, v2 ) \
 do { \
-   __m128i t  = _mm_slli_si128( v1, 8 ) | _mm_srli_si128( v2, 24 ); \
-           v2 = _mm_slli_si128( v2, 8 ) | _mm_srli_si128( v1, 24 ); \
+   __m128i t  = _mm_slli_si128( v1, 8 ) | _mm_srli_si128( v2, 8 ); \
+           v2 = _mm_slli_si128( v2, 8 ) | _mm_srli_si128( v1, 8 ); \
           v1 = t; \
 } while(0)

 #define mm128_ror1x32_256( v1, v2 ) \
 do { \
-   __m128i t  = _mm_srli_si128( v1, 4 ) | _mm_slli_si128( v2, 28 ); \
-           v2 = _mm_srli_si128( v2, 4 ) | _mm_slli_si128( v1, 28 ); \
+   __m128i t  = _mm_srli_si128( v1, 4 ) | _mm_slli_si128( v2, 12 ); \
+           v2 = _mm_srli_si128( v2, 4 ) | _mm_slli_si128( v1, 12 ); \
           v1 = t; \
 } while(0)

 #define mm128_rol1x32_256( v1, v2 ) \
 do { \
-   __m128i t  = _mm_slli_si128( v1, 4 ) | _mm_srli_si128( v2, 28 ); \
-           v2 = _mm_slli_si128( v2, 4 ) | _mm_srli_si128( v1, 28 ); \
+   __m128i t  = _mm_slli_si128( v1, 4 ) | _mm_srli_si128( v2, 12 ); \
+           v2 = _mm_slli_si128( v2, 4 ) | _mm_srli_si128( v1, 12 ); \
           v1 = t; \
 } while(0)

 #define mm128_ror1x16_256( v1, v2 ) \
 do { \
-   __m128i t  = _mm_srli_si128( v1, 2 ) | _mm_slli_si128( v2, 30 ); \
-           v2 = _mm_srli_si128( v2, 2 ) | _mm_slli_si128( v1, 30 ); \
+   __m128i t  = _mm_srli_si128( v1, 2 ) | _mm_slli_si128( v2, 14 ); \
+           v2 = _mm_srli_si128( v2, 2 ) | _mm_slli_si128( v1, 14 ); \
           v1 = t; \
 } while(0)

 #define mm128_rol1x16_256( v1, v2 ) \
 do { \
-   __m128i t  = _mm_slli_si128( v1, 2 ) | _mm_srli_si128( v2, 30 ); \
-           v2 = _mm_slli_si128( v2, 2 ) | _mm_srli_si128( v1, 30 ); \
+   __m128i t  = _mm_slli_si128( v1, 2 ) | _mm_srli_si128( v2, 14 ); \
+           v2 = _mm_slli_si128( v2, 2 ) | _mm_srli_si128( v1, 14 ); \
           v1 = t; \
 } while(0)

 #define mm128_ror1x8_256( v1, v2 ) \
 do { \
-   __m128i t  = _mm_srli_si128( v1, 1 ) | _mm_slli_si128( v2, 31 ); \
-           v2 = _mm_srli_si128( v2, 1 ) | _mm_slli_si128( v1, 31 ); \
+   __m128i t  = _mm_srli_si128( v1, 1 ) | _mm_slli_si128( v2, 15 ); \
+           v2 = _mm_srli_si128( v2, 1 ) | _mm_slli_si128( v1, 15 ); \
           v1 = t; \
 } while(0)

 #define mm128_rol1x8_256( v1, v2 ) \
 do { \
-   __m128i t  = _mm_slli_si128( v1, 1 ) | _mm_srli_si128( v2, 31 ); \
-           v2 = _mm_slli_si128( v2, 1 ) | _mm_srli_si128( v1, 31 ); \
+   __m128i t  = _mm_slli_si128( v1, 1 ) | _mm_srli_si128( v2, 15 ); \
+           v2 = _mm_slli_si128( v2, 1 ) | _mm_srli_si128( v1, 15 ); \
           v1 = t; \
 } while(0)

@@ -1919,6 +1997,7 @@ static inline __m64 mmx_compile_test( __m64 a )
    m = _mm_shuffle_pi8( m, (__m64)0x0102030405060708 );
    i = (uint64_t) mm64_ror_32( (__m64)i, 7 );
    casti_m64( n, 2 ) = m;
+    m = (__m64)__builtin_bswap64( (uint64_t)m );
    return a;
 }

--- a/22
+++ b/22
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.2.4.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.9.1'
-PACKAGE_STRING='cpuminer-opt 3.9.1'
+PACKAGE_VERSION='3.9.2.4'
+PACKAGE_STRING='cpuminer-opt 3.9.2.4'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.9.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.9.2.4 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.9.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.9.2.4:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.9.1
+cpuminer-opt configure 3.9.2.4
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.9.1, which was
+It was created by cpuminer-opt $as_me 3.9.2.4, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.9.1'
+ VERSION='3.9.2.4'


 cat >>confdefs.h <<_ACEOF
@@ -5884,7 +5884,7 @@ fi


 # GC2 for GNU static
-if test "x$OS" = "xWindows_NT" ; then
+if test "x$have_win32" = "xtrue" ; then
   # MinGW
   { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
 $as_echo_n "checking for pthread_create in -lpthread... " >&6; }
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.9.1, which was
+This file was extended by cpuminer-opt $as_me 3.9.2.4, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.9.1
+cpuminer-opt config.status 3.9.2.4
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.9.1])
+AC_INIT([cpuminer-opt], [3.9.2.4])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
@@ -106,7 +106,7 @@ fi
 AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true)

 # GC2 for GNU static
-if test "x$OS" = "xWindows_NT" ; then
+if test "x$have_win32" = "xtrue" ; then
   # MinGW
   AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",[])
 else
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -105,10 +105,12 @@ enum algos opt_algo = ALGO_NULL;
 int opt_scrypt_n = 0;
 int opt_pluck_n = 128;
 int opt_n_threads = 0;
-#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
-__int128_t opt_affinity = -1LL;
+// Windows doesn't support 128 bit affinity mask.
+#if defined(__linux) && defined(GCC_INT128)  
+#define AFFINITY_USES_UINT128 1
+uint128_t opt_affinity = -1LL;
 #else
-int64_t opt_affinity = -1LL;
+uint64_t opt_affinity = -1LL;
 #endif
 int opt_priority = 0;
 int num_cpus = 1;
@@ -203,7 +205,8 @@ static inline void drop_policy(void)
 #define pthread_setaffinity_np(tid,sz,s) {} /* only do process affinity */
 #endif

-#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
+// Linux affinity can use int128.
+#if AFFINITY_USES_UINT128
 static void affine_to_cpu_mask( int id, unsigned __int128 mask )
 #else
 static void affine_to_cpu_mask( int id, unsigned long long mask )
@@ -216,7 +219,7 @@ static void affine_to_cpu_mask( int id, unsigned long long mask )
   for ( uint8_t i = 0; i < ncpus; i++ ) 
   {
      // cpu mask
-#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
+#if AFFINITY_USES_UINT128
      if( ( mask & ( (unsigned __int128)1ULL << i ) ) )  CPU_SET( i, &set );
 #else
      if( (ncpus > 64) || ( mask & (1ULL << i) ) )  CPU_SET( i, &set );
@@ -237,6 +240,7 @@ static void affine_to_cpu_mask( int id, unsigned long long mask )
 #elif defined(WIN32) /* Windows */
 static inline void drop_policy(void) { }

+// Windows CPU groups to manage more than 64 CPUs.
 static void affine_to_cpu_mask( int id, unsigned long mask )
 {
   bool success;
@@ -263,7 +267,7 @@ static void affine_to_cpu_mask( int id, unsigned long mask )
 	      break;

  	   cpu -= cpus;
-         }
+   }

 	if (opt_debug)
 	applog(LOG_DEBUG, "Binding thread %d to cpu %d on cpu group %d (mask %x)", id, cpu, group, (1ULL << cpu));
@@ -847,7 +851,8 @@ static int share_result( int result, struct work *work, const char *reason )
   float rate;
   char rate_s[8] = {0};
   double sharediff = work ? work->sharediff : stratum.sharediff;
-   bool solved = result && (net_diff > 0.0 ) && ( sharediff >= net_diff );
+   bool solved = result && accepted_share_count && (net_diff > 0.0 )
+	         && ( sharediff >= net_diff );
   char sol[32] = {0};
   int i;

@@ -857,15 +862,17 @@ static int share_result( int result, struct work *work, const char *reason )
       hashcount += thr_hashcount[i];
       hashrate += thr_hashrates[i];
   }
+   solved = result && ( (uint64_t)hashcount > 0 )  && (net_diff > 0.0 )
+                                             && ( sharediff >= net_diff );
   result ? accepted_share_count++ : rejected_share_count++;

   if ( solved )
   {
      solved_block_count++;
      if ( use_colors )
-         sprintf( sol, CL_GRN " Solved" CL_WHT " %d", solved_block_count );   
+         sprintf( sol, CL_GRN " Solved: %d" CL_WHT, solved_block_count );   
      else
-         sprintf( sol, " Solved %d", solved_block_count ); 
+         sprintf( sol, ", Solved: %d", solved_block_count ); 
   }

   pthread_mutex_unlock(&stats_lock);
@@ -1839,26 +1846,42 @@ static void *miner_thread( void *userdata )
   }
   else
 */
+
   if ( num_cpus > 1 )
   {
-      if ( (opt_affinity == -1LL) && (opt_n_threads) > 1 ) 
-      {
-         if (opt_debug)
-            applog( LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)",
-                   thr_id, thr_id % num_cpus, ( 1ULL << (thr_id % num_cpus) ) );
-#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
-         affine_to_cpu_mask( thr_id,
-                             (unsigned __int128)1LL << (thr_id % num_cpus) );
+#if AFFINITY_USES_UINT128
+       // Default affinity
+       if ( (opt_affinity == i128_neg1 ) && opt_n_threads > 1 )
+       {  
+         if ( opt_debug )
+            applog( LOG_DEBUG, "Binding thread %d to cpu %d.",
+                    thr_id, thr_id % num_cpus,
+	                 i128_hi64( (uint128_t)1ULL << (thr_id % num_cpus) ),
+		              i128_lo64( (uint128_t)1ULL << (thr_id % num_cpus) ) );
+         affine_to_cpu_mask( thr_id, (uint128_t)1ULL << (thr_id % num_cpus) );
+       }
 #else
-         affine_to_cpu_mask( thr_id, 1ULL << (thr_id % num_cpus) );
-#endif
-      }
-      else if (opt_affinity != -1)
-      {
+       if ( (opt_affinity == -1LL) && opt_n_threads > 1 ) 
+       {
         if (opt_debug)
-             applog( LOG_DEBUG, "Binding thread %d to cpu mask %x",
-                                 thr_id, opt_affinity);
-         affine_to_cpu_mask( thr_id, opt_affinity );
+            applog( LOG_DEBUG, "Binding thread %d to cpu %d.",
+                thr_id, thr_id % num_cpus, 1LL << (thr_id % num_cpus)) ;
+         affine_to_cpu_mask( thr_id, 1ULL << (thr_id % num_cpus) );
+       }
+#endif
+      else   // Custom affinity
+      {
+#if AFFINITY_USES_UINT128
+         if (opt_debug)
+             applog( LOG_DEBUG, "Binding thread %d to mask %016llx %016llx",
+                                thr_id, i128_hi64( opt_affinity ), 
+                                        i128_lo64( opt_affinity ) );
+#else
+         if (opt_debug)
+             applog( LOG_DEBUG, "Binding thread %d to mask %016llx",
+                                 thr_id, opt_affinity );
+#endif
+      affine_to_cpu_mask( thr_id, opt_affinity );
      }
   }

@@ -2894,13 +2917,21 @@ void parse_arg(int key, char *arg )
 		break;
 	case 1020:
 		p = strstr(arg, "0x");
-		if (p)
-			ul = strtoul(p, NULL, 16);
+		if ( p )
+			ul = strtoull( p, NULL, 16 );
 		else
-			ul = atol(arg);
-		if (ul > (1UL<<num_cpus)-1)
-			ul = -1;
-		opt_affinity = ul;
+			ul = atoll( arg );
+//		if ( ul > ( 1ULL << num_cpus ) - 1ULL )
+//			ul = -1LL;
+#if AFFINITY_USES_UINT128
+// replicate the low 64 bits to make a full 128 bit mask if there are more
+// than 64 CPUs, otherwise zero extend the upper half.
+                opt_affinity = (uint128_t)ul;
+                if ( num_cpus > 64 )
+                   opt_affinity = (opt_affinity << 64 ) | (uint128_t)ul;
+#else
+                   opt_affinity = ul;
+#endif
 		break;
 	case 1021:
 		v = atoi(arg);
@@ -3299,20 +3330,18 @@ int main(int argc, char *argv[])
 	}

 	if (!rpc_userpass)
-        {
+   {
 		rpc_userpass = (char*) malloc(strlen(rpc_user) + strlen(rpc_pass) + 2);
-                if (rpc_userpass)
-	           sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
-                else
-                   return 1;
+      if (rpc_userpass)
+          sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
+       else
+         return 1;
 	}

-        // All options must be set before starting the gate
-        if ( !register_algo_gate( opt_algo, &algo_gate ) )
-           exit(1);
+   // All options must be set before starting the gate
+   if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1);

-        if ( !check_cpu_capability() )
-           exit(1);
+   if ( !check_cpu_capability() ) exit(1);

 	pthread_mutex_init(&stats_lock, NULL);
 	pthread_mutex_init(&g_work_lock, NULL);
@@ -3325,7 +3354,7 @@ int main(int argc, char *argv[])
 	        ? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL)
 	        : CURL_GLOBAL_ALL;
 	if (curl_global_init(flags))
-        {
+   {
 		applog(LOG_ERR, "CURL initialization failed");
 		return 1;
 	}
@@ -3384,6 +3413,8 @@ int main(int argc, char *argv[])
   if ( num_cpus != opt_n_threads )   
     applog( LOG_INFO,"%u CPU cores available, %u miner threads selected.",
             num_cpus, opt_n_threads );
+
+// To be reviewed
   if ( opt_affinity != -1 )
   {
      if ( num_cpus > 64 )
--- a/interleave.h
+++ b/interleave.h
@@ -43,8 +43,127 @@
 //
 // AVX512: 4x128, 8x64, 16x32
 //
-// Interleaving and deinterleaving is done in blocks of 16*16, 32*32,
-// or 64*64 bytes for SSE2, AVX2 and AVX512 vectors respectively.
+// Interleaving and deinterleaving is done in blocks of 8*8, 16*16, 32*32,
+// or 64*64 bytes for MMX, SSE2, AVX2 and AVX512 vectors respectively.
+
+//////////////////////////////////////////////////////
+// 
+//          MMX 64 bit vectors
+
+#define mm64_put_32( s0, s1 ) \
+  _mm_set_pi32( *((const uint32_t*)(s1)), *((const uint32_t*)(s0)) )
+
+#define mm64_get_32( s, i0, i1 ) \
+  _mm_set_pi32( ((const uint32_t*)(s))[i1], ((const uint32_t*)(s))[i0] )
+
+// 1 MMX block, 8 bytes * 2 lanes
+static inline void mm64_interleave_2x32( void *d, const void *s0,
+                                         const void *s1, int len )
+{
+  casti_m64( d, 0 ) = mm64_put_32( s0    , s1     );
+  casti_m64( d, 1 ) = mm64_put_32( s0+  4, s1+  4 );
+  casti_m64( d, 2 ) = mm64_put_32( s0+  8, s1+  8 );
+  casti_m64( d, 3 ) = mm64_put_32( s0+ 12, s1+ 12 );
+  casti_m64( d, 4 ) = mm64_put_32( s0+ 16, s1+ 16 );
+  casti_m64( d, 5 ) = mm64_put_32( s0+ 20, s1+ 20 );
+  casti_m64( d, 6 ) = mm64_put_32( s0+ 24, s1+ 24 );
+  casti_m64( d, 7 ) = mm64_put_32( s0+ 28, s1+ 28 );
+
+  if ( len <= 256 ) return;
+
+  casti_m64( d, 8 ) = mm64_put_32( s0+ 32, s1+ 32 );
+  casti_m64( d, 9 ) = mm64_put_32( s0+ 36, s1+ 36 );
+  casti_m64( d,10 ) = mm64_put_32( s0+ 40, s1+ 40 );
+  casti_m64( d,11 ) = mm64_put_32( s0+ 44, s1+ 44 );
+  casti_m64( d,12 ) = mm64_put_32( s0+ 48, s1+ 48 );
+  casti_m64( d,13 ) = mm64_put_32( s0+ 52, s1+ 52 );
+  casti_m64( d,14 ) = mm64_put_32( s0+ 56, s1+ 56 );
+  casti_m64( d,15 ) = mm64_put_32( s0+ 60, s1+ 60 );
+
+  if ( len <= 512 ) return;
+
+  casti_m64( d,16 ) = mm64_put_32( s0+ 64, s1+ 64 );
+  casti_m64( d,17 ) = mm64_put_32( s0+ 68, s1+ 68 );
+  casti_m64( d,18 ) = mm64_put_32( s0+ 72, s1+ 72 );
+  casti_m64( d,19 ) = mm64_put_32( s0+ 76, s1+ 76 );
+
+  if ( len <= 640 ) return;
+
+  casti_m64( d,20 ) = mm64_put_32( s0+ 80, s1+ 80 );
+  casti_m64( d,21 ) = mm64_put_32( s0+ 84, s1+ 84 );
+  casti_m64( d,22 ) = mm64_put_32( s0+ 88, s1+ 88 );
+  casti_m64( d,23 ) = mm64_put_32( s0+ 92, s1+ 92 );
+  casti_m64( d,24 ) = mm64_put_32( s0+ 96, s1+ 96 );
+  casti_m64( d,25 ) = mm64_put_32( s0+100, s1+100 );
+  casti_m64( d,26 ) = mm64_put_32( s0+104, s1+104 );
+  casti_m64( d,27 ) = mm64_put_32( s0+108, s1+108 );
+  casti_m64( d,28 ) = mm64_put_32( s0+112, s1+112 );
+  casti_m64( d,29 ) = mm64_put_32( s0+116, s1+116 );
+  casti_m64( d,30 ) = mm64_put_32( s0+120, s1+120 );
+  casti_m64( d,31 ) = mm64_put_32( s0+124, s1+124 );
+}
+
+static inline void mm64_deinterleave_2x32( void *d00, void *d01,
+                                    const int n, const void *s, int len )
+{
+   casti_m64( d00,0 ) = mm64_get_32( s,  0,  2 );
+   casti_m64( d01,0 ) = mm64_get_32( s,  1,  3 );
+   casti_m64( d00,1 ) = mm64_get_32( s,  4,  6 );
+   casti_m64( d01,1 ) = mm64_get_32( s,  5,  7 );
+   casti_m64( d00,2 ) = mm64_get_32( s,  8, 10 );
+   casti_m64( d01,2 ) = mm64_get_32( s,  9, 11 );
+   casti_m64( d00,3 ) = mm64_get_32( s, 12, 14 );
+   casti_m64( d01,3 ) = mm64_get_32( s, 13, 15 );
+
+   if ( len <= 256 ) return;
+
+   casti_m64( d00,4 ) = mm64_get_32( s, 16, 18 );
+   casti_m64( d01,4 ) = mm64_get_32( s, 17, 19 );
+   casti_m64( d00,5 ) = mm64_get_32( s, 20, 22 );
+   casti_m64( d01,5 ) = mm64_get_32( s, 21, 23 );
+   casti_m64( d00,6 ) = mm64_get_32( s, 24, 26 );
+   casti_m64( d01,6 ) = mm64_get_32( s, 25, 27 );
+   casti_m64( d00,7 ) = mm64_get_32( s, 28, 30 );
+   casti_m64( d01,7 ) = mm64_get_32( s, 29, 31 );
+
+   if ( len <= 512 ) return;
+
+   casti_m64( d00,8 ) = mm64_get_32( s, 32, 34 );
+   casti_m64( d01,8 ) = mm64_get_32( s, 33, 35 );
+   casti_m64( d00,9 ) = mm64_get_32( s, 36, 38 );
+   casti_m64( d01,9 ) = mm64_get_32( s, 37, 39 );
+
+   if ( len <= 640 ) return;
+
+   casti_m64( d00,10 ) = mm64_get_32( s, 40, 42 );
+   casti_m64( d01,10 ) = mm64_get_32( s, 41, 43 );
+   casti_m64( d00,11 ) = mm64_get_32( s, 44, 46 );
+   casti_m64( d01,11 ) = mm64_get_32( s, 45, 47 );
+   casti_m64( d00,12 ) = mm64_get_32( s, 48, 50 );
+   casti_m64( d01,12 ) = mm64_get_32( s, 49, 51 );
+   casti_m64( d00,13 ) = mm64_get_32( s, 52, 54 );
+   casti_m64( d01,13 ) = mm64_get_32( s, 53, 55 );
+   casti_m64( d00,14 ) = mm64_get_32( s, 56, 58 );
+   casti_m64( d01,14 ) = mm64_get_32( s, 57, 59 );
+   casti_m64( d00,15 ) = mm64_get_32( s, 60, 62 );
+   casti_m64( d01,15 ) = mm64_get_32( s, 61, 63 );
+}
+
+static inline void mm64_extract_lane_2x32( void *d, const void *s,
+                                         const int lane, const int bit_len )
+{
+  casti_m64( d, 0 ) = mm64_get_32( s, lane   , lane+ 4 );
+  casti_m64( d, 1 ) = mm64_get_32( s, lane+ 8, lane+12 );
+  casti_m64( d, 2 ) = mm64_get_32( s, lane+16, lane+20 );
+  casti_m64( d, 3 ) = mm64_get_32( s, lane+24, lane+28 );
+
+  if ( bit_len <= 256 ) return;
+  casti_m64( d, 4 ) = mm64_get_32( s, lane+32, lane+36 );
+  casti_m64( d, 5 ) = mm64_get_32( s, lane+40, lane+44 );
+  casti_m64( d, 6 ) = mm64_get_32( s, lane+48, lane+52 );
+  casti_m64( d, 7 ) = mm64_get_32( s, lane+56, lane+60 );
+  // bit_len == 512
+}


 ///////////////////////////////////////////////////////////////
@@ -356,6 +475,36 @@ static inline void mm256_interleave_8x32x256( void *d, const void *s00,
                                       s04+28, s05+28, s06+28, s07+28 );
 }

+static inline void mm256_be_interleave_8x32x256( void *d, const void *s00,
+       const void *s01, const void *s02, const void *s03, const void *s04,
+       const void *s05, const void *s06, const void *s07 )
+{
+   casti_m256i( d, 0 ) = mm256_bswap_32( 
+		            mm256_put_32( s00,    s01,    s02,    s03,
+                                          s04,    s05,    s06,    s07    ) );
+   casti_m256i( d, 1 ) = mm256_bswap_32(
+		            mm256_put_32( s00+ 4, s01+ 4, s02+ 4, s03+ 4,
+                                          s04+ 4, s05+ 4, s06+ 4, s07+ 4 ) );
+   casti_m256i( d, 2 ) = mm256_bswap_32(
+	                    mm256_put_32( s00+ 8, s01+ 8, s02+ 8, s03+ 8,
+                                          s04+ 8, s05+ 8, s06+ 8, s07+ 8 ) );
+   casti_m256i( d, 3 ) = mm256_bswap_32(
+	                    mm256_put_32( s00+12, s01+12, s02+12, s03+12,
+                                          s04+12, s05+12, s06+12, s07+12 ) );
+   casti_m256i( d, 4 ) = mm256_bswap_32(
+	                    mm256_put_32( s00+16, s01+16, s02+16, s03+16,
+                                          s04+16, s05+16, s06+16, s07+16 ) );
+   casti_m256i( d, 5 ) = mm256_bswap_32(
+	                    mm256_put_32( s00+20, s01+20, s02+20, s03+20,
+                                          s04+20, s05+20, s06+20, s07+20 ) );
+   casti_m256i( d, 6 ) = mm256_bswap_32(
+	                    mm256_put_32( s00+24, s01+24, s02+24, s03+24,
+                                          s04+24, s05+24, s06+24, s07+24 ) );
+   casti_m256i( d, 7 ) = mm256_bswap_32(
+	                    mm256_put_32( s00+28, s01+28, s02+28, s03+28,
+                                          s04+28, s05+28, s06+28, s07+28 ) );
+}
+
 static inline void mm256_interleave_8x32x128( void *d, const void *s00,
     const void *s01, const void *s02, const void *s03, const void *s04,
     const void *s05, const void *s06, const void *s07 )
@@ -370,6 +519,24 @@ static inline void mm256_interleave_8x32x128( void *d, const void *s00,
                                       s04+12, s05+12, s06+12, s07+12 );
 }

+static inline void mm256_be_interleave_8x32x128( void *d, const void *s00,
+       const void *s01, const void *s02, const void *s03, const void *s04,
+       const void *s05, const void *s06, const void *s07 )
+{
+   casti_m256i( d, 0 ) = mm256_bswap_32( 
+		            mm256_put_32( s00,    s01,    s02,    s03,
+                                          s04,    s05,    s06,    s07    ) );
+   casti_m256i( d, 1 ) = mm256_bswap_32(
+	                    mm256_put_32( s00+ 4, s01+ 4, s02+ 4, s03+ 4,
+                                          s04+ 4, s05+ 4, s06+ 4, s07+ 4 ) );
+   casti_m256i( d, 2 ) = mm256_bswap_32(
+	                    mm256_put_32( s00+ 8, s01+ 8, s02+ 8, s03+ 8,
+                                          s04+ 8, s05+ 8, s06+ 8, s07+ 8 ) );
+   casti_m256i( d, 3 ) = mm256_bswap_32(
+	                    mm256_put_32( s00+12, s01+12, s02+12, s03+12,
+                                          s04+12, s05+12, s06+12, s07+12 ) );
+}
+
 // can be called directly for 32 byte hash using AVX2
 static inline void mm256_deinterleave_8x32x256( void *d00, void *d01,
 	       void *d02, void *d03, void *d04, void *d05, void *d06,
@@ -394,6 +561,21 @@ static inline void mm256_interleave_4x64x256( void *d, const void *s0,
  casti_m256i( d,3 ) = mm256_put_64( s0+24, s1+24, s2+24, s3+24 );
 }

+// bswap the data as it's interleaved.
+// A bit of a missnomer, but be is nice and short.
+static inline void mm256_be_interleave_4x64x256( void *d, const void *s0,
+                       const void *s1, const void *s2, const void *s3 )
+{
+  casti_m256i( d,0 ) = mm256_bswap_32( 
+		             mm256_put_64( s0,    s1,    s2,    s3    ) );
+  casti_m256i( d,1 ) = mm256_bswap_32(
+	                     mm256_put_64( s0+ 8, s1+ 8, s2+ 8, s3+ 8 ) );
+  casti_m256i( d,2 ) = mm256_bswap_32( 
+		             mm256_put_64( s0+16, s1+16, s2+16, s3+16 ) );
+  casti_m256i( d,3 ) = mm256_bswap_32( 
+		             mm256_put_64( s0+24, s1+24, s2+24, s3+24 ) );
+}
+
 static inline void mm256_interleave_4x64x128( void *d, const void *s0,
                       const void *s1, const void *s2, const void *s3 )
 {
@@ -401,6 +583,14 @@ static inline void mm256_interleave_4x64x128( void *d, const void *s0,
  casti_m256i( d,1 ) = mm256_put_64( s0+ 8, s1+ 8, s2+ 8, s3+ 8 );
 }

+static inline void mm256_be_interleave_4x64x128( void *d, const void *s0,
+                       const void *s1, const void *s2, const void *s3 )
+{
+  casti_m256i( d,0 ) = mm256_bswap_32( 
+		           mm256_put_64( s0,    s1,    s2,    s3    ) );
+  casti_m256i( d,1 ) = mm256_bswap_32( 
+		           mm256_put_64( s0+ 8, s1+ 8, s2+ 8, s3+ 8 ) );
+}

 // 4 lanes of 256 bits using 64 bit interleaving (standard final hash size)
 static inline void mm256_deinterleave_4x64x256( void *d0, void *d1, void *d2,
@@ -496,6 +686,28 @@ static inline void mm256_interleave_8x32( void *d, const void *s0,
   // bit_len == 1024
 }

+static inline void mm256_be_interleave_8x32( void *d, const void *s0,
+        const void *s1, const void *s2, const void *s3, const void *s4,
+        const void *s5, const void *s6, const void *s7, int bit_len )
+{
+   mm256_be_interleave_8x32x256( d, s0, s1, s2, s3, s4, s5, s6, s7 );
+   if ( bit_len <= 256 ) return;
+   mm256_be_interleave_8x32x256( d+256, s0+32, s1+32, s2+32, s3+32,
+                                     s4+32, s5+32, s6+32, s7+32 );
+   if ( bit_len <= 512 ) return;
+   if ( bit_len <= 640 )
+   {
+      mm256_be_interleave_8x32x128( d+512, s0+64, s1+64, s2+64, s3+64,
+                                        s4+64, s5+64, s6+64, s7+64 );
+      return;
+   }
+   mm256_be_interleave_8x32x256( d+512, s0+64, s1+64, s2+64, s3+64,
+                                     s4+64, s5+64, s6+64, s7+64 );
+   mm256_be_interleave_8x32x256( d+768, s0+96, s1+96, s2+96, s3+96,
+                                     s4+96, s5+96, s6+96, s7+96 );
+   // bit_len == 1024
+}
+
 /*
 // Slower but it works with 32 bit data
 // bit_len must be multiple of 32
@@ -595,6 +807,23 @@ static inline void mm256_interleave_4x64( void *d, const void *s0,
  mm256_interleave_4x64x256( d+384, s0+96, s1+96, s2+96, s3+96 );
 }

+static inline void mm256_be_interleave_4x64( void *d, const void *s0,
+            const void *s1, const void *s2, const void *s3, int bit_len )
+{
+  mm256_be_interleave_4x64x256( d, s0, s1, s2, s3 );
+  if ( bit_len <= 256 ) return;
+  mm256_be_interleave_4x64x256( d+128, s0+32, s1+32, s2+32, s3+32 );
+  if ( bit_len <= 512 ) return;
+  if ( bit_len <= 640 )
+  {
+    mm256_be_interleave_4x64x128( d+256, s0+64, s1+64, s2+64, s3+64 );
+    return;
+  }
+  // bit_len == 1024
+  mm256_be_interleave_4x64x256( d+256, s0+64, s1+64, s2+64, s3+64 );
+  mm256_be_interleave_4x64x256( d+384, s0+96, s1+96, s2+96, s3+96 );
+}
+
 /*
 // Slower version
 // bit_len must be multiple of 64
@@ -676,7 +905,9 @@ static inline void mm256_extract_lane_4x64( void *d, const void *s,

 // Convert from 4x32 SSE2 interleaving to 4x64 AVX2.
 // Can't do it in place
-static inline void mm256_reinterleave_4x64( void *dst, void *src, int  bit_len )
+#define mm256_reinterleave_4x64 mm256_reinterleave_4x32_4x64
+static inline void mm256_reinterleave_4x32_4x64( void *dst, void *src,
+	                                         int  bit_len )
 {
   __m256i* d = (__m256i*)dst;
   uint32_t *s = (uint32_t*)src;
@@ -736,7 +967,9 @@ static inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src,

 // Convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
 // bit_len must be multiple of 64
-static inline void mm256_reinterleave_4x32( void *dst, void *src, int  bit_len )
+#define mm256_reinterleave_4x32 mm256_reinterleave_4x64_4x32
+static inline void mm256_reinterleave_4x64_4x32( void *dst, void *src,
+	                                         int  bit_len )
 {
   __m256i  *d = (__m256i*)dst;
   uint32_t *s = (uint32_t*)src;
@@ -862,7 +1095,8 @@ static inline void mm_reinterleave_4x32( void *dst, void *src, int  bit_len )
 }
 */

-static inline void mm256_interleave_2x128( const void *d, const void *s0,
+#define mm256_interleave_2x128 mm256_interleave_1x128
+static inline void mm256_interleave_1x128( const void *d, const void *s0,
 	                                   void *s1, const int bit_len )
 {
  casti_m256i( d, 0 ) = mm256_put_64( s0   , s0+ 8, s1   , s1+ 8 );
@@ -879,7 +1113,8 @@ static inline void mm256_interleave_2x128( const void *d, const void *s0,
  // bit_len == 1024
 }

-static inline void mm256_deinterleave_2x128( void *d0, void *d1, void *s,
+#define mm256_deinterleave_2x128 mm256_deinterleave_1x128
+static inline void mm256_deinterleave_1x128( void *d0, void *d1, void *s,
                                             int bit_len )
 {
   mm256_deinterleave_2x128x256( d0, d1, 0, s );
@@ -1078,38 +1313,38 @@ static inline void mm512_deinterleave_16x32x512( void *d00, void *d01,
                void *d12, void *d13, void *d14, void *d15, const int n,
 		const void *s )
 {
- casti_m512i(d00,n) = mm512_get_32( s,   0,  16,  32,  48,  64,  80,  96, 112,
-		                       128, 144, 160, 176, 192, 208, 224, 240 );
- casti_m512i(d01,n) = mm512_get_32( s,   1,  17,  33,  49,  65,  81,  97, 113,
-		                       129, 145, 161, 177, 193, 209, 225, 241 );
- casti_m512i(d02,n) = mm512_get_32( s,   2,  18,  34,  50,  66,  82,  98, 114,
-				       130, 146, 162, 178, 194, 210, 226, 242 );
- casti_m512i(d03,n) = mm512_get_32( s,   3,  19,  35,  51,  67,  83,  99, 115,
-                                       131, 147, 163, 179, 195, 211, 227, 243 );
- casti_m512i(d04,n) = mm512_get_32( s,   4,  20,  36,  52,  68,  84, 100, 116,
-		                       132, 148, 164, 180, 196, 212, 228, 244 );
- casti_m512i(d05,n) = mm512_get_32( s,   5,  21,  37,  53,  69,  85, 101, 117,
-                                       133, 149, 165, 181, 197, 213, 229, 245 );
- casti_m512i(d06,n) = mm512_get_32( s,   6,  22,  38,  54,  70,  86, 102, 118,
-                                       134, 150, 166, 182, 198, 214, 230, 246 );
- casti_m512i(d07,n) = mm512_get_32( s,   7,  23,  39,  55,  71,  87, 103, 119,
-		                       135, 151, 167, 183, 199, 215, 231, 247 );
- casti_m512i(d08,n) = mm512_get_32( s,   8,  24,  40,  56,  72,  88, 104, 120,
-		                       136, 152, 168, 184, 200, 216, 232, 248 );
- casti_m512i(d09,n) = mm512_get_32( s,   9,  25,  41,  57,  73,  89, 105, 121,
-		                       137, 153, 169, 185, 201, 217, 233, 249 );
- casti_m512i(d10,n) = mm512_get_32( s,  10,  26,  42,  58,  74,  90, 106, 122,
-		                       138, 154, 170, 186, 202, 218, 234, 250 );
- casti_m512i(d11,n) = mm512_get_32( s,  11,  27,  43,  59,  75,  91, 107, 123,
-		                       139, 155, 171, 187, 203, 219, 235, 251 );
- casti_m512i(d12,n) = mm512_get_32( s,  12,  28,  44,  60,  76,  92, 108, 124,
-		                       140, 156, 172, 188, 204, 220, 236, 252 );
- casti_m512i(d13,n) = mm512_get_32( s,  13,  29,  45,  61,  77,  93, 109, 125,
-		                       141, 157, 173, 189, 205, 221, 237, 253 );
- casti_m512i(d14,n) = mm512_get_32( s,  14,  30,  46,  62,  78,  94, 110, 126,
-		                       142, 158, 174, 190, 206, 222, 238, 254 );
- casti_m512i(d15,n) = mm512_get_32( s,  15,  31,  47,  63,  79,  95, 111, 127,
-		                       143, 159, 175, 191, 207, 223, 239, 255 );
+   casti_m512i(d00,n) = mm512_get_32( s,  0, 16, 32, 48, 64, 80, 96,112,
+  		                        128,144,160,176,192,208,224,240 );
+   casti_m512i(d01,n) = mm512_get_32( s,  1, 17, 33, 49, 65, 81, 97,113,
+  		                        129,145,161,177,193,209,225,241 );
+   casti_m512i(d02,n) = mm512_get_32( s,  2, 18, 34, 50, 66, 82, 98,114,
+  				        130,146,162,178,194,210,226,242 );
+   casti_m512i(d03,n) = mm512_get_32( s,  3, 19, 35, 51, 67, 83, 99,115,
+                                        131,147,163,179,195,211,227,243 );
+   casti_m512i(d04,n) = mm512_get_32( s,  4, 20, 36, 52, 68, 84,100,116,
+		                        132,148,164,180,196,212,228,244 );
+   casti_m512i(d05,n) = mm512_get_32( s,  5, 21, 37, 53, 69, 85,101,117,
+                                        133,149,165,181,197,213,229,245 );
+   casti_m512i(d06,n) = mm512_get_32( s,  6, 22, 38, 54, 70, 86,102,118,
+                                        134,150,166,182,198,214,230,246 );
+   casti_m512i(d07,n) = mm512_get_32( s,  7, 23, 39, 55, 71, 87,103,119,
+		                        135,151,167,183,199,215,231,247 );
+   casti_m512i(d08,n) = mm512_get_32( s,  8, 24, 40, 56, 72, 88,104,120,
+		                        136,152,168,184,200,216,232,248 );
+   casti_m512i(d09,n) = mm512_get_32( s,  9, 25, 41, 57, 73, 89,105,121,
+		                        137,153,169,185,201,217,233,249 );
+   casti_m512i(d10,n) = mm512_get_32( s, 10, 26, 42, 58, 74, 90,106,122,
+		                        138,154,170,186,202,218,234,250 );
+   casti_m512i(d11,n) = mm512_get_32( s, 11, 27, 43, 59, 75, 91,107,123,
+		                        139,155,171,187,203,219,235,251 );
+   casti_m512i(d12,n) = mm512_get_32( s, 12, 28, 44, 60, 76, 92,108,124,
+		                        140,156,172,188,204,220,236,252 );
+   casti_m512i(d13,n) = mm512_get_32( s, 13, 29, 45, 61, 77, 93,109,125,
+		                        141,157,173,189,205,221,237,253 );
+   casti_m512i(d14,n) = mm512_get_32( s, 14, 30, 46, 62, 78, 94,110,126,
+	                                142,158,174,190,206,222,238,254 );
+   casti_m512i(d15,n) = mm512_get_32( s, 15, 31, 47, 63, 79, 95,111,127,
+           	                        143,159,175,191,207,223,239,255 );
 }

 static inline void mm512_interleave_8x64x512( void *d, const void *s0,
@@ -1363,6 +1598,99 @@ static inline void mm512_deinterleave_4x128( void *d0, void *d1, void *d2,
   mm512_deinterleave_4x128x512( d0, d1, d2, d3, 1, s+256 );
 }

+// input one 8x64 buffer and return 2*4*128
+static inline void mm512_reinterleave_8x64_4x128( void *dst0, void *dst1,
+                                              const void *src, int  bit_len )
+{
+   __m512i* d0 = (__m512i*)dst0;
+   __m512i* d1 = (__m512i*)dst1;
+   uint64_t *s = (uint64_t*)src;
+
+   d0[0] = _mm512_set_epi64( s[ 11], s[  3], s[ 10], s[  2],
+                             s[  9], s[  1], s[  8], s[  0] );
+   d0[1] = _mm512_set_epi64( s[ 27], s[ 19], s[ 26], s[ 18],
+ 		             s[ 25], s[ 17], s[ 24], s[ 16] );
+   d0[2] = _mm512_set_epi64( s[ 15], s[  7], s[ 14], s[  6],
+                             s[ 13], s[  5], s[ 12], s[  4] );
+   d0[3] = _mm512_set_epi64( s[ 31], s[ 23], s[ 30], s[ 22],
+                             s[ 29], s[ 21], s[ 28], s[ 20] );
+   d1[0] = _mm512_set_epi64( s[ 43], s[ 35], s[ 42], s[ 34],
+                             s[ 41], s[ 33], s[ 40], s[ 32] );
+   d1[1] = _mm512_set_epi64( s[ 59], s[ 51], s[ 58], s[ 50],
+                             s[ 57], s[ 49], s[ 56], s[ 48] );
+   d1[2] = _mm512_set_epi64( s[ 47], s[ 39], s[ 46], s[ 38],
+                             s[ 45], s[ 37], s[ 44], s[ 36] );
+   d1[3] = _mm512_set_epi64( s[ 63], s[ 55], s[ 62], s[ 54],
+                              s[ 61], s[ 53], s[ 60], s[ 52] );
+
+   if ( bit_len <= 512 ) return;
+
+   d0[4] = _mm512_set_epi64( s[ 75], s[ 67], s[ 74], s[ 66],
+                             s[ 73], s[ 65], s[ 72], s[ 64] );
+   d0[5] = _mm512_set_epi64( s[ 91], s[ 83], s[ 90], s[ 82],
+                             s[ 89], s[ 81], s[ 88], s[ 80] );
+   d0[6] = _mm512_set_epi64( s[ 79], s[ 71], s[ 78], s[ 70],
+                             s[ 77], s[ 69], s[ 76], s[ 68] );
+   d0[7] = _mm512_set_epi64( s[ 95], s[ 87], s[ 94], s[ 86],
+                             s[ 93], s[ 85], s[ 92], s[ 84] );
+   d1[4] = _mm512_set_epi64( s[107], s[ 99], s[106], s[ 98],
+                             s[105], s[ 97], s[104], s[ 96] );
+   d1[5] = _mm512_set_epi64( s[123], s[115], s[122], s[114],
+                             s[121], s[113], s[120], s[112] );
+   d1[6] = _mm512_set_epi64( s[111], s[103], s[110], s[102],
+                             s[109], s[101], s[108], s[100] );
+   d1[7] = _mm512_set_epi64( s[127], s[119], s[126], s[118],
+                             s[125], s[117], s[124], s[116] );
+
+}
+
+// input 2 4x128  return 8x64
+static inline void mm512_reinterleave_4x128_8x64( void *dst, const void *src0,
+                                              const void *src1, int  bit_len )
+{
+   __m512i* d = (__m512i*)dst;
+   uint64_t *s0 = (uint64_t*)src0;
+   uint64_t *s1 = (uint64_t*)src1;
+
+   d[0] = _mm512_set_epi64( s1[ 6], s1[ 4], s1[ 2], s1[ 0],
+                            s0[ 6], s0[ 4], s0[ 2], s0[ 0] );
+   d[1] = _mm512_set_epi64( s1[ 7], s1[ 5], s1[ 3], s1[ 1],
+                            s0[ 7], s0[ 5], s0[ 3], s0[ 1] );
+   d[2] = _mm512_set_epi64( s1[14], s1[12], s1[10], s1[ 8],
+                            s0[14], s0[12], s0[10], s0[ 8] );
+   d[3] = _mm512_set_epi64( s1[15], s1[13], s1[11], s1[ 9],
+                            s0[15], s0[13], s0[11], s0[ 9] );
+   d[4] = _mm512_set_epi64( s1[22], s1[20], s1[18], s1[16],
+                            s0[22], s0[20], s0[18], s0[16] );
+   d[5] = _mm512_set_epi64( s1[23], s1[21], s1[19], s1[17],
+                            s0[24], s0[21], s0[19], s0[17] );
+   d[6] = _mm512_set_epi64( s1[22], s1[28], s1[26], s1[24],
+                            s0[22], s0[28], s0[26], s0[24] );
+   d[7] = _mm512_set_epi64( s1[31], s1[29], s1[27], s1[25],
+                            s0[31], s0[29], s0[27], s0[25] );
+
+   if ( bit_len <= 512 ) return;
+
+   d[0] = _mm512_set_epi64( s1[38], s1[36], s1[34], s1[32],
+                            s0[38], s0[36], s0[34], s0[32] );
+   d[1] = _mm512_set_epi64( s1[39], s1[37], s1[35], s1[33],
+                            s0[39], s0[37], s0[35], s0[33] );
+   d[2] = _mm512_set_epi64( s1[46], s1[44], s1[42], s1[40],
+                            s0[46], s0[44], s0[42], s0[40] );
+   d[3] = _mm512_set_epi64( s1[47], s1[45], s1[43], s1[41],
+                            s0[47], s0[45], s0[43], s0[41] );
+   d[4] = _mm512_set_epi64( s1[54], s1[52], s1[50], s1[48],
+                            s0[54], s0[52], s0[50], s0[48] );
+   d[5] = _mm512_set_epi64( s1[55], s1[53], s1[51], s1[49],
+                            s0[55], s0[53], s0[51], s0[49] );
+
+   d[6] = _mm512_set_epi64( s1[62], s1[60], s1[58], s1[56],
+                            s0[62], s0[60], s0[58], s0[56] );
+   d[7] = _mm512_set_epi64( s1[63], s1[61], s1[59], s1[57],
+                            s0[63], s0[61], s0[59], s0[57] );
+
+}
+
 static inline void mm512_extract_lane_4x128( void *d, const void *s,
                                            const int lane, const int bit_len )
 {
--- a/miner.h
+++ b/miner.h
@@ -538,6 +538,7 @@ enum algos {
        ALGO_SCRYPTJANE,
        ALGO_SHA256D,
        ALGO_SHA256T,
+        ALGO_SHA256Q,
        ALGO_SHAVITE3,    
        ALGO_SKEIN,       
        ALGO_SKEIN2,      
@@ -625,6 +626,7 @@ static const char* const algo_names[] = {
        "scryptjane",
        "sha256d",
        "sha256t",
+        "sha256q",
        "shavite3",
        "skein",
        "skein2",
@@ -774,7 +776,8 @@ Options:\n\
                          scryptjane:nf\n\
                          sha256d       Double SHA-256\n\
                          sha256t       Triple SHA-256, Onecoin (OC)\n\
-                          shavite3      Shavite3\n\
+                          sha256q       Quad SHA-256, Pyrite (PYE)\n\
+			  shavite3      Shavite3\n\
                          skein         Skein+Sha (Skeincoin)\n\
                          skein2        Double Skein (Woodcoin)\n\
                          skunk         Signatum (SIGT)\n\
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -19,7 +19,7 @@ export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/open
 ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h

 # edit configure to fix pthread lib name for Windows.
-sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
+#sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac

 # make release directory and copy selected DLLs.
 mkdir release
Author	SHA1	Message	Date
Jay D Dee	7fec680835	v3.9.2.4	2019-06-07 23:30:38 -04:00
Jay D Dee	1b0a5aadf6	v3.9.2.3	2019-06-05 12:20:04 -04:00
Jay D Dee	0a3c52810e	v3.9.2.2	2019-06-04 17:14:03 -04:00
Jay D Dee	4d4386a374	v3.9.2.1	2019-06-04 16:56:44 -04:00
Jay D Dee	ce259b915a	v3.9.2	2019-06-03 21:36:33 -04:00
Jay D Dee	02202ab803	v3.9.1.1	2019-05-31 13:20:12 -04:00