From ce259b915a714bb8fce482d0e6e78934df232f56 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Mon, 3 Jun 2019 21:36:33 -0400
Subject: [PATCH] v3.9.2

---
 Makefile.am                                   |    7 +-
 README.txt                                    |    4 +-
 RELEASE_NOTES                                 |   13 +-
 algo-gate-api.c                               |    1 +
 algo/bmw/bmw-hash-4way.h                      |   71 +-
 .../{bmw-hash-4way.c => bmw256-hash-4way.c}   | 1095 ++++++++--------
 algo/bmw/bmw512-hash-4way.c                   | 1109 ++++++++++++++++
 algo/lyra2/allium-4way.c                      |   41 +-
 algo/lyra2/allium.c                           |    3 +-
 algo/lyra2/lyra2-gate.h                       |   22 +-
 algo/lyra2/lyra2h-4way.c                      |    3 +-
 algo/lyra2/lyra2h.c                           |    3 +-
 algo/lyra2/lyra2re.c                          |    5 +-
 algo/lyra2/lyra2rev2-4way.c                   |    3 +-
 algo/lyra2/lyra2rev2.c                        |    3 +-
 algo/lyra2/lyra2rev3-4way.c                   |    1 -
 algo/lyra2/lyra2z-4way.c                      |   75 +-
 algo/lyra2/lyra2z.c                           |    3 +-
 algo/lyra2/lyra2z330.c                        |    3 +-
 algo/lyra2/phi2.c                             |   68 +-
 algo/lyra2/sponge.h                           |    4 +-
 algo/sha/sha256q-4way.c                       |  219 ++++
 algo/sha/sha256q.c                            |  113 ++
 algo/sha/sha256t-4way.c                       |    6 +-
 algo/sha/sha256t-gate.c                       |   26 +-
 algo/sha/sha256t-gate.h                       |   12 +-
 algo/sha/sha256t.c                            |   15 +-
 algo/shavite/shavite-hash-2way.c              |   24 +-
 algo/x17/sonoa-4way.c                         |   38 +-
 algo/x17/sonoa-gate.c                         |    2 +-
 algo/x17/sonoa-gate.h                         |    2 +-
 algo/x17/x17-4way.c                           |   11 +-
 algo/x17/xevan-4way.c                         |  316 ++---
 algo/x17/xevan-gate.c                         |    2 +-
 algo/x17/xevan-gate.h                         |    6 +-
 algo/x17/xevan.c                              |    4 +-
 algo/yescrypt/sha256_Y.c                      |   24 +-
 algo/yescrypt/sha256_Y.h                      |    8 +-
 algo/yescrypt/yescrypt-simd.c                 |   10 +-
 algo/yescrypt/yescrypt.c                      |    3 +-
 algo/yespower/sha256-avx2.c                   |  646 ----------
 algo/yespower/sha256.c                        |  680 ----------
 algo/yespower/sha256.c.new                    |  672 ----------
 algo/yespower/sha256.h                        |  129 --
 algo/yespower/sha256.h.new                    |  134 --
 algo/yespower/sha256_p.c                      |  218 ++++
 algo/yespower/sha256_p.c.sha                  |  496 -------
 algo/yespower/{sha256_p.h.sha => sha256_p.h}  |   39 +-
 algo/yespower/yespower-opt.c                  |    5 +-
 algo/yespower/yespower-opt.c.sha              | 1147 -----------------
 algo/yespower/yespower-ref.c                  |    5 +-
 algo/yespower/yespower.c                      |    3 +-
 avxdefs.h                                     |   32 +-
 configure                                     |   20 +-
 configure.ac                                  |    2 +-
 cpu-miner.c                                   |    9 +-
 interleave.h                                  |  281 +++-
 miner.h                                       |    5 +-
 58 files changed, 2969 insertions(+), 4932 deletions(-)
 rename algo/bmw/{bmw-hash-4way.c => bmw256-hash-4way.c} (66%)
 create mode 100644 algo/bmw/bmw512-hash-4way.c
 create mode 100644 algo/sha/sha256q-4way.c
 create mode 100644 algo/sha/sha256q.c
 delete mode 100644 algo/yespower/sha256-avx2.c
 delete mode 100644 algo/yespower/sha256.c
 delete mode 100644 algo/yespower/sha256.c.new
 delete mode 100644 algo/yespower/sha256.h
 delete mode 100644 algo/yespower/sha256.h.new
 create mode 100644 algo/yespower/sha256_p.c
 delete mode 100644 algo/yespower/sha256_p.c.sha
 rename algo/yespower/{sha256_p.h.sha => sha256_p.h} (66%)
 delete mode 100644 algo/yespower/yespower-opt.c.sha

diff --git a/Makefile.am b/Makefile.am
index 96a22d2..e0f5a3a 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -68,7 +68,8 @@ cpuminer_SOURCES = \
   algo/blake/pentablake-4way.c \
   algo/blake/pentablake.c \
   algo/bmw/sph_bmw.c \
-  algo/bmw/bmw-hash-4way.c \
+  algo/bmw/bmw256-hash-4way.c \
+  algo/bmw/bmw512-hash-4way.c \
   algo/bmw/bmw256.c \
   algo/cryptonight/cryptolight.c \
   algo/cryptonight/cryptonight-common.c\
@@ -166,6 +167,8 @@ cpuminer_SOURCES = \
   algo/sha/sha256t-gate.c \
   algo/sha/sha256t-4way.c \
   algo/sha/sha256t.c \
+  algo/sha/sha256q-4way.c \
+  algo/sha/sha256q.c \
   algo/shabal/sph_shabal.c \
   algo/shabal/shabal-hash-4way.c \
   algo/shavite/sph_shavite.c \
@@ -262,7 +265,7 @@ cpuminer_SOURCES = \
   algo/yescrypt/sha256_Y.c \
   algo/yescrypt/yescrypt-best.c \
   algo/yespower/yespower.c \
-  algo/yespower/sha256.c \
+  algo/yespower/sha256_p.c \
   algo/yespower/yespower-opt.c
 
 disable_flags =
diff --git a/README.txt b/README.txt
index ebe9be9..d7bca62 100644
--- a/README.txt
+++ b/README.txt
@@ -12,7 +12,7 @@ the software, don't use it.
 Choose the exe that best matches you CPU's features or use trial and
 error to find the fastest one that doesn't crash. Pay attention to
 the features listed at cpuminer startup to ensure you are mining at
-optimum speed using all the available features.
+optimum speed using the best available features.
 
 Architecture names and compile options used are only provided for Intel
 Core series. Even the newest Pentium and Celeron CPUs are often missing
@@ -22,8 +22,6 @@ AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
 supported by cpuminer-opt due to an incompatible implementation of SSE2 on
 these CPUs. Some algos may crash the miner with an invalid instruction.
 Users are recommended to use an unoptimized miner such as cpuminer-multi.
-Changes in v3.8.4 may have improved compatibility with some of these CPUs.
-
 
 Exe name                Compile flags            Arch name
 
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 049dc9b..c4dac86 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -33,11 +33,20 @@ Requirements
 Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
 supported.
 
-64 bit Linux or Windows operating system. Apple is not supported.
+64 bit Linux or Windows operating system. Apple and Android are not supported.
 
 Change Log
 ----------
 
+v3.9.2
+
+Added sha256q algo.
+Yespower now uses openssl SHA256, but no observable hash rate increase
+on Ryzen.
+Ongoing rearchitecting.
+Lyra2z now hashes 8-way on CPUs with AVX2.
+Lyra2 (all including phi2) now runs optimized code with SSE2.
+
 v3.9.1.1
 
 Fixed lyra2v3 AVX and below.
@@ -45,7 +54,7 @@ Fixed lyra2v3 AVX and below.
 Compiling on Windows using Cygwin now works. Simply use "./build.sh"
 just like on Linux. It isn't portable therefore the binaries package will
 continue to use the existing procedure.
-The Cygwin procedfure will be documented in more detail later and will
+The Cygwin procedure will be documented in more detail later and will
 include a list of packages that need to be installed.
 
 v3.9.1
diff --git a/algo-gate-api.c b/algo-gate-api.c
index 9948e08..b3c1a1e 100644
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -210,6 +210,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
      case ALGO_SCRYPTJANE:   register_scryptjane_algo   ( gate ); break;
      case ALGO_SHA256D:      register_sha256d_algo      ( gate ); break;
      case ALGO_SHA256T:      register_sha256t_algo      ( gate ); break;
+     case ALGO_SHA256Q:      register_sha256q_algo      ( gate ); break;
      case ALGO_SHAVITE3:     register_shavite_algo      ( gate ); break;
      case ALGO_SKEIN:        register_skein_algo        ( gate ); break;
      case ALGO_SKEIN2:       register_skein2_algo       ( gate ); break;
diff --git a/algo/bmw/bmw-hash-4way.h b/algo/bmw/bmw-hash-4way.h
index 1bd3098..f9a1fa7 100644
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -41,7 +41,6 @@ extern "C"{
 #endif
 
 #include <stddef.h>
-#ifdef __AVX2__
 
 #include "algo/sha/sph_types.h"
 #include "avxdefs.h"
@@ -50,6 +49,10 @@ extern "C"{
 
 #define SPH_SIZE_bmw512   512
 
+#if defined(__SSE2__)
+
+// BMW-256 4 way 32
+
 typedef struct {
    __m128i buf[64];
    __m128i H[16];
@@ -59,6 +62,60 @@ typedef struct {
 
 typedef bmw_4way_small_context bmw256_4way_context;
 
+void bmw256_4way_init(void *cc);
+
+void bmw256_4way(void *cc, const void *data, size_t len);
+
+void bmw256_4way_close(void *cc, void *dst);
+
+void bmw256_4way_addbits_and_close(
+        void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif  // __SSE2__
+
+#if defined(__AVX2__)
+
+// BMW-256 8 way 32
+
+typedef struct {
+   __m256i buf[64];
+   __m256i H[16];
+   size_t ptr;
+   uint32_t bit_count;  // assume bit_count fits in 32 bits
+} bmw_8way_small_context __attribute__ ((aligned (64)));
+
+typedef bmw_8way_small_context bmw256_8way_context;
+
+void bmw256_8way_init( bmw256_8way_context *ctx );
+void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len );
+void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
+
+#endif
+
+
+#if defined(__SSE2__)
+
+// BMW-512 2 way 64
+
+typedef struct {
+   __m128i buf[16];
+   __m128i H[16];
+   size_t ptr;
+   uint64_t bit_count; 
+} bmw_2way_big_context __attribute__ ((aligned (64)));
+
+typedef bmw_2way_big_context bmw512_2way_context;
+
+void bmw512_2way_init( bmw512_2way_context *ctx );
+void bmw512_2way( bmw512_2way_context *ctx, const void *data, size_t len );
+void bmw512_2way_close( bmw512_2way_context *ctx, void *dst );
+
+#endif // __SSE2__
+
+#if defined(__AVX2__)
+
+// BMW-512 4 way 64
+
 typedef struct {
    __m256i buf[16];
    __m256i H[16];
@@ -68,14 +125,6 @@ typedef struct {
 
 typedef bmw_4way_big_context bmw512_4way_context;
 
-void bmw256_4way_init(void *cc);
-
-void bmw256_4way(void *cc, const void *data, size_t len);
-
-void bmw256_4way_close(void *cc, void *dst);
-
-void bmw256_4way_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
 
 void bmw512_4way_init(void *cc);
 
@@ -86,10 +135,10 @@ void bmw512_4way_close(void *cc, void *dst);
 void bmw512_4way_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);
 
-#endif
+#endif  // __AVX2__
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif
+#endif // BMW_HASH_H__
diff --git a/algo/bmw/bmw-hash-4way.c b/algo/bmw/bmw256-hash-4way.c
similarity index 66%
rename from algo/bmw/bmw-hash-4way.c
rename to algo/bmw/bmw256-hash-4way.c
index a6881da..be7c5db 100644
--- a/algo/bmw/bmw-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -35,8 +35,6 @@
 #include <limits.h>
 #include "bmw-hash-4way.h"
 
-#if defined(__AVX2__)
-
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -47,33 +45,21 @@ extern "C"{
 
 #define LPAR   (
 
-// BMW256
+#if defined(__SSE2__)
 
-static const sph_u32 IV256[] = {
-	SPH_C32(0x40414243), SPH_C32(0x44454647),
-	SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F),
-	SPH_C32(0x50515253), SPH_C32(0x54555657),
-	SPH_C32(0x58595A5B), SPH_C32(0x5C5D5E5F),
-	SPH_C32(0x60616263), SPH_C32(0x64656667),
-	SPH_C32(0x68696A6B), SPH_C32(0x6C6D6E6F),
-	SPH_C32(0x70717273), SPH_C32(0x74757677),
-	SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
+// BMW-256 4 way 32
+
+static const uint32_t IV256[] = {
+	0x40414243, 0x44454647,
+	0x48494A4B, 0x4C4D4E4F,
+	0x50515253, 0x54555657,
+	0x58595A5B, 0x5C5D5E5F,
+	0x60616263, 0x64656667,
+	0x68696A6B, 0x6C6D6E6F,
+	0x70717273, 0x74757677,
+	0x78797A7B, 0x7C7D7E7F
 };
 
-// BMW512
-static const sph_u64 IV512[] = {
-	SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
-	SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
-	SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
-	SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
-	SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
-	SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
-	SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
-	SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
-};
-
-// BMW256
-
 #define ss0(x) \
    _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
                                  _mm_slli_epi32( (x), 3) ), \
@@ -173,108 +159,6 @@ static const sph_u64 IV512[] = {
                                ss5( qt[ (i)- 1 ] ) ) ) ) ), \
       add_elt_s( M, H, (i)-16 ) )
 
-// BMW512
-
-#define sb0(x) \
-   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
-                                       _mm256_slli_epi64( (x), 3) ), \
-                     _mm256_xor_si256( mm256_rol_64( (x),  4), \
-                                       mm256_rol_64( (x), 37) ) )
-
-#define sb1(x) \
-   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
-                                       _mm256_slli_epi64( (x), 2) ), \
-                     _mm256_xor_si256( mm256_rol_64( (x), 13), \
-                                       mm256_rol_64( (x), 43) ) )
-
-#define sb2(x) \
-   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
-                                       _mm256_slli_epi64( (x), 1) ), \
-                     _mm256_xor_si256( mm256_rol_64( (x), 19), \
-                                       mm256_rol_64( (x), 53) ) )
-
-#define sb3(x) \
-   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
-                                       _mm256_slli_epi64( (x), 2) ), \
-                     _mm256_xor_si256( mm256_rol_64( (x), 28), \
-                                       mm256_rol_64( (x), 59) ) )
-
-#define sb4(x) \
-  _mm256_xor_si256( (x), _mm256_srli_epi64( (x), 1 ) )
-
-#define sb5(x) \
-  _mm256_xor_si256( (x), _mm256_srli_epi64( (x), 2 ) )
-
-#define rb1(x)    mm256_rol_64( x,  5 ) 
-#define rb2(x)    mm256_rol_64( x, 11 ) 
-#define rb3(x)    mm256_rol_64( x, 27 ) 
-#define rb4(x)    mm256_rol_64( x, 32 ) 
-#define rb5(x)    mm256_rol_64( x, 37 ) 
-#define rb6(x)    mm256_rol_64( x, 43 ) 
-#define rb7(x)    mm256_rol_64( x, 53 ) 
-
-#define rol_off_64( M, j, off ) \
-   mm256_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
-                  ( ( (j) + (off) ) & 0xF ) + 1 )
-
-#define add_elt_b( M, H, j ) \
-   _mm256_xor_si256( \
-      _mm256_add_epi64( \
-            _mm256_sub_epi64( _mm256_add_epi64( rol_off_64( M, j, 0 ), \
-                                                rol_off_64( M, j, 3 ) ), \
-                             rol_off_64( M, j, 10 ) ), \
-            _mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
-       H[ ( (j)+7 ) & 0xF ] )
-          
-#define expand1b( qt, M, H, i ) \
-   _mm256_add_epi64( \
-      _mm256_add_epi64( \
-         _mm256_add_epi64( \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( sb1( qt[ (i)-16 ] ), \
-                                  sb2( qt[ (i)-15 ] ) ), \
-                _mm256_add_epi64( sb3( qt[ (i)-14 ] ), \
-                                  sb0( qt[ (i)-13 ] ) ) ), \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( sb1( qt[ (i)-12 ] ), \
-                                  sb2( qt[ (i)-11 ] ) ), \
-                _mm256_add_epi64( sb3( qt[ (i)-10 ] ), \
-                                  sb0( qt[ (i)- 9 ] ) ) ) ), \
-         _mm256_add_epi64( \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( sb1( qt[ (i)- 8 ] ), \
-                                  sb2( qt[ (i)- 7 ] ) ), \
-                _mm256_add_epi64( sb3( qt[ (i)- 6 ] ), \
-                                  sb0( qt[ (i)- 5 ] ) ) ), \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( sb1( qt[ (i)- 4 ] ), \
-                                  sb2( qt[ (i)- 3 ] ) ), \
-                _mm256_add_epi64( sb3( qt[ (i)- 2 ] ), \
-                                  sb0( qt[ (i)- 1 ] ) ) ) ) ), \
-      add_elt_b( M, H, (i)-16 ) )
-
-#define expand2b( qt, M, H, i) \
-   _mm256_add_epi64( \
-      _mm256_add_epi64( \
-         _mm256_add_epi64( \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ) ), \
-                _mm256_add_epi64( qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ) ), \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ) ), \
-                _mm256_add_epi64( qt[ (i)-10 ], rb4( qt[ (i)- 9 ] ) ) ) ), \
-         _mm256_add_epi64( \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ) ), \
-                _mm256_add_epi64( qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ) ), \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ) ), \
-                _mm256_add_epi64( sb4( qt[ (i)- 2 ] ), \
-                                  sb5( qt[ (i)- 1 ] ) ) ) ) ), \
-      add_elt_b( M, H, (i)-16 ) )
-
-// BMW256
-
 #define Ws0 \
    _mm_add_epi32( \
        _mm_add_epi32( \
@@ -567,301 +451,6 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
                                 _mm_xor_si128( qt[22], qt[15] ) ) );
 }
 
-// BMW512
-
-#define Wb0 \
-   _mm256_add_epi64( \
-       _mm256_add_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
-                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-             _mm256_xor_si256( M[10], H[10] ) ), \
-          _mm256_xor_si256( M[13], H[13] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
-
-#define Wb1 \
-   _mm256_sub_epi64( \
-       _mm256_add_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
-                               _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-             _mm256_xor_si256( M[11], H[11] ) ), \
-          _mm256_xor_si256( M[14], H[14] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
-
-#define Wb2 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
-
-#define Wb3 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
-             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-          _mm256_xor_si256( M[10], H[10] ) ), \
-       _mm256_xor_si256( M[13], H[13] ) )
-
-#define Wb4 \
-   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
-
-#define Wb5 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
-                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-             _mm256_xor_si256( M[10], H[10] ) ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
-
-#define Wb6 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
-                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
-             _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
-       _mm256_xor_si256( M[13], H[13] ) )
-
-#define Wb7 \
-   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-             _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
-
-#define Wb8 \
-   _mm256_sub_epi64( \
-       _mm256_add_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
-                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[13], H[13] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
-
-#define Wb9 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
-
-#define Wb10 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
-                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
-             _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
-
-#define Wb11 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
-                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
-             _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-          _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-       _mm256_xor_si256( M[ 9], H[ 9] ) )
-
-#define Wb12 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-       _mm256_xor_si256( M[10], H[10] ) )
-
-#define Wb13 \
-   _mm256_add_epi64( \
-       _mm256_add_epi64( \
-          _mm256_add_epi64( \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
-                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-             _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-          _mm256_xor_si256( M[10], H[10] ) ), \
-       _mm256_xor_si256( M[11], H[11] ) )
-
-#define Wb14 \
-   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
-                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
-       _mm256_xor_si256( M[12], H[12] ) )
-
-#define Wb15 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
-                               _mm256_xor_si256( M[ 4], H[4] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-       _mm256_xor_si256( M[13], H[13] ) )
-
-void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
-{
-   __m256i qt[32], xl, xh;
-
-   qt[ 0] = _mm256_add_epi64( sb0( Wb0 ), H[ 1] ); 
-   qt[ 1] = _mm256_add_epi64( sb1( Wb1 ), H[ 2] ); 
-   qt[ 2] = _mm256_add_epi64( sb2( Wb2 ), H[ 3] ); 
-   qt[ 3] = _mm256_add_epi64( sb3( Wb3 ), H[ 4] ); 
-   qt[ 4] = _mm256_add_epi64( sb4( Wb4 ), H[ 5] ); 
-   qt[ 5] = _mm256_add_epi64( sb0( Wb5 ), H[ 6] ); 
-   qt[ 6] = _mm256_add_epi64( sb1( Wb6 ), H[ 7] ); 
-   qt[ 7] = _mm256_add_epi64( sb2( Wb7 ), H[ 8] ); 
-   qt[ 8] = _mm256_add_epi64( sb3( Wb8 ), H[ 9] ); 
-   qt[ 9] = _mm256_add_epi64( sb4( Wb9 ), H[10] ); 
-   qt[10] = _mm256_add_epi64( sb0( Wb10), H[11] ); 
-   qt[11] = _mm256_add_epi64( sb1( Wb11), H[12] ); 
-   qt[12] = _mm256_add_epi64( sb2( Wb12), H[13] ); 
-   qt[13] = _mm256_add_epi64( sb3( Wb13), H[14] );
-   qt[14] = _mm256_add_epi64( sb4( Wb14), H[15] ); 
-   qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] ); 
-   qt[16] = expand1b( qt, M, H, 16 ); 
-   qt[17] = expand1b( qt, M, H, 17 ); 
-   qt[18] = expand2b( qt, M, H, 18 ); 
-   qt[19] = expand2b( qt, M, H, 19 ); 
-   qt[20] = expand2b( qt, M, H, 20 ); 
-   qt[21] = expand2b( qt, M, H, 21 ); 
-   qt[22] = expand2b( qt, M, H, 22 ); 
-   qt[23] = expand2b( qt, M, H, 23 ); 
-   qt[24] = expand2b( qt, M, H, 24 ); 
-   qt[25] = expand2b( qt, M, H, 25 ); 
-   qt[26] = expand2b( qt, M, H, 26 ); 
-   qt[27] = expand2b( qt, M, H, 27 ); 
-   qt[28] = expand2b( qt, M, H, 28 ); 
-   qt[29] = expand2b( qt, M, H, 29 ); 
-   qt[30] = expand2b( qt, M, H, 30 ); 
-   qt[31] = expand2b( qt, M, H, 31 ); 
-
-   xl = _mm256_xor_si256( 
-              _mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ), 
-                                _mm256_xor_si256( qt[18], qt[19] ) ), 
-              _mm256_xor_si256( _mm256_xor_si256( qt[20], qt[21] ), 
-                                _mm256_xor_si256( qt[22], qt[23] ) ) ); 
-   xh = _mm256_xor_si256( xl, 
-             _mm256_xor_si256( 
-                 _mm256_xor_si256( _mm256_xor_si256( qt[24], qt[25] ),
-                                   _mm256_xor_si256( qt[26], qt[27] ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
-                                   _mm256_xor_si256( qt[30], qt[31] ) )));
-
-   dH[ 0] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[0],
-                      _mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
-                                        _mm256_srli_epi64( qt[16], 5 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
-   dH[ 1] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[1],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
-                                        _mm256_slli_epi64( qt[17], 8 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
-   dH[ 2] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[2],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
-                                        _mm256_slli_epi64( qt[18], 5 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
-   dH[ 3] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[3],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
-                                        _mm256_slli_epi64( qt[19], 5 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
-   dH[ 4] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[4],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
-                                        _mm256_slli_epi64( qt[20], 0 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
-   dH[ 5] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[5],
-                      _mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
-                                        _mm256_srli_epi64( qt[21], 6 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
-   dH[ 6] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[6],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
-                                        _mm256_slli_epi64( qt[22], 6 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
-   dH[ 7] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[7],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
-                                        _mm256_slli_epi64( qt[23], 2 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
-   dH[ 8] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[4], 9 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
-                 _mm256_xor_si256( _mm256_slli_epi64( xl, 8 ),
-                                   _mm256_xor_si256( qt[23], qt[ 8] ) ) );
-   dH[ 9] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[5], 10 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 6 ),
-                                   _mm256_xor_si256( qt[16], qt[ 9] ) ) );
-   dH[10] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[6], 11 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
-                 _mm256_xor_si256( _mm256_slli_epi64( xl, 6 ),
-                                   _mm256_xor_si256( qt[17], qt[10] ) ) );
-   dH[11] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[7], 12 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
-                 _mm256_xor_si256( _mm256_slli_epi64( xl, 4 ),
-                                   _mm256_xor_si256( qt[18], qt[11] ) ) );
-   dH[12] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[0], 13 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 3 ),
-                                   _mm256_xor_si256( qt[19], qt[12] ) ) );
-   dH[13] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[1], 14 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 4 ),
-                                   _mm256_xor_si256( qt[20], qt[13] ) ) );
-   dH[14] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[2], 15 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 7 ),
-                                   _mm256_xor_si256( qt[21], qt[14] ) ) );
-   dH[15] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[3], 16 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 2 ),
-                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
-} 
-
-// BMW256
-
 static const uint32_t final_s[16][4] =
 {
    { 0xaaaaaaa0, 0xaaaaaaa0, 0xaaaaaaa0, 0xaaaaaaa0 },
@@ -988,129 +577,6 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
       casti_m128i( dst, u ) = h1[v];
 }
 
-// BMW512
-
-static const __m256i final_b[16] =
-{
-   { 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0,
-     0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 },
-   { 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1,
-     0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 },
-   { 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2,
-     0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 },
-   { 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3,
-     0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 },
-   { 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4,
-     0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 },
-   { 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5,
-     0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 },
-   { 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6,
-     0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 },
-   { 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7,
-     0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7 },
-   { 0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8,
-     0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8 },
-   { 0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9,
-     0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9 },
-   { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
-     0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
-   { 0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab,
-     0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab },
-   { 0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac,
-     0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac },
-   { 0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad,
-     0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad },
-   { 0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae,
-     0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae },
-   { 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf,
-     0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf }
-};
-
-static void
-bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
-{
-   for ( int i = 0; i < 16; i++ )
-      sc->H[i] = _mm256_set1_epi64x( iv[i] );
-   sc->ptr = 0;
-   sc->bit_count = 0;
-}
-
-static void
-bmw64_4way( bmw_4way_big_context *sc, const void *data, size_t len )
-{
-   __m256i *vdata = (__m256i*)data;
-   __m256i *buf;
-   __m256i htmp[16];
-   __m256i *h1, *h2;
-   size_t ptr;
-   const int buf_size = 128;  // bytes of one lane, compatible with len
-
-   sc->bit_count += (sph_u64)len << 3;
-   buf = sc->buf;
-   ptr = sc->ptr;
-   h1 = sc->H;
-   h2 = htmp;
-   while ( len > 0 )
-   {
-      size_t clen;
-      clen = buf_size - ptr;
-      if ( clen > len )
-         clen = len;
-      memcpy_256( buf + (ptr>>3), vdata, clen >> 3 );
-      vdata = vdata + (clen>>3);
-      len -= clen;
-      ptr += clen;
-      if ( ptr == buf_size )
-      {
-         __m256i *ht;
-         compress_big( buf, h1, h2 );
-         ht = h1;
-         h1 = h2;
-         h2 = ht;
-         ptr = 0;
-      }
-   }
-   sc->ptr = ptr;
-   if ( h1 != sc->H )
-        memcpy_256( sc->H, h1, 16 );
-}
-
-static void
-bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n,
-	void *dst, size_t out_size_w64)
-{
-   __m256i *buf;
-   __m256i h1[16], h2[16], *h;
-   size_t ptr, u, v;
-   unsigned z;
-   const int buf_size = 128;  // bytes of one lane, compatible with len
-
-   buf = sc->buf;
-   ptr = sc->ptr;
-   z = 0x80 >> n;
-   buf[ ptr>>3 ] = _mm256_set1_epi64x( z );
-   ptr += 8;
-   h = sc->H;
-
-   if (  ptr > (buf_size - 8) )
-   {
-      memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
-      compress_big( buf, h, h1 );
-      ptr = 0;
-      h = h1;
-   }
-   memset_zero_256( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
-   buf[ (buf_size - 8) >> 3 ] = _mm256_set1_epi64x( sc->bit_count + n );
-   compress_big( buf, h, h2 );
-   for ( u = 0; u < 16; u ++ )
-      buf[u] = h2[u];
-   compress_big( buf, final_b, h1 );
-   for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++)
-      casti_m256i(dst,u) = h1[v];
-}
-
-// BMW256
-
 void
 bmw256_4way_init(void *cc)
 {
@@ -1135,34 +601,543 @@ bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 	bmw32_4way_close(cc, ub, n, dst, 8);
 }
 
-// BMW512
+#endif   // __SSE2__
 
-void
-bmw512_4way_init(void *cc)
+#if defined(__AVX2__)
+
+// BMW-256 8 way 32
+
+// copied from bmw512 4 way.
+// change sizes to 32, macro names from b to s, shift constants.
+// all the XORs ae good.
+
+
+#define s8s0(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi32( (x), 1), \
+                                       _mm256_slli_epi32( (x), 3) ), \
+                     _mm256_xor_si256( mm256_rol_32( (x),  4), \
+                                       mm256_rol_32( (x), 19) ) )
+
+#define s8s1(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi32( (x), 1), \
+                                       _mm256_slli_epi32( (x), 2) ), \
+                     _mm256_xor_si256( mm256_rol_32( (x), 8), \
+                                       mm256_rol_32( (x), 23) ) )
+
+#define s8s2(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi32( (x), 2), \
+                                       _mm256_slli_epi32( (x), 1) ), \
+                     _mm256_xor_si256( mm256_rol_32( (x), 12), \
+                                       mm256_rol_32( (x), 25) ) )
+
+#define s8s3(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi32( (x), 2), \
+                                       _mm256_slli_epi32( (x), 2) ), \
+                     _mm256_xor_si256( mm256_rol_32( (x), 15), \
+                                       mm256_rol_32( (x), 29) ) )
+
+#define s8s4(x) \
+  _mm256_xor_si256( (x), _mm256_srli_epi32( (x), 1 ) )
+
+#define s8s5(x) \
+  _mm256_xor_si256( (x), _mm256_srli_epi32( (x), 2 ) )
+
+#define r8s1(x)    mm256_rol_32( x,  3 ) 
+#define r8s2(x)    mm256_rol_32( x,  7 ) 
+#define r8s3(x)    mm256_rol_32( x, 13 ) 
+#define r8s4(x)    mm256_rol_32( x, 16 ) 
+#define r8s5(x)    mm256_rol_32( x, 19 ) 
+#define r8s6(x)    mm256_rol_32( x, 23 ) 
+#define r8s7(x)    mm256_rol_32( x, 27 ) 
+
+#define mm256_rol_off_32( M, j, off ) \
+   mm256_rol_32( M[ ( (j) + (off) ) & 0xF ] , \
+                  ( ( (j) + (off) ) & 0xF ) + 1 )
+
+#define add_elt_s8( M, H, j ) \
+   _mm256_xor_si256( \
+      _mm256_add_epi32( \
+            _mm256_sub_epi32( _mm256_add_epi32( mm256_rol_off_32( M, j, 0 ), \
+                                                mm256_rol_off_32( M, j, 3 ) ), \
+                             mm256_rol_off_32( M, j, 10 ) ), \
+            _mm256_set1_epi32( ( (j) + 16 ) * 0x05555555UL ) ), \
+       H[ ( (j)+7 ) & 0xF ] )
+
+#define expand1s8( qt, M, H, i ) \
+   _mm256_add_epi32( \
+      _mm256_add_epi32( \
+         _mm256_add_epi32( \
+             _mm256_add_epi32( \
+                _mm256_add_epi32( s8s1( qt[ (i)-16 ] ), \
+                                  s8s2( qt[ (i)-15 ] ) ), \
+                _mm256_add_epi32( s8s3( qt[ (i)-14 ] ), \
+                                  s8s0( qt[ (i)-13 ] ) ) ), \
+             _mm256_add_epi32( \
+                _mm256_add_epi32( s8s1( qt[ (i)-12 ] ), \
+                                  s8s2( qt[ (i)-11 ] ) ), \
+                _mm256_add_epi32( s8s3( qt[ (i)-10 ] ), \
+                                  s8s0( qt[ (i)- 9 ] ) ) ) ), \
+         _mm256_add_epi32( \
+             _mm256_add_epi32( \
+                _mm256_add_epi32( s8s1( qt[ (i)- 8 ] ), \
+                                  s8s2( qt[ (i)- 7 ] ) ), \
+                _mm256_add_epi32( s8s3( qt[ (i)- 6 ] ), \
+                                  s8s0( qt[ (i)- 5 ] ) ) ), \
+             _mm256_add_epi32( \
+                _mm256_add_epi32( s8s1( qt[ (i)- 4 ] ), \
+                                  s8s2( qt[ (i)- 3 ] ) ), \
+                _mm256_add_epi32( s8s3( qt[ (i)- 2 ] ), \
+                                  s8s0( qt[ (i)- 1 ] ) ) ) ) ), \
+      add_elt_s8( M, H, (i)-16 ) )
+
+#define expand2s8( qt, M, H, i) \
+   _mm256_add_epi32( \
+      _mm256_add_epi32( \
+         _mm256_add_epi32( \
+             _mm256_add_epi32( \
+                _mm256_add_epi32( qt[ (i)-16 ], r8s1( qt[ (i)-15 ] ) ), \
+                _mm256_add_epi32( qt[ (i)-14 ], r8s2( qt[ (i)-13 ] ) ) ), \
+             _mm256_add_epi32( \
+                _mm256_add_epi32( qt[ (i)-12 ], r8s3( qt[ (i)-11 ] ) ), \
+                _mm256_add_epi32( qt[ (i)-10 ], r8s4( qt[ (i)- 9 ] ) ) ) ), \
+         _mm256_add_epi32( \
+             _mm256_add_epi32( \
+                _mm256_add_epi32( qt[ (i)- 8 ], r8s5( qt[ (i)- 7 ] ) ), \
+                _mm256_add_epi32( qt[ (i)- 6 ], r8s6( qt[ (i)- 5 ] ) ) ), \
+             _mm256_add_epi32( \
+                _mm256_add_epi32( qt[ (i)- 4 ], r8s7( qt[ (i)- 3 ] ) ), \
+                _mm256_add_epi32( s8s4( qt[ (i)- 2 ] ), \
+                                  s8s5( qt[ (i)- 1 ] ) ) ) ) ), \
+      add_elt_s8( M, H, (i)-16 ) )
+
+
+#define W8s0 \
+   _mm256_add_epi32( \
+       _mm256_add_epi32( \
+          _mm256_add_epi32( \
+             _mm256_sub_epi32( _mm256_xor_si256( M[ 5], H[ 5] ), \
+                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+             _mm256_xor_si256( M[10], H[10] ) ), \
+          _mm256_xor_si256( M[13], H[13] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define W8s1 \
+   _mm256_sub_epi32( \
+       _mm256_add_epi32( \
+          _mm256_add_epi32( \
+             _mm256_sub_epi32( _mm256_xor_si256( M[ 6], H[ 6] ), \
+                               _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+             _mm256_xor_si256( M[11], H[11] ) ), \
+          _mm256_xor_si256( M[14], H[14] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define W8s2 \
+   _mm256_add_epi32( \
+       _mm256_sub_epi32( \
+          _mm256_add_epi32( \
+             _mm256_add_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+          _mm256_xor_si256( M[12], H[12] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define W8s3 \
+   _mm256_add_epi32( \
+       _mm256_sub_epi32( \
+          _mm256_add_epi32( \
+             _mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+          _mm256_xor_si256( M[10], H[10] ) ), \
+       _mm256_xor_si256( M[13], H[13] ) )
+
+#define W8s4 \
+   _mm256_sub_epi32( \
+       _mm256_sub_epi32( \
+          _mm256_add_epi32( \
+             _mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define W8s5 \
+   _mm256_add_epi32( \
+       _mm256_sub_epi32( \
+          _mm256_add_epi32( \
+             _mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+             _mm256_xor_si256( M[10], H[10] ) ), \
+          _mm256_xor_si256( M[12], H[12] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define W8s6 \
+   _mm256_add_epi32( \
+       _mm256_sub_epi32( \
+          _mm256_sub_epi32( \
+             _mm256_sub_epi32( _mm256_xor_si256( M[ 4], H[ 4] ), \
+                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+             _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_xor_si256( M[13], H[13] ) )
+
+#define W8s7 \
+   _mm256_sub_epi32( \
+       _mm256_sub_epi32( \
+          _mm256_sub_epi32( \
+             _mm256_sub_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+             _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+          _mm256_xor_si256( M[12], H[12] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define W8s8 \
+   _mm256_sub_epi32( \
+       _mm256_add_epi32( \
+          _mm256_sub_epi32( \
+             _mm256_sub_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[13], H[13] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define W8s9 \
+   _mm256_add_epi32( \
+       _mm256_sub_epi32( \
+          _mm256_add_epi32( \
+             _mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define W8s10 \
+   _mm256_add_epi32( \
+       _mm256_sub_epi32( \
+          _mm256_sub_epi32( \
+             _mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+             _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define W8s11 \
+   _mm256_add_epi32( \
+       _mm256_sub_epi32( \
+          _mm256_sub_epi32( \
+             _mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+             _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+          _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+       _mm256_xor_si256( M[ 9], H[ 9] ) )
+
+#define W8s12 \
+   _mm256_add_epi32( \
+       _mm256_sub_epi32( \
+          _mm256_sub_epi32( \
+             _mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+       _mm256_xor_si256( M[10], H[10] ) )
+
+#define W8s13 \
+   _mm256_add_epi32( \
+       _mm256_add_epi32( \
+          _mm256_add_epi32( \
+             _mm256_add_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+             _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+          _mm256_xor_si256( M[10], H[10] ) ), \
+       _mm256_xor_si256( M[11], H[11] ) )
+
+#define W8s14 \
+   _mm256_sub_epi32( \
+       _mm256_sub_epi32( \
+          _mm256_add_epi32( \
+             _mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_xor_si256( M[12], H[12] ) )
+
+#define W8s15 \
+   _mm256_add_epi32( \
+       _mm256_sub_epi32( \
+          _mm256_sub_epi32( \
+             _mm256_sub_epi32( _mm256_xor_si256( M[12], H[12] ), \
+                               _mm256_xor_si256( M[ 4], H[4] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+       _mm256_xor_si256( M[13], H[13] ) )
+
+void compress_small_8way( const __m256i *M, const __m256i H[16],
+	                  __m256i dH[16] )
 {
-	bmw64_4way_init(cc, IV512);
+   __m256i qt[32], xl, xh;
+
+   qt[ 0] = _mm256_add_epi32( s8s0( W8s0 ), H[ 1] );
+   qt[ 1] = _mm256_add_epi32( s8s1( W8s1 ), H[ 2] );
+   qt[ 2] = _mm256_add_epi32( s8s2( W8s2 ), H[ 3] );
+   qt[ 3] = _mm256_add_epi32( s8s3( W8s3 ), H[ 4] );
+   qt[ 4] = _mm256_add_epi32( s8s4( W8s4 ), H[ 5] );
+   qt[ 5] = _mm256_add_epi32( s8s0( W8s5 ), H[ 6] );
+   qt[ 6] = _mm256_add_epi32( s8s1( W8s6 ), H[ 7] );
+   qt[ 7] = _mm256_add_epi32( s8s2( W8s7 ), H[ 8] );
+   qt[ 8] = _mm256_add_epi32( s8s3( W8s8 ), H[ 9] );
+   qt[ 9] = _mm256_add_epi32( s8s4( W8s9 ), H[10] );
+   qt[10] = _mm256_add_epi32( s8s0( W8s10), H[11] );
+   qt[11] = _mm256_add_epi32( s8s1( W8s11), H[12] );
+   qt[12] = _mm256_add_epi32( s8s2( W8s12), H[13] );
+   qt[13] = _mm256_add_epi32( s8s3( W8s13), H[14] );
+   qt[14] = _mm256_add_epi32( s8s4( W8s14), H[15] );
+   qt[15] = _mm256_add_epi32( s8s0( W8s15), H[ 0] );
+   qt[16] = expand1s8( qt, M, H, 16 );
+   qt[17] = expand1s8( qt, M, H, 17 );
+   qt[18] = expand2s8( qt, M, H, 18 );
+   qt[19] = expand2s8( qt, M, H, 19 );
+   qt[20] = expand2s8( qt, M, H, 20 );
+   qt[21] = expand2s8( qt, M, H, 21 );
+   qt[22] = expand2s8( qt, M, H, 22 );
+   qt[23] = expand2s8( qt, M, H, 23 );
+   qt[24] = expand2s8( qt, M, H, 24 );
+   qt[25] = expand2s8( qt, M, H, 25 );
+   qt[26] = expand2s8( qt, M, H, 26 );
+   qt[27] = expand2s8( qt, M, H, 27 );
+   qt[28] = expand2s8( qt, M, H, 28 );
+   qt[29] = expand2s8( qt, M, H, 29 );
+   qt[30] = expand2s8( qt, M, H, 30 );
+   qt[31] = expand2s8( qt, M, H, 31 );
+
+   xl = _mm256_xor_si256(
+              _mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ),
+                                _mm256_xor_si256( qt[18], qt[19] ) ),
+              _mm256_xor_si256( _mm256_xor_si256( qt[20], qt[21] ),
+                                _mm256_xor_si256( qt[22], qt[23] ) ) );
+   xh = _mm256_xor_si256( xl,
+             _mm256_xor_si256(
+                 _mm256_xor_si256( _mm256_xor_si256( qt[24], qt[25] ),
+                                   _mm256_xor_si256( qt[26], qt[27] ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
+                                   _mm256_xor_si256( qt[30], qt[31] ) )));
+
+   dH[ 0] = _mm256_add_epi32(
+                 _mm256_xor_si256( M[0],
+                      _mm256_xor_si256( _mm256_slli_epi32( xh, 5 ),
+                                        _mm256_srli_epi32( qt[16], 5 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
+   dH[ 1] = _mm256_add_epi32(
+                 _mm256_xor_si256( M[1],
+                      _mm256_xor_si256( _mm256_srli_epi32( xh, 7 ),
+                                        _mm256_slli_epi32( qt[17], 8 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
+   dH[ 2] = _mm256_add_epi32(
+                 _mm256_xor_si256( M[2],
+                      _mm256_xor_si256( _mm256_srli_epi32( xh, 5 ),
+                                        _mm256_slli_epi32( qt[18], 5 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
+   dH[ 3] = _mm256_add_epi32(
+                 _mm256_xor_si256( M[3],
+                      _mm256_xor_si256( _mm256_srli_epi32( xh, 1 ),
+                                        _mm256_slli_epi32( qt[19], 5 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
+   dH[ 4] = _mm256_add_epi32(
+                 _mm256_xor_si256( M[4],
+                      _mm256_xor_si256( _mm256_srli_epi32( xh, 3 ),
+                                        _mm256_slli_epi32( qt[20], 0 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
+   dH[ 5] = _mm256_add_epi32(
+                 _mm256_xor_si256( M[5],
+                      _mm256_xor_si256( _mm256_slli_epi32( xh, 6 ),
+                                        _mm256_srli_epi32( qt[21], 6 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
+   dH[ 6] = _mm256_add_epi32(
+                 _mm256_xor_si256( M[6],
+                      _mm256_xor_si256( _mm256_srli_epi32( xh, 4 ),
+                                        _mm256_slli_epi32( qt[22], 6 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
+   dH[ 7] = _mm256_add_epi32(
+                 _mm256_xor_si256( M[7],
+                      _mm256_xor_si256( _mm256_srli_epi32( xh, 11 ),
+                                        _mm256_slli_epi32( qt[23], 2 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
+   dH[ 8] = _mm256_add_epi32( _mm256_add_epi32(
+                 mm256_rol_32( dH[4], 9 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
+                 _mm256_xor_si256( _mm256_slli_epi32( xl, 8 ),
+                                   _mm256_xor_si256( qt[23], qt[ 8] ) ) );
+   dH[ 9] = _mm256_add_epi32( _mm256_add_epi32(
+                 mm256_rol_32( dH[5], 10 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
+                 _mm256_xor_si256( _mm256_srli_epi32( xl, 6 ),
+                                   _mm256_xor_si256( qt[16], qt[ 9] ) ) );
+   dH[10] = _mm256_add_epi32( _mm256_add_epi32(
+                 mm256_rol_32( dH[6], 11 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
+                 _mm256_xor_si256( _mm256_slli_epi32( xl, 6 ),
+                                   _mm256_xor_si256( qt[17], qt[10] ) ) );
+   dH[11] = _mm256_add_epi32( _mm256_add_epi32(
+                 mm256_rol_32( dH[7], 12 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
+                 _mm256_xor_si256( _mm256_slli_epi32( xl, 4 ),
+                                   _mm256_xor_si256( qt[18], qt[11] ) ) );
+   dH[12] = _mm256_add_epi32( _mm256_add_epi32(
+                 mm256_rol_32( dH[0], 13 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
+                 _mm256_xor_si256( _mm256_srli_epi32( xl, 3 ),
+                                   _mm256_xor_si256( qt[19], qt[12] ) ) );
+   dH[13] = _mm256_add_epi32( _mm256_add_epi32(
+                 mm256_rol_32( dH[1], 14 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
+                 _mm256_xor_si256( _mm256_srli_epi32( xl, 4 ),
+                                   _mm256_xor_si256( qt[20], qt[13] ) ) );
+   dH[14] = _mm256_add_epi32( _mm256_add_epi32(
+                 mm256_rol_32( dH[2], 15 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
+                 _mm256_xor_si256( _mm256_srli_epi32( xl, 7 ),
+                                   _mm256_xor_si256( qt[21], qt[14] ) ) );
+   dH[15] = _mm256_add_epi32( _mm256_add_epi32(
+                 mm256_rol_32( dH[3], 16 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
+                 _mm256_xor_si256( _mm256_srli_epi32( xl, 2 ),
+                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
 }
 
-void
-bmw512_4way(void *cc, const void *data, size_t len)
+static const __m256i final_s8[16] =
 {
-	bmw64_4way(cc, data, len);
+    { 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0,
+      0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
+    { 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1,
+      0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
+    { 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2,
+      0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 },
+    { 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3,
+      0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 },
+    { 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4,
+      0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 },
+    { 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5,
+      0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 },
+    { 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6,
+      0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 },
+    { 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7,
+      0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 },
+    { 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8,
+      0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 },
+    { 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9,
+      0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 },
+    { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
+      0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
+    { 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab,
+      0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab },
+    { 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac,
+      0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac },
+    { 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad,
+      0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad },
+    { 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae,
+      0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
+    { 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf,
+      0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
+};
+
+void bmw256_8way_init( bmw256_8way_context *ctx )
+{
+   ctx->H[ 0] = _mm256_set1_epi64x( IV256[ 0] );
+   ctx->H[ 1] = _mm256_set1_epi64x( IV256[ 1] );
+   ctx->H[ 2] = _mm256_set1_epi64x( IV256[ 2] );
+   ctx->H[ 3] = _mm256_set1_epi64x( IV256[ 3] );
+   ctx->H[ 4] = _mm256_set1_epi64x( IV256[ 4] );
+   ctx->H[ 5] = _mm256_set1_epi64x( IV256[ 5] );
+   ctx->H[ 6] = _mm256_set1_epi64x( IV256[ 6] );
+   ctx->H[ 7] = _mm256_set1_epi64x( IV256[ 7] );
+   ctx->H[ 8] = _mm256_set1_epi64x( IV256[ 8] );
+   ctx->H[ 9] = _mm256_set1_epi64x( IV256[ 9] );
+   ctx->H[10] = _mm256_set1_epi64x( IV256[10] );
+   ctx->H[11] = _mm256_set1_epi64x( IV256[11] );
+   ctx->H[12] = _mm256_set1_epi64x( IV256[12] );
+   ctx->H[13] = _mm256_set1_epi64x( IV256[13] );
+   ctx->H[14] = _mm256_set1_epi64x( IV256[14] );
+   ctx->H[15] = _mm256_set1_epi64x( IV256[15] );
+   ctx->ptr       = 0;
+   ctx->bit_count = 0;
+
 }
 
-void
-bmw512_4way_close(void *cc, void *dst)
+void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
 {
-	bmw512_4way_addbits_and_close(cc, 0, 0, dst);
+   __m256i *vdata = (__m256i*)data;
+   __m256i *buf;
+   __m256i htmp[16];
+   __m256i *h1, *h2;
+   size_t ptr;
+   const int buf_size = 64;  // bytes of one lane, compatible with len
+
+   ctx->bit_count += len << 3;
+   buf = ctx->buf;
+   ptr = ctx->ptr;
+   h1 = ctx->H;
+   h2 = htmp;
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( buf + (ptr>>3), vdata, clen >> 3 );
+      vdata = vdata + (clen>>3);
+      len -= clen;
+      ptr += clen;
+      if ( ptr == buf_size )
+      {
+         __m256i *ht;
+         compress_small_8way( buf, h1, h2 );
+         ht = h1;
+         h1 = h2;
+         h2 = ht;
+         ptr = 0;
+      }
+   }
+   ctx->ptr = ptr;
+   if ( h1 != ctx->H )
+        memcpy_256( ctx->H, h1, 16 );
 }
 
-void
-bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
 {
-	bmw64_4way_close(cc, ub, n, dst, 8);
+   __m256i *buf;
+   __m256i h1[16], h2[16], *h;
+   size_t ptr, u, v;
+//   unsigned z;
+   const int buf_size = 64;  // bytes of one lane, compatible with len
+
+   buf = ctx->buf;
+   ptr = ctx->ptr;
+   buf[ ptr>>3 ] = _mm256_set1_epi32( 0x80 );
+   ptr += 8;
+   h = ctx->H;
+
+   if (  ptr > (buf_size - 8) )
+   {
+      memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+      compress_small_8way( buf, h, h1 );
+      ptr = 0;
+      h = h1;
+   }
+   memset_zero_256( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
+   buf[ (buf_size - 8) >> 3 ] = _mm256_set1_epi64x( ctx->bit_count );
+   compress_small_8way( buf, h, h2 );
+
+   for ( u = 0; u < 16; u ++ )
+      buf[u] = h2[u];
+
+   compress_small_8way( buf, final_s8, h1 );
+   for (u = 0, v = 16 - 8; u < 8; u ++, v ++)
+      casti_m256i(dst,u) = h1[v];
 }
 
+
+#endif // __AVX2__
+
 #ifdef __cplusplus
 }
 #endif
 
-#endif  // __AVX2__
diff --git a/algo/bmw/bmw512-hash-4way.c b/algo/bmw/bmw512-hash-4way.c
new file mode 100644
index 0000000..c272e4a
--- /dev/null
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -0,0 +1,1109 @@
+/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * BMW implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+#include "bmw-hash-4way.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#define LPAR   (
+
+static const sph_u64 IV512[] = {
+        SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
+        SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
+        SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
+        SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
+        SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
+        SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
+        SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
+        SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
+};
+
+#if defined(__SSE2__)
+
+// BMW-512 2 way 64
+
+
+#define s2b0(x) \
+   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 1), \
+                                 _mm_slli_epi64( (x), 3) ), \
+                  _mm_xor_si128( mm128_rol_64( (x),  4), \
+                                 mm128_rol_64( (x), 37) ) )
+
+#define s2b1(x) \
+   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 1), \
+                                 _mm_slli_epi64( (x), 2) ), \
+                  _mm_xor_si128( mm128_rol_64( (x), 13), \
+                                 mm128_rol_64( (x), 43) ) )
+
+#define s2b2(x) \
+   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 2), \
+                                 _mm_slli_epi64( (x), 1) ), \
+                  _mm_xor_si128( mm128_rol_64( (x), 19), \
+                                 mm128_rol_64( (x), 53) ) )
+
+#define s2b3(x) \
+   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 2), \
+                                 _mm_slli_epi64( (x), 2) ), \
+                  _mm_xor_si128( mm128_rol_64( (x), 28), \
+                                 mm128_rol_64( (x), 59) ) )
+
+#define s2b4(x) \
+   _mm_xor_si128( (x), _mm_srli_epi64( (x), 1 ) )
+
+#define s2b5(x) \
+   _mm_xor_si128( (x), _mm_srli_epi64( (x), 2 ) )
+
+
+#define r2b1(x)    mm128_rol_64( x,  5 )
+#define r2b2(x)    mm128_rol_64( x, 11 )
+#define r2b3(x)    mm128_rol_64( x, 27 )
+#define r2b4(x)    mm128_rol_64( x, 32 )
+#define r2b5(x)    mm128_rol_64( x, 37 )
+#define r2b6(x)    mm128_rol_64( x, 43 )
+#define r2b7(x)    mm128_rol_64( x, 53 )
+
+#define mm128_rol_off_64( M, j, off ) \
+   mm128_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
+                  ( ( (j) + (off) ) & 0xF ) + 1 )
+
+#define add_elt_2b( M, H, j ) \
+   _mm_xor_si128( \
+      _mm_add_epi64( \
+            _mm_sub_epi64( _mm_add_epi64( mm128_rol_off_64( M, j, 0 ), \
+                                          mm128_rol_off_64( M, j, 3 ) ), \
+                           mm128_rol_off_64( M, j, 10 ) ), \
+            _mm_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
+       H[ ( (j)+7 ) & 0xF ] )
+
+
+#define expand1_2b( qt, M, H, i ) \
+   _mm_add_epi64( \
+      _mm_add_epi64( \
+         _mm_add_epi64( \
+             _mm_add_epi64( \
+                _mm_add_epi64( s2b1( qt[ (i)-16 ] ), \
+                               s2b2( qt[ (i)-15 ] ) ), \
+                _mm_add_epi64( s2b3( qt[ (i)-14 ] ), \
+                               s2b0( qt[ (i)-13 ] ) ) ), \
+             _mm_add_epi64( \
+                _mm_add_epi64( s2b1( qt[ (i)-12 ] ), \
+                               s2b2( qt[ (i)-11 ] ) ), \
+                _mm_add_epi64( s2b3( qt[ (i)-10 ] ), \
+                               s2b0( qt[ (i)- 9 ] ) ) ) ), \
+         _mm_add_epi64( \
+             _mm_add_epi64( \
+                _mm_add_epi64( s2b1( qt[ (i)- 8 ] ), \
+                               s2b2( qt[ (i)- 7 ] ) ), \
+                _mm_add_epi64( s2b3( qt[ (i)- 6 ] ), \
+                               s2b0( qt[ (i)- 5 ] ) ) ), \
+             _mm_add_epi64( \
+                _mm_add_epi64( s2b1( qt[ (i)- 4 ] ), \
+                               s2b2( qt[ (i)- 3 ] ) ), \
+                _mm_add_epi64( s2b3( qt[ (i)- 2 ] ), \
+                               s2b0( qt[ (i)- 1 ] ) ) ) ) ), \
+      add_elt_2b( M, H, (i)-16 ) )
+
+#define expand2_2b( qt, M, H, i) \
+   _mm_add_epi64( \
+      _mm_add_epi64( \
+         _mm_add_epi64( \
+             _mm_add_epi64( \
+                _mm_add_epi64( qt[ (i)-16 ], r2b1( qt[ (i)-15 ] ) ), \
+                _mm_add_epi64( qt[ (i)-14 ], r2b2( qt[ (i)-13 ] ) ) ), \
+             _mm_add_epi64( \
+                _mm_add_epi64( qt[ (i)-12 ], r2b3( qt[ (i)-11 ] ) ), \
+                _mm_add_epi64( qt[ (i)-10 ], r2b4( qt[ (i)- 9 ] ) ) ) ), \
+         _mm_add_epi64( \
+             _mm_add_epi64( \
+                _mm_add_epi64( qt[ (i)- 8 ], r2b5( qt[ (i)- 7 ] ) ), \
+                _mm_add_epi64( qt[ (i)- 6 ], r2b6( qt[ (i)- 5 ] ) ) ), \
+             _mm_add_epi64( \
+                _mm_add_epi64( qt[ (i)- 4 ], r2b7( qt[ (i)- 3 ] ) ), \
+                _mm_add_epi64( s2b4( qt[ (i)- 2 ] ), \
+                               s2b5( qt[ (i)- 1 ] ) ) ) ) ), \
+      add_elt_2b( M, H, (i)-16 ) )
+
+
+#define W2b0 \
+   _mm_add_epi64( \
+       _mm_add_epi64( \
+          _mm_add_epi64( \
+             _mm_sub_epi64( _mm_xor_si128( M[ 5], H[ 5] ), \
+                            _mm_xor_si128( M[ 7], H[ 7] ) ), \
+             _mm_xor_si128( M[10], H[10] ) ), \
+          _mm_xor_si128( M[13], H[13] ) ), \
+       _mm_xor_si128( M[14], H[14] ) )
+
+#define W2b1 \
+   _mm_sub_epi64( \
+       _mm_add_epi64( \
+          _mm_add_epi64( \
+             _mm_sub_epi64( _mm_xor_si128( M[ 6], H[ 6] ), \
+                            _mm_xor_si128( M[ 8], H[ 8] ) ), \
+             _mm_xor_si128( M[11], H[11] ) ), \
+          _mm_xor_si128( M[14], H[14] ) ), \
+       _mm_xor_si128( M[15], H[15] ) )
+
+#define W2b2 \
+   _mm_add_epi64( \
+       _mm_sub_epi64( \
+          _mm_add_epi64( \
+             _mm_add_epi64( _mm_xor_si128( M[ 0], H[ 0] ), \
+                            _mm_xor_si128( M[ 7], H[ 7] ) ), \
+             _mm_xor_si128( M[ 9], H[ 9] ) ), \
+          _mm_xor_si128( M[12], H[12] ) ), \
+       _mm_xor_si128( M[15], H[15] ) )
+
+#define W2b3 \
+   _mm_add_epi64( \
+       _mm_sub_epi64( \
+          _mm_add_epi64( \
+             _mm_sub_epi64( _mm_xor_si128( M[ 0], H[ 0] ), \
+                               _mm_xor_si128( M[ 1], H[ 1] ) ), \
+             _mm_xor_si128( M[ 8], H[ 8] ) ), \
+          _mm_xor_si128( M[10], H[10] ) ), \
+       _mm_xor_si128( M[13], H[13] ) )
+
+#define W2b4 \
+   _mm_sub_epi64( \
+       _mm_sub_epi64( \
+          _mm_add_epi64( \
+             _mm_add_epi64( _mm_xor_si128( M[ 1], H[ 1] ), \
+                            _mm_xor_si128( M[ 2], H[ 2] ) ), \
+             _mm_xor_si128( M[ 9], H[ 9] ) ), \
+          _mm_xor_si128( M[11], H[11] ) ), \
+       _mm_xor_si128( M[14], H[14] ) )
+
+#define W2b5 \
+   _mm_add_epi64( \
+       _mm_sub_epi64( \
+          _mm_add_epi64( \
+             _mm_sub_epi64( _mm_xor_si128( M[ 3], H[ 3] ), \
+                            _mm_xor_si128( M[ 2], H[ 2] ) ), \
+             _mm_xor_si128( M[10], H[10] ) ), \
+          _mm_xor_si128( M[12], H[12] ) ), \
+       _mm_xor_si128( M[15], H[15] ) )
+
+#define W2b6 \
+   _mm_add_epi64( \
+       _mm_sub_epi64( \
+          _mm_sub_epi64( \
+             _mm_sub_epi64( _mm_xor_si128( M[ 4], H[ 4] ), \
+                            _mm_xor_si128( M[ 0], H[ 0] ) ), \
+             _mm_xor_si128( M[ 3], H[ 3] ) ), \
+          _mm_xor_si128( M[11], H[11] ) ), \
+       _mm_xor_si128( M[13], H[13] ) )
+
+#define W2b7 \
+   _mm_sub_epi64( \
+       _mm_sub_epi64( \
+          _mm_sub_epi64( \
+             _mm_sub_epi64( _mm_xor_si128( M[ 1], H[ 1] ), \
+                            _mm_xor_si128( M[ 4], H[ 4] ) ), \
+             _mm_xor_si128( M[ 5], H[ 5] ) ), \
+          _mm_xor_si128( M[12], H[12] ) ), \
+       _mm_xor_si128( M[14], H[14] ) )
+
+#define W2b8 \
+   _mm_sub_epi64( \
+       _mm_add_epi64( \
+          _mm_sub_epi64( \
+             _mm_sub_epi64( _mm_xor_si128( M[ 2], H[ 2] ), \
+                            _mm_xor_si128( M[ 5], H[ 5] ) ), \
+             _mm_xor_si128( M[ 6], H[ 6] ) ), \
+          _mm_xor_si128( M[13], H[13] ) ), \
+       _mm_xor_si128( M[15], H[15] ) )
+
+#define W2b9 \
+   _mm_add_epi64( \
+       _mm_sub_epi64( \
+          _mm_add_epi64( \
+             _mm_sub_epi64( _mm_xor_si128( M[ 0], H[ 0] ), \
+                            _mm_xor_si128( M[ 3], H[ 3] ) ), \
+             _mm_xor_si128( M[ 6], H[ 6] ) ), \
+          _mm_xor_si128( M[ 7], H[ 7] ) ), \
+       _mm_xor_si128( M[14], H[14] ) )
+
+#define W2b10 \
+   _mm_add_epi64( \
+       _mm_sub_epi64( \
+          _mm_sub_epi64( \
+             _mm_sub_epi64( _mm_xor_si128( M[ 8], H[ 8] ), \
+                            _mm_xor_si128( M[ 1], H[ 1] ) ), \
+             _mm_xor_si128( M[ 4], H[ 4] ) ), \
+          _mm_xor_si128( M[ 7], H[ 7] ) ), \
+       _mm_xor_si128( M[15], H[15] ) )
+
+#define W2b11 \
+   _mm_add_epi64( \
+       _mm_sub_epi64( \
+          _mm_sub_epi64( \
+             _mm_sub_epi64( _mm_xor_si128( M[ 8], H[ 8] ), \
+                            _mm_xor_si128( M[ 0], H[ 0] ) ), \
+             _mm_xor_si128( M[ 2], H[ 2] ) ), \
+          _mm_xor_si128( M[ 5], H[ 5] ) ), \
+       _mm_xor_si128( M[ 9], H[ 9] ) )
+
+#define W2b12 \
+   _mm_add_epi64( \
+       _mm_sub_epi64( \
+          _mm_sub_epi64( \
+             _mm_add_epi64( _mm_xor_si128( M[ 1], H[ 1] ), \
+                            _mm_xor_si128( M[ 3], H[ 3] ) ), \
+             _mm_xor_si128( M[ 6], H[ 6] ) ), \
+          _mm_xor_si128( M[ 9], H[ 9] ) ), \
+       _mm_xor_si128( M[10], H[10] ) )
+
+#define W2b13 \
+   _mm_add_epi64( \
+       _mm_add_epi64( \
+          _mm_add_epi64( \
+             _mm_add_epi64( _mm_xor_si128( M[ 2], H[ 2] ), \
+                            _mm_xor_si128( M[ 4], H[ 4] ) ), \
+             _mm_xor_si128( M[ 7], H[ 7] ) ), \
+          _mm_xor_si128( M[10], H[10] ) ), \
+       _mm_xor_si128( M[11], H[11] ) )
+
+#define W2b14 \
+   _mm_sub_epi64( \
+       _mm_sub_epi64( \
+          _mm_add_epi64( \
+             _mm_sub_epi64( _mm_xor_si128( M[ 3], H[ 3] ), \
+                            _mm_xor_si128( M[ 5], H[ 5] ) ), \
+             _mm_xor_si128( M[ 8], H[ 8] ) ), \
+          _mm_xor_si128( M[11], H[11] ) ), \
+       _mm_xor_si128( M[12], H[12] ) )
+
+#define W2b15 \
+   _mm_add_epi64( \
+       _mm_sub_epi64( \
+          _mm_sub_epi64( \
+             _mm_sub_epi64( _mm_xor_si128( M[12], H[12] ), \
+                            _mm_xor_si128( M[ 4], H[4] ) ), \
+             _mm_xor_si128( M[ 6], H[ 6] ) ), \
+          _mm_xor_si128( M[ 9], H[ 9] ) ), \
+       _mm_xor_si128( M[13], H[13] ) )
+
+
+void compress_big_2way( const __m128i *M, const __m128i H[16],
+                        __m128i dH[16] )
+{
+   __m128i qt[32], xl, xh;
+
+   qt[ 0] = _mm_add_epi64( s2b0( W2b0 ), H[ 1] );
+   qt[ 1] = _mm_add_epi64( s2b1( W2b1 ), H[ 2] );
+   qt[ 2] = _mm_add_epi64( s2b2( W2b2 ), H[ 3] );
+   qt[ 3] = _mm_add_epi64( s2b3( W2b3 ), H[ 4] );
+   qt[ 4] = _mm_add_epi64( s2b4( W2b4 ), H[ 5] );
+   qt[ 5] = _mm_add_epi64( s2b0( W2b5 ), H[ 6] );
+   qt[ 6] = _mm_add_epi64( s2b1( W2b6 ), H[ 7] );
+   qt[ 7] = _mm_add_epi64( s2b2( W2b7 ), H[ 8] );
+   qt[ 8] = _mm_add_epi64( s2b3( W2b8 ), H[ 9] );
+   qt[ 9] = _mm_add_epi64( s2b4( W2b9 ), H[10] );
+   qt[10] = _mm_add_epi64( s2b0( W2b10), H[11] );
+   qt[11] = _mm_add_epi64( s2b1( W2b11), H[12] );
+   qt[12] = _mm_add_epi64( s2b2( W2b12), H[13] );
+   qt[13] = _mm_add_epi64( s2b3( W2b13), H[14] );
+   qt[14] = _mm_add_epi64( s2b4( W2b14), H[15] );
+   qt[15] = _mm_add_epi64( s2b0( W2b15), H[ 0] );
+   qt[16] = expand1_2b( qt, M, H, 16 );
+   qt[17] = expand1_2b( qt, M, H, 17 );
+   qt[18] = expand2_2b( qt, M, H, 18 );
+   qt[19] = expand2_2b( qt, M, H, 19 );
+   qt[20] = expand2_2b( qt, M, H, 20 );
+   qt[21] = expand2_2b( qt, M, H, 21 );
+   qt[22] = expand2_2b( qt, M, H, 22 );
+   qt[23] = expand2_2b( qt, M, H, 23 );
+   qt[24] = expand2_2b( qt, M, H, 24 );
+   qt[25] = expand2_2b( qt, M, H, 25 );
+   qt[26] = expand2_2b( qt, M, H, 26 );
+   qt[27] = expand2_2b( qt, M, H, 27 );
+   qt[28] = expand2_2b( qt, M, H, 28 );
+   qt[29] = expand2_2b( qt, M, H, 29 );
+   qt[30] = expand2_2b( qt, M, H, 30 );
+   qt[31] = expand2_2b( qt, M, H, 31 );
+
+   xl = _mm_xor_si128(
+            _mm_xor_si128( _mm_xor_si128( qt[16], qt[17] ),
+                           _mm_xor_si128( qt[18], qt[19] ) ),
+            _mm_xor_si128( _mm_xor_si128( qt[20], qt[21] ),
+                           _mm_xor_si128( qt[22], qt[23] ) ) );
+   xh = _mm_xor_si128( xl,
+            _mm_xor_si128(
+                 _mm_xor_si128( _mm_xor_si128( qt[24], qt[25] ),
+                                _mm_xor_si128( qt[26], qt[27] ) ),
+                 _mm_xor_si128( _mm_xor_si128( qt[28], qt[29] ),
+                                _mm_xor_si128( qt[30], qt[31] ) ) ) );
+
+   dH[ 0] = _mm_add_epi64(
+              _mm_xor_si128( M[0],
+                    _mm_xor_si128( _mm_slli_epi64( xh, 5 ),
+                                   _mm_srli_epi64( qt[16], 5 ) ) ),
+              _mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] ) );
+   dH[ 1] = _mm_add_epi64(
+              _mm_xor_si128( M[1],
+                    _mm_xor_si128( _mm_srli_epi64( xh, 7 ),
+                                   _mm_slli_epi64( qt[17], 8 ) ) ),
+              _mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] ) );
+   dH[ 2] = _mm_add_epi64(
+               _mm_xor_si128( M[2],
+                    _mm_xor_si128( _mm_srli_epi64( xh, 5 ),
+                                _mm_slli_epi64( qt[18], 5 ) ) ),
+               _mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] ) );
+   dH[ 3] = _mm_add_epi64(
+               _mm_xor_si128( M[3],
+                    _mm_xor_si128( _mm_srli_epi64( xh, 1 ),
+                                   _mm_slli_epi64( qt[19], 5 ) ) ),
+               _mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] ) );
+   dH[ 4] = _mm_add_epi64(
+               _mm_xor_si128( M[4],
+                    _mm_xor_si128( _mm_srli_epi64( xh, 3 ),
+                                      _mm_slli_epi64( qt[20], 0 ) ) ),
+               _mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] ) );
+   dH[ 5] = _mm_add_epi64(
+               _mm_xor_si128( M[5],
+                    _mm_xor_si128( _mm_slli_epi64( xh, 6 ),
+                                   _mm_srli_epi64( qt[21], 6 ) ) ),
+               _mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] ) );
+   dH[ 6] = _mm_add_epi64(
+               _mm_xor_si128( M[6],
+                    _mm_xor_si128( _mm_srli_epi64( xh, 4 ),
+                                   _mm_slli_epi64( qt[22], 6 ) ) ),
+                 _mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] ) );
+   dH[ 7] = _mm_add_epi64(
+               _mm_xor_si128( M[7],
+                    _mm_xor_si128( _mm_srli_epi64( xh, 11 ),
+                                   _mm_slli_epi64( qt[23], 2 ) ) ),
+               _mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ) );
+   dH[ 8] = _mm_add_epi64( _mm_add_epi64(
+               mm128_rol_64( dH[4], 9 ),
+               _mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] ) ),
+               _mm_xor_si128( _mm_slli_epi64( xl, 8 ),
+                              _mm_xor_si128( qt[23], qt[ 8] ) ) );
+   dH[ 9] = _mm_add_epi64( _mm_add_epi64(
+               mm128_rol_64( dH[5], 10 ),
+               _mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] ) ),
+               _mm_xor_si128( _mm_srli_epi64( xl, 6 ),
+                              _mm_xor_si128( qt[16], qt[ 9] ) ) );
+   dH[10] = _mm_add_epi64( _mm_add_epi64(
+               mm128_rol_64( dH[6], 11 ),
+               _mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] ) ),
+               _mm_xor_si128( _mm_slli_epi64( xl, 6 ),
+                              _mm_xor_si128( qt[17], qt[10] ) ) );
+   dH[11] = _mm_add_epi64( _mm_add_epi64(
+               mm128_rol_64( dH[7], 12 ),
+               _mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
+               _mm_xor_si128( _mm_slli_epi64( xl, 4 ),
+                              _mm_xor_si128( qt[18], qt[11] ) ) );
+   dH[12] = _mm_add_epi64( _mm_add_epi64(
+               mm128_rol_64( dH[0], 13 ),
+               _mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] ) ),
+               _mm_xor_si128( _mm_srli_epi64( xl, 3 ),
+                              _mm_xor_si128( qt[19], qt[12] ) ) );
+   dH[13] = _mm_add_epi64( _mm_add_epi64(
+               mm128_rol_64( dH[1], 14 ),
+               _mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] ) ),
+               _mm_xor_si128( _mm_srli_epi64( xl, 4 ),
+                              _mm_xor_si128( qt[20], qt[13] ) ) );
+   dH[14] = _mm_add_epi64( _mm_add_epi64(
+               mm128_rol_64( dH[2], 15 ),
+               _mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] ) ),
+               _mm_xor_si128( _mm_srli_epi64( xl, 7 ),
+                              _mm_xor_si128( qt[21], qt[14] ) ) );
+   dH[15] = _mm_add_epi64( _mm_add_epi64(
+               mm128_rol_64( dH[3], 16 ),
+               _mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] ) ),
+               _mm_xor_si128( _mm_srli_epi64( xl, 2 ),
+                              _mm_xor_si128( qt[22], qt[15] ) ) );
+}
+
+static const __m128i final_b2[16] =
+{
+   { 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 },
+   { 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 },
+   { 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 },
+   { 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 },
+   { 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 },
+   { 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 },
+   { 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 },
+   { 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 },
+   { 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 },
+   { 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 },
+   { 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 },
+   { 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 },
+   { 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 },
+   { 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 },
+   { 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7 },
+   { 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf }
+};
+
+void bmw512_2way_init( bmw_2way_big_context *ctx )
+{
+   ctx->H[ 0] = _mm_set1_epi64x( IV512[ 0] );
+   ctx->H[ 1] = _mm_set1_epi64x( IV512[ 1] );
+   ctx->H[ 2] = _mm_set1_epi64x( IV512[ 2] );
+   ctx->H[ 3] = _mm_set1_epi64x( IV512[ 3] );
+   ctx->H[ 4] = _mm_set1_epi64x( IV512[ 4] );
+   ctx->H[ 5] = _mm_set1_epi64x( IV512[ 5] );
+   ctx->H[ 6] = _mm_set1_epi64x( IV512[ 6] );
+   ctx->H[ 7] = _mm_set1_epi64x( IV512[ 7] );
+   ctx->H[ 8] = _mm_set1_epi64x( IV512[ 8] );
+   ctx->H[ 9] = _mm_set1_epi64x( IV512[ 9] );
+   ctx->H[10] = _mm_set1_epi64x( IV512[10] );
+   ctx->H[11] = _mm_set1_epi64x( IV512[11] );
+   ctx->H[12] = _mm_set1_epi64x( IV512[12] );
+   ctx->H[13] = _mm_set1_epi64x( IV512[13] );
+   ctx->H[14] = _mm_set1_epi64x( IV512[14] );
+   ctx->H[15] = _mm_set1_epi64x( IV512[15] );
+   ctx->ptr = 0;
+   ctx->bit_count = 0;
+}
+
+void bmw512_2way( bmw_2way_big_context *ctx, const void *data, size_t len )
+{
+   __m128i *buf = (__m128i*)ctx->buf;
+   __m128i htmp[16];
+   __m128i *h1 = ctx->H;
+   __m128i *h2 = htmp;
+   size_t blen = len << 1;
+   size_t ptr = ctx->ptr;
+   size_t bptr = ctx->ptr << 1;
+   size_t vptr = ctx->ptr >> 3;
+//   const int buf_size = 128;  // bytes of one lane, compatible with len
+
+   ctx->bit_count += len << 3;
+   while ( blen > 0 )
+   {
+      size_t clen = (sizeof ctx->buf ) - bptr;
+      if ( clen > blen )
+         clen = blen;
+      memcpy( buf + vptr, data, clen );
+      bptr += clen;
+      vptr = bptr >> 4;
+      data = (const unsigned char *)data + clen;
+      blen -= clen;
+      if ( ptr == (sizeof ctx->buf ) )
+      {
+         __m128i *ht;
+         compress_big_2way( buf, h1, h2 );
+         ht = h1;
+         h1 = h2;
+         h2 = ht;
+         ptr = 0;
+      }
+   }
+   ctx->ptr = ptr;
+   if ( h1 != ctx->H )
+        memcpy_128( ctx->H, h1, 16 );
+}
+
+void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
+{
+   __m128i h1[16], h2[16], *h;
+   __m128i *buf = (__m128i*)ctx->buf;
+   size_t   vptr    = ctx->ptr >> 3;
+//   unsigned bit_len = ( (unsigned)(ctx->ptr) << 1 );
+
+   buf[ vptr++ ] = _mm_set1_epi64x( 0x80 );
+   h = ctx->H;
+
+   if ( vptr == 16 )
+   {
+      compress_big_2way( buf, h, h1 );
+      vptr = 0;
+      h = h1;
+   }
+   memset_zero_128( buf + vptr, 16 - vptr - 1 );
+   buf[ 15 ] = _mm_set1_epi64x( ctx->bit_count );
+   compress_big_2way( buf, h, h2 );
+   memcpy_128( buf, h2, 16 );
+   compress_big_2way( buf, final_b2, h1 );
+   memcpy( (__m128i*)dst, h1+16, 8 );
+}
+
+#endif  // __SSE2__
+
+
+
+#if defined(__AVX2__)
+
+// BMW-512 4 way 64
+
+
+#define sb0(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
+                                       _mm256_slli_epi64( (x), 3) ), \
+                     _mm256_xor_si256( mm256_rol_64( (x),  4), \
+                                       mm256_rol_64( (x), 37) ) )
+
+#define sb1(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
+                                       _mm256_slli_epi64( (x), 2) ), \
+                     _mm256_xor_si256( mm256_rol_64( (x), 13), \
+                                       mm256_rol_64( (x), 43) ) )
+
+#define sb2(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
+                                       _mm256_slli_epi64( (x), 1) ), \
+                     _mm256_xor_si256( mm256_rol_64( (x), 19), \
+                                       mm256_rol_64( (x), 53) ) )
+
+#define sb3(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
+                                       _mm256_slli_epi64( (x), 2) ), \
+                     _mm256_xor_si256( mm256_rol_64( (x), 28), \
+                                       mm256_rol_64( (x), 59) ) )
+
+#define sb4(x) \
+  _mm256_xor_si256( (x), _mm256_srli_epi64( (x), 1 ) )
+
+#define sb5(x) \
+  _mm256_xor_si256( (x), _mm256_srli_epi64( (x), 2 ) )
+
+#define rb1(x)    mm256_rol_64( x,  5 ) 
+#define rb2(x)    mm256_rol_64( x, 11 ) 
+#define rb3(x)    mm256_rol_64( x, 27 ) 
+#define rb4(x)    mm256_rol_64( x, 32 ) 
+#define rb5(x)    mm256_rol_64( x, 37 ) 
+#define rb6(x)    mm256_rol_64( x, 43 ) 
+#define rb7(x)    mm256_rol_64( x, 53 ) 
+
+#define rol_off_64( M, j, off ) \
+   mm256_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
+                  ( ( (j) + (off) ) & 0xF ) + 1 )
+
+#define add_elt_b( M, H, j ) \
+   _mm256_xor_si256( \
+      _mm256_add_epi64( \
+            _mm256_sub_epi64( _mm256_add_epi64( rol_off_64( M, j, 0 ), \
+                                                rol_off_64( M, j, 3 ) ), \
+                             rol_off_64( M, j, 10 ) ), \
+            _mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
+       H[ ( (j)+7 ) & 0xF ] )
+          
+#define expand1b( qt, M, H, i ) \
+   _mm256_add_epi64( \
+      _mm256_add_epi64( \
+         _mm256_add_epi64( \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( sb1( qt[ (i)-16 ] ), \
+                                  sb2( qt[ (i)-15 ] ) ), \
+                _mm256_add_epi64( sb3( qt[ (i)-14 ] ), \
+                                  sb0( qt[ (i)-13 ] ) ) ), \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( sb1( qt[ (i)-12 ] ), \
+                                  sb2( qt[ (i)-11 ] ) ), \
+                _mm256_add_epi64( sb3( qt[ (i)-10 ] ), \
+                                  sb0( qt[ (i)- 9 ] ) ) ) ), \
+         _mm256_add_epi64( \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( sb1( qt[ (i)- 8 ] ), \
+                                  sb2( qt[ (i)- 7 ] ) ), \
+                _mm256_add_epi64( sb3( qt[ (i)- 6 ] ), \
+                                  sb0( qt[ (i)- 5 ] ) ) ), \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( sb1( qt[ (i)- 4 ] ), \
+                                  sb2( qt[ (i)- 3 ] ) ), \
+                _mm256_add_epi64( sb3( qt[ (i)- 2 ] ), \
+                                  sb0( qt[ (i)- 1 ] ) ) ) ) ), \
+      add_elt_b( M, H, (i)-16 ) )
+
+#define expand2b( qt, M, H, i) \
+   _mm256_add_epi64( \
+      _mm256_add_epi64( \
+         _mm256_add_epi64( \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ) ), \
+                _mm256_add_epi64( qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ) ), \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ) ), \
+                _mm256_add_epi64( qt[ (i)-10 ], rb4( qt[ (i)- 9 ] ) ) ) ), \
+         _mm256_add_epi64( \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ) ), \
+                _mm256_add_epi64( qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ) ), \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ) ), \
+                _mm256_add_epi64( sb4( qt[ (i)- 2 ] ), \
+                                  sb5( qt[ (i)- 1 ] ) ) ) ) ), \
+      add_elt_b( M, H, (i)-16 ) )
+
+
+#define Wb0 \
+   _mm256_add_epi64( \
+       _mm256_add_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
+                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+             _mm256_xor_si256( M[10], H[10] ) ), \
+          _mm256_xor_si256( M[13], H[13] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define Wb1 \
+   _mm256_sub_epi64( \
+       _mm256_add_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
+                               _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+             _mm256_xor_si256( M[11], H[11] ) ), \
+          _mm256_xor_si256( M[14], H[14] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb2 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+          _mm256_xor_si256( M[12], H[12] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb3 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+          _mm256_xor_si256( M[10], H[10] ) ), \
+       _mm256_xor_si256( M[13], H[13] ) )
+
+#define Wb4 \
+   _mm256_sub_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define Wb5 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+             _mm256_xor_si256( M[10], H[10] ) ), \
+          _mm256_xor_si256( M[12], H[12] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb6 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
+                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+             _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_xor_si256( M[13], H[13] ) )
+
+#define Wb7 \
+   _mm256_sub_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+             _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+          _mm256_xor_si256( M[12], H[12] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define Wb8 \
+   _mm256_sub_epi64( \
+       _mm256_add_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[13], H[13] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb9 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define Wb10 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+             _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb11 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+             _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+          _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+       _mm256_xor_si256( M[ 9], H[ 9] ) )
+
+#define Wb12 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+       _mm256_xor_si256( M[10], H[10] ) )
+
+#define Wb13 \
+   _mm256_add_epi64( \
+       _mm256_add_epi64( \
+          _mm256_add_epi64( \
+             _mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+             _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+          _mm256_xor_si256( M[10], H[10] ) ), \
+       _mm256_xor_si256( M[11], H[11] ) )
+
+#define Wb14 \
+   _mm256_sub_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_xor_si256( M[12], H[12] ) )
+
+#define Wb15 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
+                               _mm256_xor_si256( M[ 4], H[4] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+       _mm256_xor_si256( M[13], H[13] ) )
+
+void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
+{
+   __m256i qt[32], xl, xh;
+
+   qt[ 0] = _mm256_add_epi64( sb0( Wb0 ), H[ 1] ); 
+   qt[ 1] = _mm256_add_epi64( sb1( Wb1 ), H[ 2] ); 
+   qt[ 2] = _mm256_add_epi64( sb2( Wb2 ), H[ 3] ); 
+   qt[ 3] = _mm256_add_epi64( sb3( Wb3 ), H[ 4] ); 
+   qt[ 4] = _mm256_add_epi64( sb4( Wb4 ), H[ 5] ); 
+   qt[ 5] = _mm256_add_epi64( sb0( Wb5 ), H[ 6] ); 
+   qt[ 6] = _mm256_add_epi64( sb1( Wb6 ), H[ 7] ); 
+   qt[ 7] = _mm256_add_epi64( sb2( Wb7 ), H[ 8] ); 
+   qt[ 8] = _mm256_add_epi64( sb3( Wb8 ), H[ 9] ); 
+   qt[ 9] = _mm256_add_epi64( sb4( Wb9 ), H[10] ); 
+   qt[10] = _mm256_add_epi64( sb0( Wb10), H[11] ); 
+   qt[11] = _mm256_add_epi64( sb1( Wb11), H[12] ); 
+   qt[12] = _mm256_add_epi64( sb2( Wb12), H[13] ); 
+   qt[13] = _mm256_add_epi64( sb3( Wb13), H[14] );
+   qt[14] = _mm256_add_epi64( sb4( Wb14), H[15] ); 
+   qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] ); 
+   qt[16] = expand1b( qt, M, H, 16 ); 
+   qt[17] = expand1b( qt, M, H, 17 ); 
+   qt[18] = expand2b( qt, M, H, 18 ); 
+   qt[19] = expand2b( qt, M, H, 19 ); 
+   qt[20] = expand2b( qt, M, H, 20 ); 
+   qt[21] = expand2b( qt, M, H, 21 ); 
+   qt[22] = expand2b( qt, M, H, 22 ); 
+   qt[23] = expand2b( qt, M, H, 23 ); 
+   qt[24] = expand2b( qt, M, H, 24 ); 
+   qt[25] = expand2b( qt, M, H, 25 ); 
+   qt[26] = expand2b( qt, M, H, 26 ); 
+   qt[27] = expand2b( qt, M, H, 27 ); 
+   qt[28] = expand2b( qt, M, H, 28 ); 
+   qt[29] = expand2b( qt, M, H, 29 ); 
+   qt[30] = expand2b( qt, M, H, 30 ); 
+   qt[31] = expand2b( qt, M, H, 31 ); 
+
+   xl = _mm256_xor_si256( 
+              _mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ), 
+                                _mm256_xor_si256( qt[18], qt[19] ) ), 
+              _mm256_xor_si256( _mm256_xor_si256( qt[20], qt[21] ), 
+                                _mm256_xor_si256( qt[22], qt[23] ) ) ); 
+   xh = _mm256_xor_si256( xl, 
+             _mm256_xor_si256( 
+                 _mm256_xor_si256( _mm256_xor_si256( qt[24], qt[25] ),
+                                   _mm256_xor_si256( qt[26], qt[27] ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
+                                   _mm256_xor_si256( qt[30], qt[31] ) )));
+
+   dH[ 0] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[0],
+                      _mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
+                                        _mm256_srli_epi64( qt[16], 5 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
+   dH[ 1] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[1],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
+                                        _mm256_slli_epi64( qt[17], 8 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
+   dH[ 2] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[2],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
+                                        _mm256_slli_epi64( qt[18], 5 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
+   dH[ 3] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[3],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
+                                        _mm256_slli_epi64( qt[19], 5 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
+   dH[ 4] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[4],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
+                                        _mm256_slli_epi64( qt[20], 0 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
+   dH[ 5] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[5],
+                      _mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
+                                        _mm256_srli_epi64( qt[21], 6 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
+   dH[ 6] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[6],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
+                                        _mm256_slli_epi64( qt[22], 6 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
+   dH[ 7] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[7],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
+                                        _mm256_slli_epi64( qt[23], 2 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
+   dH[ 8] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rol_64( dH[4], 9 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
+                 _mm256_xor_si256( _mm256_slli_epi64( xl, 8 ),
+                                   _mm256_xor_si256( qt[23], qt[ 8] ) ) );
+   dH[ 9] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rol_64( dH[5], 10 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 6 ),
+                                   _mm256_xor_si256( qt[16], qt[ 9] ) ) );
+   dH[10] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rol_64( dH[6], 11 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
+                 _mm256_xor_si256( _mm256_slli_epi64( xl, 6 ),
+                                   _mm256_xor_si256( qt[17], qt[10] ) ) );
+   dH[11] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rol_64( dH[7], 12 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
+                 _mm256_xor_si256( _mm256_slli_epi64( xl, 4 ),
+                                   _mm256_xor_si256( qt[18], qt[11] ) ) );
+   dH[12] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rol_64( dH[0], 13 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 3 ),
+                                   _mm256_xor_si256( qt[19], qt[12] ) ) );
+   dH[13] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rol_64( dH[1], 14 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 4 ),
+                                   _mm256_xor_si256( qt[20], qt[13] ) ) );
+   dH[14] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rol_64( dH[2], 15 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 7 ),
+                                   _mm256_xor_si256( qt[21], qt[14] ) ) );
+   dH[15] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rol_64( dH[3], 16 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 2 ),
+                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
+} 
+
+static const __m256i final_b[16] =
+{
+   { 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0,
+     0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 },
+   { 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1,
+     0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 },
+   { 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2,
+     0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 },
+   { 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3,
+     0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 },
+   { 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4,
+     0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 },
+   { 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5,
+     0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 },
+   { 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6,
+     0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 },
+   { 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7,
+     0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7 },
+   { 0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8,
+     0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8 },
+   { 0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9,
+     0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9 },
+   { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
+     0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
+   { 0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab,
+     0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab },
+   { 0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac,
+     0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac },
+   { 0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad,
+     0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad },
+   { 0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae,
+     0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae },
+   { 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf,
+     0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf }
+};
+
+static void
+bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
+{
+   for ( int i = 0; i < 16; i++ )
+      sc->H[i] = _mm256_set1_epi64x( iv[i] );
+   sc->ptr = 0;
+   sc->bit_count = 0;
+}
+
+static void
+bmw64_4way( bmw_4way_big_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   __m256i *buf;
+   __m256i htmp[16];
+   __m256i *h1, *h2;
+   size_t ptr;
+   const int buf_size = 128;  // bytes of one lane, compatible with len
+
+   sc->bit_count += (sph_u64)len << 3;
+   buf = sc->buf;
+   ptr = sc->ptr;
+   h1 = sc->H;
+   h2 = htmp;
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( buf + (ptr>>3), vdata, clen >> 3 );
+      vdata = vdata + (clen>>3);
+      len -= clen;
+      ptr += clen;
+      if ( ptr == buf_size )
+      {
+         __m256i *ht;
+         compress_big( buf, h1, h2 );
+         ht = h1;
+         h1 = h2;
+         h2 = ht;
+         ptr = 0;
+      }
+   }
+   sc->ptr = ptr;
+   if ( h1 != sc->H )
+        memcpy_256( sc->H, h1, 16 );
+}
+
+static void
+bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w64)
+{
+   __m256i *buf;
+   __m256i h1[16], h2[16], *h;
+   size_t ptr, u, v;
+   unsigned z;
+   const int buf_size = 128;  // bytes of one lane, compatible with len
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   z = 0x80 >> n;
+   buf[ ptr>>3 ] = _mm256_set1_epi64x( z );
+   ptr += 8;
+   h = sc->H;
+
+   if (  ptr > (buf_size - 8) )
+   {
+      memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+      compress_big( buf, h, h1 );
+      ptr = 0;
+      h = h1;
+   }
+   memset_zero_256( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
+   buf[ (buf_size - 8) >> 3 ] = _mm256_set1_epi64x( sc->bit_count + n );
+   compress_big( buf, h, h2 );
+   for ( u = 0; u < 16; u ++ )
+      buf[u] = h2[u];
+   compress_big( buf, final_b, h1 );
+   for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++)
+      casti_m256i(dst,u) = h1[v];
+}
+
+void
+bmw512_4way_init(void *cc)
+{
+	bmw64_4way_init(cc, IV512);
+}
+
+void
+bmw512_4way(void *cc, const void *data, size_t len)
+{
+	bmw64_4way(cc, data, len);
+}
+
+void
+bmw512_4way_close(void *cc, void *dst)
+{
+	bmw512_4way_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw64_4way_close(cc, ub, n, dst, 8);
+}
+
+#endif  // __AVX2__
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c
index 18d4655..c503308 100644
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -90,7 +90,7 @@ void allium_4way_hash( void *state, const void *input )
 }
 
 int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                             uint64_t *hashes_done )
+                             uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[8*4] __attribute__ ((aligned (64)));
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -100,40 +100,47 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
    const uint32_t Htarg = ptarget[7];
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 76; // 19*4
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 
    if ( opt_benchmark )
       ( (uint32_t*)ptarget )[7] = 0x0000ff;
 
-   swab32_array( edata, pdata, 20 );
+   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
    mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
    blake256_4way_init( &allium_4way_ctx.blake );
    blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
 
    do {
-     be32enc( noncep,   n   );
-     be32enc( noncep+1, n+1 );
-     be32enc( noncep+2, n+2 );
-     be32enc( noncep+3, n+3 );
+     *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
 
      allium_4way_hash( hash, vdata );
      pdata[19] = n;
 
-     for ( int i = 0; i < 4; i++ )
-     if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+     for ( int lane = 0; lane < 4; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
      {
-         pdata[19] = n+i;
-         nonces[ num_found++ ] = n+i;
-         work_set_target_ratio( work, hash+(i<<3) );
+        if ( fulltest( hash+(lane<<3), ptarget ) )
+        {
+           pdata[19] = n + lane;
+           work_set_target_ratio( work, hash+(lane<<3) );
+           if ( submit_work( mythr, work ) )
+               applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
+                             accepted_share_count + rejected_share_count + 1,
+                             thr_id, lane );
+           else
+               applog( LOG_WARNING, "Failed to submit share." );
+         }
      }
      n += 4;
-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
 
    *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }
 
 #endif
diff --git a/algo/lyra2/allium.c b/algo/lyra2/allium.c
index bf98929..f46a037 100644
--- a/algo/lyra2/allium.c
+++ b/algo/lyra2/allium.c
@@ -70,7 +70,7 @@ void allium_hash(void *state, const void *input)
 }
 
 int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done )
+                     uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t _ALIGN(128) hash[8];
     uint32_t _ALIGN(128) endiandata[20];
@@ -80,6 +80,7 @@ int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
     const uint32_t Htarg = ptarget[7];
     const uint32_t first_nonce = pdata[19];
     uint32_t nonce = first_nonce;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 
     if ( opt_benchmark )
         ptarget[7] = 0x3ffff;
diff --git a/algo/lyra2/lyra2-gate.h b/algo/lyra2/lyra2-gate.h
index 24957c6..617dc8a 100644
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -43,14 +43,14 @@ bool register_lyra2rev2_algo( algo_gate_t* gate );
 
 void lyra2rev2_4way_hash( void *state, const void *input );
 int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                             uint64_t *hashes_done );
+                             uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev2_4way_ctx();
 
 #else
 
 void lyra2rev2_hash( void *state, const void *input );
 int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done );
+                        uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev2_ctx();
 
 #endif
@@ -61,7 +61,7 @@ bool init_lyra2rev2_ctx();
   #define LYRA2Z_4WAY
 #endif
 #if defined(__AVX2__)
-//  #define LYRA2Z_8WAY
+  #define LYRA2Z_8WAY
 #endif
 
 
@@ -71,21 +71,21 @@ bool init_lyra2rev2_ctx();
 
 void lyra2z_8way_hash( void *state, const void *input );
 int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done );
+                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_8way_thread_init();
 
 #elif defined(LYRA2Z_4WAY)
 
 void lyra2z_4way_hash( void *state, const void *input );
 int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done );
+                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_4way_thread_init();
 
 #else
 
 void lyra2z_hash( void *state, const void *input );
 int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
+                     uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_thread_init();
 
 #endif
@@ -102,14 +102,14 @@ bool lyra2z_thread_init();
 
 void lyra2h_4way_hash( void *state, const void *input );
 int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done );
+                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2h_4way_thread_init();
 
 #else
 
 void lyra2h_hash( void *state, const void *input );
 int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
+                     uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2h_thread_init();
 
 #endif
@@ -126,14 +126,14 @@ bool register_allium_algo( algo_gate_t* gate );
 
 void allium_4way_hash( void *state, const void *input );
 int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done );
+                          uint64_t *hashes_done, struct thr_info *mythr );
 bool init_allium_4way_ctx();
 
 #else
 
 void allium_hash( void *state, const void *input );
 int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
+                     uint64_t *hashes_done, struct thr_info *mythr );
 bool init_allium_ctx();
 
 #endif 
@@ -146,7 +146,7 @@ bool register_phi2_algo( algo_gate_t* gate );
 
 void phi2_hash( void *state, const void *input );
 int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
+                     uint64_t *hashes_done, struct thr_info *mythr );
 void init_phi2_ctx();
 
 #endif  // LYRA2_GATE_H__
diff --git a/algo/lyra2/lyra2h-4way.c b/algo/lyra2/lyra2h-4way.c
index 16bb5aa..e3640d2 100644
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -50,7 +50,7 @@ void lyra2h_4way_hash( void *state, const void *input )
 }
 
 int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done )
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[8*4] __attribute__ ((aligned (64)));
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -63,6 +63,7 @@ int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
    uint32_t *nonces = work->nonces;
    int num_found = 0;
    uint32_t *noncep= vdata + 76; // 19*4
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 
    if ( opt_benchmark )
       ptarget[7] = 0x0000ff;
diff --git a/algo/lyra2/lyra2h.c b/algo/lyra2/lyra2h.c
index 92423aa..5a054d7 100644
--- a/algo/lyra2/lyra2h.c
+++ b/algo/lyra2/lyra2h.c
@@ -36,7 +36,7 @@ void lyra2h_hash( void *state, const void *input )
 }
 
 int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done )
+                    uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(64) hash[8];
 	uint32_t _ALIGN(64) endiandata[20];
@@ -45,6 +45,7 @@ int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 
 	if (opt_benchmark)
 		ptarget[7] = 0x0000ff;
diff --git a/algo/lyra2/lyra2re.c b/algo/lyra2/lyra2re.c
index 5d3a475..9873685 100644
--- a/algo/lyra2/lyra2re.c
+++ b/algo/lyra2/lyra2re.c
@@ -81,8 +81,8 @@ void lyra2re_hash(void *state, const void *input)
 	memcpy(state, hashA, 32);
 }
 
-int scanhash_lyra2re(int thr_id, struct work *work,
-	uint32_t max_nonce,	uint64_t *hashes_done)
+int scanhash_lyra2re( int thr_id, struct work *work, uint32_t max_nonce,
+	              uint64_t *hashes_done, struct thr_info *mythr )
 {
         uint32_t *pdata = work->data;
         uint32_t *ptarget = work->target;
@@ -91,6 +91,7 @@ int scanhash_lyra2re(int thr_id, struct work *work,
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
         const uint32_t Htarg = ptarget[7];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 
         swab32_array( endiandata, pdata, 20 );
 
diff --git a/algo/lyra2/lyra2rev2-4way.c b/algo/lyra2/lyra2rev2-4way.c
index e814c2e..f11b5d6 100644
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -82,7 +82,7 @@ void lyra2rev2_4way_hash( void *state, const void *input )
 }
 
 int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                             uint64_t *hashes_done )
+                             uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[8*4] __attribute__ ((aligned (64)));
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -95,6 +95,7 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
    uint32_t *nonces = work->nonces;
    int num_found = 0;
    uint32_t *noncep = vdata + 76; // 19*4
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 
    if ( opt_benchmark )
       ( (uint32_t*)ptarget )[7] = 0x0000ff;
diff --git a/algo/lyra2/lyra2rev2.c b/algo/lyra2/lyra2rev2.c
index e3d4f0b..88578e7 100644
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -73,7 +73,7 @@ void lyra2rev2_hash( void *state, const void *input )
 }
 
 int scanhash_lyra2rev2(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
 {
         uint32_t *pdata = work->data;
         uint32_t *ptarget = work->target;
@@ -82,6 +82,7 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
         const uint32_t Htarg = ptarget[7];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
diff --git a/algo/lyra2/lyra2rev3-4way.c b/algo/lyra2/lyra2rev3-4way.c
index 540439e..94e12d6 100644
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -74,7 +74,6 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
    const uint32_t Htarg = ptarget[7];
-   int num_found = 0;
    __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
    
diff --git a/algo/lyra2/lyra2z-4way.c b/algo/lyra2/lyra2z-4way.c
index db8cdd4..6dba5c9 100644
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -50,7 +50,7 @@ void lyra2z_4way_hash( void *state, const void *input )
 }
 
 int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done )
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[8*4] __attribute__ ((aligned (64)));
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -60,25 +60,23 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 76; // 19*4
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 
    if ( opt_benchmark )
       ptarget[7] = 0x0000ff;
 
-   for ( int i=0; i < 20; i++ )
-      be32enc( &edata[i], pdata[i] );
-
+   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
    mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
 
    lyra2z_4way_midstate( vdata );
 
    do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+1, n+1 );
-      be32enc( noncep+2, n+2 );
-      be32enc( noncep+3, n+3 );
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
 
       lyra2z_4way_hash( hash, vdata );
       pdata[19] = n;
@@ -87,15 +85,19 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
       if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
       {
           pdata[19] = n+i;         
-          nonces[ num_found++ ] = n+i;
           work_set_target_ratio( work, hash+(i<<3) );
+          if ( submit_work( mythr, work ) )
+              applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
+                             accepted_share_count + rejected_share_count + 1,
+                             thr_id, i );
+          else
+              applog( LOG_WARNING, "Failed to submit share." );
       }
       n += 4;
-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
 
    *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }
 
 #endif
@@ -150,14 +152,14 @@ void lyra2z_8way_hash( void *state, const void *input )
      memcpy( state+ 32, hash1, 32 );
      memcpy( state+ 64, hash2, 32 );
      memcpy( state+ 96, hash3, 32 );
-     memcpy( state+128, hash1, 32 );
-     memcpy( state+160, hash2, 32 );
-     memcpy( state+192, hash3, 32 );
-     memcpy( state+224, hash1, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
 }
 
 int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done )
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[8*8] __attribute__ ((aligned (64)));
    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
@@ -167,15 +169,15 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 152; // 19*8
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 
    if ( opt_benchmark )
       ptarget[7] = 0x0000ff;
 
-   for ( int i=0; i < 19; i++ )
-      be32enc( &edata[i], pdata[i] );
+   casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+   casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
 
    mm256_interleave_8x32( vdata, edata, edata, edata, edata,
                                  edata, edata, edata, edata, 640 );
@@ -183,15 +185,8 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
    lyra2z_8way_midstate( vdata );
 
    do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+1, n+1 );
-      be32enc( noncep+2, n+2 );
-      be32enc( noncep+3, n+3 );
-      be32enc( noncep+4, n+4 );
-      be32enc( noncep+5, n+5 );
-      be32enc( noncep+6, n+6 );
-      be32enc( noncep+7, n+7 );
-
+      *noncev = mm256_bswap_32(
+                 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
       lyra2z_8way_hash( hash, vdata );
       pdata[19] = n;
 
@@ -199,15 +194,19 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
       if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
       {
           pdata[19] = n+i;         
-          nonces[ num_found++ ] = n+i;
           work_set_target_ratio( work, hash+(i<<3) );
+          if ( submit_work( mythr, work ) )
+              applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
+                             accepted_share_count + rejected_share_count + 1,
+                             thr_id, i );
+          else
+              applog( LOG_WARNING, "Failed to submit share." );
       }
       n += 8;
-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
+   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
 
    *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }
 
 
diff --git a/algo/lyra2/lyra2z.c b/algo/lyra2/lyra2z.c
index ab3d736..1bc858b 100644
--- a/algo/lyra2/lyra2z.c
+++ b/algo/lyra2/lyra2z.c
@@ -44,7 +44,7 @@ void lyra2z_hash( void *state, const void *input )
 }
 
 int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done )
+                    uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(64) hash[8];
 	uint32_t _ALIGN(64) endiandata[20];
@@ -53,6 +53,7 @@ int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 
 	if (opt_benchmark)
 		ptarget[7] = 0x0000ff;
diff --git a/algo/lyra2/lyra2z330.c b/algo/lyra2/lyra2z330.c
index d213bd7..a0f4988 100644
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -16,7 +16,7 @@ void lyra2z330_hash(void *state, const void *input, uint32_t height)
 }
 
 int scanhash_lyra2z330( int thr_id, struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done )
+                        uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t hash[8] __attribute__ ((aligned (64))); 
 	uint32_t endiandata[20] __attribute__ ((aligned (64)));
@@ -25,6 +25,7 @@ int scanhash_lyra2z330( int thr_id, struct work *work, uint32_t max_nonce,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 	if (opt_benchmark)
 		ptarget[7] = 0x0000ff;
 
diff --git a/algo/lyra2/phi2.c b/algo/lyra2/phi2.c
index b573ae0..93efd7b 100644
--- a/algo/lyra2/phi2.c
+++ b/algo/lyra2/phi2.c
@@ -92,42 +92,50 @@ void phi2_hash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }
 
-int scanhash_phi2(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
+	           uint64_t *hashes_done, struct thr_info *mythr )
 {
-	uint32_t _ALIGN(128) hash[8];
-	uint32_t _ALIGN(128) endiandata[36];
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
+   uint32_t _ALIGN(128) hash[8];
+   uint32_t _ALIGN(128) endiandata[36];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 
-	const uint32_t Htarg = ptarget[7];
-	const uint32_t first_nonce = pdata[19];
-	uint32_t n = first_nonce;
+   if(opt_benchmark){
+   	ptarget[7] = 0x00ff;
+   }
 
-	if(opt_benchmark){
-		ptarget[7] = 0x00ff;
-	}
+   phi2_has_roots = false;
+   for ( int i=0; i < 36; i++ )
+   {
+	be32enc(&endiandata[i], pdata[i]);
+	if (i >= 20 && pdata[i]) phi2_has_roots = true;
+   }
 
-	phi2_has_roots = false;
-	for (int i=0; i < 36; i++) {
-		be32enc(&endiandata[i], pdata[i]);
-		if (i >= 20 && pdata[i]) phi2_has_roots = true;
-	}
+   do {
+	be32enc( &endiandata[19], n );
+	phi2_hash( hash, endiandata );
 
-	do {
-		be32enc(&endiandata[19], n);
-		phi2_hash(hash, endiandata);
-
-		if (hash[7] < Htarg && fulltest(hash, ptarget)) {
-			work_set_target_ratio(work, hash);
+	if ( hash[7] < Htarg && fulltest( hash, ptarget ) )
+       	{
+           pdata[19] = n;
+           work_set_target_ratio( work, hash );
+           if ( submit_work( mythr, work ) )
+               applog( LOG_NOTICE, "Share %d submitted by thread %d.",
+                            accepted_share_count + rejected_share_count + 1,
+                            thr_id );
+           else
+               applog( LOG_WARNING, "Failed to submit share." );
 			*hashes_done = n - first_nonce + 1;
-			pdata[19] = n;
-			return 1;
-		}
-		n++;
+	}
+	n++;
 
-	} while (n < max_nonce && !work_restart[thr_id].restart);
+   } while ( n < max_nonce && !work_restart[thr_id].restart );
 
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-	return 0;
+   *hashes_done = n - first_nonce + 1;
+   pdata[19] = n;
+   return 0;
 }
diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h
index 67b0962..bf20341 100644
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -91,7 +91,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
    LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
 
-#elif defined(__SSE4_2__)
+#elif defined(__SSE2__)
 
 // process 2 columns in parallel
 // returns void, all args updated
@@ -132,7 +132,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
 
 
-#endif // AVX2 else SSE4_2
+#endif // AVX2 else SSE2
 
 // Scalar
 //Blake2b's G function
diff --git a/algo/sha/sha256q-4way.c b/algo/sha/sha256q-4way.c
new file mode 100644
index 0000000..6634051
--- /dev/null
+++ b/algo/sha/sha256q-4way.c
@@ -0,0 +1,219 @@
+#include "sha256t-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "sha2-hash-4way.h"
+
+#if defined(SHA256T_8WAY)
+
+static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
+
+void sha256q_8way_hash( void* output, const void* input )
+{
+   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+   sha256_8way_context ctx;
+   memcpy( &ctx, &sha256_ctx8, sizeof ctx );
+
+   sha256_8way( &ctx, input + (64<<3), 16 );
+   sha256_8way_close( &ctx, vhash );
+
+   sha256_8way_init( &ctx );
+   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_close( &ctx, vhash );
+
+   sha256_8way_init( &ctx );
+   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_close( &ctx, vhash );
+
+   sha256_8way_init( &ctx );
+   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_close( &ctx, output );
+}
+
+int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
+	                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t edata[20] __attribute__ ((aligned (32)));;
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+
+   const uint64_t htmax[] = {          0,
+                                     0xF,
+                                    0xFF,
+                                   0xFFF,
+                                  0xFFFF,
+                              0x10000000 };
+   const uint32_t masks[] = {  0xFFFFFFFF,
+                               0xFFFFFFF0,
+                               0xFFFFFF00,
+                               0xFFFFF000,
+                               0xFFFF0000,
+                                        0 };
+
+   // Need big endian data
+   casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+   casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+                                 edata, edata, edata, edata, 640 );
+   sha256_8way_init( &sha256_ctx8 );
+   sha256_8way( &sha256_ctx8, vdata, 64 );
+
+   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+   {
+      uint32_t mask = masks[m];
+      do
+      {
+        *noncev = mm256_bswap_32(
+		 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
+
+	 pdata[19] = n;
+
+         sha256q_8way_hash( hash, vdata );
+
+         uint32_t *hash7 = &(hash[7<<3]); 
+	 
+         for ( int lane = 0; lane < 8; lane++ )
+         if ( !( hash7[ lane ] & mask ) )
+         { 
+            // deinterleave hash for lane
+	    uint32_t lane_hash[8];
+	    mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
+
+	    if ( fulltest( lane_hash, ptarget ) )
+            {
+	      pdata[19] = n + lane;
+              work_set_target_ratio( work, lane_hash );
+              if ( submit_work( mythr, work ) )
+                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
+                             accepted_share_count + rejected_share_count + 1,
+                             thr_id, lane );
+              else
+                applog( LOG_WARNING, "Failed to submit share." );
+	    }
+	 }
+         n += 8;
+
+      } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
+      break;
+   }
+    
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#endif
+
+#if defined(SHA256T_4WAY)
+
+static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
+
+void sha256q_4way_hash( void* output, const void* input )
+{
+   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+   sha256_4way_context ctx;
+   memcpy( &ctx, &sha256_ctx4, sizeof ctx );
+
+   sha256_4way( &ctx, input + (64<<2), 16 );
+   sha256_4way_close( &ctx, vhash );
+
+   sha256_4way_init( &ctx );
+   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_close( &ctx, vhash );
+
+   sha256_4way_init( &ctx );
+   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_close( &ctx, vhash );
+
+   sha256_4way_init( &ctx );
+   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_close( &ctx, output );
+}
+
+int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
+	                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[7<<2]);
+   uint32_t lane_hash[8];
+   uint32_t edata[20] __attribute__ ((aligned (32)));;
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+
+   const uint64_t htmax[] = {          0,
+                                     0xF,
+                                    0xFF,
+                                   0xFFF,
+                                  0xFFFF,
+                              0x10000000 };
+   const uint32_t masks[] = {  0xFFFFFFFF,
+                               0xFFFFFFF0,
+                               0xFFFFFF00,
+                               0xFFFFF000,
+                               0xFFFF0000,
+                                        0 };
+
+   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   sha256_4way_init( &sha256_ctx4 );
+   sha256_4way( &sha256_ctx4, vdata, 64 );
+
+   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+   {
+      uint32_t mask = masks[m];
+      do {
+         *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
+	 pdata[19] = n;
+
+         sha256q_4way_hash( hash, vdata );
+
+         for ( int lane = 0; lane < 4; lane++ )
+         if ( !( hash7[ lane ] & mask ) )
+         {
+            mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+
+            if ( fulltest( lane_hash, ptarget ) )
+            {
+              pdata[19] = n + lane;
+              work_set_target_ratio( work, lane_hash );
+              if ( submit_work( mythr, work ) )
+                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
+                             accepted_share_count + rejected_share_count + 1,
+                             thr_id, lane );
+              else
+                applog( LOG_WARNING, "Failed to submit share." );
+            }
+         }
+
+	 n += 4;
+
+      } while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
+      break;
+   }
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#endif
+
diff --git a/algo/sha/sha256q.c b/algo/sha/sha256q.c
new file mode 100644
index 0000000..8e9007b
--- /dev/null
+++ b/algo/sha/sha256q.c
@@ -0,0 +1,113 @@
+#include "sha256t-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <openssl/sha.h>
+
+static __thread SHA256_CTX sha256q_ctx __attribute__ ((aligned (64)));
+
+void sha256q_midstate( const void* input )
+{
+    SHA256_Init( &sha256q_ctx );
+    SHA256_Update( &sha256q_ctx, input, 64 );
+}
+
+void sha256q_hash( void* output, const void* input )
+{
+   uint32_t _ALIGN(64) hash[16];
+   const int midlen = 64;            // bytes
+   const int tail   = 80 - midlen;   // 16
+
+   SHA256_CTX ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &sha256q_ctx, sizeof sha256q_ctx );
+
+   SHA256_Update( &ctx, input + midlen, tail );
+   SHA256_Final( (unsigned char*)hash, &ctx );
+
+   SHA256_Init( &ctx );
+   SHA256_Update( &ctx, hash, 32 );
+   SHA256_Final( (unsigned char*)hash, &ctx );
+
+   SHA256_Init( &ctx );
+   SHA256_Update( &ctx, hash, 32 );
+   SHA256_Final( (unsigned char*)hash, &ctx );
+
+   SHA256_Init( &ctx );
+   SHA256_Update( &ctx, hash, 32 );
+   SHA256_Final( (unsigned char*)hash, &ctx );
+
+   memcpy( output, hash, 32 );
+}
+
+int scanhash_sha256q( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19] - 1;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t Htarg = ptarget[7];
+#ifdef _MSC_VER
+   uint32_t __declspec(align(32)) hash64[8];
+#else
+   uint32_t hash64[8] __attribute__((aligned(32)));
+#endif
+   uint32_t endiandata[32];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+
+   uint64_t htmax[] = {
+		0,
+		0xF,
+		0xFF,
+		0xFFF,
+		0xFFFF,
+		0x10000000
+	};
+   uint32_t masks[] = {
+		0xFFFFFFFF,
+		0xFFFFFFF0,
+		0xFFFFFF00,
+		0xFFFFF000,
+		0xFFFF0000,
+		0
+	};
+
+   // we need bigendian data...
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+   sha256q_midstate( endiandata );
+
+   for ( int m = 0; m < 6; m++ )
+   {
+      if ( Htarg <= htmax[m] )
+      {
+         uint32_t mask = masks[m];
+         do {
+            pdata[19] = ++n;
+            be32enc(&endiandata[19], n);
+            sha256q_hash( hash64, endiandata );
+            if ( ( !(hash64[7] & mask) ) && fulltest( hash64, ptarget ) )
+            {
+               work_set_target_ratio( work, hash64 );
+               if ( submit_work( mythr, work ) )
+                  applog( LOG_NOTICE, "Share %d submitted by thread %d.",
+                             accepted_share_count + rejected_share_count + 1,
+                             thr_id );
+               else
+                  applog( LOG_WARNING, "Failed to submit share." );
+               *hashes_done = n - first_nonce + 1;
+            }
+         } while ( n < max_nonce && !work_restart[thr_id].restart );
+         break;
+      }
+   }
+
+   *hashes_done = n - first_nonce + 1;
+   pdata[19] = n;
+   return 0;
+}
diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c
index b42cf0c..cefc7ef 100644
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -83,7 +83,7 @@ int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
          if ( !( hash7[ lane ] & mask ) )
          { 
             // deinterleave hash for lane
-	    uint32_t lane_hash[8];
+	    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
 	    mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
 
 	    if ( fulltest( lane_hash, ptarget ) )
@@ -138,9 +138,9 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
    uint32_t hash[8*4] __attribute__ ((aligned (32)));
-   uint32_t *hash7 = &(hash[7<<2]);
-   uint32_t lane_hash[8];
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
    uint32_t edata[20] __attribute__ ((aligned (32)));;
+   uint32_t *hash7 = &(hash[7<<2]);
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    const uint32_t Htarg = ptarget[7];
diff --git a/algo/sha/sha256t-gate.c b/algo/sha/sha256t-gate.c
index 52562d2..33b168d 100644
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -3,15 +3,15 @@
 bool register_sha256t_algo( algo_gate_t* gate )
 {
 #if defined(SHA256T_8WAY)
-    gate->optimizations = SSE2_OPT | AVX2_OPT;
+    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
     gate->scanhash   = (void*)&scanhash_sha256t_8way;
     gate->hash       = (void*)&sha256t_8way_hash;
 #elif defined(SHA256T_4WAY)
-    gate->optimizations = SSE2_OPT | AVX2_OPT;
+    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
     gate->scanhash   = (void*)&scanhash_sha256t_4way;
     gate->hash       = (void*)&sha256t_4way_hash;
 #else
-    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
+gate->optimizations = SHA_OPT;
     gate->scanhash   = (void*)&scanhash_sha256t;
     gate->hash       = (void*)&sha256t_hash;
 #endif
@@ -19,3 +19,23 @@ bool register_sha256t_algo( algo_gate_t* gate )
     return true;
 }
 
+bool register_sha256q_algo( algo_gate_t* gate )
+{
+#if defined(SHA256T_8WAY)
+    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
+    gate->scanhash   = (void*)&scanhash_sha256q_8way;
+    gate->hash       = (void*)&sha256q_8way_hash;
+#elif defined(SHA256T_4WAY)
+    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
+    gate->scanhash   = (void*)&scanhash_sha256q_4way;
+    gate->hash       = (void*)&sha256q_4way_hash;
+#else
+    gate->optimizations = SHA_OPT;
+    gate->scanhash   = (void*)&scanhash_sha256q;
+    gate->hash       = (void*)&sha256q_hash;
+#endif
+    gate->get_max64  = (void*)&get_max64_0x3ffff;
+    return true;
+
+}
+
diff --git a/algo/sha/sha256t-gate.h b/algo/sha/sha256t-gate.h
index 5183374..ae0a8f1 100644
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -15,24 +15,34 @@
 #endif
 #endif
 
-bool register_blake2s_algo( algo_gate_t* gate );
+bool register_sha256t_algo( algo_gate_t* gate );
+bool register_sha256q_algo( algo_gate_t* gate );
 
 #if defined(SHA256T_8WAY)
 
 void sha256t_8way_hash( void *output, const void *input );
 int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr );
+void sha256q_8way_hash( void *output, const void *input );
+int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
 
 #elif defined(SHA256T_4WAY)
 
 void sha256t_4way_hash( void *output, const void *input );
 int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr );
+void sha256q_4way_hash( void *output, const void *input );
+int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
 #else
 
 void sha256t_hash( void *output, const void *input );
 int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
+void sha256q_hash( void *output, const void *input );
+int scanhash_sha256q( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );
 
 #endif
 
diff --git a/algo/sha/sha256t.c b/algo/sha/sha256t.c
index ae5f96c..6917ff7 100644
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -70,8 +70,11 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
 	};
 
    // we need bigendian data...
-   for ( int k = 0; k < 19; k++ )
-      be32enc( &endiandata[k], pdata[k] );
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
 
    sha256t_midstate( endiandata );
 
@@ -87,7 +90,13 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
             if ( ( !(hash64[7] & mask) ) && fulltest( hash64, ptarget ) )
             {
                *hashes_done = n - first_nonce + 1;
-               return true;
+               work_set_target_ratio( work, hash64 );
+               if ( submit_work( mythr, work ) )
+                  applog( LOG_NOTICE, "Share %d submitted by thread %d.",
+                             accepted_share_count + rejected_share_count + 1,
+                             thr_id );
+               else
+                  applog( LOG_WARNING, "Failed to submit share." );
             }
          } while ( n < max_nonce && !work_restart[thr_id].restart );
          break;
diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c
index 15149df..f29260e 100644
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -346,7 +346,7 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
       memcpy( buf + ptr, data, clen );
       data = (const unsigned char *)data + clen;
       ptr += clen;
-      len -= clen >> 1;
+      len -= (clen >> 1);
       if ( ptr == sizeof ctx->buf )
       {
          if ( ( ctx->count0 = ctx->count0 + 1024 )  == 0 )
@@ -365,16 +365,8 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
    }
 
    uint32_t vp = ptr>>5;
-
-   // Terminating byte then zero pad
-   casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
-
-   // Zero pad full vectors up to count
-   for ( ; vp < 6; vp++ )
-       casti_m256i( buf, vp ) = m256_zero;
-
    // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
-   // Count is misaligned to 16 bits and straddles a vector.
+   // Count is misaligned to 16 bits and straddles 2 vectors.
    // Use u32 overlay to stage then u16 to load buf.
    union
    {
@@ -387,6 +379,18 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
    count.u32[2] = ctx->count2;
    count.u32[3] = ctx->count3;
 
+   if ( vp == 0 )    // empty buf, xevan.
+   { 
+      casti_m256i( buf, 0 ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
+      memset_zero_256( (__m256i*)buf + 1, 5 );
+      ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
+   }
+   else     // half full buf, everyone else.
+   {
+      casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
+      memset_zero_256( (__m256i*)buf + vp, 6 - vp );
+   }
+
    casti_m256i( buf, 6 ) = _mm256_set_epi16( count.u16[0], 0,0,0,0,0,0,0,
                                              count.u16[0], 0,0,0,0,0,0,0 );
    casti_m256i( buf, 7 ) = _mm256_set_epi16(
diff --git a/algo/x17/sonoa-4way.c b/algo/x17/sonoa-4way.c
index 3c45405..ff81623 100644
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -25,7 +25,8 @@
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/sha/sha2-hash-4way.h"
 
-typedef struct {
+union _sonoa_4way_context_overlay
+{
     blake512_4way_context   blake;
     bmw512_4way_context     bmw;
     hashState_groestl       groestl;
@@ -43,8 +44,10 @@ typedef struct {
     sph_whirlpool_context   whirlpool;
     sha512_4way_context     sha512;
     haval256_5_4way_context haval;
-} sonoa_4way_ctx_holder;
+};
 
+typedef union _sonoa_4way_context_overlay sonoa_4way_context_overlay;
+/*
 sonoa_4way_ctx_holder sonoa_4way_ctx __attribute__ ((aligned (64)));
 
 void init_sonoa_4way_ctx()
@@ -67,6 +70,7 @@ void init_sonoa_4way_ctx()
      sha512_4way_init( &sonoa_4way_ctx.sha512 );
      haval256_5_4way_init( &sonoa_4way_ctx.haval );
 };
+*/
 
 void sonoa_4way_hash( void *state, const void *input )
 {
@@ -77,19 +81,23 @@ void sonoa_4way_hash( void *state, const void *input )
      uint64_t vhash[8*4] __attribute__ ((aligned (64)));
      uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
      uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
-     sonoa_4way_ctx_holder ctx __attribute__ ((aligned (64)));
-        memcpy( &ctx, &sonoa_4way_ctx, sizeof(sonoa_4way_ctx) );
+     sonoa_4way_context_overlay ctx;
+//     sonoa_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+//        memcpy( &ctx, &sonoa_4way_ctx, sizeof(sonoa_4way_ctx) );
 
 // 1
 
+     blake512_4way_init( &ctx.blake );
      blake512_4way( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
+     bmw512_4way_init( &ctx.bmw );
      bmw512_4way( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
+     init_groestl( &ctx.groestl, 64 );
      update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
      init_groestl( &ctx.groestl, 64 );
      update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
@@ -100,29 +108,36 @@ void sonoa_4way_hash( void *state, const void *input )
 
      mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
+     skein512_4way_init( &ctx.skein );
      skein512_4way( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
+     jh512_4way_init( &ctx.jh );
      jh512_4way( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
+     keccak512_4way_init( &ctx.keccak );
      keccak512_4way( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
 
+     luffa_2way_init( &ctx.luffa, 512 );
      luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
      luffa_2way_init( &ctx.luffa, 512 );
      luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
 
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
      cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
      cube_2way_init( &ctx.cube, 512, 16, 32 );
      cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
 
+     shavite512_2way_init( &ctx.shavite );
      shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
      shavite512_2way_init( &ctx.shavite );
      shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
 
+     simd_2way_init( &ctx.simd, 512 );
      simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
      simd_2way_init( &ctx.simd, 512 );
      simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
@@ -130,6 +145,7 @@ void sonoa_4way_hash( void *state, const void *input )
      mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
      mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
 
+     init_echo( &ctx.echo, 512 );
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
                        (const BitSequence *) hash0, 512 );
      init_echo( &ctx.echo, 512 );
@@ -215,10 +231,12 @@ void sonoa_4way_hash( void *state, const void *input )
 
      mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
+     hamsi512_4way_init( &ctx.hamsi );
      hamsi512_4way( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
 // 3
+
      bmw512_4way_init( &ctx.bmw );
      bmw512_4way( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
@@ -294,6 +312,7 @@ void sonoa_4way_hash( void *state, const void *input )
 
      mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
+     sph_fugue512_init( &ctx.fugue );
      sph_fugue512( &ctx.fugue, hash0, 64 );
      sph_fugue512_close( &ctx.fugue, hash0 );
      sph_fugue512_init( &ctx.fugue );
@@ -399,10 +418,11 @@ void sonoa_4way_hash( void *state, const void *input )
 
      mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
 
+     shabal512_4way_init( &ctx.shabal );
      shabal512_4way( &ctx.shabal, vhash, 64 );
      shabal512_4way_close( &ctx.shabal, vhash );
 
-     mm256_reinterleave_4x64( vhashB, vhash, 512 ); 
+     mm256_reinterleave_4x32_4x64( vhashB, vhash, 512 ); 
 
      hamsi512_4way_init( &ctx.hamsi );
      hamsi512_4way( &ctx.hamsi, vhashB, 64 );
@@ -438,7 +458,7 @@ void sonoa_4way_hash( void *state, const void *input )
      bmw512_4way( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
-     mm256_reinterleave_4x32( vhashB, vhash,  512 );
+     mm256_reinterleave_4x64_4x32( vhashB, vhash,  512 );
 
      shabal512_4way_init( &ctx.shabal );
      shabal512_4way( &ctx.shabal, vhashB, 64 );
@@ -536,6 +556,7 @@ void sonoa_4way_hash( void *state, const void *input )
 
      mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
 
+     sph_whirlpool_init( &ctx.whirlpool );
      sph_whirlpool( &ctx.whirlpool, hash0, 64 );
      sph_whirlpool_close( &ctx.whirlpool, hash0 );
      sph_whirlpool_init( &ctx.whirlpool );
@@ -663,6 +684,7 @@ void sonoa_4way_hash( void *state, const void *input )
 
      mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
+     sha512_4way_init( &ctx.sha512 );
      sha512_4way( &ctx.sha512, vhash, 64 );
      sha512_4way_close( &ctx.sha512, vhash );
 
@@ -800,11 +822,11 @@ void sonoa_4way_hash( void *state, const void *input )
      sha512_4way( &ctx.sha512, vhash, 64 );
      sha512_4way_close( &ctx.sha512, vhash );
 
-     mm256_reinterleave_4x32( vhashB, vhash,  512 );
+     mm256_reinterleave_4x64_4x32( vhashB, vhash,  512 );
 
+     haval256_5_4way_init( &ctx.haval );
      haval256_5_4way( &ctx.haval, vhashB, 64 );
      haval256_5_4way_close( &ctx.haval, state );
-
 }
 
 int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
diff --git a/algo/x17/sonoa-gate.c b/algo/x17/sonoa-gate.c
index e74073b..b420564 100644
--- a/algo/x17/sonoa-gate.c
+++ b/algo/x17/sonoa-gate.c
@@ -3,7 +3,7 @@
 bool register_sonoa_algo( algo_gate_t* gate )
 {
 #if defined (SONOA_4WAY)
-  init_sonoa_4way_ctx();
+//  init_sonoa_4way_ctx();
   gate->scanhash  = (void*)&scanhash_sonoa_4way;
   gate->hash      = (void*)&sonoa_4way_hash;
 #else
diff --git a/algo/x17/sonoa-gate.h b/algo/x17/sonoa-gate.h
index f00efd8..05f03ff 100644
--- a/algo/x17/sonoa-gate.h
+++ b/algo/x17/sonoa-gate.h
@@ -17,7 +17,7 @@ void sonoa_4way_hash( void *state, const void *input );
 int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 
-void init_sonoa_4way_ctx();
+//void init_sonoa_4way_ctx();
 
 #endif
 
diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c
index 4d7e8b8..21af850 100644
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -14,7 +14,6 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
-#include "algo/shavite/sph_shavite.h"
 #include "algo/shavite/shavite-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -222,7 +221,7 @@ void x17_4way_hash( void *state, const void *input )
      sha512_4way_close( &ctx.sha512, vhash );     
 
      // 17 Haval parallel 32 bit
-     mm256_reinterleave_4x32( vhashB, vhash,  512 );
+     mm256_reinterleave_4x64_4x32( vhashB, vhash,  512 );
 
      haval256_5_4way_init( &ctx.haval );
      haval256_5_4way( &ctx.haval, vhashB, 64 );
@@ -258,18 +257,18 @@ int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
      uint64_t *edata = (uint64_t*)endiandata;
      mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
 
-     for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+     for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
      {
-        uint32_t mask = masks[m];
+        uint32_t mask = masks[ m ];
         do
         {
   	   *noncev = mm256_interleave_blend_32( mm256_bswap_32(
-	                     _mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ),
+	                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ),
 	  		                        *noncev );
            x17_4way_hash( hash, vdata );
 
 	   for ( int lane = 0; lane < 4; lane++ )
-           if ( ( ( hash7[ lane ] & mask ) == 0 ) )
+           if ( ( hash7[ lane ] & mask ) == 0 )
            {
               mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
               if ( fulltest( lane_hash, ptarget ) )
diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c
index 9b3ec48..22236db 100644
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -12,8 +12,9 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
-#include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
+#include "algo/shavite/shavite-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -24,16 +25,17 @@
 #include "algo/sha/sha2-hash-4way.h"
 #include "algo/haval/haval-hash-4way.h"
 
-typedef struct {
-        blake512_4way_context   blake;
+union _xevan_4way_context_overlay
+{
+	blake512_4way_context   blake;
         bmw512_4way_context     bmw;
         hashState_groestl       groestl;
         skein512_4way_context   skein;
         jh512_4way_context      jh;
         keccak512_4way_context  keccak;
         luffa_2way_context      luffa;
-        cubehashParam           cube;
-        sph_shavite512_context  shavite;
+        cube_2way_context       cube;
+        shavite512_2way_context shavite;
         simd_2way_context       simd;
         hashState_echo          echo;
         hamsi512_4way_context   hamsi;
@@ -42,39 +44,8 @@ typedef struct {
         sph_whirlpool_context   whirlpool;
         sha512_4way_context     sha512;
         haval256_5_4way_context haval;
-} xevan_4way_ctx_holder;
-
-xevan_4way_ctx_holder xevan_4way_ctx __attribute__ ((aligned (64)));
-static __thread blake512_4way_context xevan_blake_4way_mid
-                                        __attribute__ ((aligned (64)));
-
-void init_xevan_4way_ctx()
-{
-        blake512_4way_init(&xevan_4way_ctx.blake);
-        bmw512_4way_init( &xevan_4way_ctx.bmw );
-        init_groestl( &xevan_4way_ctx.groestl, 64 );
-        skein512_4way_init(&xevan_4way_ctx.skein);
-        jh512_4way_init(&xevan_4way_ctx.jh);
-        keccak512_4way_init(&xevan_4way_ctx.keccak);
-        luffa_2way_init( &xevan_4way_ctx.luffa, 512 );
-        cubehashInit( &xevan_4way_ctx.cube, 512, 16, 32 );
-        sph_shavite512_init( &xevan_4way_ctx.shavite );
-        simd_2way_init( &xevan_4way_ctx.simd, 512 );
-        init_echo( &xevan_4way_ctx.echo, 512 );
-        hamsi512_4way_init( &xevan_4way_ctx.hamsi );
-        sph_fugue512_init( &xevan_4way_ctx.fugue );
-        shabal512_4way_init( &xevan_4way_ctx.shabal );
-        sph_whirlpool_init( &xevan_4way_ctx.whirlpool );
-        sha512_4way_init( &xevan_4way_ctx.sha512 );
-        haval256_5_4way_init( &xevan_4way_ctx.haval );
 };
-
-void xevan_4way_blake512_midstate( const void* input )
-{
-    memcpy( &xevan_blake_4way_mid, &xevan_4way_ctx.blake,
-            sizeof(xevan_blake_4way_mid) );
-    blake512_4way( &xevan_blake_4way_mid, input, 64 );
-}
+typedef union _xevan_4way_context_overlay xevan_4way_context_overlay;
 
 void xevan_4way_hash( void *output, const void *input )
 {
@@ -83,293 +54,283 @@ void xevan_4way_hash( void *output, const void *input )
      uint64_t hash2[16] __attribute__ ((aligned (64)));
      uint64_t hash3[16] __attribute__ ((aligned (64)));
      uint64_t vhash[16<<2] __attribute__ ((aligned (64)));
-     uint64_t vhash32[16<<2] __attribute__ ((aligned (64)));
+     uint64_t vhashA[16<<2] __attribute__ ((aligned (64)));
+     uint64_t vhashB[16<<2] __attribute__ ((aligned (64)));
      const int dataLen = 128;
-     const int midlen = 64;            // bytes
-     const int tail   = 80 - midlen;   // 16
-     xevan_4way_ctx_holder ctx __attribute__ ((aligned (64)));
-     memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );
+     xevan_4way_context_overlay ctx __attribute__ ((aligned (64)));
 
-     // parallel way
-     memcpy( &ctx.blake, &xevan_blake_4way_mid,
-             sizeof(xevan_blake_4way_mid) );
-     blake512_4way( &ctx.blake, input + (midlen<<2), tail );
+     // parallel 4 way
+
+     blake512_4way_init( &ctx.blake );
+     blake512_4way( &ctx.blake, input, 80 );
      blake512_4way_close(&ctx.blake, vhash);
      memset( &vhash[8<<2], 0, 64<<2 );
 
+     bmw512_4way_init( &ctx.bmw );
      bmw512_4way( &ctx.bmw, vhash, dataLen );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serial
      mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
 
+     init_groestl( &ctx.groestl, 64 );
      update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
                                dataLen<<3 );
-     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
      update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
                                dataLen<<3 );
-     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
      update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
                                dataLen<<3 );
-     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
      update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
                                dataLen<<3 );
 
      // Parallel 4way
      mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
+     skein512_4way_init( &ctx.skein );
      skein512_4way( &ctx.skein, vhash, dataLen );
      skein512_4way_close( &ctx.skein, vhash );
 
+     jh512_4way_init( &ctx.jh );
      jh512_4way( &ctx.jh, vhash, dataLen );
      jh512_4way_close( &ctx.jh, vhash );
 
+     keccak512_4way_init( &ctx.keccak );
      keccak512_4way( &ctx.keccak, vhash, dataLen );
      keccak512_4way_close( &ctx.keccak, vhash );
 
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
-     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
-     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
-     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
+     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
+
      luffa_2way_init( &ctx.luffa, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
+     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
 
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
-                           dataLen );
-     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
-                           dataLen );
-     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
-                           dataLen );
-     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
-                           dataLen );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
 
-     sph_shavite512( &ctx.shavite, hash0, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash1, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash2, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash3, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash3 );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, dataLen );
 
-     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
-     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
-     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
      simd_2way_init( &ctx.simd, 512 );
-     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
+     simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
 
+     mm256_deinterleave_1x128( hash0, hash1, vhashA, dataLen<<3 );
+     mm256_deinterleave_1x128( hash2, hash3, vhashB, dataLen<<3 );
+
+     init_echo( &ctx.echo, 512 );
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
                        (const BitSequence *) hash0, dataLen<<3 );
-     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
      update_final_echo( &ctx.echo, (BitSequence *)hash1,
                        (const BitSequence *) hash1, dataLen<<3 );
-     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
      update_final_echo( &ctx.echo, (BitSequence *)hash2,
                        (const BitSequence *) hash2, dataLen<<3 );
-     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
      update_final_echo( &ctx.echo, (BitSequence *)hash3,
                        (const BitSequence *) hash3, dataLen<<3 );
      // Parallel
      mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     hamsi512_4way_init( &ctx.hamsi );
      hamsi512_4way( &ctx.hamsi, vhash, dataLen );
      hamsi512_4way_close( &ctx.hamsi, vhash );
+
      mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
 
+     sph_fugue512_init( &ctx.fugue );
      sph_fugue512( &ctx.fugue, hash0, dataLen );
      sph_fugue512_close( &ctx.fugue, hash0 );
-     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
      sph_fugue512( &ctx.fugue, hash1, dataLen );
      sph_fugue512_close( &ctx.fugue, hash1 );
-     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
      sph_fugue512( &ctx.fugue, hash2, dataLen );
      sph_fugue512_close( &ctx.fugue, hash2 );
-     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
      sph_fugue512( &ctx.fugue, hash3, dataLen );
      sph_fugue512_close( &ctx.fugue, hash3 );
 
      // Parallel 4way 32 bit
      mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     shabal512_4way_init( &ctx.shabal );
      shabal512_4way( &ctx.shabal, vhash, dataLen );
      shabal512_4way_close( &ctx.shabal, vhash );
+
      mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
 
      // Serial
+     sph_whirlpool_init( &ctx.whirlpool );
      sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
      sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
      sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
      sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
      sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
      sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
      sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
      sph_whirlpool_close( &ctx.whirlpool, hash3 );
 
      mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     sha512_4way_init( &ctx.sha512 );
      sha512_4way( &ctx.sha512, vhash, dataLen );
      sha512_4way_close( &ctx.sha512, vhash );
 
-     mm256_reinterleave_4x32( vhash32, vhash, dataLen<<3 );
-     haval256_5_4way( &ctx.haval, vhash32, dataLen );
-     haval256_5_4way_close( &ctx.haval, vhash );
-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+     mm256_reinterleave_4x64_4x32( vhashA, vhash, dataLen<<3 );
+
+     haval256_5_4way_init( &ctx.haval );
+     haval256_5_4way( &ctx.haval, vhashA, dataLen );
+     haval256_5_4way_close( &ctx.haval, vhashA );
+
+     mm256_reinterleave_4x32_4x64( vhash, vhashA, dataLen<<3 );
 
-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
      memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 );
-     memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );
 
+     blake512_4way_init( &ctx.blake );
      blake512_4way( &ctx.blake, vhash, dataLen );
      blake512_4way_close(&ctx.blake, vhash);
 
+     bmw512_4way_init( &ctx.bmw );
      bmw512_4way( &ctx.bmw, vhash, dataLen );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
 
+     init_groestl( &ctx.groestl, 64 );
      update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
                                dataLen<<3 );
-     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
      update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
                                dataLen<<3 );
-     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
      update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
                                dataLen<<3 );
-     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
      update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
                                dataLen<<3 );
 
      mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
+     skein512_4way_init( &ctx.skein );
      skein512_4way( &ctx.skein, vhash, dataLen );
      skein512_4way_close( &ctx.skein, vhash );
 
+     jh512_4way_init( &ctx.jh );
      jh512_4way( &ctx.jh, vhash, dataLen );
      jh512_4way_close( &ctx.jh, vhash );
 
+     keccak512_4way_init( &ctx.keccak );
      keccak512_4way( &ctx.keccak, vhash, dataLen );
      keccak512_4way_close( &ctx.keccak, vhash );
 
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
-     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
-     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
-     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
+     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
+
      luffa_2way_init( &ctx.luffa, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
+     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
 
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
-                           dataLen );
-     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
-                           dataLen );
-     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
-                           dataLen );
-     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
-                           dataLen );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
 
-     sph_shavite512( &ctx.shavite, hash0, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash1, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash2, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash3, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash3 );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, dataLen );
 
-     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
-     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
-     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
      simd_2way_init( &ctx.simd, 512 );
-     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
+     simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
 
+     mm256_deinterleave_1x128( hash0, hash1, vhashA, dataLen<<3 );
+     mm256_deinterleave_1x128( hash2, hash3, vhashB, dataLen<<3 );
+
+     init_echo( &ctx.echo, 512 );
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
                        (const BitSequence *) hash0, dataLen<<3 );
-     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
      update_final_echo( &ctx.echo, (BitSequence *)hash1,
                        (const BitSequence *) hash1, dataLen<<3 );
-     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
      update_final_echo( &ctx.echo, (BitSequence *)hash2,
                        (const BitSequence *) hash2, dataLen<<3 );
-     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );
      update_final_echo( &ctx.echo, (BitSequence *)hash3,
                        (const BitSequence *) hash3, dataLen<<3 );
 
      mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     hamsi512_4way_init( &ctx.hamsi );
      hamsi512_4way( &ctx.hamsi, vhash, dataLen );
      hamsi512_4way_close( &ctx.hamsi, vhash );
+
      mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
 
+     sph_fugue512_init( &ctx.fugue );
      sph_fugue512( &ctx.fugue, hash0, dataLen );
      sph_fugue512_close( &ctx.fugue, hash0 );
-     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
      sph_fugue512( &ctx.fugue, hash1, dataLen );
      sph_fugue512_close( &ctx.fugue, hash1 );
-     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
      sph_fugue512( &ctx.fugue, hash2, dataLen );
      sph_fugue512_close( &ctx.fugue, hash2 );
-     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
      sph_fugue512( &ctx.fugue, hash3, dataLen );
      sph_fugue512_close( &ctx.fugue, hash3 );
 
      mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     shabal512_4way_init( &ctx.shabal );
      shabal512_4way( &ctx.shabal, vhash, dataLen );
      shabal512_4way_close( &ctx.shabal, vhash );
+
      mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
 
+     sph_whirlpool_init( &ctx.whirlpool );
      sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
      sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
      sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
      sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
      sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
      sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
      sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
      sph_whirlpool_close( &ctx.whirlpool, hash3 );
 
      mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     sha512_4way_init( &ctx.sha512 );
      sha512_4way( &ctx.sha512, vhash, dataLen );
      sha512_4way_close( &ctx.sha512, vhash );
 
-     mm256_reinterleave_4x32( vhash32, vhash, dataLen<<3 );
-     haval256_5_4way( &ctx.haval, vhash32, dataLen );
+     mm256_reinterleave_4x64_4x32( vhashA, vhash, dataLen<<3 );
+
+     haval256_5_4way_init( &ctx.haval );
+     haval256_5_4way( &ctx.haval, vhashA, dataLen );
      haval256_5_4way_close( &ctx.haval, output );
 }
 
 int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done )
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[4*8] __attribute__ ((aligned (64)));
    uint32_t *hash7 = &(hash[7<<2]);
@@ -378,30 +339,26 @@ int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
    uint32_t _ALIGN(64) endiandata[20];
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
 
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 73;   // 9*8 + 1
 
    if ( opt_benchmark )
       ptarget[7] = 0x0cff;
 
-   for ( int k=0; k < 19; k++ )
-      be32enc( &endiandata[k], pdata[k] );
-
    uint64_t *edata = (uint64_t*)endiandata;
+
+   casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+   casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
 
-   xevan_4way_blake512_midstate( vdata );
-
    do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+2, n+1 );
-      be32enc( noncep+4, n+2 );
-      be32enc( noncep+6, n+3 );
+      *noncev = mm256_interleave_blend_32( mm256_bswap_32(
+               _mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ), *noncev );
 
       xevan_4way_hash( hash, vdata );
       for ( int lane = 0; lane < 4; lane++ )
@@ -411,15 +368,20 @@ int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
 	 if ( fulltest( lane_hash, ptarget ) )
          {
              pdata[19] = n + lane;
-             nonces[ num_found++ ] = n + lane;
              work_set_target_ratio( work, lane_hash );
+             if ( submit_work( mythr, work ) )
+                applog( LOG_NOTICE,
+                        "Share %d submitted by thread %d, lane %d.",
+                         accepted_share_count + rejected_share_count + 1,
+                         thr_id, lane );
+             else
+                applog( LOG_WARNING, "Failed to submit share." );
          }
       }
       n += 4;
-   } while ( ( num_found == 0 ) && ( n < max_nonce )
-             && !work_restart[thr_id].restart );
+   } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
    *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }
 
 #endif
diff --git a/algo/x17/xevan-gate.c b/algo/x17/xevan-gate.c
index d5f4aba..1f9cd8e 100644
--- a/algo/x17/xevan-gate.c
+++ b/algo/x17/xevan-gate.c
@@ -8,7 +8,7 @@ void xevan_set_target( struct work* work, double job_diff )
 bool register_xevan_algo( algo_gate_t* gate )
 {
 #if defined (XEVAN_4WAY)
-  init_xevan_4way_ctx();
+//  init_xevan_4way_ctx();
   gate->scanhash  = (void*)&scanhash_xevan_4way;
   gate->hash      = (void*)&xevan_4way_hash;
 #else
diff --git a/algo/x17/xevan-gate.h b/algo/x17/xevan-gate.h
index 650b4f1..f4bc38f 100644
--- a/algo/x17/xevan-gate.h
+++ b/algo/x17/xevan-gate.h
@@ -15,16 +15,16 @@ bool register_xevan_algo( algo_gate_t* gate );
 void xevan_4way_hash( void *state, const void *input );
 
 int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done );
+                       uint64_t *hashes_done, struct thr_info *mythr );
 
-void init_xevan_4way_ctx();
+//void init_xevan_4way_ctx();
 
 #endif
 
 void xevan_hash( void *state, const void *input );
 
 int scanhash_xevan( int thr_id, struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done );
+                  uint64_t *hashes_done, struct thr_info *mythr );
 
 void init_xevan_ctx();
 
diff --git a/algo/x17/xevan.c b/algo/x17/xevan.c
index b35c657..b45734a 100644
--- a/algo/x17/xevan.c
+++ b/algo/x17/xevan.c
@@ -230,12 +230,14 @@ void xevan_hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }
 
-int scanhash_xevan(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_xevan( int thr_id, struct work *work, uint32_t max_nonce,
+	            uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(64) hash[8];
 	uint32_t _ALIGN(64) endiandata[20];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
+        /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
diff --git a/algo/yescrypt/sha256_Y.c b/algo/yescrypt/sha256_Y.c
index 62265bf..7b778ed 100644
--- a/algo/yescrypt/sha256_Y.c
+++ b/algo/yescrypt/sha256_Y.c
@@ -290,7 +290,7 @@ SHA256_Final_Y(unsigned char digest[32], SHA256_CTX_Y * ctx)
 
 /* Initialize an HMAC-SHA256 operation with the given key. */
 void
-HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
+HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y * ctx, const void * _K, size_t Klen)
 {
 	unsigned char pad[64];
 	unsigned char khash[32];
@@ -326,7 +326,7 @@ HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
 
 /* Add bytes to the HMAC-SHA256 operation. */
 void
-HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)
+HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y * ctx, const void *in, size_t len)
 {
 
 	/* Feed data to the inner SHA256 operation. */
@@ -335,7 +335,7 @@ HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)
 
 /* Finish an HMAC-SHA256 operation. */
 void
-HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx)
+HMAC_SHA256_Final_Y(unsigned char digest[32], HMAC_SHA256_CTX_Y * ctx)
 {
 	unsigned char ihash[32];
 
@@ -361,7 +361,7 @@ void
 PBKDF2_SHA256_Y(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
     size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
 {
-	HMAC_SHA256_CTX PShctx, hctx;
+	HMAC_SHA256_CTX_Y PShctx, hctx;
 	uint8_t _ALIGN(128) T[32];
 	uint8_t _ALIGN(128) U[32];
 	uint8_t ivec[4];
@@ -370,8 +370,8 @@ PBKDF2_SHA256_Y(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
 	int k;
 
 	/* Compute HMAC state after processing P and S. */
-	HMAC_SHA256_Init(&PShctx, passwd, passwdlen);
-	HMAC_SHA256_Update(&PShctx, salt, saltlen);
+	HMAC_SHA256_Init_Y(&PShctx, passwd, passwdlen);
+	HMAC_SHA256_Update_Y(&PShctx, salt, saltlen);
 
 	/* Iterate through the blocks. */
 	for (i = 0; i * 32 < dkLen; i++) {
@@ -379,18 +379,18 @@ PBKDF2_SHA256_Y(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
 		be32enc(ivec, (uint32_t)(i + 1));
 
 		/* Compute U_1 = PRF(P, S || INT(i)). */
-		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
-		HMAC_SHA256_Update(&hctx, ivec, 4);
-		HMAC_SHA256_Final(U, &hctx);
+		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_Y));
+		HMAC_SHA256_Update_Y(&hctx, ivec, 4);
+		HMAC_SHA256_Final_Y(U, &hctx);
 
 		/* T_i = U_1 ... */
 		memcpy(T, U, 32);
 
 		for (j = 2; j <= c; j++) {
 			/* Compute U_j. */
-			HMAC_SHA256_Init(&hctx, passwd, passwdlen);
-			HMAC_SHA256_Update(&hctx, U, 32);
-			HMAC_SHA256_Final(U, &hctx);
+			HMAC_SHA256_Init_Y(&hctx, passwd, passwdlen);
+			HMAC_SHA256_Update_Y(&hctx, U, 32);
+			HMAC_SHA256_Final_Y(U, &hctx);
 
 			/* ... xor U_j ... */
 			for (k = 0; k < 32; k++)
diff --git a/algo/yescrypt/sha256_Y.h b/algo/yescrypt/sha256_Y.h
index 4912e43..703d059 100644
--- a/algo/yescrypt/sha256_Y.h
+++ b/algo/yescrypt/sha256_Y.h
@@ -49,14 +49,14 @@ typedef struct HMAC_SHA256Context {
 typedef struct HMAC_SHA256Context {
         SHA256_CTX ictx;
         SHA256_CTX octx;
-} HMAC_SHA256_CTX;
+} HMAC_SHA256_CTX_Y;
 
 void	SHA256_Init_Y(SHA256_CTX_Y *);
 void	SHA256_Update_Y(SHA256_CTX_Y *, const void *, size_t);
 void	SHA256_Final_Y(unsigned char [32], SHA256_CTX_Y *);
-void	HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t);
-void	HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t);
-void	HMAC_SHA256_Final(unsigned char [32], HMAC_SHA256_CTX *);
+void	HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
+void	HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
+void	HMAC_SHA256_Final_Y(unsigned char [32], HMAC_SHA256_CTX_Y *);
 
 /**
  * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
diff --git a/algo/yescrypt/yescrypt-simd.c b/algo/yescrypt/yescrypt-simd.c
index edecb60..e70c37e 100644
--- a/algo/yescrypt/yescrypt-simd.c
+++ b/algo/yescrypt/yescrypt-simd.c
@@ -1354,14 +1354,14 @@ yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
 	if ((t || flags) && buflen == sizeof(sha256)) {
 	   /* Compute ClientKey */
 	   {
-		HMAC_SHA256_CTX ctx;
-		HMAC_SHA256_Init(&ctx, buf, buflen);
+		HMAC_SHA256_CTX_Y ctx;
+		HMAC_SHA256_Init_Y(&ctx, buf, buflen);
                 if ( yescrypt_client_key )
-                    HMAC_SHA256_Update( &ctx, (uint8_t*)yescrypt_client_key,
+                    HMAC_SHA256_Update_Y( &ctx, (uint8_t*)yescrypt_client_key,
                                         yescrypt_client_key_len );
                 else
-                    HMAC_SHA256_Update( &ctx, salt, saltlen );
-		HMAC_SHA256_Final(sha256, &ctx);
+                    HMAC_SHA256_Update_Y( &ctx, salt, saltlen );
+		HMAC_SHA256_Final_Y(sha256, &ctx);
 	   }
 	   /* Compute StoredKey */
 	   {
diff --git a/algo/yescrypt/yescrypt.c b/algo/yescrypt/yescrypt.c
index 828c8f3..4ec4536 100644
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -383,7 +383,7 @@ void yescrypthash(void *output, const void *input)
 }
 
 int scanhash_yescrypt( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done )
+                       uint64_t *hashes_done, struct thr_info *mythr )
 {
         uint32_t _ALIGN(64) vhash[8];
         uint32_t _ALIGN(64) endiandata[20];
@@ -393,6 +393,7 @@ int scanhash_yescrypt( int thr_id, struct work *work, uint32_t max_nonce,
         const uint32_t Htarg = ptarget[7];
         const uint32_t first_nonce = pdata[19];
         uint32_t n = first_nonce;
+        /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 
         for (int k = 0; k < 19; k++)
                 be32enc(&endiandata[k], pdata[k]);
diff --git a/algo/yespower/sha256-avx2.c b/algo/yespower/sha256-avx2.c
deleted file mode 100644
index ee2d5d2..0000000
--- a/algo/yespower/sha256-avx2.c
+++ /dev/null
@@ -1,646 +0,0 @@
-/*-
- * Copyright 2005-2016 Colin Percival
- * Copyright 2016-2018 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <assert.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "insecure_memzero.h"
-#include "sysendian.h"
-
-#include "sha256.h"
-
-#ifdef __ICC
-/* Miscompile with icc 14.0.0 (at least), so don't use restrict there */
-#define restrict
-#elif __STDC_VERSION__ >= 199901L
-/* Have restrict */
-#elif defined(__GNUC__)
-#define restrict __restrict
-#else
-#define restrict
-#endif
-
-/*
- * Encode a length len*2 vector of (uint32_t) into a length len*8 vector of
- * (uint8_t) in big-endian form.
- */
-static void
-be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
-{
-
-	/* Encode vector, two words at a time. */
-	do {
-		be32enc(&dst[0], src[0]);
-		be32enc(&dst[4], src[1]);
-		src += 2;
-		dst += 8;
-	} while (--len);
-}
-
-/*
- * Decode a big-endian length len*8 vector of (uint8_t) into a length
- * len*2 vector of (uint32_t).
- */
-static void
-be32dec_vect(uint32_t * dst, const uint8_t * src, size_t len)
-{
-
-	/* Decode vector, two words at a time. */
-	do {
-		dst[0] = be32dec(&src[0]);
-		dst[1] = be32dec(&src[4]);
-		src += 8;
-		dst += 2;
-	} while (--len);
-}
-
-/* SHA256 round constants. */
-static const uint32_t Krnd[64] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-/* Elementary functions used by SHA256 */
-#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
-#define Maj(x, y, z)	((x & (y | z)) | (y & z))
-#define SHR(x, n)	(x >> n)
-#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
-#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
-#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
-#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
-#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
-
-/* SHA256 round function */
-#define RND(a, b, c, d, e, f, g, h, k)			\
-	h += S1(e) + Ch(e, f, g) + k;			\
-	d += h;						\
-	h += S0(a) + Maj(a, b, c);
-
-/* Adjusted round function for rotating state */
-#define RNDr(S, W, i, ii)			\
-	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
-	    S[(66 - i) % 8], S[(67 - i) % 8],	\
-	    S[(68 - i) % 8], S[(69 - i) % 8],	\
-	    S[(70 - i) % 8], S[(71 - i) % 8],	\
-	    W[i + ii] + Krnd[i + ii])
-
-/* Message schedule computation */
-#define MSCH(W, ii, i)				\
-	W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + s0(W[i + ii + 1]) + W[i + ii]
-
-/*
- * SHA256 block compression function.  The 256-bit state is transformed via
- * the 512-bit input block to produce a new state.
- */
-static void
-SHA256_Transform(uint32_t state[static restrict 8],
-    const uint8_t block[static restrict 64],
-    uint32_t W[static restrict 64], uint32_t S[static restrict 8])
-{
-	int i;
-
-	/* 1. Prepare the first part of the message schedule W. */
-	be32dec_vect(W, block, 8);
-
-	/* 2. Initialize working variables. */
-	memcpy(S, state, 32);
-
-	/* 3. Mix. */
-	for (i = 0; i < 64; i += 16) {
-		RNDr(S, W, 0, i);
-		RNDr(S, W, 1, i);
-		RNDr(S, W, 2, i);
-		RNDr(S, W, 3, i);
-		RNDr(S, W, 4, i);
-		RNDr(S, W, 5, i);
-		RNDr(S, W, 6, i);
-		RNDr(S, W, 7, i);
-		RNDr(S, W, 8, i);
-		RNDr(S, W, 9, i);
-		RNDr(S, W, 10, i);
-		RNDr(S, W, 11, i);
-		RNDr(S, W, 12, i);
-		RNDr(S, W, 13, i);
-		RNDr(S, W, 14, i);
-		RNDr(S, W, 15, i);
-
-		if (i == 48)
-			break;
-		MSCH(W, 0, i);
-		MSCH(W, 1, i);
-		MSCH(W, 2, i);
-		MSCH(W, 3, i);
-		MSCH(W, 4, i);
-		MSCH(W, 5, i);
-		MSCH(W, 6, i);
-		MSCH(W, 7, i);
-		MSCH(W, 8, i);
-		MSCH(W, 9, i);
-		MSCH(W, 10, i);
-		MSCH(W, 11, i);
-		MSCH(W, 12, i);
-		MSCH(W, 13, i);
-		MSCH(W, 14, i);
-		MSCH(W, 15, i);
-	}
-
-	/* 4. Mix local working variables into global state. */
-	state[0] += S[0];
-	state[1] += S[1];
-	state[2] += S[2];
-	state[3] += S[3];
-	state[4] += S[4];
-	state[5] += S[5];
-	state[6] += S[6];
-	state[7] += S[7];
-}
-
-static const uint8_t PAD[64] = {
-	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-/* Add padding and terminating bit-count. */
-static void
-SHA256_Pad(SHA256_CTX * ctx, uint32_t tmp32[static restrict 72])
-{
-	size_t r;
-
-	/* Figure out how many bytes we have buffered. */
-	r = (ctx->count >> 3) & 0x3f;
-
-	/* Pad to 56 mod 64, transforming if we finish a block en route. */
-	if (r < 56) {
-		/* Pad to 56 mod 64. */
-		memcpy(&ctx->buf[r], PAD, 56 - r);
-	} else {
-		/* Finish the current block and mix. */
-		memcpy(&ctx->buf[r], PAD, 64 - r);
-		SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-
-		/* The start of the final block is all zeroes. */
-		memset(&ctx->buf[0], 0, 56);
-	}
-
-	/* Add the terminating bit-count. */
-	be64enc(&ctx->buf[56], ctx->count);
-
-	/* Mix in the final block. */
-	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-}
-
-/* Magic initialization constants. */
-static const uint32_t initial_state[8] = {
-	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-};
-
-/**
- * SHA256_Init(ctx):
- * Initialize the SHA256 context ${ctx}.
- */
-void
-SHA256_Init(SHA256_CTX * ctx)
-{
-
-	/* Zero bits processed so far. */
-	ctx->count = 0;
-
-	/* Initialize state. */
-	memcpy(ctx->state, initial_state, sizeof(initial_state));
-}
-
-/**
- * SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
- */
-static void
-_SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len,
-    uint32_t tmp32[static restrict 72])
-{
-	uint32_t r;
-	const uint8_t * src = in;
-
-	/* Return immediately if we have nothing to do. */
-	if (len == 0)
-		return;
-
-	/* Number of bytes left in the buffer from previous updates. */
-	r = (ctx->count >> 3) & 0x3f;
-
-	/* Update number of bits. */
-	ctx->count += (uint64_t)(len) << 3;
-
-	/* Handle the case where we don't need to perform any transforms. */
-	if (len < 64 - r) {
-		memcpy(&ctx->buf[r], src, len);
-		return;
-	}
-
-	/* Finish the current block. */
-	memcpy(&ctx->buf[r], src, 64 - r);
-	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-	src += 64 - r;
-	len -= 64 - r;
-
-	/* Perform complete blocks. */
-	while (len >= 64) {
-		SHA256_Transform(ctx->state, src, &tmp32[0], &tmp32[64]);
-		src += 64;
-		len -= 64;
-	}
-
-	/* Copy left over data into buffer. */
-	memcpy(ctx->buf, src, len);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_SHA256_Update(ctx, in, len, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * SHA256_Final(digest, ctx):
- * Output the SHA256 hash of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-static void
-_SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx,
-    uint32_t tmp32[static restrict 72])
-{
-
-	/* Add padding. */
-	SHA256_Pad(ctx, tmp32);
-
-	/* Write the hash. */
-	be32enc_vect(digest, ctx->state, 4);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_SHA256_Final(digest, ctx, tmp32);
-
-	/* Clear the context state. */
-	insecure_memzero(ctx, sizeof(SHA256_CTX));
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * SHA256_Buf(in, len, digest):
- * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
- */
-void
-SHA256_Buf(const void * in, size_t len, uint8_t digest[32])
-{
-	SHA256_CTX ctx;
-	uint32_t tmp32[72];
-
-	SHA256_Init(&ctx);
-	_SHA256_Update(&ctx, in, len, tmp32);
-	_SHA256_Final(digest, &ctx, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(&ctx, sizeof(SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Init(ctx, K, Klen):
- * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
- * ${K}.
- */
-static void
-_HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen,
-    uint32_t tmp32[static restrict 72], uint8_t pad[static restrict 64],
-    uint8_t khash[static restrict 32])
-{
-	const uint8_t * K = _K;
-	size_t i;
-
-	/* If Klen > 64, the key is really SHA256(K). */
-	if (Klen > 64) {
-		SHA256_Init(&ctx->ictx);
-		_SHA256_Update(&ctx->ictx, K, Klen, tmp32);
-		_SHA256_Final(khash, &ctx->ictx, tmp32);
-		K = khash;
-		Klen = 32;
-	}
-
-	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-	SHA256_Init(&ctx->ictx);
-	memset(pad, 0x36, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-	_SHA256_Update(&ctx->ictx, pad, 64, tmp32);
-
-	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-	SHA256_Init(&ctx->octx);
-	memset(pad, 0x5c, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-	_SHA256_Update(&ctx->octx, pad, 64, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
-{
-	uint32_t tmp32[72];
-	uint8_t pad[64];
-	uint8_t khash[32];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Init(ctx, _K, Klen, tmp32, pad, khash);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(khash, 32);
-	insecure_memzero(pad, 64);
-}
-
-/**
- * HMAC_SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
- */
-static void
-_HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len,
-    uint32_t tmp32[static restrict 72])
-{
-
-	/* Feed data to the inner SHA256 operation. */
-	_SHA256_Update(&ctx->ictx, in, len, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Update(ctx, in, len, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Final(digest, ctx):
- * Output the HMAC-SHA256 of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-static void
-_HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx,
-    uint32_t tmp32[static restrict 72], uint8_t ihash[static restrict 32])
-{
-
-	/* Finish the inner SHA256 operation. */
-	_SHA256_Final(ihash, &ctx->ictx, tmp32);
-
-	/* Feed the inner hash to the outer SHA256 operation. */
-	_SHA256_Update(&ctx->octx, ihash, 32, tmp32);
-
-	/* Finish the outer SHA256 operation. */
-	_SHA256_Final(digest, &ctx->octx, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx)
-{
-	uint32_t tmp32[72];
-	uint8_t ihash[32];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Final(digest, ctx, tmp32, ihash);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(ihash, 32);
-}
-
-/**
- * HMAC_SHA256_Buf(K, Klen, in, len, digest):
- * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
- * length ${Klen}, and write the result to ${digest}.
- */
-void
-HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
-    uint8_t digest[32])
-{
-	HMAC_SHA256_CTX ctx;
-	uint32_t tmp32[72];
-	uint8_t tmp8[96];
-
-	_HMAC_SHA256_Init(&ctx, K, Klen, tmp32, &tmp8[0], &tmp8[64]);
-	_HMAC_SHA256_Update(&ctx, in, len, tmp32);
-	_HMAC_SHA256_Final(digest, &ctx, tmp32, &tmp8[0]);
-
-	/* Clean the stack. */
-	insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(tmp8, 96);
-}
-
-/* Add padding and terminating bit-count, but don't invoke Transform yet. */
-static int
-SHA256_Pad_Almost(SHA256_CTX * ctx, uint8_t len[static restrict 8],
-    uint32_t tmp32[static restrict 72])
-{
-	uint32_t r;
-
-	r = (ctx->count >> 3) & 0x3f;
-	if (r >= 56)
-		return -1;
-
-	/*
-	 * Convert length to a vector of bytes -- we do this now rather
-	 * than later because the length will change after we pad.
-	 */
-	be64enc(len, ctx->count);
-
-	/* Add 1--56 bytes so that the resulting length is 56 mod 64. */
-	_SHA256_Update(ctx, PAD, 56 - r, tmp32);
-
-	/* Add the terminating bit-count. */
-	ctx->buf[63] = len[7];
-	_SHA256_Update(ctx, len, 7, tmp32);
-
-	return 0;
-}
-
-/**
- * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
- * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
- * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
- */
-void
-PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
-    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
-{
-	HMAC_SHA256_CTX Phctx, PShctx, hctx;
-	uint32_t tmp32[72];
-	union {
-		uint8_t tmp8[96];
-		uint32_t state[8];
-	} u;
-	size_t i;
-	uint8_t ivec[4];
-	uint8_t U[32];
-	uint8_t T[32];
-	uint64_t j;
-	int k;
-	size_t clen;
-
-	/* Sanity-check. */
-	assert(dkLen <= 32 * (size_t)(UINT32_MAX));
-
-	if (c == 1 && (dkLen & 31) == 0 && (saltlen & 63) <= 51) {
-		uint32_t oldcount;
-		uint8_t * ivecp;
-
-		/* Compute HMAC state after processing P and S. */
-		_HMAC_SHA256_Init(&hctx, passwd, passwdlen,
-		    tmp32, &u.tmp8[0], &u.tmp8[64]);
-		_HMAC_SHA256_Update(&hctx, salt, saltlen, tmp32);
-
-		/* Prepare ictx padding. */
-		oldcount = hctx.ictx.count & (0x3f << 3);
-		_HMAC_SHA256_Update(&hctx, "\0\0\0", 4, tmp32);
-		if ((hctx.ictx.count & (0x3f << 3)) < oldcount ||
-		    SHA256_Pad_Almost(&hctx.ictx, u.tmp8, tmp32))
-			goto generic; /* Can't happen due to saltlen check */
-		ivecp = hctx.ictx.buf + (oldcount >> 3);
-
-		/* Prepare octx padding. */
-		hctx.octx.count += 32 << 3;
-		SHA256_Pad_Almost(&hctx.octx, u.tmp8, tmp32);
-
-		/* Iterate through the blocks. */
-		for (i = 0; i * 32 < dkLen; i++) {
-			/* Generate INT(i + 1). */
-			be32enc(ivecp, (uint32_t)(i + 1));
-
-			/* Compute U_1 = PRF(P, S || INT(i)). */
-			memcpy(u.state, hctx.ictx.state, sizeof(u.state));
-			SHA256_Transform(u.state, hctx.ictx.buf,
-			    &tmp32[0], &tmp32[64]);
-			be32enc_vect(hctx.octx.buf, u.state, 4);
-			memcpy(u.state, hctx.octx.state, sizeof(u.state));
-			SHA256_Transform(u.state, hctx.octx.buf,
-			    &tmp32[0], &tmp32[64]);
-			be32enc_vect(&buf[i * 32], u.state, 4);
-		}
-
-		goto cleanup;
-	}
-
-generic:
-	/* Compute HMAC state after processing P. */
-	_HMAC_SHA256_Init(&Phctx, passwd, passwdlen,
-	    tmp32, &u.tmp8[0], &u.tmp8[64]);
-
-	/* Compute HMAC state after processing P and S. */
-	memcpy(&PShctx, &Phctx, sizeof(HMAC_SHA256_CTX));
-	_HMAC_SHA256_Update(&PShctx, salt, saltlen, tmp32);
-
-	/* Iterate through the blocks. */
-	for (i = 0; i * 32 < dkLen; i++) {
-		/* Generate INT(i + 1). */
-		be32enc(ivec, (uint32_t)(i + 1));
-
-		/* Compute U_1 = PRF(P, S || INT(i)). */
-		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
-		_HMAC_SHA256_Update(&hctx, ivec, 4, tmp32);
-		_HMAC_SHA256_Final(T, &hctx, tmp32, u.tmp8);
-
-		if (c > 1) {
-			/* T_i = U_1 ... */
-			memcpy(U, T, 32);
-
-			for (j = 2; j <= c; j++) {
-				/* Compute U_j. */
-				memcpy(&hctx, &Phctx, sizeof(HMAC_SHA256_CTX));
-				_HMAC_SHA256_Update(&hctx, U, 32, tmp32);
-				_HMAC_SHA256_Final(U, &hctx, tmp32, u.tmp8);
-
-				/* ... xor U_j ... */
-				for (k = 0; k < 32; k++)
-					T[k] ^= U[k];
-			}
-		}
-
-		/* Copy as many bytes as necessary into buf. */
-		clen = dkLen - i * 32;
-		if (clen > 32)
-			clen = 32;
-		memcpy(&buf[i * 32], T, clen);
-	}
-
-	/* Clean the stack. */
-	insecure_memzero(&Phctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(&PShctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(U, 32);
-	insecure_memzero(T, 32);
-
-cleanup:
-	insecure_memzero(&hctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(&u, sizeof(u));
-}
diff --git a/algo/yespower/sha256.c b/algo/yespower/sha256.c
deleted file mode 100644
index 63ca998..0000000
--- a/algo/yespower/sha256.c
+++ /dev/null
@@ -1,680 +0,0 @@
-/*-
- * Copyright 2005-2016 Colin Percival
- * Copyright 2016-2018 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <assert.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "insecure_memzero.h"
-#include "sysendian.h"
-
-#include "sha256.h"
-#include "avxdefs.h"
-
-#ifdef __ICC
-/* Miscompile with icc 14.0.0 (at least), so don't use restrict there */
-#define restrict
-#elif __STDC_VERSION__ >= 199901L
-/* Have restrict */
-#elif defined(__GNUC__)
-#define restrict __restrict
-#else
-#define restrict
-#endif
-
-/*
- * Encode a length len*2 vector of (uint32_t) into a length len*8 vector of
- * (uint8_t) in big-endian form.
- */
-static void
-be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
-{
-
-	/* Encode vector, two words at a time. */
-	do {
-		be32enc(&dst[0], src[0]);
-		be32enc(&dst[4], src[1]);
-		src += 2;
-		dst += 8;
-	} while (--len);
-}
-
-/*
- * Decode a big-endian length len*8 vector of (uint8_t) into a length
- * len*2 vector of (uint32_t).
- */
-static void
-be32dec_vect(uint32_t * dst, const uint8_t * src, size_t len)
-{
-
-	/* Decode vector, two words at a time. */
-	do {
-		dst[0] = be32dec(&src[0]);
-		dst[1] = be32dec(&src[4]);
-		src += 8;
-		dst += 2;
-	} while (--len);
-}
-
-/* SHA256 round constants. */
-static const uint32_t Krnd[64] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-/* Elementary functions used by SHA256 */
-#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
-#define Maj(x, y, z)	((x & (y | z)) | (y & z))
-#define SHR(x, n)	(x >> n)
-#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
-#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
-#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
-#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
-#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
-
-#if 0    //defined(__SHA__)
-
-// ABEF = _mm_sha256rnds2_epu32( CDGH, ABEF, k )
-//_mm_sha256rnds2_epu32 (__m128i a, __m128i b, __m128i k)
-// b = { ABEF }   a = { CDGH }
-//
-//a = _mm_set_epi32( S[(66 - i) % 8], S[(67 - i) % 8],
-//                 S[(70 - i) % 8], S[(71 - i) % 8] );
-//b = _mm_set_epi32( S[(64 - i) % 8], S[(65 - i) % 8],
-//                 S[(68 - i) % 8], S[(69 - i) % 8] );
-//k = _mm_set1_epi32( W[i + ii] + Krnd[i + ii] )
-// _mm_sha256rnds2_epu32(a,b,k)
-
-#define RNDr( S, W, i, ii ) do \
-{ \
-uint32_t abef[4]; \
-  __m128i ABEF =  _mm_set_epi32( S[(66 - i) % 8], S[(67 - i) % 8], \
-                                 S[(70 - i) % 8], S[(71 - i) % 8] ); \
-  __m128i CDGH =  _mm_set_epi32( S[(64 - i) % 8], S[(65 - i) % 8], \
-                                 S[(68 - i) % 8], S[(69 - i) % 8] ); \
-  __m128i    K =  _mm_set1_epi32( W[i + ii] + Krnd[i + ii] ); \
-  casti_m128i( abef, 0 )  = _mm_sha256rnds2_epu32( CDGH, ABEF, K ); \
-  S[(66 - i) % 8] = abef[3]; \
-  S[(67 - i) % 8] = abef[2]; \
-  S[(64 - i) % 8] = abef[1]; \
-  S[(65 - i) % 8] = abef[0]; \
-} while(0)
-
-#else
-
-/* SHA256 round function */
-
-#define RND(a, b, c, d, e, f, g, h, k)			\
-	h += S1(e) + Ch(e, f, g) + k;			\
-	d += h;						\
-	h += S0(a) + Maj(a, b, c);
-
-/* Adjusted round function for rotating state */
-#define RNDr(S, W, i, ii)			\
-	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
-	    S[(66 - i) % 8], S[(67 - i) % 8],	\
-	    S[(68 - i) % 8], S[(69 - i) % 8],	\
-	    S[(70 - i) % 8], S[(71 - i) % 8],	\
-	    W[i + ii] + Krnd[i + ii])
-
-#endif
-
-/* Message schedule computation */
-#define MSCH(W, ii, i)				\
-	W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + s0(W[i + ii + 1]) + W[i + ii]
-
-/*
- * SHA256 block compression function.  The 256-bit state is transformed via
- * the 512-bit input block to produce a new state.
- */
-static void
-SHA256_Transform(uint32_t state[static restrict 8],
-    const uint8_t block[static restrict 64],
-    uint32_t W[static restrict 64], uint32_t S[static restrict 8])
-{
-	int i;
-
-	/* 1. Prepare the first part of the message schedule W. */
-	be32dec_vect(W, block, 8);
-
-	/* 2. Initialize working variables. */
-	memcpy(S, state, 32);
-
-	/* 3. Mix. */
-	for (i = 0; i < 64; i += 16) {
-		RNDr(S, W, 0, i);
-		RNDr(S, W, 1, i);
-		RNDr(S, W, 2, i);
-		RNDr(S, W, 3, i);
-		RNDr(S, W, 4, i);
-		RNDr(S, W, 5, i);
-		RNDr(S, W, 6, i);
-		RNDr(S, W, 7, i);
-		RNDr(S, W, 8, i);
-		RNDr(S, W, 9, i);
-		RNDr(S, W, 10, i);
-		RNDr(S, W, 11, i);
-		RNDr(S, W, 12, i);
-		RNDr(S, W, 13, i);
-		RNDr(S, W, 14, i);
-		RNDr(S, W, 15, i);
-
-		if (i == 48)
-			break;
-		MSCH(W, 0, i);
-		MSCH(W, 1, i);
-		MSCH(W, 2, i);
-		MSCH(W, 3, i);
-		MSCH(W, 4, i);
-		MSCH(W, 5, i);
-		MSCH(W, 6, i);
-		MSCH(W, 7, i);
-		MSCH(W, 8, i);
-		MSCH(W, 9, i);
-		MSCH(W, 10, i);
-		MSCH(W, 11, i);
-		MSCH(W, 12, i);
-		MSCH(W, 13, i);
-		MSCH(W, 14, i);
-		MSCH(W, 15, i);
-	}
-
-	/* 4. Mix local working variables into global state. */
-	state[0] += S[0];
-	state[1] += S[1];
-	state[2] += S[2];
-	state[3] += S[3];
-	state[4] += S[4];
-	state[5] += S[5];
-	state[6] += S[6];
-	state[7] += S[7];
-}
-
-static const uint8_t PAD[64] = {
-	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-/* Add padding and terminating bit-count. */
-static void
-SHA256_Pad(SHA256_CTX * ctx, uint32_t tmp32[static restrict 72])
-{
-	size_t r;
-
-	/* Figure out how many bytes we have buffered. */
-	r = (ctx->count >> 3) & 0x3f;
-
-	/* Pad to 56 mod 64, transforming if we finish a block en route. */
-	if (r < 56) {
-		/* Pad to 56 mod 64. */
-		memcpy(&ctx->buf[r], PAD, 56 - r);
-	} else {
-		/* Finish the current block and mix. */
-		memcpy(&ctx->buf[r], PAD, 64 - r);
-		SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-
-		/* The start of the final block is all zeroes. */
-		memset(&ctx->buf[0], 0, 56);
-	}
-
-	/* Add the terminating bit-count. */
-	be64enc(&ctx->buf[56], ctx->count);
-
-	/* Mix in the final block. */
-	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-}
-
-/* Magic initialization constants. */
-static const uint32_t initial_state[8] = {
-	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-};
-
-/**
- * SHA256_Init(ctx):
- * Initialize the SHA256 context ${ctx}.
- */
-void
-SHA256_Init(SHA256_CTX * ctx)
-{
-
-	/* Zero bits processed so far. */
-	ctx->count = 0;
-
-	/* Initialize state. */
-	memcpy(ctx->state, initial_state, sizeof(initial_state));
-}
-
-/**
- * SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
- */
-static void
-_SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len,
-    uint32_t tmp32[static restrict 72])
-{
-	uint32_t r;
-	const uint8_t * src = in;
-
-	/* Return immediately if we have nothing to do. */
-	if (len == 0)
-		return;
-
-	/* Number of bytes left in the buffer from previous updates. */
-	r = (ctx->count >> 3) & 0x3f;
-
-	/* Update number of bits. */
-	ctx->count += (uint64_t)(len) << 3;
-
-	/* Handle the case where we don't need to perform any transforms. */
-	if (len < 64 - r) {
-		memcpy(&ctx->buf[r], src, len);
-		return;
-	}
-
-	/* Finish the current block. */
-	memcpy(&ctx->buf[r], src, 64 - r);
-	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-	src += 64 - r;
-	len -= 64 - r;
-
-	/* Perform complete blocks. */
-	while (len >= 64) {
-		SHA256_Transform(ctx->state, src, &tmp32[0], &tmp32[64]);
-		src += 64;
-		len -= 64;
-	}
-
-	/* Copy left over data into buffer. */
-	memcpy(ctx->buf, src, len);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_SHA256_Update(ctx, in, len, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * SHA256_Final(digest, ctx):
- * Output the SHA256 hash of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-static void
-_SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx,
-    uint32_t tmp32[static restrict 72])
-{
-
-	/* Add padding. */
-	SHA256_Pad(ctx, tmp32);
-
-	/* Write the hash. */
-	be32enc_vect(digest, ctx->state, 4);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_SHA256_Final(digest, ctx, tmp32);
-
-	/* Clear the context state. */
-	insecure_memzero(ctx, sizeof(SHA256_CTX));
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * SHA256_Buf(in, len, digest):
- * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
- */
-void
-SHA256_Buf(const void * in, size_t len, uint8_t digest[32])
-{
-	SHA256_CTX ctx;
-	uint32_t tmp32[72];
-
-	SHA256_Init(&ctx);
-	_SHA256_Update(&ctx, in, len, tmp32);
-	_SHA256_Final(digest, &ctx, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(&ctx, sizeof(SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Init(ctx, K, Klen):
- * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
- * ${K}.
- */
-static void
-_HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen,
-    uint32_t tmp32[static restrict 72], uint8_t pad[static restrict 64],
-    uint8_t khash[static restrict 32])
-{
-	const uint8_t * K = _K;
-	size_t i;
-
-	/* If Klen > 64, the key is really SHA256(K). */
-	if (Klen > 64) {
-		SHA256_Init(&ctx->ictx);
-		_SHA256_Update(&ctx->ictx, K, Klen, tmp32);
-		_SHA256_Final(khash, &ctx->ictx, tmp32);
-		K = khash;
-		Klen = 32;
-	}
-
-	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-	SHA256_Init(&ctx->ictx);
-	memset(pad, 0x36, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-	_SHA256_Update(&ctx->ictx, pad, 64, tmp32);
-
-	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-	SHA256_Init(&ctx->octx);
-	memset(pad, 0x5c, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-	_SHA256_Update(&ctx->octx, pad, 64, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
-{
-	uint32_t tmp32[72];
-	uint8_t pad[64];
-	uint8_t khash[32];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Init(ctx, _K, Klen, tmp32, pad, khash);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(khash, 32);
-	insecure_memzero(pad, 64);
-}
-
-/**
- * HMAC_SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
- */
-static void
-_HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len,
-    uint32_t tmp32[static restrict 72])
-{
-
-	/* Feed data to the inner SHA256 operation. */
-	_SHA256_Update(&ctx->ictx, in, len, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Update(ctx, in, len, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Final(digest, ctx):
- * Output the HMAC-SHA256 of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-static void
-_HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx,
-    uint32_t tmp32[static restrict 72], uint8_t ihash[static restrict 32])
-{
-
-	/* Finish the inner SHA256 operation. */
-	_SHA256_Final(ihash, &ctx->ictx, tmp32);
-
-	/* Feed the inner hash to the outer SHA256 operation. */
-	_SHA256_Update(&ctx->octx, ihash, 32, tmp32);
-
-	/* Finish the outer SHA256 operation. */
-	_SHA256_Final(digest, &ctx->octx, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx)
-{
-	uint32_t tmp32[72];
-	uint8_t ihash[32];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Final(digest, ctx, tmp32, ihash);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(ihash, 32);
-}
-
-/**
- * HMAC_SHA256_Buf(K, Klen, in, len, digest):
- * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
- * length ${Klen}, and write the result to ${digest}.
- */
-void
-HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
-    uint8_t digest[32])
-{
-	HMAC_SHA256_CTX ctx;
-	uint32_t tmp32[72];
-	uint8_t tmp8[96];
-
-	_HMAC_SHA256_Init(&ctx, K, Klen, tmp32, &tmp8[0], &tmp8[64]);
-	_HMAC_SHA256_Update(&ctx, in, len, tmp32);
-	_HMAC_SHA256_Final(digest, &ctx, tmp32, &tmp8[0]);
-
-	/* Clean the stack. */
-	insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(tmp8, 96);
-}
-
-/* Add padding and terminating bit-count, but don't invoke Transform yet. */
-static int
-SHA256_Pad_Almost(SHA256_CTX * ctx, uint8_t len[static restrict 8],
-    uint32_t tmp32[static restrict 72])
-{
-	uint32_t r;
-
-	r = (ctx->count >> 3) & 0x3f;
-	if (r >= 56)
-		return -1;
-
-	/*
-	 * Convert length to a vector of bytes -- we do this now rather
-	 * than later because the length will change after we pad.
-	 */
-	be64enc(len, ctx->count);
-
-	/* Add 1--56 bytes so that the resulting length is 56 mod 64. */
-	_SHA256_Update(ctx, PAD, 56 - r, tmp32);
-
-	/* Add the terminating bit-count. */
-	ctx->buf[63] = len[7];
-	_SHA256_Update(ctx, len, 7, tmp32);
-
-	return 0;
-}
-
-/**
- * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
- * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
- * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
- */
-void
-PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
-    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
-{
-	HMAC_SHA256_CTX Phctx, PShctx, hctx;
-	uint32_t tmp32[72];
-	union {
-		uint8_t tmp8[96];
-		uint32_t state[8];
-	} u;
-	size_t i;
-	uint8_t ivec[4];
-	uint8_t U[32];
-	uint8_t T[32];
-	uint64_t j;
-	int k;
-	size_t clen;
-
-	/* Sanity-check. */
-	assert(dkLen <= 32 * (size_t)(UINT32_MAX));
-
-	if (c == 1 && (dkLen & 31) == 0 && (saltlen & 63) <= 51) {
-		uint32_t oldcount;
-		uint8_t * ivecp;
-
-		/* Compute HMAC state after processing P and S. */
-		_HMAC_SHA256_Init(&hctx, passwd, passwdlen,
-		    tmp32, &u.tmp8[0], &u.tmp8[64]);
-		_HMAC_SHA256_Update(&hctx, salt, saltlen, tmp32);
-
-		/* Prepare ictx padding. */
-		oldcount = hctx.ictx.count & (0x3f << 3);
-		_HMAC_SHA256_Update(&hctx, "\0\0\0", 4, tmp32);
-		if ((hctx.ictx.count & (0x3f << 3)) < oldcount ||
-		    SHA256_Pad_Almost(&hctx.ictx, u.tmp8, tmp32))
-			goto generic; /* Can't happen due to saltlen check */
-		ivecp = hctx.ictx.buf + (oldcount >> 3);
-
-		/* Prepare octx padding. */
-		hctx.octx.count += 32 << 3;
-		SHA256_Pad_Almost(&hctx.octx, u.tmp8, tmp32);
-
-		/* Iterate through the blocks. */
-		for (i = 0; i * 32 < dkLen; i++) {
-			/* Generate INT(i + 1). */
-			be32enc(ivecp, (uint32_t)(i + 1));
-
-			/* Compute U_1 = PRF(P, S || INT(i)). */
-			memcpy(u.state, hctx.ictx.state, sizeof(u.state));
-			SHA256_Transform(u.state, hctx.ictx.buf,
-			    &tmp32[0], &tmp32[64]);
-			be32enc_vect(hctx.octx.buf, u.state, 4);
-			memcpy(u.state, hctx.octx.state, sizeof(u.state));
-			SHA256_Transform(u.state, hctx.octx.buf,
-			    &tmp32[0], &tmp32[64]);
-			be32enc_vect(&buf[i * 32], u.state, 4);
-		}
-
-		goto cleanup;
-	}
-
-generic:
-	/* Compute HMAC state after processing P. */
-	_HMAC_SHA256_Init(&Phctx, passwd, passwdlen,
-	    tmp32, &u.tmp8[0], &u.tmp8[64]);
-
-	/* Compute HMAC state after processing P and S. */
-	memcpy(&PShctx, &Phctx, sizeof(HMAC_SHA256_CTX));
-	_HMAC_SHA256_Update(&PShctx, salt, saltlen, tmp32);
-
-	/* Iterate through the blocks. */
-	for (i = 0; i * 32 < dkLen; i++) {
-		/* Generate INT(i + 1). */
-		be32enc(ivec, (uint32_t)(i + 1));
-
-		/* Compute U_1 = PRF(P, S || INT(i)). */
-		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
-		_HMAC_SHA256_Update(&hctx, ivec, 4, tmp32);
-		_HMAC_SHA256_Final(T, &hctx, tmp32, u.tmp8);
-
-		if (c > 1) {
-			/* T_i = U_1 ... */
-			memcpy(U, T, 32);
-
-			for (j = 2; j <= c; j++) {
-				/* Compute U_j. */
-				memcpy(&hctx, &Phctx, sizeof(HMAC_SHA256_CTX));
-				_HMAC_SHA256_Update(&hctx, U, 32, tmp32);
-				_HMAC_SHA256_Final(U, &hctx, tmp32, u.tmp8);
-
-				/* ... xor U_j ... */
-				for (k = 0; k < 32; k++)
-					T[k] ^= U[k];
-			}
-		}
-
-		/* Copy as many bytes as necessary into buf. */
-		clen = dkLen - i * 32;
-		if (clen > 32)
-			clen = 32;
-		memcpy(&buf[i * 32], T, clen);
-	}
-
-	/* Clean the stack. */
-	insecure_memzero(&Phctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(&PShctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(U, 32);
-	insecure_memzero(T, 32);
-
-cleanup:
-	insecure_memzero(&hctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(&u, sizeof(u));
-}
diff --git a/algo/yespower/sha256.c.new b/algo/yespower/sha256.c.new
deleted file mode 100644
index eab1ca1..0000000
--- a/algo/yespower/sha256.c.new
+++ /dev/null
@@ -1,672 +0,0 @@
-/*-
- * Copyright 2005-2016 Colin Percival
- * Copyright 2016-2018 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <assert.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "insecure_memzero.h"
-#include "sysendian.h"
-
-#include "sha256.h"
-
-#ifdef __ICC
-/* Miscompile with icc 14.0.0 (at least), so don't use restrict there */
-#define restrict
-#elif __STDC_VERSION__ >= 199901L
-/* Have restrict */
-#elif defined(__GNUC__)
-#define restrict __restrict
-#else
-#define restrict
-#endif
-
-/*
- * Encode a length len*2 vector of (uint32_t) into a length len*8 vector of
- * (uint8_t) in big-endian form.
- */
-static void
-be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
-{
-
-	/* Encode vector, two words at a time. */
-	do {
-		be32enc(&dst[0], src[0]);
-		be32enc(&dst[4], src[1]);
-		src += 2;
-		dst += 8;
-	} while (--len);
-}
-
-/*
- * Decode a big-endian length len*8 vector of (uint8_t) into a length
- * len*2 vector of (uint32_t).
- */
-static void
-be32dec_vect(uint32_t * dst, const uint8_t * src, size_t len)
-{
-
-	/* Decode vector, two words at a time. */
-	do {
-		dst[0] = be32dec(&src[0]);
-		dst[1] = be32dec(&src[4]);
-		src += 8;
-		dst += 2;
-	} while (--len);
-}
-
-#if 0
-/* SHA256 round constants. */
-static const uint32_t Krnd[64] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-/* Elementary functions used by SHA256 */
-#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
-#define Maj(x, y, z)	((x & (y | z)) | (y & z))
-#define SHR(x, n)	(x >> n)
-#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
-#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
-#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
-#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
-#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
-
-/* SHA256 round function */
-#define RND(a, b, c, d, e, f, g, h, k)			\
-	h += S1(e) + Ch(e, f, g) + k;			\
-	d += h;						\
-	h += S0(a) + Maj(a, b, c);
-
-/* Adjusted round function for rotating state */
-#define RNDr(S, W, i, ii)			\
-	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
-	    S[(66 - i) % 8], S[(67 - i) % 8],	\
-	    S[(68 - i) % 8], S[(69 - i) % 8],	\
-	    S[(70 - i) % 8], S[(71 - i) % 8],	\
-	    W[i + ii] + Krnd[i + ii])
-
-/* Message schedule computation */
-#define MSCH(W, ii, i)				\
-	W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + s0(W[i + ii + 1]) + W[i + ii]
-
-/*
- * SHA256 block compression function.  The 256-bit state is transformed via
- * the 512-bit input block to produce a new state.
- */
-static void
-SHA256_Transform(uint32_t state[static restrict 8],
-    const uint8_t block[static restrict 64],
-    uint32_t W[static restrict 64], uint32_t S[static restrict 8])
-{
-	int i;
-
-	/* 1. Prepare the first part of the message schedule W. */
-	be32dec_vect(W, block, 8);
-
-	/* 2. Initialize working variables. */
-	memcpy(S, state, 32);
-
-	/* 3. Mix. */
-	for (i = 0; i < 64; i += 16) {
-		RNDr(S, W, 0, i);
-		RNDr(S, W, 1, i);
-		RNDr(S, W, 2, i);
-		RNDr(S, W, 3, i);
-		RNDr(S, W, 4, i);
-		RNDr(S, W, 5, i);
-		RNDr(S, W, 6, i);
-		RNDr(S, W, 7, i);
-		RNDr(S, W, 8, i);
-		RNDr(S, W, 9, i);
-		RNDr(S, W, 10, i);
-		RNDr(S, W, 11, i);
-		RNDr(S, W, 12, i);
-		RNDr(S, W, 13, i);
-		RNDr(S, W, 14, i);
-		RNDr(S, W, 15, i);
-
-		if (i == 48)
-			break;
-		MSCH(W, 0, i);
-		MSCH(W, 1, i);
-		MSCH(W, 2, i);
-		MSCH(W, 3, i);
-		MSCH(W, 4, i);
-		MSCH(W, 5, i);
-		MSCH(W, 6, i);
-		MSCH(W, 7, i);
-		MSCH(W, 8, i);
-		MSCH(W, 9, i);
-		MSCH(W, 10, i);
-		MSCH(W, 11, i);
-		MSCH(W, 12, i);
-		MSCH(W, 13, i);
-		MSCH(W, 14, i);
-		MSCH(W, 15, i);
-	}
-
-	/* 4. Mix local working variables into global state. */
-	state[0] += S[0];
-	state[1] += S[1];
-	state[2] += S[2];
-	state[3] += S[3];
-	state[4] += S[4];
-	state[5] += S[5];
-	state[6] += S[6];
-	state[7] += S[7];
-}
-#endif
-static const uint8_t PAD[64] = {
-	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-/* Add padding and terminating bit-count. */
-static void
-SHA256_Pad(SHA256_CTX * ctx, uint32_t tmp32[static restrict 72])
-{
-	size_t r;
-
-	/* Figure out how many bytes we have buffered. */
-	r = (ctx->count >> 3) & 0x3f;
-
-	/* Pad to 56 mod 64, transforming if we finish a block en route. */
-	if (r < 56) {
-		/* Pad to 56 mod 64. */
-		memcpy(&ctx->buf[r], PAD, 56 - r);
-	} else {
-		/* Finish the current block and mix. */
-		memcpy(&ctx->buf[r], PAD, 64 - r);
-		SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-
-		/* The start of the final block is all zeroes. */
-		memset(&ctx->buf[0], 0, 56);
-	}
-
-	/* Add the terminating bit-count. */
-	be64enc(&ctx->buf[56], ctx->count);
-
-	/* Mix in the final block. */
-	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-}
-#if 0
-/* Magic initialization constants. */
-static const uint32_t initial_state[8] = {
-	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-};
-
-/**
- * SHA256_Init(ctx):
- * Initialize the SHA256 context ${ctx}.
- */
-void
-SHA256_Init(SHA256_CTX * ctx)
-{
-
-	/* Zero bits processed so far. */
-	ctx->count = 0;
-
-	/* Initialize state. */
-	memcpy(ctx->state, initial_state, sizeof(initial_state));
-}
-
-/**
- * SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
- */
-static void
-_SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len,
-    uint32_t tmp32[static restrict 72])
-{
-	uint32_t r;
-	const uint8_t * src = in;
-
-	/* Return immediately if we have nothing to do. */
-	if (len == 0)
-		return;
-
-	/* Number of bytes left in the buffer from previous updates. */
-	r = (ctx->count >> 3) & 0x3f;
-
-	/* Update number of bits. */
-	ctx->count += (uint64_t)(len) << 3;
-
-	/* Handle the case where we don't need to perform any transforms. */
-	if (len < 64 - r) {
-		memcpy(&ctx->buf[r], src, len);
-		return;
-	}
-
-	/* Finish the current block. */
-	memcpy(&ctx->buf[r], src, 64 - r);
-	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-	src += 64 - r;
-	len -= 64 - r;
-
-	/* Perform complete blocks. */
-	while (len >= 64) {
-		SHA256_Transform(ctx->state, src, &tmp32[0], &tmp32[64]);
-		src += 64;
-		len -= 64;
-	}
-
-	/* Copy left over data into buffer. */
-	memcpy(ctx->buf, src, len);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_SHA256_Update(ctx, in, len, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * SHA256_Final(digest, ctx):
- * Output the SHA256 hash of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-static void
-_SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx,
-    uint32_t tmp32[static restrict 72])
-{
-
-	/* Add padding. */
-	SHA256_Pad(ctx, tmp32);
-
-	/* Write the hash. */
-	be32enc_vect(digest, ctx->state, 4);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_SHA256_Final(digest, ctx, tmp32);
-
-	/* Clear the context state. */
-	insecure_memzero(ctx, sizeof(SHA256_CTX));
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-#endif
-/**
- * SHA256_Buf(in, len, digest):
- * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
- */
-void
-SHA256_Buf(const void * in, size_t len, uint8_t digest[32])
-{
-	SHA256_CTX ctx;
-	uint32_t tmp32[72];
-
-	SHA256_Init(&ctx);
-        SHA256_Update(&ctx, in, len);
-        SHA256_Final(digest, &ctx);
-//	_SHA256_Update(&ctx, in, len, tmp32);
-//	_SHA256_Final(digest, &ctx, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(&ctx, sizeof(SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Init(ctx, K, Klen):
- * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
- * ${K}.
- */
-static void
-_HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen,
-    uint32_t tmp32[static restrict 72], uint8_t pad[static restrict 64],
-    uint8_t khash[static restrict 32])
-{
-	const uint8_t * K = _K;
-	size_t i;
-
-	/* If Klen > 64, the key is really SHA256(K). */
-	if (Klen > 64) {
-		SHA256_Init(&ctx->ictx);
-                SHA256_Update(&ctx->ictx, K, Klen);
-                SHA256_Final(khash, &ctx->ictx);
-//		_SHA256_Update(&ctx->ictx, K, Klen, tmp32);
-//		_SHA256_Final(khash, &ctx->ictx, tmp32);
-		K = khash;
-		Klen = 32;
-	}
-
-	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-	SHA256_Init(&ctx->ictx);
-	memset(pad, 0x36, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-        SHA256_Update(&ctx->ictx, pad, 64);
-//	_SHA256_Update(&ctx->ictx, pad, 64, tmp32);
-
-	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-	SHA256_Init(&ctx->octx);
-	memset(pad, 0x5c, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-        SHA256_Update(&ctx->octx, pad, 64);
-//	_SHA256_Update(&ctx->octx, pad, 64, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
-{
-	uint32_t tmp32[72];
-	uint8_t pad[64];
-	uint8_t khash[32];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Init(ctx, _K, Klen, tmp32, pad, khash);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(khash, 32);
-	insecure_memzero(pad, 64);
-}
-
-/**
- * HMAC_SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
- */
-static void
-_HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len,
-    uint32_t tmp32[static restrict 72])
-{
-
-	/* Feed data to the inner SHA256 operation. */
-        SHA256_Update(&ctx->ictx, in, len);
-//	_SHA256_Update(&ctx->ictx, in, len, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len)
-{
-	uint32_t tmp32[72];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Update(ctx, in, len, tmp32);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Final(digest, ctx):
- * Output the HMAC-SHA256 of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-static void
-_HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx,
-    uint32_t tmp32[static restrict 72], uint8_t ihash[static restrict 32])
-{
-        /* Finish the inner SHA256 operation. */
-        _SHA256_Final(ihash, &ctx->ictx, tmp32);
-
-        /* Feed the inner hash to the outer SHA256 operation. */
-        _SHA256_Update(&ctx->octx, ihash, 32, tmp32);
-
-        /* Finish the outer SHA256 operation. */
-        _SHA256_Final(digest, &ctx->octx, tmp32);
-
-
-//	_SHA256_Final(ihash, &ctx->ictx, tmp32);
-//	_SHA256_Update(&ctx->octx, ihash, 32, tmp32);
-//	_SHA256_Final(digest, &ctx->octx, tmp32);
-}
-
-/* Wrapper function for intermediate-values sanitization. */
-void
-HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx)
-{
-	uint32_t tmp32[72];
-	uint8_t ihash[32];
-
-	/* Call the real function. */
-	_HMAC_SHA256_Final(digest, ctx, tmp32, ihash);
-
-	/* Clean the stack. */
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(ihash, 32);
-}
-
-/**
- * HMAC_SHA256_Buf(K, Klen, in, len, digest):
- * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
- * length ${Klen}, and write the result to ${digest}.
- */
-void
-HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
-    uint8_t digest[32])
-{
-	HMAC_SHA256_CTX ctx;
-	uint32_t tmp32[72];
-	uint8_t tmp8[96];
-
-	_HMAC_SHA256_Init(&ctx, K, Klen, tmp32, &tmp8[0], &tmp8[64]);
-	_HMAC_SHA256_Update(&ctx, in, len, tmp32);
-	_HMAC_SHA256_Final(digest, &ctx, tmp32, &tmp8[0]);
-
-	/* Clean the stack. */
-	insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(tmp8, 96);
-}
-
-/* Add padding and terminating bit-count, but don't invoke Transform yet. */
-static int
-SHA256_Pad_Almost(SHA256_CTX * ctx, uint8_t len[static restrict 8],
-    uint32_t tmp32[static restrict 72])
-{
-	uint32_t r;
-
-	r = (ctx->count >> 3) & 0x3f;
-	if (r >= 56)
-		return -1;
-
-	/*
-	 * Convert length to a vector of bytes -- we do this now rather
-	 * than later because the length will change after we pad.
-	 */
-	be64enc(len, ctx->count);
-
-        /* Add 1--56 bytes so that the resulting length is 56 mod 64. */
-        SHA256_Update(ctx, PAD, 56 - r, tmp);
-
-        /* Add the terminating bit-count. */
-        ctx->buf[63] = len[7];
-        SHA256_Update(ctx, len, 7, tmp);
-	
-	/* Add 1--56 bytes so that the resulting length is 56 mod 64. */
-//	_SHA256_Update(ctx, PAD, 56 - r, tmp32);
-
-	/* Add the terminating bit-count. */
-//	ctx->buf[63] = len[7];
-//	_SHA256_Update(ctx, len, 7, tmp32);
-
-	return 0;
-}
-
-/**
- * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
- * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
- * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
- */
-void
-PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
-    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
-{
-	HMAC_SHA256_CTX Phctx, PShctx, hctx;
-	uint32_t tmp32[72];
-	union {
-		uint8_t tmp8[96];
-		uint32_t state[8];
-	} u;
-	size_t i;
-	uint8_t ivec[4];
-	uint8_t U[32];
-	uint8_t T[32];
-	uint64_t j;
-	int k;
-	size_t clen;
-
-	/* Sanity-check. */
-	assert(dkLen <= 32 * (size_t)(UINT32_MAX));
-
-	if (c == 1 && (dkLen & 31) == 0 && (saltlen & 63) <= 51) {
-		uint32_t oldcount;
-		uint8_t * ivecp;
-
-		/* Compute HMAC state after processing P and S. */
-		_HMAC_SHA256_Init(&hctx, passwd, passwdlen,
-		    tmp32, &u.tmp8[0], &u.tmp8[64]);
-		_HMAC_SHA256_Update(&hctx, salt, saltlen, tmp32);
-
-		/* Prepare ictx padding. */
-		oldcount = hctx.ictx.count & (0x3f << 3);
-		_HMAC_SHA256_Update(&hctx, "\0\0\0", 4, tmp32);
-		if ((hctx.ictx.count & (0x3f << 3)) < oldcount ||
-		    SHA256_Pad_Almost(&hctx.ictx, u.tmp8, tmp32))
-			goto generic; /* Can't happen due to saltlen check */
-		ivecp = hctx.ictx.buf + (oldcount >> 3);
-
-		/* Prepare octx padding. */
-		hctx.octx.count += 32 << 3;
-		SHA256_Pad_Almost(&hctx.octx, u.tmp8, tmp32);
-
-		/* Iterate through the blocks. */
-		for (i = 0; i * 32 < dkLen; i++) {
-			/* Generate INT(i + 1). */
-			be32enc(ivecp, (uint32_t)(i + 1));
-
-			/* Compute U_1 = PRF(P, S || INT(i)). */
-			memcpy(u.state, hctx.ictx.state, sizeof(u.state));
-
-                        SHA256_Transform(u.state, hctx.ictx.buf );
-                        be32enc_vect(hctx.octx.buf, u.state, 4);
-                        memcpy(u.state, hctx.octx.state, sizeof(u.state));
-                        SHA256_Transform(u.state, hctx.octx.buf );
-
-//			SHA256_Transform(u.state, hctx.ictx.buf,
-//			    &tmp32[0], &tmp32[64]);
-//			be32enc_vect(hctx.octx.buf, u.state, 4);
-//			memcpy(u.state, hctx.octx.state, sizeof(u.state));
-//			SHA256_Transform(u.state, hctx.octx.buf,
-//			    &tmp32[0], &tmp32[64]);
-
-			be32enc_vect(&buf[i * 32], u.state, 4);
-		}
-
-		goto cleanup;
-	}
-
-generic:
-	/* Compute HMAC state after processing P. */
-	_HMAC_SHA256_Init(&Phctx, passwd, passwdlen,
-	    tmp32, &u.tmp8[0], &u.tmp8[64]);
-
-	/* Compute HMAC state after processing P and S. */
-	memcpy(&PShctx, &Phctx, sizeof(HMAC_SHA256_CTX));
-	_HMAC_SHA256_Update(&PShctx, salt, saltlen, tmp32);
-
-	/* Iterate through the blocks. */
-	for (i = 0; i * 32 < dkLen; i++) {
-		/* Generate INT(i + 1). */
-		be32enc(ivec, (uint32_t)(i + 1));
-
-		/* Compute U_1 = PRF(P, S || INT(i)). */
-		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
-		_HMAC_SHA256_Update(&hctx, ivec, 4, tmp32);
-		_HMAC_SHA256_Final(T, &hctx, tmp32, u.tmp8);
-
-		if (c > 1) {
-			/* T_i = U_1 ... */
-			memcpy(U, T, 32);
-
-			for (j = 2; j <= c; j++) {
-				/* Compute U_j. */
-				memcpy(&hctx, &Phctx, sizeof(HMAC_SHA256_CTX));
-				_HMAC_SHA256_Update(&hctx, U, 32, tmp32);
-				_HMAC_SHA256_Final(U, &hctx, tmp32, u.tmp8);
-
-				/* ... xor U_j ... */
-				for (k = 0; k < 32; k++)
-					T[k] ^= U[k];
-			}
-		}
-
-		/* Copy as many bytes as necessary into buf. */
-		clen = dkLen - i * 32;
-		if (clen > 32)
-			clen = 32;
-		memcpy(&buf[i * 32], T, clen);
-	}
-
-	/* Clean the stack. */
-	insecure_memzero(&Phctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(&PShctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(U, 32);
-	insecure_memzero(T, 32);
-
-cleanup:
-	insecure_memzero(&hctx, sizeof(HMAC_SHA256_CTX));
-	insecure_memzero(tmp32, 288);
-	insecure_memzero(&u, sizeof(u));
-}
diff --git a/algo/yespower/sha256.h b/algo/yespower/sha256.h
deleted file mode 100644
index 6210502..0000000
--- a/algo/yespower/sha256.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*-
- * Copyright 2005-2016 Colin Percival
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef _SHA256_H_
-#define _SHA256_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Use #defines in order to avoid namespace collisions with anyone else's
- * SHA256 code (e.g., the code in OpenSSL).
- */
-#define SHA256_Init libcperciva_SHA256_Init
-#define SHA256_Update libcperciva_SHA256_Update
-#define SHA256_Final libcperciva_SHA256_Final
-#define SHA256_Buf libcperciva_SHA256_Buf
-#define SHA256_CTX libcperciva_SHA256_CTX
-#define HMAC_SHA256_Init libcperciva_HMAC_SHA256_Init
-#define HMAC_SHA256_Update libcperciva_HMAC_SHA256_Update
-#define HMAC_SHA256_Final libcperciva_HMAC_SHA256_Final
-#define HMAC_SHA256_Buf libcperciva_HMAC_SHA256_Buf
-#define HMAC_SHA256_CTX libcperciva_HMAC_SHA256_CTX
-
-/* Context structure for SHA256 operations. */
-typedef struct {
-	uint32_t state[8];
-	uint64_t count;
-	uint8_t buf[64];
-} SHA256_CTX;
-
-/**
- * SHA256_Init(ctx):
- * Initialize the SHA256 context ${ctx}.
- */
-void SHA256_Init(SHA256_CTX *);
-
-/**
- * SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
- */
-void SHA256_Update(SHA256_CTX *, const void *, size_t);
-
-/**
- * SHA256_Final(digest, ctx):
- * Output the SHA256 hash of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-void SHA256_Final(uint8_t[32], SHA256_CTX *);
-
-/**
- * SHA256_Buf(in, len, digest):
- * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
- */
-void SHA256_Buf(const void *, size_t, uint8_t[32]);
-
-/* Context structure for HMAC-SHA256 operations. */
-typedef struct {
-	SHA256_CTX ictx;
-	SHA256_CTX octx;
-} HMAC_SHA256_CTX;
-
-/**
- * HMAC_SHA256_Init(ctx, K, Klen):
- * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
- * ${K}.
- */
-void HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t);
-
-/**
- * HMAC_SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
- */
-void HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t);
-
-/**
- * HMAC_SHA256_Final(digest, ctx):
- * Output the HMAC-SHA256 of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-void HMAC_SHA256_Final(uint8_t[32], HMAC_SHA256_CTX *);
-
-/**
- * HMAC_SHA256_Buf(K, Klen, in, len, digest):
- * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
- * length ${Klen}, and write the result to ${digest}.
- */
-void HMAC_SHA256_Buf(const void *, size_t, const void *, size_t, uint8_t[32]);
-
-/**
- * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
- * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
- * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
- */
-void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t,
-    uint64_t, uint8_t *, size_t);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* !_SHA256_H_ */
diff --git a/algo/yespower/sha256.h.new b/algo/yespower/sha256.h.new
deleted file mode 100644
index 086d879..0000000
--- a/algo/yespower/sha256.h.new
+++ /dev/null
@@ -1,134 +0,0 @@
-/*-
- * Copyright 2005-2016 Colin Percival
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef _SHA256_H_
-#define _SHA256_H_
-
-#include <stddef.h>
-#include <stdint.h>
-#include <openssl.sha>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Use #defines in order to avoid namespace collisions with anyone else's
- * SHA256 code (e.g., the code in OpenSSL).
- */
-/*
-#define SHA256_Init libcperciva_SHA256_Init
-#define SHA256_Update libcperciva_SHA256_Update
-#define SHA256_Final libcperciva_SHA256_Final
-#define SHA256_CTX libcperciva_SHA256_CTX
-*/
-#define SHA256_Buf libcperciva_SHA256_Buf
-#define HMAC_SHA256_Init libcperciva_HMAC_SHA256_Init
-#define HMAC_SHA256_Update libcperciva_HMAC_SHA256_Update
-#define HMAC_SHA256_Final libcperciva_HMAC_SHA256_Final
-#define HMAC_SHA256_Buf libcperciva_HMAC_SHA256_Buf
-#define HMAC_SHA256_CTX libcperciva_HMAC_SHA256_CTX
-
-#if 0
-/* Context structure for SHA256 operations. */
-typedef struct {
-	uint32_t state[8];
-	uint64_t count;
-	uint8_t buf[64];
-} SHA256_CTX;
-
-/**
- * SHA256_Init(ctx):
- * Initialize the SHA256 context ${ctx}.
- */
-void SHA256_Init(SHA256_CTX *);
-
-/**
- * SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
- */
-void SHA256_Update(SHA256_CTX *, const void *, size_t);
-
-/**
- * SHA256_Final(digest, ctx):
- * Output the SHA256 hash of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-void SHA256_Final(uint8_t[32], SHA256_CTX *);
-#endif
-
-/**
- * SHA256_Buf(in, len, digest):
- * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
- */
-void SHA256_Buf(const void *, size_t, uint8_t[32]);
-
-/* Context structure for HMAC-SHA256 operations. */
-typedef struct {
-	SHA256_CTX ictx;
-	SHA256_CTX octx;
-} HMAC_SHA256_CTX;
-
-/**
- * HMAC_SHA256_Init(ctx, K, Klen):
- * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
- * ${K}.
- */
-void HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t);
-
-/**
- * HMAC_SHA256_Update(ctx, in, len):
- * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
- */
-void HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t);
-
-/**
- * HMAC_SHA256_Final(digest, ctx):
- * Output the HMAC-SHA256 of the data input to the context ${ctx} into the
- * buffer ${digest}.
- */
-void HMAC_SHA256_Final(uint8_t[32], HMAC_SHA256_CTX *);
-
-/**
- * HMAC_SHA256_Buf(K, Klen, in, len, digest):
- * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
- * length ${Klen}, and write the result to ${digest}.
- */
-void HMAC_SHA256_Buf(const void *, size_t, const void *, size_t, uint8_t[32]);
-
-/**
- * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
- * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
- * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
- */
-void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t,
-    uint64_t, uint8_t *, size_t);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* !_SHA256_H_ */
diff --git a/algo/yespower/sha256_p.c b/algo/yespower/sha256_p.c
new file mode 100644
index 0000000..7201797
--- /dev/null
+++ b/algo/yespower/sha256_p.c
@@ -0,0 +1,218 @@
+/*-
+ * Copyright 2005,2007,2009 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+
+#include <stdint.h>
+#include <string.h>
+
+#include "sysendian.h"
+
+#include "sha256_p.h"
+#include "compat.h"
+
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define SHR(x, n)	(x >> n)
+#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
+#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
+#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k)			\
+	t0 = h + S1(e) + Ch(e, f, g) + k;		\
+	t1 = S0(a) + Maj(a, b, c);			\
+	d += t0;					\
+	h  = t0 + t1;
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i, k)			\
+	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
+	    S[(66 - i) % 8], S[(67 - i) % 8],	\
+	    S[(68 - i) % 8], S[(69 - i) % 8],	\
+	    S[(70 - i) % 8], S[(71 - i) % 8],	\
+	    W[i] + k)
+
+/*
+static unsigned char PAD[64] = {
+	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+*/
+/**
+ * SHA256_Buf(in, len, digest):
+ * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
+ */
+void
+SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
+{
+	SHA256_CTX ctx;
+        SHA256_Init( &ctx );
+        SHA256_Update( &ctx, in, len );
+        SHA256_Final( digest, &ctx );
+}
+
+/**
+ * HMAC_SHA256_Buf(K, Klen, in, len, digest):
+ * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
+ * length ${Klen}, and write the result to ${digest}.
+ */
+void
+HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
+    uint8_t digest[32])
+{
+        HMAC_SHA256_CTX ctx;
+
+        HMAC_SHA256_Init( &ctx, K, Klen );
+        HMAC_SHA256_Update( &ctx, in, len );
+        HMAC_SHA256_Final( digest, &ctx );
+}
+
+/* Initialize an HMAC-SHA256 operation with the given key. */
+void
+HMAC_SHA256_Init( HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen )
+{
+	unsigned char pad[64];
+	unsigned char khash[32];
+	const unsigned char * K = _K;
+	size_t i;
+
+	/* If Klen > 64, the key is really SHA256(K). */
+	if (Klen > 64) {
+		SHA256_Init( &ctx->ictx );
+		SHA256_Update( &ctx->ictx, K, Klen );
+		SHA256_Final( khash, &ctx->ictx );
+		K = khash;
+		Klen = 32;
+	}
+
+	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
+        SHA256_Init( &ctx->ictx );
+	memset( pad, 0x36, 64 );
+	for ( i = 0; i < Klen; i++ )
+		pad[i] ^= K[i];
+	SHA256_Update( &ctx->ictx, pad, 64 );
+
+	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
+	SHA256_Init( &ctx->octx );
+	memset(pad, 0x5c, 64);
+	for ( i = 0; i < Klen; i++ )
+		pad[i] ^= K[i];
+	SHA256_Update( &ctx->octx, pad, 64 );
+
+	/* Clean the stack. */
+	//memset(khash, 0, 32);
+}
+
+/* Add bytes to the HMAC-SHA256 operation. */
+void
+HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)
+{
+
+	/* Feed data to the inner SHA256 operation. */
+	SHA256_Update( &ctx->ictx, in, len );
+}
+
+/* Finish an HMAC-SHA256 operation. */
+void
+HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx )
+{
+	unsigned char ihash[32];
+
+	/* Finish the inner SHA256 operation. */
+	SHA256_Final( ihash, &ctx->ictx );
+
+	/* Feed the inner hash to the outer SHA256 operation. */
+	SHA256_Update( &ctx->octx, ihash, 32 );
+
+	/* Finish the outer SHA256 operation. */
+	SHA256_Final( digest, &ctx->octx );
+
+	/* Clean the stack. */
+	//memset(ihash, 0, 32);
+}
+
+/**
+ * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
+ * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
+ * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
+ */
+void
+PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
+    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
+{
+	HMAC_SHA256_CTX PShctx, hctx;
+	uint8_t _ALIGN(128) T[32];
+	uint8_t _ALIGN(128) U[32];
+	uint8_t ivec[4];
+	size_t i, clen;
+	uint64_t j;
+	int k;
+
+	/* Compute HMAC state after processing P and S. */
+	HMAC_SHA256_Init(&PShctx, passwd, passwdlen);
+	HMAC_SHA256_Update(&PShctx, salt, saltlen);
+
+	/* Iterate through the blocks. */
+	for (i = 0; i * 32 < dkLen; i++) {
+		/* Generate INT(i + 1). */
+		be32enc(ivec, (uint32_t)(i + 1));
+
+		/* Compute U_1 = PRF(P, S || INT(i)). */
+		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
+		HMAC_SHA256_Update(&hctx, ivec, 4);
+		HMAC_SHA256_Final(U, &hctx);
+
+		/* T_i = U_1 ... */
+		memcpy(T, U, 32);
+
+		for (j = 2; j <= c; j++) {
+			/* Compute U_j. */
+			HMAC_SHA256_Init(&hctx, passwd, passwdlen);
+			HMAC_SHA256_Update(&hctx, U, 32);
+			HMAC_SHA256_Final(U, &hctx);
+
+			/* ... xor U_j ... */
+			for (k = 0; k < 32; k++)
+				T[k] ^= U[k];
+		}
+
+		/* Copy as many bytes as necessary into buf. */
+		clen = dkLen - i * 32;
+		if (clen > 32)
+			clen = 32;
+		memcpy(&buf[i * 32], T, clen);
+	}
+
+	/* Clean PShctx, since we never called _Final on it. */
+	//memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y));
+}
diff --git a/algo/yespower/sha256_p.c.sha b/algo/yespower/sha256_p.c.sha
deleted file mode 100644
index 86032a3..0000000
--- a/algo/yespower/sha256_p.c.sha
+++ /dev/null
@@ -1,496 +0,0 @@
-/*-
- * Copyright 2005,2007,2009 Colin Percival
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <sys/types.h>
-
-#include <stdint.h>
-#include <string.h>
-
-#include "sysendian.h"
-
-#include "sha256_p.h"
-#include "compat.h"
-
-/*
- * Encode a length len/4 vector of (uint32_t) into a length len vector of
- * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
- */
-static void
-be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len)
-{
-	size_t i;
-
-	for (i = 0; i < len / 4; i++)
-		be32enc(dst + i * 4, src[i]);
-}
-
-/*
- * Decode a big-endian length len vector of (unsigned char) into a length
- * len/4 vector of (uint32_t).  Assumes len is a multiple of 4.
- */
-static void
-be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len)
-{
-	size_t i;
-
-	for (i = 0; i < len / 4; i++)
-		dst[i] = be32dec(src + i * 4);
-}
-
-/* Elementary functions used by SHA256 */
-#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
-#define Maj(x, y, z)	((x & (y | z)) | (y & z))
-#define SHR(x, n)	(x >> n)
-#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
-#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
-#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
-#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
-#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
-
-/* SHA256 round function */
-#define RND(a, b, c, d, e, f, g, h, k)			\
-	t0 = h + S1(e) + Ch(e, f, g) + k;		\
-	t1 = S0(a) + Maj(a, b, c);			\
-	d += t0;					\
-	h  = t0 + t1;
-
-/* Adjusted round function for rotating state */
-#define RNDr(S, W, i, k)			\
-	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
-	    S[(66 - i) % 8], S[(67 - i) % 8],	\
-	    S[(68 - i) % 8], S[(69 - i) % 8],	\
-	    S[(70 - i) % 8], S[(71 - i) % 8],	\
-	    W[i] + k)
-
-/*
- * SHA256 block compression function.  The 256-bit state is transformed via
- * the 512-bit input block to produce a new state.
- */
-static void
-SHA256_Transform_p(uint32_t * state, const unsigned char block[64])
-{
-	uint32_t _ALIGN(128) W[64], S[8];
-	uint32_t t0, t1;
-	int i;
-
-	/* 1. Prepare message schedule W. */
-	be32dec_vect(W, block, 64);
-	for (i = 16; i < 64; i++)
-		W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
-
-	/* 2. Initialize working variables. */
-	memcpy(S, state, 32);
-
-	/* 3. Mix. */
-	RNDr(S, W, 0, 0x428a2f98);
-	RNDr(S, W, 1, 0x71374491);
-	RNDr(S, W, 2, 0xb5c0fbcf);
-	RNDr(S, W, 3, 0xe9b5dba5);
-	RNDr(S, W, 4, 0x3956c25b);
-	RNDr(S, W, 5, 0x59f111f1);
-	RNDr(S, W, 6, 0x923f82a4);
-	RNDr(S, W, 7, 0xab1c5ed5);
-	RNDr(S, W, 8, 0xd807aa98);
-	RNDr(S, W, 9, 0x12835b01);
-	RNDr(S, W, 10, 0x243185be);
-	RNDr(S, W, 11, 0x550c7dc3);
-	RNDr(S, W, 12, 0x72be5d74);
-	RNDr(S, W, 13, 0x80deb1fe);
-	RNDr(S, W, 14, 0x9bdc06a7);
-	RNDr(S, W, 15, 0xc19bf174);
-	RNDr(S, W, 16, 0xe49b69c1);
-	RNDr(S, W, 17, 0xefbe4786);
-	RNDr(S, W, 18, 0x0fc19dc6);
-	RNDr(S, W, 19, 0x240ca1cc);
-	RNDr(S, W, 20, 0x2de92c6f);
-	RNDr(S, W, 21, 0x4a7484aa);
-	RNDr(S, W, 22, 0x5cb0a9dc);
-	RNDr(S, W, 23, 0x76f988da);
-	RNDr(S, W, 24, 0x983e5152);
-	RNDr(S, W, 25, 0xa831c66d);
-	RNDr(S, W, 26, 0xb00327c8);
-	RNDr(S, W, 27, 0xbf597fc7);
-	RNDr(S, W, 28, 0xc6e00bf3);
-	RNDr(S, W, 29, 0xd5a79147);
-	RNDr(S, W, 30, 0x06ca6351);
-	RNDr(S, W, 31, 0x14292967);
-	RNDr(S, W, 32, 0x27b70a85);
-	RNDr(S, W, 33, 0x2e1b2138);
-	RNDr(S, W, 34, 0x4d2c6dfc);
-	RNDr(S, W, 35, 0x53380d13);
-	RNDr(S, W, 36, 0x650a7354);
-	RNDr(S, W, 37, 0x766a0abb);
-	RNDr(S, W, 38, 0x81c2c92e);
-	RNDr(S, W, 39, 0x92722c85);
-	RNDr(S, W, 40, 0xa2bfe8a1);
-	RNDr(S, W, 41, 0xa81a664b);
-	RNDr(S, W, 42, 0xc24b8b70);
-	RNDr(S, W, 43, 0xc76c51a3);
-	RNDr(S, W, 44, 0xd192e819);
-	RNDr(S, W, 45, 0xd6990624);
-	RNDr(S, W, 46, 0xf40e3585);
-	RNDr(S, W, 47, 0x106aa070);
-	RNDr(S, W, 48, 0x19a4c116);
-	RNDr(S, W, 49, 0x1e376c08);
-	RNDr(S, W, 50, 0x2748774c);
-	RNDr(S, W, 51, 0x34b0bcb5);
-	RNDr(S, W, 52, 0x391c0cb3);
-	RNDr(S, W, 53, 0x4ed8aa4a);
-	RNDr(S, W, 54, 0x5b9cca4f);
-	RNDr(S, W, 55, 0x682e6ff3);
-	RNDr(S, W, 56, 0x748f82ee);
-	RNDr(S, W, 57, 0x78a5636f);
-	RNDr(S, W, 58, 0x84c87814);
-	RNDr(S, W, 59, 0x8cc70208);
-	RNDr(S, W, 60, 0x90befffa);
-	RNDr(S, W, 61, 0xa4506ceb);
-	RNDr(S, W, 62, 0xbef9a3f7);
-	RNDr(S, W, 63, 0xc67178f2);
-
-	/* 4. Mix local working variables into global state */
-	for (i = 0; i < 8; i++)
-		state[i] += S[i];
-#if 0
-	/* Clean the stack. */
-	memset(W, 0, 256);
-	memset(S, 0, 32);
-	t0 = t1 = 0;
-#endif
-}
-
-static unsigned char PAD[64] = {
-	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-// only called by SHA256_Final_p
-/* Add padding and terminating bit-count. */
-static void
-SHA256_Pad_p(SHA256_CTX_p * ctx)
-{
-	unsigned char len[8];
-	uint32_t r, plen;
-
-	/*
-	 * Convert length to a vector of bytes -- we do this now rather
-	 * than later because the length will change after we pad.
-	 */
-	be32enc_vect(len, ctx->count, 8);
-
-	/* Add 1--64 bytes so that the resulting length is 56 mod 64 */
-	r = (ctx->count[1] >> 3) & 0x3f;
-	plen = (r < 56) ? (56 - r) : (120 - r);
-	SHA256_Update_p(ctx, PAD, (size_t)plen);
-	/* Add the terminating bit-count */
-	SHA256_Update_p(ctx, len, 8);
-}
-
-/* SHA-256 initialization.  Begins a SHA-256 operation. */
-void
-SHA256_Init_p(SHA256_CTX_p * ctx)
-{
-	/* Zero bits processed so far */
-	ctx->count[0] = ctx->count[1] = 0;
-
-	/* Magic initialization constants */
-	ctx->state[0] = 0x6A09E667;
-	ctx->state[1] = 0xBB67AE85;
-	ctx->state[2] = 0x3C6EF372;
-	ctx->state[3] = 0xA54FF53A;
-	ctx->state[4] = 0x510E527F;
-	ctx->state[5] = 0x9B05688C;
-	ctx->state[6] = 0x1F83D9AB;
-	ctx->state[7] = 0x5BE0CD19;
-}
-
-/* Add bytes into the hash */
-void
-SHA256_Update_p(SHA256_CTX_p * ctx, const void *in, size_t len)
-{
-	uint32_t bitlen[2];
-	uint32_t r;
-	const unsigned char *src = in;
-
-	/* Number of bytes left in the buffer from previous updates */
-	r = (ctx->count[1] >> 3) & 0x3f;
-
-	/* Convert the length into a number of bits */
-	bitlen[1] = ((uint32_t)len) << 3;
-	bitlen[0] = (uint32_t)(len >> 29);
-
-	/* Update number of bits */
-	if ((ctx->count[1] += bitlen[1]) < bitlen[1])
-		ctx->count[0]++;
-	ctx->count[0] += bitlen[0];
-
-	/* Handle the case where we don't need to perform any transforms */
-	if (len < 64 - r) {
-		memcpy(&ctx->buf[r], src, len);
-		return;
-	}
-
-	/* Finish the current block */
-	memcpy(&ctx->buf[r], src, 64 - r);
-        SHA256_Transform_p(ctx->state, ctx->buf);
-	src += 64 - r;
-	len -= 64 - r;
-
-	/* Perform complete blocks */
-	while (len >= 64) {
-		SHA256_Transform_p(ctx->state, src);
-		src += 64;
-		len -= 64;
-	}
-
-	/* Copy left over data into buffer */
-	memcpy(ctx->buf, src, len);
-}
-
-/*
- * SHA-256 finalization.  Pads the input data, exports the hash value,
- * and clears the context state.
- */
-void
-SHA256_Final_p(unsigned char digest[32], SHA256_CTX_p * ctx)
-{
-	/* Add padding */
-	SHA256_Pad_p(ctx);
-
-	/* Write the hash */
-	be32enc_vect(digest, ctx->state, 32);
-
-	/* Clear the context state */
-	memset((void *)ctx, 0, sizeof(*ctx));
-}
-
-/**
- * SHA256_Buf(in, len, digest):
- * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
- */
-void
-SHA256_Buf_p(const void * in, size_t len, uint8_t digest[32])
-{
-//        SHA256_CTX_p ctx;
-//        uint32_t tmp32[72];
-
-#if defined(__SHA__)
-        SHA256_CTX ctx;
-        SHA256_Init(&ctx);
-        SHA256_Update(&ctx, in, len);
-        SHA256_Final(digest, &ctx);
-#else
-        SHA256_CTX_p ctx;
-	SHA256_Init_p(&ctx);
-        SHA256_Update_p(&ctx, in, len);
-        SHA256_Final_p(digest, &ctx);
-#endif
-
-        /* Clean the stack. */
-//      insecure_memzero(&ctx, sizeof(SHA256_CTX));
-//      insecure_memzero(tmp32, 288);
-}
-
-/**
- * HMAC_SHA256_Buf(K, Klen, in, len, digest):
- * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
- * length ${Klen}, and write the result to ${digest}.
- */
-void
-HMAC_SHA256_Buf_p(const void * K, size_t Klen, const void * in, size_t len,
-    uint8_t digest[32])
-{
-        HMAC_SHA256_CTX_p ctx;
-//        uint32_t tmp32[72];
-//        uint8_t tmp8[96];
-
-        HMAC_SHA256_Init_p(&ctx, K, Klen);
-        HMAC_SHA256_Update_p(&ctx, in, len);
-        HMAC_SHA256_Final_p(digest, &ctx);
-
-        /* Clean the stack. */
-//        insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
-//        insecure_memzero(tmp32, 288);
-//        insecure_memzero(tmp8, 96);
-}
-
-/* Initialize an HMAC-SHA256 operation with the given key. */
-void
-HMAC_SHA256_Init_p(HMAC_SHA256_CTX_p * ctx, const void * _K, size_t Klen)
-{
-	unsigned char pad[64];
-	unsigned char khash[32];
-	const unsigned char * K = _K;
-	size_t i;
-
-	/* If Klen > 64, the key is really SHA256(K). */
-	if (Klen > 64) {
-#if defined(__SHA__)
-		SHA256_Init(&ctx->ictx);
-		SHA256_Update(&ctx->ictx, K, Klen);
-		SHA256_Final(khash, &ctx->ictx);
-#else
-                SHA256_Init_p(&ctx->ictx);
-                SHA256_Update_p(&ctx->ictx, K, Klen);
-                SHA256_Final_p(khash, &ctx->ictx);
-#endif
-		K = khash;
-		Klen = 32;
-	}
-
-	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-#if defined(__SHA__)
-        SHA256_Init(&ctx->ictx);
-#else
-        SHA256_Init_p(&ctx->ictx);
-#endif
-	memset(pad, 0x36, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-#if defined(__SHA__)
-	SHA256_Update(&ctx->ictx, pad, 64);
-#else
-        SHA256_Update_p(&ctx->ictx, pad, 64);
-#endif
-
-	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-#if defined(__SHA__)
-	SHA256_Init(&ctx->octx);
-#else
-        SHA256_Init_p(&ctx->octx);
-#endif
-	memset(pad, 0x5c, 64);
-	for (i = 0; i < Klen; i++)
-		pad[i] ^= K[i];
-#if defined(__SHA__)
-	SHA256_Update(&ctx->octx, pad, 64);
-#else
-        SHA256_Update_p(&ctx->octx, pad, 64);
-#endif
-
-	/* Clean the stack. */
-	//memset(khash, 0, 32);
-}
-
-/* Add bytes to the HMAC-SHA256 operation. */
-void
-HMAC_SHA256_Update_p(HMAC_SHA256_CTX_p * ctx, const void *in, size_t len)
-{
-
-	/* Feed data to the inner SHA256 operation. */
-#if defined(__SHA__)
-	SHA256_Update(&ctx->ictx, in, len);
-#else
-        SHA256_Update_p(&ctx->ictx, in, len);
-#endif
-}
-
-/* Finish an HMAC-SHA256 operation. */
-void
-HMAC_SHA256_Final_p(unsigned char digest[32], HMAC_SHA256_CTX_p * ctx)
-{
-	unsigned char ihash[32];
-
-#if defined(__SHA__)
-	/* Finish the inner SHA256 operation. */
-	SHA256_Final(ihash, &ctx->ictx);
-
-	/* Feed the inner hash to the outer SHA256 operation. */
-	SHA256_Update(&ctx->octx, ihash, 32);
-
-	/* Finish the outer SHA256 operation. */
-	SHA256_Final(digest, &ctx->octx);
-#else
-        /* Finish the inner SHA256 operation. */
-        SHA256_Final_p(ihash, &ctx->ictx);
-
-        /* Feed the inner hash to the outer SHA256 operation. */
-        SHA256_Update_p(&ctx->octx, ihash, 32);
-
-        /* Finish the outer SHA256 operation. */
-        SHA256_Final_p(digest, &ctx->octx);
-#endif
-
-	/* Clean the stack. */
-	//memset(ihash, 0, 32);
-}
-
-/**
- * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
- * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
- * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
- */
-void
-PBKDF2_SHA256_p(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
-    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
-{
-	HMAC_SHA256_CTX_p PShctx, hctx;
-	uint8_t _ALIGN(128) T[32];
-	uint8_t _ALIGN(128) U[32];
-	uint8_t ivec[4];
-	size_t i, clen;
-	uint64_t j;
-	int k;
-
-	/* Compute HMAC state after processing P and S. */
-	HMAC_SHA256_Init_p(&PShctx, passwd, passwdlen);
-	HMAC_SHA256_Update_p(&PShctx, salt, saltlen);
-
-	/* Iterate through the blocks. */
-	for (i = 0; i * 32 < dkLen; i++) {
-		/* Generate INT(i + 1). */
-		be32enc(ivec, (uint32_t)(i + 1));
-
-		/* Compute U_1 = PRF(P, S || INT(i)). */
-		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_p));
-		HMAC_SHA256_Update_p(&hctx, ivec, 4);
-		HMAC_SHA256_Final_p(U, &hctx);
-
-		/* T_i = U_1 ... */
-		memcpy(T, U, 32);
-
-		for (j = 2; j <= c; j++) {
-			/* Compute U_j. */
-			HMAC_SHA256_Init_p(&hctx, passwd, passwdlen);
-			HMAC_SHA256_Update_p(&hctx, U, 32);
-			HMAC_SHA256_Final_p(U, &hctx);
-
-			/* ... xor U_j ... */
-			for (k = 0; k < 32; k++)
-				T[k] ^= U[k];
-		}
-
-		/* Copy as many bytes as necessary into buf. */
-		clen = dkLen - i * 32;
-		if (clen > 32)
-			clen = 32;
-		memcpy(&buf[i * 32], T, clen);
-	}
-
-	/* Clean PShctx, since we never called _Final on it. */
-	//memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y));
-}
diff --git a/algo/yespower/sha256_p.h.sha b/algo/yespower/sha256_p.h
similarity index 66%
rename from algo/yespower/sha256_p.h.sha
rename to algo/yespower/sha256_p.h
index a10011c..2481caf 100644
--- a/algo/yespower/sha256_p.h.sha
+++ b/algo/yespower/sha256_p.h
@@ -33,45 +33,24 @@
 #include <stdint.h>
 #include <openssl/sha.h>
 
-typedef struct SHA256Context {
-	uint32_t state[8];
-	uint32_t count[2];
-	unsigned char buf[64];
-} SHA256_CTX_p;
-
-/*
 typedef struct HMAC_SHA256Context {
-	SHA256_CTX_Y ictx;
-	SHA256_CTX_Y octx;
-} HMAC_SHA256_CTX_Y;
-*/
-
-typedef struct HMAC_SHA256Context {
-#if defined(__SHA__)
         SHA256_CTX ictx;
         SHA256_CTX octx;
-#else
-        SHA256_CTX_p ictx;
-        SHA256_CTX_p octx;
-#endif
-} HMAC_SHA256_CTX_p;
+} HMAC_SHA256_CTX;
 
-void	SHA256_Init_p(SHA256_CTX_p *);
-void	SHA256_Update_p(SHA256_CTX_p *, const void *, size_t);
-void	SHA256_Final_p(unsigned char [32], SHA256_CTX_p *);
-void    SHA256_Buf_p(const void * in, size_t len, uint8_t digest[32]);
-void	HMAC_SHA256_Init_p(HMAC_SHA256_CTX_p *, const void *, size_t);
-void	HMAC_SHA256_Update_p(HMAC_SHA256_CTX_p *, const void *, size_t);
-void	HMAC_SHA256_Final_p(unsigned char [32], HMAC_SHA256_CTX_p *);
-void    HMAC_SHA256_Buf_p(const void * K, size_t Klen, const void * in,
-	size_t len, uint8_t digest[32]);
+void SHA256_Buf( const void * in, size_t len, uint8_t digest[32] );
+void HMAC_SHA256_Init( HMAC_SHA256_CTX *, const void *, size_t );
+void HMAC_SHA256_Update( HMAC_SHA256_CTX *, const void *, size_t );
+void HMAC_SHA256_Final( unsigned char [32], HMAC_SHA256_CTX * );
+void HMAC_SHA256_Buf( const void * K, size_t Klen, const void * in,
+                      size_t len, uint8_t digest[32] );
 
 /**
  * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
  * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
  * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
  */
-void	PBKDF2_SHA256_p(const uint8_t *, size_t, const uint8_t *, size_t,
-    uint64_t, uint8_t *, size_t);
+void PBKDF2_SHA256( const uint8_t *, size_t, const uint8_t *, size_t,
+                    uint64_t, uint8_t *, size_t);
 
 #endif /* !_SHA256_H_ */
diff --git a/algo/yespower/yespower-opt.c b/algo/yespower/yespower-opt.c
index aa7c08f..b6f76ec 100644
--- a/algo/yespower/yespower-opt.c
+++ b/algo/yespower/yespower-opt.c
@@ -62,6 +62,7 @@
 #warning "Note: building generic code for non-x86.  That's OK."
 #endif
 */
+
 /*
  * The SSE4 code version has fewer instructions than the generic SSE2 version,
  * but all of the instructions are SIMD, thereby wasting the scalar execution
@@ -96,7 +97,7 @@
 #include <string.h>
 
 #include "insecure_memzero.h"
-#include "sha256.h"
+#include "sha256_p.h"
 #include "sysendian.h"
 
 #include "yespower.h"
@@ -528,7 +529,7 @@ static volatile uint64_t Smask2var = Smask2;
 /* 64-bit without AVX.  This relies on out-of-order execution and register
  * renaming.  It may actually be fastest on CPUs with AVX(2) as well - e.g.,
  * it runs great on Haswell. */
-//#warning "Note: using x86-64 inline assembly for pwxform.  That's great."
+#warning "Note: using x86-64 inline assembly for pwxform.  That's great."
 #undef MAYBE_MEMORY_BARRIER
 #define MAYBE_MEMORY_BARRIER \
 	__asm__("" : : : "memory");
diff --git a/algo/yespower/yespower-opt.c.sha b/algo/yespower/yespower-opt.c.sha
deleted file mode 100644
index 8c5c571..0000000
--- a/algo/yespower/yespower-opt.c.sha
+++ /dev/null
@@ -1,1147 +0,0 @@
-/*-
- * Copyright 2009 Colin Percival
- * Copyright 2012-2018 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * This file was originally written by Colin Percival as part of the Tarsnap
- * online backup system.
- *
- * This is a proof-of-work focused fork of yescrypt, including optimized and
- * cut-down implementation of the obsolete yescrypt 0.5 (based off its first
- * submission to PHC back in 2014) and a new proof-of-work specific variation
- * known as yespower 1.0.  The former is intended as an upgrade for
- * cryptocurrencies that already use yescrypt 0.5 and the latter may be used
- * as a further upgrade (hard fork) by those and other cryptocurrencies.  The
- * version of algorithm to use is requested through parameters, allowing for
- * both algorithms to co-exist in client and miner implementations (such as in
- * preparation for a hard-fork).
- */
-
-#ifndef _YESPOWER_OPT_C_PASS_
-#define _YESPOWER_OPT_C_PASS_ 1
-#endif
-
-#if _YESPOWER_OPT_C_PASS_ == 1
-/*
- * AVX and especially XOP speed up Salsa20 a lot, but needlessly result in
- * extra instruction prefixes for pwxform (which we make more use of).  While
- * no slowdown from the prefixes is generally observed on AMD CPUs supporting
- * XOP, some slowdown is sometimes observed on Intel CPUs with AVX.
- */
-#ifdef __XOP__
-#warning "Note: XOP is enabled.  That's great."
-#elif defined(__AVX__)
-#warning "Note: AVX is enabled.  That's OK."
-#elif defined(__SSE2__)
-#warning "Note: AVX and XOP are not enabled.  That's OK."
-#elif defined(__x86_64__) || defined(__i386__)
-#warning "SSE2 not enabled.  Expect poor performance."
-#else
-#warning "Note: building generic code for non-x86.  That's OK."
-#endif
-
-/*
- * The SSE4 code version has fewer instructions than the generic SSE2 version,
- * but all of the instructions are SIMD, thereby wasting the scalar execution
- * units.  Thus, the generic SSE2 version below actually runs faster on some
- * CPUs due to its balanced mix of SIMD and scalar instructions.
- */
-#undef USE_SSE4_FOR_32BIT
-
-#ifdef __SSE2__
-/*
- * GCC before 4.9 would by default unnecessarily use store/load (without
- * SSE4.1) or (V)PEXTR (with SSE4.1 or AVX) instead of simply (V)MOV.
- * This was tracked as GCC bug 54349.
- * "-mtune=corei7" works around this, but is only supported for GCC 4.6+.
- * We use inline asm for pre-4.6 GCC, further down this file.
- */
-#if __GNUC__ == 4 && __GNUC_MINOR__ >= 6 && __GNUC_MINOR__ < 9 && \
-    !defined(__clang__) && !defined(__ICC)
-#pragma GCC target ("tune=corei7")
-#endif
-#include <emmintrin.h>
-#ifdef __XOP__
-#include <x86intrin.h>
-#endif
-#elif defined(__SSE__)
-#include <xmmintrin.h>
-#endif
-
-#include <errno.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "insecure_memzero.h"
-#include "sha256_p.h"
-#include "sysendian.h"
-
-#include "yespower.h"
-
-#include "yespower-platform.c"
-
-#if __STDC_VERSION__ >= 199901L
-/* Have restrict */
-#elif defined(__GNUC__)
-#define restrict __restrict
-#else
-#define restrict
-#endif
-
-#ifdef __GNUC__
-#define unlikely(exp) __builtin_expect(exp, 0)
-#else
-#define unlikely(exp) (exp)
-#endif
-
-#ifdef __SSE__
-#define PREFETCH(x, hint) _mm_prefetch((const char *)(x), (hint));
-#else
-#undef PREFETCH
-#endif
-
-typedef union {
-	uint32_t w[16];
-	uint64_t d[8];
-#ifdef __SSE2__
-	__m128i q[4];
-#endif
-} salsa20_blk_t;
-
-static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin,
-    salsa20_blk_t *Bout)
-{
-#define COMBINE(out, in1, in2) \
-	Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32);
-	COMBINE(0, 0, 2)
-	COMBINE(1, 5, 7)
-	COMBINE(2, 2, 4)
-	COMBINE(3, 7, 1)
-	COMBINE(4, 4, 6)
-	COMBINE(5, 1, 3)
-	COMBINE(6, 6, 0)
-	COMBINE(7, 3, 5)
-#undef COMBINE
-}
-
-static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
-    salsa20_blk_t *Bout)
-{
-#define UNCOMBINE(out, in1, in2) \
-	Bout->w[out * 2] = Bin->d[in1]; \
-	Bout->w[out * 2 + 1] = Bin->d[in2] >> 32;
-	UNCOMBINE(0, 0, 6)
-	UNCOMBINE(1, 5, 3)
-	UNCOMBINE(2, 2, 0)
-	UNCOMBINE(3, 7, 5)
-	UNCOMBINE(4, 4, 2)
-	UNCOMBINE(5, 1, 7)
-	UNCOMBINE(6, 6, 4)
-	UNCOMBINE(7, 3, 1)
-#undef UNCOMBINE
-}
-
-#ifdef __SSE2__
-#define DECL_X \
-	__m128i X0, X1, X2, X3;
-#define DECL_Y \
-	__m128i Y0, Y1, Y2, Y3;
-#define READ_X(in) \
-	X0 = (in).q[0]; X1 = (in).q[1]; X2 = (in).q[2]; X3 = (in).q[3];
-#define WRITE_X(out) \
-	(out).q[0] = X0; (out).q[1] = X1; (out).q[2] = X2; (out).q[3] = X3;
-
-#ifdef __XOP__
-#define ARX(out, in1, in2, s) \
-	out = _mm_xor_si128(out, _mm_roti_epi32(_mm_add_epi32(in1, in2), s));
-#else
-#define ARX(out, in1, in2, s) { \
-	__m128i tmp = _mm_add_epi32(in1, in2); \
-	out = _mm_xor_si128(out, _mm_slli_epi32(tmp, s)); \
-	out = _mm_xor_si128(out, _mm_srli_epi32(tmp, 32 - s)); \
-}
-#endif
-
-#define SALSA20_2ROUNDS \
-	/* Operate on "columns" */ \
-	ARX(X1, X0, X3, 7) \
-	ARX(X2, X1, X0, 9) \
-	ARX(X3, X2, X1, 13) \
-	ARX(X0, X3, X2, 18) \
-	/* Rearrange data */ \
-	X1 = _mm_shuffle_epi32(X1, 0x93); \
-	X2 = _mm_shuffle_epi32(X2, 0x4E); \
-	X3 = _mm_shuffle_epi32(X3, 0x39); \
-	/* Operate on "rows" */ \
-	ARX(X3, X0, X1, 7) \
-	ARX(X2, X3, X0, 9) \
-	ARX(X1, X2, X3, 13) \
-	ARX(X0, X1, X2, 18) \
-	/* Rearrange data */ \
-	X1 = _mm_shuffle_epi32(X1, 0x39); \
-	X2 = _mm_shuffle_epi32(X2, 0x4E); \
-	X3 = _mm_shuffle_epi32(X3, 0x93);
-
-/**
- * Apply the Salsa20 core to the block provided in (X0 ... X3).
- */
-#define SALSA20_wrapper(out, rounds) { \
-	__m128i Z0 = X0, Z1 = X1, Z2 = X2, Z3 = X3; \
-	rounds \
-	(out).q[0] = X0 = _mm_add_epi32(X0, Z0); \
-	(out).q[1] = X1 = _mm_add_epi32(X1, Z1); \
-	(out).q[2] = X2 = _mm_add_epi32(X2, Z2); \
-	(out).q[3] = X3 = _mm_add_epi32(X3, Z3); \
-}
-
-/**
- * Apply the Salsa20/2 core to the block provided in X.
- */
-#define SALSA20_2(out) \
-	SALSA20_wrapper(out, SALSA20_2ROUNDS)
-
-#define SALSA20_8ROUNDS \
-	SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS
-
-/**
- * Apply the Salsa20/8 core to the block provided in X.
- */
-#define SALSA20_8(out) \
-	SALSA20_wrapper(out, SALSA20_8ROUNDS)
-
-#define XOR_X(in) \
-	X0 = _mm_xor_si128(X0, (in).q[0]); \
-	X1 = _mm_xor_si128(X1, (in).q[1]); \
-	X2 = _mm_xor_si128(X2, (in).q[2]); \
-	X3 = _mm_xor_si128(X3, (in).q[3]);
-
-#define XOR_X_2(in1, in2) \
-	X0 = _mm_xor_si128((in1).q[0], (in2).q[0]); \
-	X1 = _mm_xor_si128((in1).q[1], (in2).q[1]); \
-	X2 = _mm_xor_si128((in1).q[2], (in2).q[2]); \
-	X3 = _mm_xor_si128((in1).q[3], (in2).q[3]);
-
-#define XOR_X_WRITE_XOR_Y_2(out, in) \
-	(out).q[0] = Y0 = _mm_xor_si128((out).q[0], (in).q[0]); \
-	(out).q[1] = Y1 = _mm_xor_si128((out).q[1], (in).q[1]); \
-	(out).q[2] = Y2 = _mm_xor_si128((out).q[2], (in).q[2]); \
-	(out).q[3] = Y3 = _mm_xor_si128((out).q[3], (in).q[3]); \
-	X0 = _mm_xor_si128(X0, Y0); \
-	X1 = _mm_xor_si128(X1, Y1); \
-	X2 = _mm_xor_si128(X2, Y2); \
-	X3 = _mm_xor_si128(X3, Y3);
-
-#define INTEGERIFY _mm_cvtsi128_si32(X0)
-
-#else /* !defined(__SSE2__) */
-
-#define DECL_X \
-	salsa20_blk_t X;
-#define DECL_Y \
-	salsa20_blk_t Y;
-
-#define COPY(out, in) \
-	(out).d[0] = (in).d[0]; \
-	(out).d[1] = (in).d[1]; \
-	(out).d[2] = (in).d[2]; \
-	(out).d[3] = (in).d[3]; \
-	(out).d[4] = (in).d[4]; \
-	(out).d[5] = (in).d[5]; \
-	(out).d[6] = (in).d[6]; \
-	(out).d[7] = (in).d[7];
-
-#define READ_X(in) COPY(X, in)
-#define WRITE_X(out) COPY(out, X)
-
-/**
- * salsa20(B):
- * Apply the Salsa20 core to the provided block.
- */
-static inline void salsa20(salsa20_blk_t *restrict B,
-    salsa20_blk_t *restrict Bout, uint32_t doublerounds)
-{
-	salsa20_blk_t X;
-#define x X.w
-
-	salsa20_simd_unshuffle(B, &X);
-
-	do {
-#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
-		/* Operate on columns */
-		x[ 4] ^= R(x[ 0]+x[12], 7);  x[ 8] ^= R(x[ 4]+x[ 0], 9);
-		x[12] ^= R(x[ 8]+x[ 4],13);  x[ 0] ^= R(x[12]+x[ 8],18);
-
-		x[ 9] ^= R(x[ 5]+x[ 1], 7);  x[13] ^= R(x[ 9]+x[ 5], 9);
-		x[ 1] ^= R(x[13]+x[ 9],13);  x[ 5] ^= R(x[ 1]+x[13],18);
-
-		x[14] ^= R(x[10]+x[ 6], 7);  x[ 2] ^= R(x[14]+x[10], 9);
-		x[ 6] ^= R(x[ 2]+x[14],13);  x[10] ^= R(x[ 6]+x[ 2],18);
-
-		x[ 3] ^= R(x[15]+x[11], 7);  x[ 7] ^= R(x[ 3]+x[15], 9);
-		x[11] ^= R(x[ 7]+x[ 3],13);  x[15] ^= R(x[11]+x[ 7],18);
-
-		/* Operate on rows */
-		x[ 1] ^= R(x[ 0]+x[ 3], 7);  x[ 2] ^= R(x[ 1]+x[ 0], 9);
-		x[ 3] ^= R(x[ 2]+x[ 1],13);  x[ 0] ^= R(x[ 3]+x[ 2],18);
-
-		x[ 6] ^= R(x[ 5]+x[ 4], 7);  x[ 7] ^= R(x[ 6]+x[ 5], 9);
-		x[ 4] ^= R(x[ 7]+x[ 6],13);  x[ 5] ^= R(x[ 4]+x[ 7],18);
-
-		x[11] ^= R(x[10]+x[ 9], 7);  x[ 8] ^= R(x[11]+x[10], 9);
-		x[ 9] ^= R(x[ 8]+x[11],13);  x[10] ^= R(x[ 9]+x[ 8],18);
-
-		x[12] ^= R(x[15]+x[14], 7);  x[13] ^= R(x[12]+x[15], 9);
-		x[14] ^= R(x[13]+x[12],13);  x[15] ^= R(x[14]+x[13],18);
-#undef R
-	} while (--doublerounds);
-#undef x
-
-	{
-		uint32_t i;
-		salsa20_simd_shuffle(&X, Bout);
-		for (i = 0; i < 16; i += 4) {
-			B->w[i] = Bout->w[i] += B->w[i];
-			B->w[i + 1] = Bout->w[i + 1] += B->w[i + 1];
-			B->w[i + 2] = Bout->w[i + 2] += B->w[i + 2];
-			B->w[i + 3] = Bout->w[i + 3] += B->w[i + 3];
-		}
-	}
-}
-
-/**
- * Apply the Salsa20/2 core to the block provided in X.
- */
-#define SALSA20_2(out) \
-	salsa20(&X, &out, 1);
-
-/**
- * Apply the Salsa20/8 core to the block provided in X.
- */
-#define SALSA20_8(out) \
-	salsa20(&X, &out, 4);
-
-#define XOR(out, in1, in2) \
-	(out).d[0] = (in1).d[0] ^ (in2).d[0]; \
-	(out).d[1] = (in1).d[1] ^ (in2).d[1]; \
-	(out).d[2] = (in1).d[2] ^ (in2).d[2]; \
-	(out).d[3] = (in1).d[3] ^ (in2).d[3]; \
-	(out).d[4] = (in1).d[4] ^ (in2).d[4]; \
-	(out).d[5] = (in1).d[5] ^ (in2).d[5]; \
-	(out).d[6] = (in1).d[6] ^ (in2).d[6]; \
-	(out).d[7] = (in1).d[7] ^ (in2).d[7];
-
-#define XOR_X(in) XOR(X, X, in)
-#define XOR_X_2(in1, in2) XOR(X, in1, in2)
-#define XOR_X_WRITE_XOR_Y_2(out, in) \
-	XOR(Y, out, in) \
-	COPY(out, Y) \
-	XOR(X, X, Y)
-
-#define INTEGERIFY (uint32_t)X.d[0]
-#endif
-
-/**
- * Apply the Salsa20 core to the block provided in X ^ in.
- */
-#define SALSA20_XOR_MEM(in, out) \
-	XOR_X(in) \
-	SALSA20(out)
-
-#define SALSA20 SALSA20_8
-#else /* pass 2 */
-#undef SALSA20
-#define SALSA20 SALSA20_2
-#endif
-
-/**
- * blockmix_salsa(Bin, Bout):
- * Compute Bout = BlockMix_{salsa20, 1}(Bin).  The input Bin must be 128
- * bytes in length; the output Bout must also be the same size.
- */
-static inline void blockmix_salsa(const salsa20_blk_t *restrict Bin,
-    salsa20_blk_t *restrict Bout)
-{
-	DECL_X
-
-	READ_X(Bin[1])
-	SALSA20_XOR_MEM(Bin[0], Bout[0])
-	SALSA20_XOR_MEM(Bin[1], Bout[1])
-}
-
-static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1,
-    const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout)
-{
-	DECL_X
-
-	XOR_X_2(Bin1[1], Bin2[1])
-	XOR_X(Bin1[0])
-	SALSA20_XOR_MEM(Bin2[0], Bout[0])
-	XOR_X(Bin1[1])
-	SALSA20_XOR_MEM(Bin2[1], Bout[1])
-
-	return INTEGERIFY;
-}
-
-#if _YESPOWER_OPT_C_PASS_ == 1
-/* This is tunable, but it is part of what defines a yespower version */
-/* Version 0.5 */
-#define Swidth_0_5 8
-/* Version 1.0 */
-#define Swidth_1_0 11
-
-/* Not tunable in this implementation, hard-coded in a few places */
-#define PWXsimple 2
-#define PWXgather 4
-
-/* Derived value.  Not tunable on its own. */
-#define PWXbytes (PWXgather * PWXsimple * 8)
-
-/* (Maybe-)runtime derived values.  Not tunable on their own. */
-#define Swidth_to_Sbytes1(Swidth) ((1 << (Swidth)) * PWXsimple * 8)
-#define Swidth_to_Smask(Swidth) (((1 << (Swidth)) - 1) * PWXsimple * 8)
-#define Smask_to_Smask2(Smask) (((uint64_t)(Smask) << 32) | (Smask))
-
-/* These should be compile-time derived */
-#define Smask2_0_5 Smask_to_Smask2(Swidth_to_Smask(Swidth_0_5))
-#define Smask2_1_0 Smask_to_Smask2(Swidth_to_Smask(Swidth_1_0))
-
-typedef struct {
-	uint8_t *S0, *S1, *S2;
-	size_t w;
-	uint32_t Sbytes;
-} pwxform_ctx_t;
-
-#define DECL_SMASK2REG /* empty */
-#define MAYBE_MEMORY_BARRIER /* empty */
-
-#ifdef __SSE2__
-/*
- * (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs
- * starting with Sandy Bridge.  Additionally, PSHUFD uses separate source and
- * destination registers, whereas the shifts would require an extra move
- * instruction for our code when building without AVX.  Unfortunately, PSHUFD
- * is much slower on Conroe (4 cycles latency vs. 1 cycle latency for PSRLQ)
- * and somewhat slower on some non-Intel CPUs (luckily not including AMD
- * Bulldozer and Piledriver).
- */
-#ifdef __AVX__
-#define HI32(X) \
-	_mm_srli_si128((X), 4)
-#elif 1 /* As an option, check for __SSE4_1__ here not to hurt Conroe */
-#define HI32(X) \
-	_mm_shuffle_epi32((X), _MM_SHUFFLE(2,3,0,1))
-#else
-#define HI32(X) \
-	_mm_srli_epi64((X), 32)
-#endif
-
-#if defined(__x86_64__) && \
-    __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__ICC)
-#ifdef __AVX__
-#define MOVQ "vmovq"
-#else
-/* "movq" would be more correct, but "movd" is supported by older binutils
- * due to an error in AMD's spec for x86-64. */
-#define MOVQ "movd"
-#endif
-#define EXTRACT64(X) ({ \
-	uint64_t result; \
-	__asm__(MOVQ " %1, %0" : "=r" (result) : "x" (X)); \
-	result; \
-})
-#elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__)
-/* MSVC and Open64 had bugs */
-#define EXTRACT64(X) _mm_cvtsi128_si64(X)
-#elif defined(__x86_64__) && defined(__SSE4_1__)
-/* No known bugs for this intrinsic */
-#include <smmintrin.h>
-#define EXTRACT64(X) _mm_extract_epi64((X), 0)
-#elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__)
-/* 32-bit */
-#include <smmintrin.h>
-#if 0
-/* This is currently unused by the code below, which instead uses these two
- * intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */
-#define EXTRACT64(X) \
-	((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
-	((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32))
-#endif
-#else
-/* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64() */
-#define EXTRACT64(X) \
-	((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
-	((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32))
-#endif
-
-#if defined(__x86_64__) && (defined(__AVX__) || !defined(__GNUC__))
-/* 64-bit with AVX */
-/* Force use of 64-bit AND instead of two 32-bit ANDs */
-#undef DECL_SMASK2REG
-#if defined(__GNUC__) && !defined(__ICC)
-#define DECL_SMASK2REG uint64_t Smask2reg = Smask2;
-/* Force use of lower-numbered registers to reduce number of prefixes, relying
- * on out-of-order execution and register renaming. */
-#define FORCE_REGALLOC_1 \
-	__asm__("" : "=a" (x), "+d" (Smask2reg), "+S" (S0), "+D" (S1));
-#define FORCE_REGALLOC_2 \
-	__asm__("" : : "c" (lo));
-#else
-static volatile uint64_t Smask2var = Smask2;
-#define DECL_SMASK2REG uint64_t Smask2reg = Smask2var;
-#define FORCE_REGALLOC_1 /* empty */
-#define FORCE_REGALLOC_2 /* empty */
-#endif
-#define PWXFORM_SIMD(X) { \
-	uint64_t x; \
-	FORCE_REGALLOC_1 \
-	uint32_t lo = x = EXTRACT64(X) & Smask2reg; \
-	FORCE_REGALLOC_2 \
-	uint32_t hi = x >> 32; \
-	X = _mm_mul_epu32(HI32(X), X); \
-	X = _mm_add_epi64(X, *(__m128i *)(S0 + lo)); \
-	X = _mm_xor_si128(X, *(__m128i *)(S1 + hi)); \
-}
-#elif defined(__x86_64__)
-/* 64-bit without AVX.  This relies on out-of-order execution and register
- * renaming.  It may actually be fastest on CPUs with AVX(2) as well - e.g.,
- * it runs great on Haswell. */
-#warning "Note: using x86-64 inline assembly for pwxform.  That's great."
-#undef MAYBE_MEMORY_BARRIER
-#define MAYBE_MEMORY_BARRIER \
-	__asm__("" : : : "memory");
-#define PWXFORM_SIMD(X) { \
-	__m128i H; \
-	__asm__( \
-	    "movd %0, %%rax\n\t" \
-	    "pshufd $0xb1, %0, %1\n\t" \
-	    "andq %2, %%rax\n\t" \
-	    "pmuludq %1, %0\n\t" \
-	    "movl %%eax, %%ecx\n\t" \
-	    "shrq $0x20, %%rax\n\t" \
-	    "paddq (%3,%%rcx), %0\n\t" \
-	    "pxor (%4,%%rax), %0\n\t" \
-	    : "+x" (X), "=x" (H) \
-	    : "d" (Smask2), "S" (S0), "D" (S1) \
-	    : "cc", "ax", "cx"); \
-}
-#elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__)
-/* 32-bit with SSE4.1 */
-#define PWXFORM_SIMD(X) { \
-	__m128i x = _mm_and_si128(X, _mm_set1_epi64x(Smask2)); \
-	__m128i s0 = *(__m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \
-	__m128i s1 = *(__m128i *)(S1 + (uint32_t)_mm_extract_epi32(x, 1)); \
-	X = _mm_mul_epu32(HI32(X), X); \
-	X = _mm_add_epi64(X, s0); \
-	X = _mm_xor_si128(X, s1); \
-}
-#else
-/* 32-bit without SSE4.1 */
-#define PWXFORM_SIMD(X) { \
-	uint64_t x = EXTRACT64(X) & Smask2; \
-	__m128i s0 = *(__m128i *)(S0 + (uint32_t)x); \
-	__m128i s1 = *(__m128i *)(S1 + (x >> 32)); \
-	X = _mm_mul_epu32(HI32(X), X); \
-	X = _mm_add_epi64(X, s0); \
-	X = _mm_xor_si128(X, s1); \
-}
-#endif
-
-#define PWXFORM_SIMD_WRITE(X, Sw) \
-	PWXFORM_SIMD(X) \
-	MAYBE_MEMORY_BARRIER \
-	*(__m128i *)(Sw + w) = X; \
-	MAYBE_MEMORY_BARRIER
-
-#define PWXFORM_ROUND \
-	PWXFORM_SIMD(X0) \
-	PWXFORM_SIMD(X1) \
-	PWXFORM_SIMD(X2) \
-	PWXFORM_SIMD(X3)
-
-#define PWXFORM_ROUND_WRITE4 \
-	PWXFORM_SIMD_WRITE(X0, S0) \
-	PWXFORM_SIMD_WRITE(X1, S1) \
-	w += 16; \
-	PWXFORM_SIMD_WRITE(X2, S0) \
-	PWXFORM_SIMD_WRITE(X3, S1) \
-	w += 16;
-
-#define PWXFORM_ROUND_WRITE2 \
-	PWXFORM_SIMD_WRITE(X0, S0) \
-	PWXFORM_SIMD_WRITE(X1, S1) \
-	w += 16; \
-	PWXFORM_SIMD(X2) \
-	PWXFORM_SIMD(X3)
-
-#else /* !defined(__SSE2__) */
-
-#define PWXFORM_SIMD(x0, x1) { \
-	uint64_t x = x0 & Smask2; \
-	uint64_t *p0 = (uint64_t *)(S0 + (uint32_t)x); \
-	uint64_t *p1 = (uint64_t *)(S1 + (x >> 32)); \
-	x0 = ((x0 >> 32) * (uint32_t)x0 + p0[0]) ^ p1[0]; \
-	x1 = ((x1 >> 32) * (uint32_t)x1 + p0[1]) ^ p1[1]; \
-}
-
-#define PWXFORM_SIMD_WRITE(x0, x1, Sw) \
-	PWXFORM_SIMD(x0, x1) \
-	((uint64_t *)(Sw + w))[0] = x0; \
-	((uint64_t *)(Sw + w))[1] = x1;
-
-#define PWXFORM_ROUND \
-	PWXFORM_SIMD(X.d[0], X.d[1]) \
-	PWXFORM_SIMD(X.d[2], X.d[3]) \
-	PWXFORM_SIMD(X.d[4], X.d[5]) \
-	PWXFORM_SIMD(X.d[6], X.d[7])
-
-#define PWXFORM_ROUND_WRITE4 \
-	PWXFORM_SIMD_WRITE(X.d[0], X.d[1], S0) \
-	PWXFORM_SIMD_WRITE(X.d[2], X.d[3], S1) \
-	w += 16; \
-	PWXFORM_SIMD_WRITE(X.d[4], X.d[5], S0) \
-	PWXFORM_SIMD_WRITE(X.d[6], X.d[7], S1) \
-	w += 16;
-
-#define PWXFORM_ROUND_WRITE2 \
-	PWXFORM_SIMD_WRITE(X.d[0], X.d[1], S0) \
-	PWXFORM_SIMD_WRITE(X.d[2], X.d[3], S1) \
-	w += 16; \
-	PWXFORM_SIMD(X.d[4], X.d[5]) \
-	PWXFORM_SIMD(X.d[6], X.d[7])
-#endif
-
-#define PWXFORM \
-	PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND \
-	PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND
-
-#define Smask2 Smask2_0_5
-
-#else /* pass 2 */
-
-#undef PWXFORM
-#define PWXFORM \
-	PWXFORM_ROUND_WRITE4 PWXFORM_ROUND_WRITE2 PWXFORM_ROUND_WRITE2 \
-	w &= Smask2; \
-	{ \
-		uint8_t *Stmp = S2; \
-		S2 = S1; \
-		S1 = S0; \
-		S0 = Stmp; \
-	}
-
-#undef Smask2
-#define Smask2 Smask2_1_0
-
-#endif
-
-/**
- * blockmix_pwxform(Bin, Bout, r, S):
- * Compute Bout = BlockMix_pwxform{salsa20, r, S}(Bin).  The input Bin must
- * be 128r bytes in length; the output Bout must also be the same size.
- */
-static void blockmix(const salsa20_blk_t *restrict Bin,
-    salsa20_blk_t *restrict Bout, size_t r, pwxform_ctx_t *restrict ctx)
-{
-	if (unlikely(!ctx)) {
-		blockmix_salsa(Bin, Bout);
-		return;
-	}
-
-	uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
-#if _YESPOWER_OPT_C_PASS_ > 1
-	uint8_t *S2 = ctx->S2;
-	size_t w = ctx->w;
-#endif
-	size_t i;
-	DECL_X
-
-	/* Convert count of 128-byte blocks to max index of 64-byte block */
-	r = r * 2 - 1;
-
-	READ_X(Bin[r])
-
-	DECL_SMASK2REG
-
-	i = 0;
-	do {
-		XOR_X(Bin[i])
-		PWXFORM
-		if (unlikely(i >= r))
-			break;
-		WRITE_X(Bout[i])
-		i++;
-	} while (1);
-
-#if _YESPOWER_OPT_C_PASS_ > 1
-	ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2;
-	ctx->w = w;
-#endif
-
-	SALSA20(Bout[i])
-}
-
-static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
-    const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout,
-    size_t r, pwxform_ctx_t *restrict ctx)
-{
-	if (unlikely(!ctx))
-		return blockmix_salsa_xor(Bin1, Bin2, Bout);
-
-	uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
-#if _YESPOWER_OPT_C_PASS_ > 1
-	uint8_t *S2 = ctx->S2;
-	size_t w = ctx->w;
-#endif
-	size_t i;
-	DECL_X
-
-	/* Convert count of 128-byte blocks to max index of 64-byte block */
-	r = r * 2 - 1;
-
-#ifdef PREFETCH
-	PREFETCH(&Bin2[r], _MM_HINT_T0)
-	for (i = 0; i < r; i++) {
-		PREFETCH(&Bin2[i], _MM_HINT_T0)
-	}
-#endif
-
-	XOR_X_2(Bin1[r], Bin2[r])
-
-	DECL_SMASK2REG
-
-	i = 0;
-	r--;
-	do {
-		XOR_X(Bin1[i])
-		XOR_X(Bin2[i])
-		PWXFORM
-		WRITE_X(Bout[i])
-
-		XOR_X(Bin1[i + 1])
-		XOR_X(Bin2[i + 1])
-		PWXFORM
-
-		if (unlikely(i >= r))
-			break;
-
-		WRITE_X(Bout[i + 1])
-
-		i += 2;
-	} while (1);
-	i++;
-
-#if _YESPOWER_OPT_C_PASS_ > 1
-	ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2;
-	ctx->w = w;
-#endif
-
-	SALSA20(Bout[i])
-
-	return INTEGERIFY;
-}
-
-static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out,
-    salsa20_blk_t *restrict Bin2,
-    size_t r, pwxform_ctx_t *restrict ctx)
-{
-	uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
-#if _YESPOWER_OPT_C_PASS_ > 1
-	uint8_t *S2 = ctx->S2;
-	size_t w = ctx->w;
-#endif
-	size_t i;
-	DECL_X
-	DECL_Y
-
-	/* Convert count of 128-byte blocks to max index of 64-byte block */
-	r = r * 2 - 1;
-
-#ifdef PREFETCH
-	PREFETCH(&Bin2[r], _MM_HINT_T0)
-	for (i = 0; i < r; i++) {
-		PREFETCH(&Bin2[i], _MM_HINT_T0)
-	}
-#endif
-
-	XOR_X_2(Bin1out[r], Bin2[r])
-
-	DECL_SMASK2REG
-
-	i = 0;
-	r--;
-	do {
-		XOR_X_WRITE_XOR_Y_2(Bin2[i], Bin1out[i])
-		PWXFORM
-		WRITE_X(Bin1out[i])
-
-		XOR_X_WRITE_XOR_Y_2(Bin2[i + 1], Bin1out[i + 1])
-		PWXFORM
-
-		if (unlikely(i >= r))
-			break;
-
-		WRITE_X(Bin1out[i + 1])
-
-		i += 2;
-	} while (1);
-	i++;
-
-#if _YESPOWER_OPT_C_PASS_ > 1
-	ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2;
-	ctx->w = w;
-#endif
-
-	SALSA20(Bin1out[i])
-
-	return INTEGERIFY;
-}
-
-#if _YESPOWER_OPT_C_PASS_ == 1
-/**
- * integerify(B, r):
- * Return the result of parsing B_{2r-1} as a little-endian integer.
- */
-static inline uint32_t integerify(const salsa20_blk_t *B, size_t r)
-{
-/*
- * Our 64-bit words are in host byte order, which is why we don't just read
- * w[0] here (would be wrong on big-endian).  Also, our 32-bit words are
- * SIMD-shuffled, but we only care about the least significant 32 bits anyway.
- */
-	return (uint32_t)B[2 * r - 1].d[0];
-}
-#endif
-
-/**
- * smix1(B, r, N, V, XY, S):
- * Compute first loop of B = SMix_r(B, N).  The input B must be 128r bytes in
- * length; the temporary storage V must be 128rN bytes in length; the temporary
- * storage XY must be 128r+64 bytes in length.  N must be even and at least 4.
- * The array V must be aligned to a multiple of 64 bytes, and arrays B and XY
- * to a multiple of at least 16 bytes.
- */
-static void smix1(uint8_t *B, size_t r, uint32_t N,
-    salsa20_blk_t *V, salsa20_blk_t *XY, pwxform_ctx_t *ctx)
-{
-	size_t s = 2 * r;
-	salsa20_blk_t *X = V, *Y = &V[s], *V_j;
-	uint32_t i, j, n;
-
-#if _YESPOWER_OPT_C_PASS_ == 1
-	for (i = 0; i < 2 * r; i++) {
-#else
-	for (i = 0; i < 2; i++) {
-#endif
-		const salsa20_blk_t *src = (salsa20_blk_t *)&B[i * 64];
-		salsa20_blk_t *tmp = Y;
-		salsa20_blk_t *dst = &X[i];
-		size_t k;
-		for (k = 0; k < 16; k++)
-			tmp->w[k] = le32dec(&src->w[k]);
-		salsa20_simd_shuffle(tmp, dst);
-	}
-
-#if _YESPOWER_OPT_C_PASS_ > 1
-	for (i = 1; i < r; i++)
-		blockmix(&X[(i - 1) * 2], &X[i * 2], 1, ctx);
-#endif
-
-	blockmix(X, Y, r, ctx);
-	X = Y + s;
-	blockmix(Y, X, r, ctx);
-	j = integerify(X, r);
-
-	for (n = 2; n < N; n <<= 1) {
-		uint32_t m = (n < N / 2) ? n : (N - 1 - n);
-		for (i = 1; i < m; i += 2) {
-			Y = X + s;
-			j &= n - 1;
-			j += i - 1;
-			V_j = &V[j * s];
-			j = blockmix_xor(X, V_j, Y, r, ctx);
-			j &= n - 1;
-			j += i;
-			V_j = &V[j * s];
-			X = Y + s;
-			j = blockmix_xor(Y, V_j, X, r, ctx);
-		}
-	}
-	n >>= 1;
-
-	j &= n - 1;
-	j += N - 2 - n;
-	V_j = &V[j * s];
-	Y = X + s;
-	j = blockmix_xor(X, V_j, Y, r, ctx);
-	j &= n - 1;
-	j += N - 1 - n;
-	V_j = &V[j * s];
-	blockmix_xor(Y, V_j, XY, r, ctx);
-
-	for (i = 0; i < 2 * r; i++) {
-		const salsa20_blk_t *src = &XY[i];
-		salsa20_blk_t *tmp = &XY[s];
-		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64];
-		size_t k;
-		for (k = 0; k < 16; k++)
-			le32enc(&tmp->w[k], src->w[k]);
-		salsa20_simd_unshuffle(tmp, dst);
-	}
-}
-
-/**
- * smix2(B, r, N, Nloop, V, XY, S):
- * Compute second loop of B = SMix_r(B, N).  The input B must be 128r bytes in
- * length; the temporary storage V must be 128rN bytes in length; the temporary
- * storage XY must be 256r bytes in length.  N must be a power of 2 and at
- * least 2.  Nloop must be even.  The array V must be aligned to a multiple of
- * 64 bytes, and arrays B and XY to a multiple of at least 16 bytes.
- */
-static void smix2(uint8_t *B, size_t r, uint32_t N, uint32_t Nloop,
-    salsa20_blk_t *V, salsa20_blk_t *XY, pwxform_ctx_t *ctx)
-{
-	size_t s = 2 * r;
-	salsa20_blk_t *X = XY, *Y = &XY[s];
-	uint32_t i, j;
-
-	for (i = 0; i < 2 * r; i++) {
-		const salsa20_blk_t *src = (salsa20_blk_t *)&B[i * 64];
-		salsa20_blk_t *tmp = Y;
-		salsa20_blk_t *dst = &X[i];
-		size_t k;
-		for (k = 0; k < 16; k++)
-			tmp->w[k] = le32dec(&src->w[k]);
-		salsa20_simd_shuffle(tmp, dst);
-	}
-
-	j = integerify(X, r) & (N - 1);
-
-#if _YESPOWER_OPT_C_PASS_ == 1
-	if (Nloop > 2) {
-#endif
-		do {
-			salsa20_blk_t *V_j = &V[j * s];
-			j = blockmix_xor_save(X, V_j, r, ctx) & (N - 1);
-			V_j = &V[j * s];
-			j = blockmix_xor_save(X, V_j, r, ctx) & (N - 1);
-		} while (Nloop -= 2);
-#if _YESPOWER_OPT_C_PASS_ == 1
-	} else {
-		do {
-			const salsa20_blk_t * V_j = &V[j * s];
-			j = blockmix_xor(X, V_j, Y, r, ctx) & (N - 1);
-			V_j = &V[j * s];
-			j = blockmix_xor(Y, V_j, X, r, ctx) & (N - 1);
-		} while (Nloop -= 2);
-	}
-#endif
-
-	for (i = 0; i < 2 * r; i++) {
-		const salsa20_blk_t *src = &X[i];
-		salsa20_blk_t *tmp = Y;
-		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64];
-		size_t k;
-		for (k = 0; k < 16; k++)
-			le32enc(&tmp->w[k], src->w[k]);
-		salsa20_simd_unshuffle(tmp, dst);
-	}
-}
-
-/**
- * smix(B, r, N, V, XY, S):
- * Compute B = SMix_r(B, N).  The input B must be 128rp bytes in length; the
- * temporary storage V must be 128rN bytes in length; the temporary storage
- * XY must be 256r bytes in length.  N must be a power of 2 and at least 16.
- * The array V must be aligned to a multiple of 64 bytes, and arrays B and XY
- * to a multiple of at least 16 bytes (aligning them to 64 bytes as well saves
- * cache lines, but it might also result in cache bank conflicts).
- */
-static void smix(uint8_t *B, size_t r, uint32_t N,
-    salsa20_blk_t *V, salsa20_blk_t *XY, pwxform_ctx_t *ctx)
-{
-#if _YESPOWER_OPT_C_PASS_ == 1
-	uint32_t Nloop_all = (N + 2) / 3; /* 1/3, round up */
-	uint32_t Nloop_rw = Nloop_all;
-
-	Nloop_all++; Nloop_all &= ~(uint32_t)1; /* round up to even */
-	Nloop_rw &= ~(uint32_t)1; /* round down to even */
-#else
-	uint32_t Nloop_rw = (N + 2) / 3; /* 1/3, round up */
-	Nloop_rw++; Nloop_rw &= ~(uint32_t)1; /* round up to even */
-#endif
-
-	smix1(B, 1, ctx->Sbytes / 128, (salsa20_blk_t *)ctx->S0, XY, NULL);
-	smix1(B, r, N, V, XY, ctx);
-	smix2(B, r, N, Nloop_rw /* must be > 2 */, V, XY, ctx);
-#if _YESPOWER_OPT_C_PASS_ == 1
-	if (Nloop_all > Nloop_rw)
-		smix2(B, r, N, 2, V, XY, ctx);
-#endif
-}
-
-#if _YESPOWER_OPT_C_PASS_ == 1
-#undef _YESPOWER_OPT_C_PASS_
-#define _YESPOWER_OPT_C_PASS_ 2
-#define blockmix_salsa blockmix_salsa_1_0
-#define blockmix_salsa_xor blockmix_salsa_xor_1_0
-#define blockmix blockmix_1_0
-#define blockmix_xor blockmix_xor_1_0
-#define blockmix_xor_save blockmix_xor_save_1_0
-#define smix1 smix1_1_0
-#define smix2 smix2_1_0
-#define smix smix_1_0
-#include "yespower-opt.c"
-#undef smix
-
-/**
- * yespower(local, src, srclen, params, dst):
- * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target".
- * local is the thread-local data structure, allowing to preserve and reuse a
- * memory allocation across calls, thereby reducing its overhead.
- *
- * Return 0 on success; or -1 on error.
- */
-int yespower(yespower_local_t *local,
-    const uint8_t *src, size_t srclen,
-    const yespower_params_t *params,
-    yespower_binary_t *dst)
-{
-	yespower_version_t version = params->version;
-	uint32_t N = params->N;
-	uint32_t r = params->r;
-	const uint8_t *pers = params->pers;
-	size_t perslen = params->perslen;
-	uint32_t Swidth;
-	size_t B_size, V_size, XY_size, need;
-	uint8_t *B, *S;
-	salsa20_blk_t *V, *XY;
-	pwxform_ctx_t ctx;
-	uint8_t sha256[32];
-
-	/* Sanity-check parameters */
-	if ((version != YESPOWER_0_5 && version != YESPOWER_1_0) ||
-	    N < 1024 || N > 512 * 1024 || r < 8 || r > 32 ||
-	    (N & (N - 1)) != 0 ||
-	    (!pers && perslen)) {
-		errno = EINVAL;
-		return -1;
-	}
-
-	/* Allocate memory */
-	B_size = (size_t)128 * r;
-	V_size = B_size * N;
-	if (version == YESPOWER_0_5) {
-		XY_size = B_size * 2;
-		Swidth = Swidth_0_5;
-		ctx.Sbytes = 2 * Swidth_to_Sbytes1(Swidth);
-	} else {
-		XY_size = B_size + 64;
-		Swidth = Swidth_1_0;
-		ctx.Sbytes = 3 * Swidth_to_Sbytes1(Swidth);
-	}
-	need = B_size + V_size + XY_size + ctx.Sbytes;
-	if (local->aligned_size < need) {
-		if (free_region(local))
-			return -1;
-		if (!alloc_region(local, need))
-			return -1;
-	}
-	B = (uint8_t *)local->aligned;
-	V = (salsa20_blk_t *)((uint8_t *)B + B_size);
-	XY = (salsa20_blk_t *)((uint8_t *)V + V_size);
-	S = (uint8_t *)XY + XY_size;
-	ctx.S0 = S;
-	ctx.S1 = S + Swidth_to_Sbytes1(Swidth);
-
-	SHA256_Buf_p(src, srclen, sha256);
-
-	if (version == YESPOWER_0_5) {
-		PBKDF2_SHA256_p(sha256, sizeof(sha256), src, srclen, 1,
-		    B, B_size);
-		memcpy(sha256, B, sizeof(sha256));
-		smix(B, r, N, V, XY, &ctx);
-		PBKDF2_SHA256_p(sha256, sizeof(sha256), B, B_size, 1,
-		    (uint8_t *)dst, sizeof(*dst));
-
-		if (pers) {
-			HMAC_SHA256_Buf_p(dst, sizeof(*dst), pers, perslen,
-			    sha256);
-			SHA256_Buf_p(sha256, sizeof(sha256), (uint8_t *)dst);
-		}
-	} else {
-		ctx.S2 = S + 2 * Swidth_to_Sbytes1(Swidth);
-		ctx.w = 0;
-
-		if (pers) {
-			src = pers;
-			srclen = perslen;
-		} else {
-			srclen = 0;
-		}
-
-		PBKDF2_SHA256_p(sha256, sizeof(sha256), src, srclen, 1, B, 128);
-		memcpy(sha256, B, sizeof(sha256));
-		smix_1_0(B, r, N, V, XY, &ctx);
-		HMAC_SHA256_Buf_p(B + B_size - 64, 64,
-		    sha256, sizeof(sha256), (uint8_t *)dst);
-	}
-
-	/* Success! */
-	return 0;
-}
-
-/**
- * yespower_tls(src, srclen, params, dst):
- * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target".
- * The memory allocation is maintained internally using thread-local storage.
- *
- * Return 0 on success; or -1 on error.
- */
-int yespower_tls(const uint8_t *src, size_t srclen,
-    const yespower_params_t *params, yespower_binary_t *dst)
-{
-	static __thread int initialized = 0;
-	static __thread yespower_local_t local;
-
-	if (!initialized) {
-		if (yespower_init_local(&local))
-			return -1;
-		initialized = 1;
-	}
-
-	return yespower(&local, src, srclen, params, dst);
-}
-
-int yespower_init_local(yespower_local_t *local)
-{
-	init_region(local);
-	return 0;
-}
-
-int yespower_free_local(yespower_local_t *local)
-{
-	return free_region(local);
-}
-#endif
diff --git a/algo/yespower/yespower-ref.c b/algo/yespower/yespower-ref.c
index 29c03d2..bec75c5 100644
--- a/algo/yespower/yespower-ref.c
+++ b/algo/yespower/yespower-ref.c
@@ -51,7 +51,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "sha256.h"
+#include "sha256_p.h"
 #include "sysendian.h"
 
 #include "yespower.h"
@@ -534,11 +534,12 @@ int yespower(yespower_local_t *local,
 
 		if (pers) {
 			HMAC_SHA256_Buf(dst, sizeof(*dst), pers, perslen,
+               return true;
 			    (uint8_t *)sha256);
 			SHA256_Buf(sha256, sizeof(sha256), (uint8_t *)dst);
 		}
 	} else {
-		HMAC_SHA256_Buf((uint8_t *)B + B_size - 64, 64,
+		HMAC_SHA256_Buf_P((uint8_t *)B + B_size - 64, 64,
 		    sha256, sizeof(sha256), (uint8_t *)dst);
 	}
 
diff --git a/algo/yespower/yespower.c b/algo/yespower/yespower.c
index e676d81..8b700a4 100644
--- a/algo/yespower/yespower.c
+++ b/algo/yespower/yespower.c
@@ -38,7 +38,7 @@ void yespower_hash( const char *input, char *output, uint32_t len )
 }
 
 int scanhash_yespower( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done )
+                       uint64_t *hashes_done, struct thr_info *mythr )
 {
         uint32_t _ALIGN(64) vhash[8];
         uint32_t _ALIGN(64) endiandata[20];
@@ -48,6 +48,7 @@ int scanhash_yespower( int thr_id, struct work *work, uint32_t max_nonce,
         const uint32_t Htarg = ptarget[7];
         const uint32_t first_nonce = pdata[19];
         uint32_t n = first_nonce;
+        /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
 
         for (int k = 0; k < 19; k++)
                 be32enc(&endiandata[k], pdata[k]);
diff --git a/avxdefs.h b/avxdefs.h
index 953c649..fb618f5 100644
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -662,57 +662,57 @@ do { \
 
 #define mm128_ror1x64_256( v1, v2 ) \
 do { \
-   __m128i t  = _mm_srli_si128( v1, 8 ) | _mm_slli_si128( v2, 24 ); \
-           v2 = _mm_srli_si128( v2, 8 ) | _mm_slli_si128( v1, 24 ); \
+   __m128i t  = _mm_srli_si128( v1, 8 ) | _mm_slli_si128( v2, 8 ); \
+           v2 = _mm_srli_si128( v2, 8 ) | _mm_slli_si128( v1, 8 ); \
            v1 = t; \
 } while(0)
 
 #define mm128_rol1x64_256( v1, v2 ) \
 do { \
-   __m128i t  = _mm_slli_si128( v1, 8 ) | _mm_srli_si128( v2, 24 ); \
-           v2 = _mm_slli_si128( v2, 8 ) | _mm_srli_si128( v1, 24 ); \
+   __m128i t  = _mm_slli_si128( v1, 8 ) | _mm_srli_si128( v2, 8 ); \
+           v2 = _mm_slli_si128( v2, 8 ) | _mm_srli_si128( v1, 8 ); \
            v1 = t; \
 } while(0)
 
 #define mm128_ror1x32_256( v1, v2 ) \
 do { \
-   __m128i t  = _mm_srli_si128( v1, 4 ) | _mm_slli_si128( v2, 28 ); \
-           v2 = _mm_srli_si128( v2, 4 ) | _mm_slli_si128( v1, 28 ); \
+   __m128i t  = _mm_srli_si128( v1, 4 ) | _mm_slli_si128( v2, 12 ); \
+           v2 = _mm_srli_si128( v2, 4 ) | _mm_slli_si128( v1, 12 ); \
            v1 = t; \
 } while(0)
 
 #define mm128_rol1x32_256( v1, v2 ) \
 do { \
-   __m128i t  = _mm_slli_si128( v1, 4 ) | _mm_srli_si128( v2, 28 ); \
-           v2 = _mm_slli_si128( v2, 4 ) | _mm_srli_si128( v1, 28 ); \
+   __m128i t  = _mm_slli_si128( v1, 4 ) | _mm_srli_si128( v2, 12 ); \
+           v2 = _mm_slli_si128( v2, 4 ) | _mm_srli_si128( v1, 12 ); \
            v1 = t; \
 } while(0)
 
 #define mm128_ror1x16_256( v1, v2 ) \
 do { \
-   __m128i t  = _mm_srli_si128( v1, 2 ) | _mm_slli_si128( v2, 30 ); \
-           v2 = _mm_srli_si128( v2, 2 ) | _mm_slli_si128( v1, 30 ); \
+   __m128i t  = _mm_srli_si128( v1, 2 ) | _mm_slli_si128( v2, 14 ); \
+           v2 = _mm_srli_si128( v2, 2 ) | _mm_slli_si128( v1, 14 ); \
            v1 = t; \
 } while(0)
 
 #define mm128_rol1x16_256( v1, v2 ) \
 do { \
-   __m128i t  = _mm_slli_si128( v1, 2 ) | _mm_srli_si128( v2, 30 ); \
-           v2 = _mm_slli_si128( v2, 2 ) | _mm_srli_si128( v1, 30 ); \
+   __m128i t  = _mm_slli_si128( v1, 2 ) | _mm_srli_si128( v2, 14 ); \
+           v2 = _mm_slli_si128( v2, 2 ) | _mm_srli_si128( v1, 14 ); \
            v1 = t; \
 } while(0)
 
 #define mm128_ror1x8_256( v1, v2 ) \
 do { \
-   __m128i t  = _mm_srli_si128( v1, 1 ) | _mm_slli_si128( v2, 31 ); \
-           v2 = _mm_srli_si128( v2, 1 ) | _mm_slli_si128( v1, 31 ); \
+   __m128i t  = _mm_srli_si128( v1, 1 ) | _mm_slli_si128( v2, 15 ); \
+           v2 = _mm_srli_si128( v2, 1 ) | _mm_slli_si128( v1, 15 ); \
            v1 = t; \
 } while(0)
 
 #define mm128_rol1x8_256( v1, v2 ) \
 do { \
-   __m128i t  = _mm_slli_si128( v1, 1 ) | _mm_srli_si128( v2, 31 ); \
-           v2 = _mm_slli_si128( v2, 1 ) | _mm_srli_si128( v1, 31 ); \
+   __m128i t  = _mm_slli_si128( v1, 1 ) | _mm_srli_si128( v2, 15 ); \
+           v2 = _mm_slli_si128( v2, 1 ) | _mm_srli_si128( v1, 15 ); \
            v1 = t; \
 } while(0)
 
diff --git a/configure b/configure
index f5e1a68..2b7849d 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.1.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.2.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.9.1.1'
-PACKAGE_STRING='cpuminer-opt 3.9.1.1'
+PACKAGE_VERSION='3.9.2'
+PACKAGE_STRING='cpuminer-opt 3.9.2'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.9.1.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.9.2 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.9.1.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.9.2:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.9.1.1
+cpuminer-opt configure 3.9.2
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.9.1.1, which was
+It was created by cpuminer-opt $as_me 3.9.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.9.1.1'
+ VERSION='3.9.2'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.9.1.1, which was
+This file was extended by cpuminer-opt $as_me 3.9.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.9.1.1
+cpuminer-opt config.status 3.9.2
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index bf1ccf0..05a47d0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.9.1.1])
+AC_INIT([cpuminer-opt], [3.9.2])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index edbd654..a893648 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -847,7 +847,8 @@ static int share_result( int result, struct work *work, const char *reason )
    float rate;
    char rate_s[8] = {0};
    double sharediff = work ? work->sharediff : stratum.sharediff;
-   bool solved = result && (net_diff > 0.0 ) && ( sharediff >= net_diff );
+   bool solved = result && accepted_share_count && (net_diff > 0.0 )
+	         && ( sharediff >= net_diff );
    char sol[32] = {0};
    int i;
 
@@ -857,15 +858,17 @@ static int share_result( int result, struct work *work, const char *reason )
        hashcount += thr_hashcount[i];
        hashrate += thr_hashrates[i];
    }
+   solved = result && ( (uint64_t)hashcount > 0 )  && (net_diff > 0.0 )
+                                             && ( sharediff >= net_diff );
    result ? accepted_share_count++ : rejected_share_count++;
 
    if ( solved )
    {
       solved_block_count++;
       if ( use_colors )
-         sprintf( sol, CL_GRN " Solved" CL_WHT " %d", solved_block_count );   
+         sprintf( sol, CL_GRN " Solved: %d" CL_WHT, solved_block_count );   
       else
-         sprintf( sol, " Solved %d", solved_block_count ); 
+         sprintf( sol, ", Solved: %d", solved_block_count ); 
    }
 
    pthread_mutex_unlock(&stats_lock);
diff --git a/interleave.h b/interleave.h
index 9a65c12..146fef2 100644
--- a/interleave.h
+++ b/interleave.h
@@ -356,6 +356,36 @@ static inline void mm256_interleave_8x32x256( void *d, const void *s00,
                                        s04+28, s05+28, s06+28, s07+28 );
 }
 
+static inline void mm256_be_interleave_8x32x256( void *d, const void *s00,
+       const void *s01, const void *s02, const void *s03, const void *s04,
+       const void *s05, const void *s06, const void *s07 )
+{
+   casti_m256i( d, 0 ) = mm256_bswap_32( 
+		            mm256_put_32( s00,    s01,    s02,    s03,
+                                          s04,    s05,    s06,    s07    ) );
+   casti_m256i( d, 1 ) = mm256_bswap_32(
+		            mm256_put_32( s00+ 4, s01+ 4, s02+ 4, s03+ 4,
+                                          s04+ 4, s05+ 4, s06+ 4, s07+ 4 ) );
+   casti_m256i( d, 2 ) = mm256_bswap_32(
+	                    mm256_put_32( s00+ 8, s01+ 8, s02+ 8, s03+ 8,
+                                          s04+ 8, s05+ 8, s06+ 8, s07+ 8 ) );
+   casti_m256i( d, 3 ) = mm256_bswap_32(
+	                    mm256_put_32( s00+12, s01+12, s02+12, s03+12,
+                                          s04+12, s05+12, s06+12, s07+12 ) );
+   casti_m256i( d, 4 ) = mm256_bswap_32(
+	                    mm256_put_32( s00+16, s01+16, s02+16, s03+16,
+                                          s04+16, s05+16, s06+16, s07+16 ) );
+   casti_m256i( d, 5 ) = mm256_bswap_32(
+	                    mm256_put_32( s00+20, s01+20, s02+20, s03+20,
+                                          s04+20, s05+20, s06+20, s07+20 ) );
+   casti_m256i( d, 6 ) = mm256_bswap_32(
+	                    mm256_put_32( s00+24, s01+24, s02+24, s03+24,
+                                          s04+24, s05+24, s06+24, s07+24 ) );
+   casti_m256i( d, 7 ) = mm256_bswap_32(
+	                    mm256_put_32( s00+28, s01+28, s02+28, s03+28,
+                                          s04+28, s05+28, s06+28, s07+28 ) );
+}
+
 static inline void mm256_interleave_8x32x128( void *d, const void *s00,
      const void *s01, const void *s02, const void *s03, const void *s04,
      const void *s05, const void *s06, const void *s07 )
@@ -370,6 +400,24 @@ static inline void mm256_interleave_8x32x128( void *d, const void *s00,
                                        s04+12, s05+12, s06+12, s07+12 );
 }
 
+static inline void mm256_be_interleave_8x32x128( void *d, const void *s00,
+       const void *s01, const void *s02, const void *s03, const void *s04,
+       const void *s05, const void *s06, const void *s07 )
+{
+   casti_m256i( d, 0 ) = mm256_bswap_32( 
+		            mm256_put_32( s00,    s01,    s02,    s03,
+                                          s04,    s05,    s06,    s07    ) );
+   casti_m256i( d, 1 ) = mm256_bswap_32(
+	                    mm256_put_32( s00+ 4, s01+ 4, s02+ 4, s03+ 4,
+                                          s04+ 4, s05+ 4, s06+ 4, s07+ 4 ) );
+   casti_m256i( d, 2 ) = mm256_bswap_32(
+	                    mm256_put_32( s00+ 8, s01+ 8, s02+ 8, s03+ 8,
+                                          s04+ 8, s05+ 8, s06+ 8, s07+ 8 ) );
+   casti_m256i( d, 3 ) = mm256_bswap_32(
+	                    mm256_put_32( s00+12, s01+12, s02+12, s03+12,
+                                          s04+12, s05+12, s06+12, s07+12 ) );
+}
+
 // can be called directly for 32 byte hash using AVX2
 static inline void mm256_deinterleave_8x32x256( void *d00, void *d01,
 	       void *d02, void *d03, void *d04, void *d05, void *d06,
@@ -394,6 +442,21 @@ static inline void mm256_interleave_4x64x256( void *d, const void *s0,
   casti_m256i( d,3 ) = mm256_put_64( s0+24, s1+24, s2+24, s3+24 );
 }
 
+// bswap the data as it's interleaved.
+// A bit of a missnomer, but be is nice and short.
+static inline void mm256_be_interleave_4x64x256( void *d, const void *s0,
+                       const void *s1, const void *s2, const void *s3 )
+{
+  casti_m256i( d,0 ) = mm256_bswap_32( 
+		             mm256_put_64( s0,    s1,    s2,    s3    ) );
+  casti_m256i( d,1 ) = mm256_bswap_32(
+	                     mm256_put_64( s0+ 8, s1+ 8, s2+ 8, s3+ 8 ) );
+  casti_m256i( d,2 ) = mm256_bswap_32( 
+		             mm256_put_64( s0+16, s1+16, s2+16, s3+16 ) );
+  casti_m256i( d,3 ) = mm256_bswap_32( 
+		             mm256_put_64( s0+24, s1+24, s2+24, s3+24 ) );
+}
+
 static inline void mm256_interleave_4x64x128( void *d, const void *s0,
                        const void *s1, const void *s2, const void *s3 )
 {
@@ -401,6 +464,14 @@ static inline void mm256_interleave_4x64x128( void *d, const void *s0,
   casti_m256i( d,1 ) = mm256_put_64( s0+ 8, s1+ 8, s2+ 8, s3+ 8 );
 }
 
+static inline void mm256_be_interleave_4x64x128( void *d, const void *s0,
+                       const void *s1, const void *s2, const void *s3 )
+{
+  casti_m256i( d,0 ) = mm256_bswap_32( 
+		           mm256_put_64( s0,    s1,    s2,    s3    ) );
+  casti_m256i( d,1 ) = mm256_bswap_32( 
+		           mm256_put_64( s0+ 8, s1+ 8, s2+ 8, s3+ 8 ) );
+}
 
 // 4 lanes of 256 bits using 64 bit interleaving (standard final hash size)
 static inline void mm256_deinterleave_4x64x256( void *d0, void *d1, void *d2,
@@ -496,6 +567,28 @@ static inline void mm256_interleave_8x32( void *d, const void *s0,
    // bit_len == 1024
 }
 
+static inline void mm256_be_interleave_8x32( void *d, const void *s0,
+        const void *s1, const void *s2, const void *s3, const void *s4,
+        const void *s5, const void *s6, const void *s7, int bit_len )
+{
+   mm256_be_interleave_8x32x256( d, s0, s1, s2, s3, s4, s5, s6, s7 );
+   if ( bit_len <= 256 ) return;
+   mm256_be_interleave_8x32x256( d+256, s0+32, s1+32, s2+32, s3+32,
+                                     s4+32, s5+32, s6+32, s7+32 );
+   if ( bit_len <= 512 ) return;
+   if ( bit_len <= 640 )
+   {
+      mm256_be_interleave_8x32x128( d+512, s0+64, s1+64, s2+64, s3+64,
+                                        s4+64, s5+64, s6+64, s7+64 );
+      return;
+   }
+   mm256_be_interleave_8x32x256( d+512, s0+64, s1+64, s2+64, s3+64,
+                                     s4+64, s5+64, s6+64, s7+64 );
+   mm256_be_interleave_8x32x256( d+768, s0+96, s1+96, s2+96, s3+96,
+                                     s4+96, s5+96, s6+96, s7+96 );
+   // bit_len == 1024
+}
+
 /*
 // Slower but it works with 32 bit data
 // bit_len must be multiple of 32
@@ -595,6 +688,23 @@ static inline void mm256_interleave_4x64( void *d, const void *s0,
   mm256_interleave_4x64x256( d+384, s0+96, s1+96, s2+96, s3+96 );
 }
 
+static inline void mm256_be_interleave_4x64( void *d, const void *s0,
+            const void *s1, const void *s2, const void *s3, int bit_len )
+{
+  mm256_be_interleave_4x64x256( d, s0, s1, s2, s3 );
+  if ( bit_len <= 256 ) return;
+  mm256_be_interleave_4x64x256( d+128, s0+32, s1+32, s2+32, s3+32 );
+  if ( bit_len <= 512 ) return;
+  if ( bit_len <= 640 )
+  {
+    mm256_be_interleave_4x64x128( d+256, s0+64, s1+64, s2+64, s3+64 );
+    return;
+  }
+  // bit_len == 1024
+  mm256_be_interleave_4x64x256( d+256, s0+64, s1+64, s2+64, s3+64 );
+  mm256_be_interleave_4x64x256( d+384, s0+96, s1+96, s2+96, s3+96 );
+}
+
 /*
 // Slower version
 // bit_len must be multiple of 64
@@ -676,7 +786,9 @@ static inline void mm256_extract_lane_4x64( void *d, const void *s,
 
 // Convert from 4x32 SSE2 interleaving to 4x64 AVX2.
 // Can't do it in place
-static inline void mm256_reinterleave_4x64( void *dst, void *src, int  bit_len )
+#define mm256_reinterleave_4x64 mm256_reinterleave_4x32_4x64
+static inline void mm256_reinterleave_4x32_4x64( void *dst, void *src,
+	                                         int  bit_len )
 {
    __m256i* d = (__m256i*)dst;
    uint32_t *s = (uint32_t*)src;
@@ -736,7 +848,9 @@ static inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src,
 
 // Convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
 // bit_len must be multiple of 64
-static inline void mm256_reinterleave_4x32( void *dst, void *src, int  bit_len )
+#define mm256_reinterleave_4x32 mm256_reinterleave_4x64_4x32
+static inline void mm256_reinterleave_4x64_4x32( void *dst, void *src,
+	                                         int  bit_len )
 {
    __m256i  *d = (__m256i*)dst;
    uint32_t *s = (uint32_t*)src;
@@ -862,7 +976,8 @@ static inline void mm_reinterleave_4x32( void *dst, void *src, int  bit_len )
 }
 */
 
-static inline void mm256_interleave_2x128( const void *d, const void *s0,
+#define mm256_interleave_2x128 mm256_interleave_1x128
+static inline void mm256_interleave_1x128( const void *d, const void *s0,
 	                                   void *s1, const int bit_len )
 {
   casti_m256i( d, 0 ) = mm256_put_64( s0   , s0+ 8, s1   , s1+ 8 );
@@ -879,7 +994,8 @@ static inline void mm256_interleave_2x128( const void *d, const void *s0,
   // bit_len == 1024
 }
 
-static inline void mm256_deinterleave_2x128( void *d0, void *d1, void *s,
+#define mm256_deinterleave_2x128 mm256_deinterleave_1x128
+static inline void mm256_deinterleave_1x128( void *d0, void *d1, void *s,
                                              int bit_len )
 {
    mm256_deinterleave_2x128x256( d0, d1, 0, s );
@@ -1078,38 +1194,38 @@ static inline void mm512_deinterleave_16x32x512( void *d00, void *d01,
                 void *d12, void *d13, void *d14, void *d15, const int n,
 		const void *s )
 {
- casti_m512i(d00,n) = mm512_get_32( s,   0,  16,  32,  48,  64,  80,  96, 112,
-		                       128, 144, 160, 176, 192, 208, 224, 240 );
- casti_m512i(d01,n) = mm512_get_32( s,   1,  17,  33,  49,  65,  81,  97, 113,
-		                       129, 145, 161, 177, 193, 209, 225, 241 );
- casti_m512i(d02,n) = mm512_get_32( s,   2,  18,  34,  50,  66,  82,  98, 114,
-				       130, 146, 162, 178, 194, 210, 226, 242 );
- casti_m512i(d03,n) = mm512_get_32( s,   3,  19,  35,  51,  67,  83,  99, 115,
-                                       131, 147, 163, 179, 195, 211, 227, 243 );
- casti_m512i(d04,n) = mm512_get_32( s,   4,  20,  36,  52,  68,  84, 100, 116,
-		                       132, 148, 164, 180, 196, 212, 228, 244 );
- casti_m512i(d05,n) = mm512_get_32( s,   5,  21,  37,  53,  69,  85, 101, 117,
-                                       133, 149, 165, 181, 197, 213, 229, 245 );
- casti_m512i(d06,n) = mm512_get_32( s,   6,  22,  38,  54,  70,  86, 102, 118,
-                                       134, 150, 166, 182, 198, 214, 230, 246 );
- casti_m512i(d07,n) = mm512_get_32( s,   7,  23,  39,  55,  71,  87, 103, 119,
-		                       135, 151, 167, 183, 199, 215, 231, 247 );
- casti_m512i(d08,n) = mm512_get_32( s,   8,  24,  40,  56,  72,  88, 104, 120,
-		                       136, 152, 168, 184, 200, 216, 232, 248 );
- casti_m512i(d09,n) = mm512_get_32( s,   9,  25,  41,  57,  73,  89, 105, 121,
-		                       137, 153, 169, 185, 201, 217, 233, 249 );
- casti_m512i(d10,n) = mm512_get_32( s,  10,  26,  42,  58,  74,  90, 106, 122,
-		                       138, 154, 170, 186, 202, 218, 234, 250 );
- casti_m512i(d11,n) = mm512_get_32( s,  11,  27,  43,  59,  75,  91, 107, 123,
-		                       139, 155, 171, 187, 203, 219, 235, 251 );
- casti_m512i(d12,n) = mm512_get_32( s,  12,  28,  44,  60,  76,  92, 108, 124,
-		                       140, 156, 172, 188, 204, 220, 236, 252 );
- casti_m512i(d13,n) = mm512_get_32( s,  13,  29,  45,  61,  77,  93, 109, 125,
-		                       141, 157, 173, 189, 205, 221, 237, 253 );
- casti_m512i(d14,n) = mm512_get_32( s,  14,  30,  46,  62,  78,  94, 110, 126,
-		                       142, 158, 174, 190, 206, 222, 238, 254 );
- casti_m512i(d15,n) = mm512_get_32( s,  15,  31,  47,  63,  79,  95, 111, 127,
-		                       143, 159, 175, 191, 207, 223, 239, 255 );
+   casti_m512i(d00,n) = mm512_get_32( s,  0, 16, 32, 48, 64, 80, 96,112,
+  		                        128,144,160,176,192,208,224,240 );
+   casti_m512i(d01,n) = mm512_get_32( s,  1, 17, 33, 49, 65, 81, 97,113,
+  		                        129,145,161,177,193,209,225,241 );
+   casti_m512i(d02,n) = mm512_get_32( s,  2, 18, 34, 50, 66, 82, 98,114,
+  				        130,146,162,178,194,210,226,242 );
+   casti_m512i(d03,n) = mm512_get_32( s,  3, 19, 35, 51, 67, 83, 99,115,
+                                        131,147,163,179,195,211,227,243 );
+   casti_m512i(d04,n) = mm512_get_32( s,  4, 20, 36, 52, 68, 84,100,116,
+		                        132,148,164,180,196,212,228,244 );
+   casti_m512i(d05,n) = mm512_get_32( s,  5, 21, 37, 53, 69, 85,101,117,
+                                        133,149,165,181,197,213,229,245 );
+   casti_m512i(d06,n) = mm512_get_32( s,  6, 22, 38, 54, 70, 86,102,118,
+                                        134,150,166,182,198,214,230,246 );
+   casti_m512i(d07,n) = mm512_get_32( s,  7, 23, 39, 55, 71, 87,103,119,
+		                        135,151,167,183,199,215,231,247 );
+   casti_m512i(d08,n) = mm512_get_32( s,  8, 24, 40, 56, 72, 88,104,120,
+		                        136,152,168,184,200,216,232,248 );
+   casti_m512i(d09,n) = mm512_get_32( s,  9, 25, 41, 57, 73, 89,105,121,
+		                        137,153,169,185,201,217,233,249 );
+   casti_m512i(d10,n) = mm512_get_32( s, 10, 26, 42, 58, 74, 90,106,122,
+		                        138,154,170,186,202,218,234,250 );
+   casti_m512i(d11,n) = mm512_get_32( s, 11, 27, 43, 59, 75, 91,107,123,
+		                        139,155,171,187,203,219,235,251 );
+   casti_m512i(d12,n) = mm512_get_32( s, 12, 28, 44, 60, 76, 92,108,124,
+		                        140,156,172,188,204,220,236,252 );
+   casti_m512i(d13,n) = mm512_get_32( s, 13, 29, 45, 61, 77, 93,109,125,
+		                        141,157,173,189,205,221,237,253 );
+   casti_m512i(d14,n) = mm512_get_32( s, 14, 30, 46, 62, 78, 94,110,126,
+	                                142,158,174,190,206,222,238,254 );
+   casti_m512i(d15,n) = mm512_get_32( s, 15, 31, 47, 63, 79, 95,111,127,
+           	                        143,159,175,191,207,223,239,255 );
 }
 
 static inline void mm512_interleave_8x64x512( void *d, const void *s0,
@@ -1363,6 +1479,99 @@ static inline void mm512_deinterleave_4x128( void *d0, void *d1, void *d2,
    mm512_deinterleave_4x128x512( d0, d1, d2, d3, 1, s+256 );
 }
 
+// input one 8x64 buffer and return 2*4*128
+static inline void mm512_reinterleave_8x64_4x128( void *dst0, void *dst1,
+                                              const void *src, int  bit_len )
+{
+   __m512i* d0 = (__m512i*)dst0;
+   __m512i* d1 = (__m512i*)dst1;
+   uint64_t *s = (uint64_t*)src;
+
+   d0[0] = _mm512_set_epi64( s[ 11], s[  3], s[ 10], s[  2],
+                             s[  9], s[  1], s[  8], s[  0] );
+   d0[1] = _mm512_set_epi64( s[ 27], s[ 19], s[ 26], s[ 18],
+ 		             s[ 25], s[ 17], s[ 24], s[ 16] );
+   d0[2] = _mm512_set_epi64( s[ 15], s[  7], s[ 14], s[  6],
+                             s[ 13], s[  5], s[ 12], s[  4] );
+   d0[3] = _mm512_set_epi64( s[ 31], s[ 23], s[ 30], s[ 22],
+                             s[ 29], s[ 21], s[ 28], s[ 20] );
+   d1[0] = _mm512_set_epi64( s[ 43], s[ 35], s[ 42], s[ 34],
+                             s[ 41], s[ 33], s[ 40], s[ 32] );
+   d1[1] = _mm512_set_epi64( s[ 59], s[ 51], s[ 58], s[ 50],
+                             s[ 57], s[ 49], s[ 56], s[ 48] );
+   d1[2] = _mm512_set_epi64( s[ 47], s[ 39], s[ 46], s[ 38],
+                             s[ 45], s[ 37], s[ 44], s[ 36] );
+   d1[3] = _mm512_set_epi64( s[ 63], s[ 55], s[ 62], s[ 54],
+                              s[ 61], s[ 53], s[ 60], s[ 52] );
+
+   if ( bit_len <= 512 ) return;
+
+   d0[4] = _mm512_set_epi64( s[ 75], s[ 67], s[ 74], s[ 66],
+                             s[ 73], s[ 65], s[ 72], s[ 64] );
+   d0[5] = _mm512_set_epi64( s[ 91], s[ 83], s[ 90], s[ 82],
+                             s[ 89], s[ 81], s[ 88], s[ 80] );
+   d0[6] = _mm512_set_epi64( s[ 79], s[ 71], s[ 78], s[ 70],
+                             s[ 77], s[ 69], s[ 76], s[ 68] );
+   d0[7] = _mm512_set_epi64( s[ 95], s[ 87], s[ 94], s[ 86],
+                             s[ 93], s[ 85], s[ 92], s[ 84] );
+   d1[4] = _mm512_set_epi64( s[107], s[ 99], s[106], s[ 98],
+                             s[105], s[ 97], s[104], s[ 96] );
+   d1[5] = _mm512_set_epi64( s[123], s[115], s[122], s[114],
+                             s[121], s[113], s[120], s[112] );
+   d1[6] = _mm512_set_epi64( s[111], s[103], s[110], s[102],
+                             s[109], s[101], s[108], s[100] );
+   d1[7] = _mm512_set_epi64( s[127], s[119], s[126], s[118],
+                             s[125], s[117], s[124], s[116] );
+
+}
+
+// input 2 4x128  return 8x64
+static inline void mm512_reinterleave_4x128_8x64( void *dst, const void *src0,
+                                              const void *src1, int  bit_len )
+{
+   __m512i* d = (__m512i*)dst;
+   uint64_t *s0 = (uint64_t*)src0;
+   uint64_t *s1 = (uint64_t*)src1;
+
+   d[0] = _mm512_set_epi64( s1[ 6], s1[ 4], s1[ 2], s1[ 0],
+                            s0[ 6], s0[ 4], s0[ 2], s0[ 0] );
+   d[1] = _mm512_set_epi64( s1[ 7], s1[ 5], s1[ 3], s1[ 1],
+                            s0[ 7], s0[ 5], s0[ 3], s0[ 1] );
+   d[2] = _mm512_set_epi64( s1[14], s1[12], s1[10], s1[ 8],
+                            s0[14], s0[12], s0[10], s0[ 8] );
+   d[3] = _mm512_set_epi64( s1[15], s1[13], s1[11], s1[ 9],
+                            s0[15], s0[13], s0[11], s0[ 9] );
+   d[4] = _mm512_set_epi64( s1[22], s1[20], s1[18], s1[16],
+                            s0[22], s0[20], s0[18], s0[16] );
+   d[5] = _mm512_set_epi64( s1[23], s1[21], s1[19], s1[17],
+                            s0[24], s0[21], s0[19], s0[17] );
+   d[6] = _mm512_set_epi64( s1[22], s1[28], s1[26], s1[24],
+                            s0[22], s0[28], s0[26], s0[24] );
+   d[7] = _mm512_set_epi64( s1[31], s1[29], s1[27], s1[25],
+                            s0[31], s0[29], s0[27], s0[25] );
+
+   if ( bit_len <= 512 ) return;
+
+   d[0] = _mm512_set_epi64( s1[38], s1[36], s1[34], s1[32],
+                            s0[38], s0[36], s0[34], s0[32] );
+   d[1] = _mm512_set_epi64( s1[39], s1[37], s1[35], s1[33],
+                            s0[39], s0[37], s0[35], s0[33] );
+   d[2] = _mm512_set_epi64( s1[46], s1[44], s1[42], s1[40],
+                            s0[46], s0[44], s0[42], s0[40] );
+   d[3] = _mm512_set_epi64( s1[47], s1[45], s1[43], s1[41],
+                            s0[47], s0[45], s0[43], s0[41] );
+   d[4] = _mm512_set_epi64( s1[54], s1[52], s1[50], s1[48],
+                            s0[54], s0[52], s0[50], s0[48] );
+   d[5] = _mm512_set_epi64( s1[55], s1[53], s1[51], s1[49],
+                            s0[55], s0[53], s0[51], s0[49] );
+
+   d[6] = _mm512_set_epi64( s1[62], s1[60], s1[58], s1[56],
+                            s0[62], s0[60], s0[58], s0[56] );
+   d[7] = _mm512_set_epi64( s1[63], s1[61], s1[59], s1[57],
+                            s0[63], s0[61], s0[59], s0[57] );
+
+}
+
 static inline void mm512_extract_lane_4x128( void *d, const void *s,
                                             const int lane, const int bit_len )
 {
diff --git a/miner.h b/miner.h
index 98fc082..58d2107 100644
--- a/miner.h
+++ b/miner.h
@@ -538,6 +538,7 @@ enum algos {
         ALGO_SCRYPTJANE,
         ALGO_SHA256D,
         ALGO_SHA256T,
+        ALGO_SHA256Q,
         ALGO_SHAVITE3,    
         ALGO_SKEIN,       
         ALGO_SKEIN2,      
@@ -625,6 +626,7 @@ static const char* const algo_names[] = {
         "scryptjane",
         "sha256d",
         "sha256t",
+        "sha256q",
         "shavite3",
         "skein",
         "skein2",
@@ -774,7 +776,8 @@ Options:\n\
                           scryptjane:nf\n\
                           sha256d       Double SHA-256\n\
                           sha256t       Triple SHA-256, Onecoin (OC)\n\
-                          shavite3      Shavite3\n\
+                          sha256q       Quad SHA-256, Pyrite (PYE)\n\
+			  shavite3      Shavite3\n\
                           skein         Skein+Sha (Skeincoin)\n\
                           skein2        Double Skein (Woodcoin)\n\
                           skunk         Signatum (SIGT)\n\