v3.17.1

v3.17.0
v3.16.5
2025-09-17 23:44:27 +00:00 · 2021-07-26 15:01:37 -04:00 · 2021-07-15 20:30:44 -04:00 · 2021-06-26 12:27:44 -04:00 · 2021-06-23 21:52:42 -04:00 · 2021-05-06 14:55:03 -04:00
69 changed files with 3077 additions and 1175 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -163,6 +163,8 @@ cpuminer_SOURCES = \
  algo/sha/sph_sha2big.c \
  algo/sha/sha256-hash-4way.c \
  algo/sha/sha512-hash-4way.c \
  algo/sha/sha256-hash-opt.c \
  algo/sha/sha256-hash-2way-ni.c \
  algo/sha/hmac-sha256-hash.c \
  algo/sha/hmac-sha256-hash-4way.c \
  algo/sha/sha2.c \
--- a/README.txt
+++ b/README.txt
@@ -64,6 +64,11 @@ source code obtained from the author's official repository. The exact
 procedure is documented in the build instructions for Windows:
 https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
 Some DLL filess may already be installed on the system by Windows or third
 party packages. They often will work and may be used instead of the included
 file. Without a compelling reason to do so it's recommended to use the included
 files as they are packaged.
 If you like this software feel free to donate:
 BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
--- a/31
+++ b/31
@@ -65,11 +65,39 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 v3.17.1
 Fixed Windows build for AES+SSE4.2 (Westmere), was missing AES.
 More ternary logic optimizations for AVX512, AVX512+VAES, and AVX512+AES.
 Fixed my-gr algo for VAES.
 v3.17.0
 AVX512 optimized using ternary logic instructions.
 Faster sha256t on all CPU architectures: AVX512 +30%, SHA +30%, AVX2 +9%.
 Use SHA on supported CPUs to produce merkle hash.
 Fixed byte order in Extranonce2 log & replaced Block height with Job ID.
 v3.16.5
 #329: Fixed GBT incorrect target diff in stats, second attempt.
 Fixed formatting error in share result log when --no-color option is used.
 v3.16.4
 Faster sha512 and sha256 when not using SHA CPU extension.
 #329: Fixed GBT incorrect target diff in stats.
 v3.16.3
 #313 Fix compile error with GCC 11.
 Incremental improvements to verthash.
 v3.16.2
 Verthash: midstate prehash optimization for all architectures.
 Verthash: AVX2 optimization.
-GBT: added support for Bech32 addresses, untested.
+GBT: added support for Bech32 addresses.
 Linux: added CPU frequency to benchmark log.
 Fixed integer overflow in time calculations.
@@ -112,7 +140,6 @@ v3.15.5
 Fix stratum jobs lost if 2 jobs received in less than one second.
 v3.15.4
 Fixed yescryptr16 broken in v3.15.3.
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -180,6 +180,7 @@ void blake512_8way_update( void *cc, const void *data, size_t len );
 void blake512_8way_close( void *cc, void *dst );
 void blake512_8way_full( blake_8way_big_context *sc, void * dst,
                        const void *data, size_t len );
 void blake512_8way_hash_le80( void *hash, const void *data );
 #endif  // AVX512
 #endif  // AVX2
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -669,14 +669,14 @@ do { \
      ROUND_S_8WAY(2); \
      ROUND_S_8WAY(3); \
   } \
-   H0 = _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), H0 ); \
+   H0 = mm256_xor3( V8, V0, H0 ); \
-   H1 = _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), H1 ); \
+   H1 = mm256_xor3( V9, V1, H1 ); \
-   H2 = _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), H2 ); \
+   H2 = mm256_xor3( VA, V2, H2 ); \
-   H3 = _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), H3 ); \
+   H3 = mm256_xor3( VB, V3, H3 ); \
-   H4 = _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), H4 ); \
+   H4 = mm256_xor3( VC, V4, H4 ); \
-   H5 = _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), H5 ); \
+   H5 = mm256_xor3( VD, V5, H5 ); \
-   H6 = _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), H6 ); \
+   H6 = mm256_xor3( VE, V6, H6 ); \
-   H7 = _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), H7 ); \
+   H7 = mm256_xor3( VF, V7, H7 ); \
 } while (0)
@@ -808,14 +808,14 @@ do { \
      ROUND_S_16WAY(2); \
      ROUND_S_16WAY(3); \
   } \
-   H0 = _mm512_xor_si512( _mm512_xor_si512( V8, V0 ), H0 ); \
+   H0 = mm512_xor3( V8, V0, H0 ); \
-   H1 = _mm512_xor_si512( _mm512_xor_si512( V9, V1 ), H1 ); \
+   H1 = mm512_xor3( V9, V1, H1 ); \
-   H2 = _mm512_xor_si512( _mm512_xor_si512( VA, V2 ), H2 ); \
+   H2 = mm512_xor3( VA, V2, H2 ); \
-   H3 = _mm512_xor_si512( _mm512_xor_si512( VB, V3 ), H3 ); \
+   H3 = mm512_xor3( VB, V3, H3 ); \
-   H4 = _mm512_xor_si512( _mm512_xor_si512( VC, V4 ), H4 ); \
+   H4 = mm512_xor3( VC, V4, H4 ); \
-   H5 = _mm512_xor_si512( _mm512_xor_si512( VD, V5 ), H5 ); \
+   H5 = mm512_xor3( VD, V5, H5 ); \
-   H6 = _mm512_xor_si512( _mm512_xor_si512( VE, V6 ), H6 ); \
+   H6 = mm512_xor3( VE, V6, H6 ); \
-   H7 = _mm512_xor_si512( _mm512_xor_si512( VF, V7 ), H7 ); \
+   H7 = mm512_xor3( VF, V7, H7 ); \
 } while (0)
 #endif
--- a/algo/blake/blake2b-hash-4way.c
+++ b/algo/blake/blake2b-hash-4way.c
@@ -122,14 +122,14 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
      B2B8W_G( 3, 4,  9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
   }
-   ctx->h[0] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[0], v[0] ), v[ 8] );
+   ctx->h[0] = mm512_xor3( ctx->h[0], v[0], v[ 8] );
-   ctx->h[1] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[1], v[1] ), v[ 9] );
+   ctx->h[1] = mm512_xor3( ctx->h[1], v[1], v[ 9] );
-   ctx->h[2] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[2], v[2] ), v[10] );
+   ctx->h[2] = mm512_xor3( ctx->h[2], v[2], v[10] );
-   ctx->h[3] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[3], v[3] ), v[11] );
+   ctx->h[3] = mm512_xor3( ctx->h[3], v[3], v[11] );
-   ctx->h[4] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[4], v[4] ), v[12] );
+   ctx->h[4] = mm512_xor3( ctx->h[4], v[4], v[12] );
-   ctx->h[5] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[5], v[5] ), v[13] );
+   ctx->h[5] = mm512_xor3( ctx->h[5], v[5], v[13] );
-   ctx->h[6] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[6], v[6] ), v[14] );
+   ctx->h[6] = mm512_xor3( ctx->h[6], v[6], v[14] );
-   ctx->h[7] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[7], v[7] ), v[15] );
+   ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
 }
 int blake2b_8way_init( blake2b_8way_ctx *ctx )
--- a/algo/blake/blake2b-hash-4way.h
+++ b/algo/blake/blake2b-hash-4way.h
@@ -17,7 +17,7 @@
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-ALIGN(128) typedef struct {
+typedef struct ALIGN( 64 ) {
   __m512i b[16]; // input buffer
   __m512i h[8];  // chained state
   uint64_t t[2];  // total number of bytes
@@ -35,7 +35,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
 #if defined(__AVX2__)
 // state context
-ALIGN(128) typedef struct {
+typedef struct ALIGN( 64 ) {
 	__m256i b[16]; // input buffer
 	__m256i h[8];  // chained state
 	uint64_t t[2];  // total number of bytes
--- a/algo/blake/blake2s-gate.h
+++ b/algo/blake/blake2s-gate.h
@@ -4,7 +4,6 @@
 #include <stdint.h>
 #include "algo-gate-api.h"
 //#if defined(__SSE4_2__)
 #if defined(__SSE2__)
  #define BLAKE2S_4WAY
 #endif
@@ -27,8 +26,6 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
 #elif defined (BLAKE2S_8WAY)
 //#if defined(BLAKE2S_8WAY)
 void blake2s_8way_hash( void *state, const void *input );
 int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -368,7 +368,7 @@ do { \
   ROUND8W( 9 );
   for( size_t i = 0; i < 8; ++i )
-      S->h[i] = _mm256_xor_si256( _mm256_xor_si256( S->h[i], v[i] ), v[i + 8] );
+      S->h[i] = mm256_xor3( S->h[i], v[i], v[i + 8] );
 #undef G8W
 #undef ROUND8W
@@ -566,7 +566,7 @@ do { \
   ROUND16W( 9 );
   for( size_t i = 0; i < 8; ++i )
-      S->h[i] = _mm512_xor_si512( _mm512_xor_si512( S->h[i], v[i] ), v[i + 8] );
+      S->h[i] = mm512_xor3( S->h[i], v[i], v[i + 8] );
 #undef G16W
 #undef ROUND16W
--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -60,7 +60,7 @@ typedef struct __blake2s_nway_param
 } blake2s_nway_param;
 #pragma pack(pop)
-ALIGN( 64 ) typedef struct __blake2s_4way_state
+typedef struct ALIGN( 64 ) __blake2s_4way_state
 {
   __m128i h[8];
   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 4 ];
@@ -80,7 +80,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
 #if defined(__AVX2__)
-ALIGN( 64 ) typedef struct __blake2s_8way_state
+typedef struct ALIGN( 64 ) __blake2s_8way_state
 {
   __m256i h[8];
   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 8 ];
@@ -101,7 +101,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-ALIGN( 128 ) typedef struct __blake2s_16way_state
+typedef struct ALIGN( 64 ) __blake2s_16way_state
 {
   __m512i h[8];
   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 16 ];
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -293,10 +293,6 @@ static const sph_u64 CB[16] = {
      H5 = (state)->H[5]; \
      H6 = (state)->H[6]; \
      H7 = (state)->H[7]; \
      S0 = (state)->S[0]; \
      S1 = (state)->S[1]; \
      S2 = (state)->S[2]; \
      S3 = (state)->S[3]; \
      T0 = (state)->T0; \
      T1 = (state)->T1; \
   } while (0)
@@ -310,10 +306,6 @@ static const sph_u64 CB[16] = {
      (state)->H[5] = H5; \
      (state)->H[6] = H6; \
      (state)->H[7] = H7; \
      (state)->S[0] = S0; \
      (state)->S[1] = S1; \
      (state)->S[2] = S2; \
      (state)->S[3] = S3; \
      (state)->T0 = T0; \
      (state)->T1 = T1; \
   } while (0)
@@ -348,7 +340,6 @@ static const sph_u64 CB[16] = {
 #define DECL_STATE64_8WAY \
   __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
        __m512i S0, S1, S2, S3; \
   uint64_t T0, T1;
 #define COMPRESS64_8WAY( buf )   do \
@@ -366,10 +357,10 @@ static const sph_u64 CB[16] = {
  V5 = H5; \
  V6 = H6; \
  V7 = H7; \
-  V8 = _mm512_xor_si512( S0, m512_const1_64( CB0 ) );  \
+  V8 = m512_const1_64( CB0 );  \
-  V9 = _mm512_xor_si512( S1, m512_const1_64( CB1 ) );  \
+  V9 = m512_const1_64( CB1 );  \
-  VA = _mm512_xor_si512( S2, m512_const1_64( CB2 ) );  \
+  VA = m512_const1_64( CB2 );  \
-  VB = _mm512_xor_si512( S3, m512_const1_64( CB3 ) );  \
+  VB = m512_const1_64( CB3 );  \
  VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
                         m512_const1_64( CB4 ) );  \
  VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
@@ -414,14 +405,14 @@ static const sph_u64 CB[16] = {
  ROUND_B_8WAY(3); \
  ROUND_B_8WAY(4); \
  ROUND_B_8WAY(5); \
-  H0 = mm512_xor4( V8, V0, S0, H0 ); \
+  H0 = mm512_xor3( V8, V0, H0 ); \
-  H1 = mm512_xor4( V9, V1, S1, H1 ); \
+  H1 = mm512_xor3( V9, V1, H1 ); \
-  H2 = mm512_xor4( VA, V2, S2, H2 ); \
+  H2 = mm512_xor3( VA, V2, H2 ); \
-  H3 = mm512_xor4( VB, V3, S3, H3 ); \
+  H3 = mm512_xor3( VB, V3, H3 ); \
-  H4 = mm512_xor4( VC, V4, S0, H4 ); \
+  H4 = mm512_xor3( VC, V4, H4 ); \
-  H5 = mm512_xor4( VD, V5, S1, H5 ); \
+  H5 = mm512_xor3( VD, V5, H5 ); \
-  H6 = mm512_xor4( VE, V6, S2, H6 ); \
+  H6 = mm512_xor3( VE, V6, H6 ); \
-  H7 = mm512_xor4( VF, V7, S3, H7 ); \
+  H7 = mm512_xor3( VF, V7, H7 ); \
 } while (0)
 void blake512_8way_compress( blake_8way_big_context *sc )
@@ -440,10 +431,10 @@ void blake512_8way_compress( blake_8way_big_context *sc )
  V5 = sc->H[5];
  V6 = sc->H[6];
  V7 = sc->H[7];
-  V8 = _mm512_xor_si512( sc->S[0], m512_const1_64( CB0 ) );
+  V8 = m512_const1_64( CB0 );
-  V9 = _mm512_xor_si512( sc->S[1], m512_const1_64( CB1 ) );
+  V9 = m512_const1_64( CB1 );
-  VA = _mm512_xor_si512( sc->S[2], m512_const1_64( CB2 ) );
+  VA = m512_const1_64( CB2 );
-  VB = _mm512_xor_si512( sc->S[3], m512_const1_64( CB3 ) );
+  VB = m512_const1_64( CB3 );
  VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
                            m512_const1_64( CB4 ) );
  VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
@@ -492,19 +483,18 @@ void blake512_8way_compress( blake_8way_big_context *sc )
  ROUND_B_8WAY(4);
  ROUND_B_8WAY(5);
-  sc->H[0] = mm512_xor4( V8, V0, sc->S[0], sc->H[0] );
+  sc->H[0] = mm512_xor3( V8, V0, sc->H[0] );
-  sc->H[1] = mm512_xor4( V9, V1, sc->S[1], sc->H[1] );
+  sc->H[1] = mm512_xor3( V9, V1, sc->H[1] );
-  sc->H[2] = mm512_xor4( VA, V2, sc->S[2], sc->H[2] );
+  sc->H[2] = mm512_xor3( VA, V2, sc->H[2] );
-  sc->H[3] = mm512_xor4( VB, V3, sc->S[3], sc->H[3] );
+  sc->H[3] = mm512_xor3( VB, V3, sc->H[3] );
-  sc->H[4] = mm512_xor4( VC, V4, sc->S[0], sc->H[4] );
+  sc->H[4] = mm512_xor3( VC, V4, sc->H[4] );
-  sc->H[5] = mm512_xor4( VD, V5, sc->S[1], sc->H[5] );
+  sc->H[5] = mm512_xor3( VD, V5, sc->H[5] );
-  sc->H[6] = mm512_xor4( VE, V6, sc->S[2], sc->H[6] );
+  sc->H[6] = mm512_xor3( VE, V6, sc->H[6] );
-  sc->H[7] = mm512_xor4( VF, V7, sc->S[3], sc->H[7] );
+  sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
 }
 void blake512_8way_init( blake_8way_big_context *sc )
 {
   __m512i zero = m512_zero;
   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
@@ -514,11 +504,6 @@ void blake512_8way_init( blake_8way_big_context *sc )
   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
   casti_m512i( sc->S, 0 ) = zero;
   casti_m512i( sc->S, 1 ) = zero;
   casti_m512i( sc->S, 2 ) = zero;
   casti_m512i( sc->S, 3 ) = zero;
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
 }
@@ -641,11 +626,6 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
   casti_m512i( sc->S, 0 ) = m512_zero;
   casti_m512i( sc->S, 1 ) = m512_zero;
   casti_m512i( sc->S, 2 ) = m512_zero;
   casti_m512i( sc->S, 3 ) = m512_zero;
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
@@ -740,7 +720,6 @@ blake512_8way_close(void *cc, void *dst)
 #define DECL_STATE64_4WAY \
 	__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
        __m256i S0, S1, S2, S3; \
 	uint64_t T0, T1;
 #define COMPRESS64_4WAY   do \
@@ -758,10 +737,10 @@ blake512_8way_close(void *cc, void *dst)
  V5 = H5; \
  V6 = H6; \
  V7 = H7; \
-  V8 = _mm256_xor_si256( S0, m256_const1_64( CB0 ) );  \
+  V8 = m256_const1_64( CB0 );  \
-  V9 = _mm256_xor_si256( S1, m256_const1_64( CB1 ) );  \
+  V9 = m256_const1_64( CB1 );  \
-  VA = _mm256_xor_si256( S2, m256_const1_64( CB2 ) );  \
+  VA = m256_const1_64( CB2 );  \
-  VB = _mm256_xor_si256( S3, m256_const1_64( CB3 ) );  \
+  VB = m256_const1_64( CB3 );  \
  VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
                         m256_const1_64( CB4 ) );  \
  VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
@@ -804,14 +783,14 @@ blake512_8way_close(void *cc, void *dst)
  ROUND_B_4WAY(3); \
  ROUND_B_4WAY(4); \
  ROUND_B_4WAY(5); \
-  H0 = mm256_xor4( V8, V0, S0, H0 ); \
+  H0 = mm256_xor3( V8, V0, H0 ); \
-  H1 = mm256_xor4( V9, V1, S1, H1 ); \
+  H1 = mm256_xor3( V9, V1, H1 ); \
-  H2 = mm256_xor4( VA, V2, S2, H2 ); \
+  H2 = mm256_xor3( VA, V2, H2 ); \
-  H3 = mm256_xor4( VB, V3, S3, H3 ); \
+  H3 = mm256_xor3( VB, V3, H3 ); \
-  H4 = mm256_xor4( VC, V4, S0, H4 ); \
+  H4 = mm256_xor3( VC, V4, H4 ); \
-  H5 = mm256_xor4( VD, V5, S1, H5 ); \
+  H5 = mm256_xor3( VD, V5, H5 ); \
-  H6 = mm256_xor4( VE, V6, S2, H6 ); \
+  H6 = mm256_xor3( VE, V6, H6 ); \
-  H7 = mm256_xor4( VF, V7, S3, H7 ); \
+  H7 = mm256_xor3( VF, V7, H7 ); \
 } while (0)
@@ -831,10 +810,10 @@ void blake512_4way_compress( blake_4way_big_context *sc )
  V5 = sc->H[5];
  V6 = sc->H[6];
  V7 = sc->H[7];
-  V8 = _mm256_xor_si256( sc->S[0], m256_const1_64( CB0 ) );
+  V8 = m256_const1_64( CB0 );
-  V9 = _mm256_xor_si256( sc->S[1], m256_const1_64( CB1 ) );
+  V9 = m256_const1_64( CB1 );
-  VA = _mm256_xor_si256( sc->S[2], m256_const1_64( CB2 ) );
+  VA = m256_const1_64( CB2 );
-  VB = _mm256_xor_si256( sc->S[3], m256_const1_64( CB3 ) );
+  VB = m256_const1_64( CB3 );
  VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
                             m256_const1_64( CB4 ) );
  VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
@@ -880,19 +859,18 @@ void blake512_4way_compress( blake_4way_big_context *sc )
  ROUND_B_4WAY(4);
  ROUND_B_4WAY(5);
-  sc->H[0] = mm256_xor4( V8, V0, sc->S[0], sc->H[0] );
+  sc->H[0] = mm256_xor3( V8, V0, sc->H[0] );
-  sc->H[1] = mm256_xor4( V9, V1, sc->S[1], sc->H[1] );
+  sc->H[1] = mm256_xor3( V9, V1, sc->H[1] );
-  sc->H[2] = mm256_xor4( VA, V2, sc->S[2], sc->H[2] );
+  sc->H[2] = mm256_xor3( VA, V2, sc->H[2] );
-  sc->H[3] = mm256_xor4( VB, V3, sc->S[3], sc->H[3] );
+  sc->H[3] = mm256_xor3( VB, V3, sc->H[3] );
-  sc->H[4] = mm256_xor4( VC, V4, sc->S[0], sc->H[4] );
+  sc->H[4] = mm256_xor3( VC, V4, sc->H[4] );
-  sc->H[5] = mm256_xor4( VD, V5, sc->S[1], sc->H[5] );
+  sc->H[5] = mm256_xor3( VD, V5, sc->H[5] );
-  sc->H[6] = mm256_xor4( VE, V6, sc->S[2], sc->H[6] );
+  sc->H[6] = mm256_xor3( VE, V6, sc->H[6] );
-  sc->H[7] = mm256_xor4( VF, V7, sc->S[3], sc->H[7] );
+  sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
 }
 void blake512_4way_init( blake_4way_big_context *sc )
 {
   __m256i zero = m256_zero;
   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
@@ -902,11 +880,6 @@ void blake512_4way_init( blake_4way_big_context *sc )
   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
   casti_m256i( sc->S, 0 ) = zero;
   casti_m256i( sc->S, 1 ) = zero;
   casti_m256i( sc->S, 2 ) = zero;
   casti_m256i( sc->S, 3 ) = zero;
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
 }
@@ -1026,11 +999,6 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
   casti_m256i( sc->S, 0 ) = m256_zero;
   casti_m256i( sc->S, 1 ) = m256_zero;
   casti_m256i( sc->S, 2 ) = m256_zero;
   casti_m256i( sc->S, 3 ) = m256_zero;
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -323,7 +323,7 @@ int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen )
 int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
 {
-	blake2s_state S[1];
+	blake2s_state S;
 	/* Verify parameters */
 	if ( NULL == in ) return -1;
@@ -334,15 +334,15 @@ int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen
 	if( keylen > 0 )
 	{
-		if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
+		if( blake2s_init_key( &S, outlen, key, keylen ) < 0 ) return -1;
 	}
 	else
 	{
-		if( blake2s_init( S, outlen ) < 0 ) return -1;
+		if( blake2s_init( &S, outlen ) < 0 ) return -1;
 	}
-	blake2s_update( S, ( uint8_t * )in, inlen );
+	blake2s_update( &S, ( uint8_t * )in, inlen );
-	blake2s_final( S, out, outlen );
+	blake2s_final( &S, out, outlen );
 	return 0;
 }
--- a/algo/blake/sph-blake2s.h
+++ b/algo/blake/sph-blake2s.h
@@ -116,7 +116,7 @@ extern "C" {
 		uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
 	} blake2s_param;
-	ALIGN( 64 ) typedef struct __blake2s_state
+	typedef struct ALIGN( 64 ) __blake2s_state
 	{
 		uint32_t h[8];
 		uint32_t t[2];
--- a/algo/blake/sph_blake2b.h
+++ b/algo/blake/sph_blake2b.h
@@ -18,7 +18,7 @@
 #endif
 // state context
-ALIGN(64) typedef struct {
+typedef ALIGN(64) struct {
 	uint8_t b[128]; // input buffer
 	uint64_t h[8];  // chained state
 	uint64_t t[2];  // total number of bytes
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -867,40 +867,35 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
   qt[30] = expand2s8( qt, M, H, 30 );
   qt[31] = expand2s8( qt, M, H, 31 );
-   xl = _mm256_xor_si256(
+   xl = mm256_xor3( mm256_xor3( qt[16], qt[17], qt[18] ),
-              mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
+                    mm256_xor3( qt[19], qt[20], qt[21] ),
-              mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) );
+                    _mm256_xor_si256( qt[22], qt[23] ) );
-   xh = _mm256_xor_si256( xl,  _mm256_xor_si256(
+
-                 mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
+   xh = mm256_xor3( mm256_xor3( xl,     qt[24], qt[25] ),
-                 mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
+                    mm256_xor3( qt[26], qt[27], qt[28] ),
                    mm256_xor3( qt[29], qt[30], qt[31] ) );
 #define DH1L( m, sl, sr, a, b, c ) \
-   _mm256_add_epi32( \
+   _mm256_add_epi32( mm256_xor3( M[m], _mm256_slli_epi32( xh, sl ), \
-               _mm256_xor_si256( M[m], \
+                                       _mm256_srli_epi32( qt[a], sr ) ), \
-                  _mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \
+                     mm256_xor3( xl, qt[b], qt[c] ) )
                                    _mm256_srli_epi32( qt[a], sr ) ) ), \
               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
 #define DH1R( m, sl, sr, a, b, c ) \
-   _mm256_add_epi32( \
+   _mm256_add_epi32( mm256_xor3( M[m], _mm256_srli_epi32( xh, sl ), \
-               _mm256_xor_si256( M[m], \
+                                       _mm256_slli_epi32( qt[a], sr ) ), \
-                  _mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \
+                     mm256_xor3( xl, qt[b], qt[c] ) )
                                    _mm256_slli_epi32( qt[a], sr ) ) ), \
               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
 #define DH2L( m, rl, sl, h, a, b, c ) \
   _mm256_add_epi32( _mm256_add_epi32( \
-       mm256_rol_32( dH[h], rl ), \
+                        mm256_rol_32( dH[h], rl ), \
-          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
+                        mm256_xor3( xh, qt[a], M[m] ) ), \
-                 _mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \
+                     mm256_xor3( _mm256_slli_epi32( xl, sl ), qt[b], qt[c] ) ) 
                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
 #define DH2R( m, rl, sr, h, a, b, c ) \
   _mm256_add_epi32( _mm256_add_epi32( \
-       mm256_rol_32( dH[h], rl ), \
+                        mm256_rol_32( dH[h], rl ), \
-          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
+                        mm256_xor3( xh, qt[a], M[m] ) ), \
-                 _mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \
+                     mm256_xor3( _mm256_srli_epi32( xl, sr ), qt[b], qt[c] ) )
                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
   dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
@@ -924,88 +919,6 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
 #undef DH2L
 #undef DH2R
 /*   
   dH[ 0] = _mm256_add_epi32(
                 _mm256_xor_si256( M[0],
                      _mm256_xor_si256( _mm256_slli_epi32( xh, 5 ),
                                        _mm256_srli_epi32( qt[16], 5 ) ) ),
                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
   dH[ 1] = _mm256_add_epi32(
                 _mm256_xor_si256( M[1],
                      _mm256_xor_si256( _mm256_srli_epi32( xh, 7 ),
                                        _mm256_slli_epi32( qt[17], 8 ) ) ),
                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
   dH[ 2] = _mm256_add_epi32(
                 _mm256_xor_si256( M[2],
                      _mm256_xor_si256( _mm256_srli_epi32( xh, 5 ),
                                        _mm256_slli_epi32( qt[18], 5 ) ) ),
                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
   dH[ 3] = _mm256_add_epi32(
                 _mm256_xor_si256( M[3],
                      _mm256_xor_si256( _mm256_srli_epi32( xh, 1 ),
                                        _mm256_slli_epi32( qt[19], 5 ) ) ),
                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
   dH[ 4] = _mm256_add_epi32(
                 _mm256_xor_si256( M[4],
                      _mm256_xor_si256( _mm256_srli_epi32( xh, 3 ),
                                        _mm256_slli_epi32( qt[20], 0 ) ) ),
                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
   dH[ 5] = _mm256_add_epi32(
                 _mm256_xor_si256( M[5],
                      _mm256_xor_si256( _mm256_slli_epi32( xh, 6 ),
                                        _mm256_srli_epi32( qt[21], 6 ) ) ),
                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
   dH[ 6] = _mm256_add_epi32(
                 _mm256_xor_si256( M[6],
                      _mm256_xor_si256( _mm256_srli_epi32( xh, 4 ),
                                        _mm256_slli_epi32( qt[22], 6 ) ) ),
                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
   dH[ 7] = _mm256_add_epi32(
                 _mm256_xor_si256( M[7],
                      _mm256_xor_si256( _mm256_srli_epi32( xh, 11 ),
                                        _mm256_slli_epi32( qt[23], 2 ) ) ),
                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
   dH[ 8] = _mm256_add_epi32( _mm256_add_epi32(
                 mm256_rol_32( dH[4], 9 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
                 _mm256_xor_si256( _mm256_slli_epi32( xl, 8 ),
                                   _mm256_xor_si256( qt[23], qt[ 8] ) ) );
   dH[ 9] = _mm256_add_epi32( _mm256_add_epi32(
                 mm256_rol_32( dH[5], 10 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
                 _mm256_xor_si256( _mm256_srli_epi32( xl, 6 ),
                                   _mm256_xor_si256( qt[16], qt[ 9] ) ) );
   dH[10] = _mm256_add_epi32( _mm256_add_epi32(
                 mm256_rol_32( dH[6], 11 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
                 _mm256_xor_si256( _mm256_slli_epi32( xl, 6 ),
                                   _mm256_xor_si256( qt[17], qt[10] ) ) );
   dH[11] = _mm256_add_epi32( _mm256_add_epi32(
                 mm256_rol_32( dH[7], 12 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
                 _mm256_xor_si256( _mm256_slli_epi32( xl, 4 ),
                                   _mm256_xor_si256( qt[18], qt[11] ) ) );
   dH[12] = _mm256_add_epi32( _mm256_add_epi32(
                 mm256_rol_32( dH[0], 13 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
                 _mm256_xor_si256( _mm256_srli_epi32( xl, 3 ),
                                   _mm256_xor_si256( qt[19], qt[12] ) ) );
   dH[13] = _mm256_add_epi32( _mm256_add_epi32(
                 mm256_rol_32( dH[1], 14 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
                 _mm256_xor_si256( _mm256_srli_epi32( xl, 4 ),
                                   _mm256_xor_si256( qt[20], qt[13] ) ) );
   dH[14] = _mm256_add_epi32( _mm256_add_epi32(
                 mm256_rol_32( dH[2], 15 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
                 _mm256_xor_si256( _mm256_srli_epi32( xl, 7 ),
                                   _mm256_xor_si256( qt[21], qt[14] ) ) );
   dH[15] = _mm256_add_epi32( _mm256_add_epi32(
                 mm256_rol_32( dH[3], 16 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
                 _mm256_xor_si256( _mm256_srli_epi32( xl, 2 ),
                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
 */
 }
 static const __m256i final_s8[16] =
@@ -1422,40 +1335,35 @@ void compress_small_16way( const __m512i *M, const __m512i H[16],
   qt[30] = expand2s16( qt, M, H, 30 );
   qt[31] = expand2s16( qt, M, H, 31 );
-   xl = _mm512_xor_si512(
+   xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
-              mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
+                    mm512_xor3( qt[19], qt[20], qt[21] ),
-              mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
+                    _mm512_xor_si512( qt[22], qt[23] ) );
-   xh = _mm512_xor_si512( xl,  _mm512_xor_si512(
+
-                 mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
+   xh = mm512_xor3( mm512_xor3( xl,     qt[24], qt[25] ),
-                 mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
+                    mm512_xor3( qt[26], qt[27], qt[28] ),
                    mm512_xor3( qt[29], qt[30], qt[31] ) );
 #define DH1L( m, sl, sr, a, b, c ) \
-   _mm512_add_epi32( \
+   _mm512_add_epi32( mm512_xor3( M[m], _mm512_slli_epi32( xh, sl ), \
-               _mm512_xor_si512( M[m], \
+                                       _mm512_srli_epi32( qt[a], sr ) ), \
-                  _mm512_xor_si512( _mm512_slli_epi32( xh, sl ), \
+                     mm512_xor3( xl, qt[b], qt[c] ) )
                                    _mm512_srli_epi32( qt[a], sr ) ) ), \
               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
 #define DH1R( m, sl, sr, a, b, c ) \
-   _mm512_add_epi32( \
+   _mm512_add_epi32( mm512_xor3( M[m], _mm512_srli_epi32( xh, sl ), \
-               _mm512_xor_si512( M[m], \
+                                       _mm512_slli_epi32( qt[a], sr ) ), \
-                  _mm512_xor_si512( _mm512_srli_epi32( xh, sl ), \
+                     mm512_xor3( xl, qt[b], qt[c] ) )
                                    _mm512_slli_epi32( qt[a], sr ) ) ), \
               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
 #define DH2L( m, rl, sl, h, a, b, c ) \
   _mm512_add_epi32( _mm512_add_epi32( \
-       mm512_rol_32( dH[h], rl ), \
+                        mm512_rol_32( dH[h], rl ), \
-          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
+                        mm512_xor3( xh, qt[a], M[m] ) ), \
-                 _mm512_xor_si512( _mm512_slli_epi32( xl, sl ), \
+                     mm512_xor3( _mm512_slli_epi32( xl, sl ), qt[b], qt[c] ) ) 
                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
 #define DH2R( m, rl, sr, h, a, b, c ) \
   _mm512_add_epi32( _mm512_add_epi32( \
-       mm512_rol_32( dH[h], rl ), \
+                        mm512_rol_32( dH[h], rl ), \
-          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
+                        mm512_xor3( xh, qt[a], M[m] ) ), \
-                 _mm512_xor_si512( _mm512_srli_epi32( xl, sr ), \
+                     mm512_xor3( _mm512_srli_epi32( xl, sr ), qt[b], qt[c] ) )
                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
   dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -1285,40 +1285,35 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
   qt[30] = expand2b8( qt, M, H, 30 );
   qt[31] = expand2b8( qt, M, H, 31 );
-   xl = _mm512_xor_si512(
+   xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
-           mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
+                    mm512_xor3( qt[19], qt[20], qt[21] ),
-           mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
+                    _mm512_xor_si512( qt[22], qt[23] ) );
-   xh = _mm512_xor_si512( xl, _mm512_xor_si512(
+
-           mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
+   xh = mm512_xor3( mm512_xor3( xl,     qt[24], qt[25] ),
-           mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
+                    mm512_xor3( qt[26], qt[27], qt[28] ),
                    mm512_xor3( qt[29], qt[30], qt[31] ) );
 #define DH1L( m, sl, sr, a, b, c ) \
-   _mm512_add_epi64( \
+   _mm512_add_epi64( mm512_xor3( M[m], _mm512_slli_epi64( xh, sl ), \
-               _mm512_xor_si512( M[m], \
+                                       _mm512_srli_epi64( qt[a], sr ) ), \
-                  _mm512_xor_si512( _mm512_slli_epi64( xh, sl ), \
+                     mm512_xor3( xl, qt[b], qt[c] ) )
                                    _mm512_srli_epi64( qt[a], sr ) ) ), \
               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
 #define DH1R( m, sl, sr, a, b, c ) \
-   _mm512_add_epi64( \
+   _mm512_add_epi64( mm512_xor3( M[m], _mm512_srli_epi64( xh, sl ), \
-               _mm512_xor_si512( M[m], \
+                                       _mm512_slli_epi64( qt[a], sr ) ), \
-                  _mm512_xor_si512( _mm512_srli_epi64( xh, sl ), \
+                     mm512_xor3( xl, qt[b], qt[c] ) )
                                    _mm512_slli_epi64( qt[a], sr ) ) ), \
               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
 #define DH2L( m, rl, sl, h, a, b, c ) \
   _mm512_add_epi64( _mm512_add_epi64( \
-       mm512_rol_64( dH[h], rl ), \
+                        mm512_rol_64( dH[h], rl ), \
-          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
+                        mm512_xor3( xh, qt[a], M[m] ) ), \
-                 _mm512_xor_si512( _mm512_slli_epi64( xl, sl ), \
+                     mm512_xor3( _mm512_slli_epi64( xl, sl ), qt[b], qt[c] ) ) 
                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
 #define DH2R( m, rl, sr, h, a, b, c ) \
   _mm512_add_epi64( _mm512_add_epi64( \
-       mm512_rol_64( dH[h], rl ), \
+                        mm512_rol_64( dH[h], rl ), \
-          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
+                        mm512_xor3( xh, qt[a], M[m] ) ), \
-                 _mm512_xor_si512( _mm512_srli_epi64( xl, sr ), \
+                     mm512_xor3( _mm512_srli_epi64( xl, sr ), qt[b], qt[c] ) )
                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -53,6 +53,20 @@ MYALIGN const unsigned int	zero[]			= {0x00000000, 0x00000000, 0x00000000, 0x000
 MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
 #define ECHO_SUBBYTES4(state, j) \
   state[0][j] = _mm_aesenc_si128(state[0][j], k1);\
   k1 = _mm_add_epi32(k1, M128(const1));\
   state[1][j] = _mm_aesenc_si128(state[1][j], k1);\
   k1 = _mm_add_epi32(k1, M128(const1));\
   state[2][j] = _mm_aesenc_si128(state[2][j], k1);\
   k1 = _mm_add_epi32(k1, M128(const1));\
   state[3][j] = _mm_aesenc_si128(state[3][j], k1);\
   k1 = _mm_add_epi32(k1, M128(const1));\
   state[0][j] = _mm_aesenc_si128(state[0][j], m128_zero ); \
   state[1][j] = _mm_aesenc_si128(state[1][j], m128_zero ); \
   state[2][j] = _mm_aesenc_si128(state[2][j], m128_zero ); \
   state[3][j] = _mm_aesenc_si128(state[3][j], m128_zero )
 #define ECHO_SUBBYTES(state, i, j) \
 	state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
   k1 = _mm_add_epi32(k1, M128(const1));\
@@ -73,7 +87,7 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	t1 = _mm_and_si128(t1, M128(lsbmask));\
 	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
 	s2 = _mm_xor_si128(s2, t2);\
-	state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
+	state2[0][j] = mm128_xor3(state2[0][j], s2, state1[1][(j + 1) & 3] );\
 	state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
 	state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
 	state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
@@ -83,7 +97,7 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
 	s2 = _mm_xor_si128(s2, t2);\
 	state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
-	state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
+	state2[1][j] = mm128_xor3(state2[1][j], s2, state1[2][(j + 2) & 3] );\
 	state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
 	state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
 	s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
@@ -93,10 +107,29 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	s2 = _mm_xor_si128(s2, t2);\
 	state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
 	state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
-	state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
+	state2[2][j] = mm128_xor3(state2[2][j], s2, state1[3][(j + 3) & 3] );\
 	state2[3][j] = _mm_xor_si128(state2[3][j], s2)
 #define ECHO_ROUND_UNROLL2 \
   ECHO_SUBBYTES4(_state, 0);\
   ECHO_SUBBYTES4(_state, 1);\
   ECHO_SUBBYTES4(_state, 2);\
   ECHO_SUBBYTES4(_state, 3);\
   ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
   ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
   ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
   ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
   ECHO_SUBBYTES4(_state2, 0);\
   ECHO_SUBBYTES4(_state2, 1);\
   ECHO_SUBBYTES4(_state2, 2);\
   ECHO_SUBBYTES4(_state2, 3);\
   ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
   ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
   ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
   ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
 /*
 #define ECHO_ROUND_UNROLL2 \
 	ECHO_SUBBYTES(_state, 0, 0);\
 	ECHO_SUBBYTES(_state, 1, 0);\
@@ -138,7 +171,7 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
 	ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
 	ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
-
+*/
 #define SAVESTATE(dst, src)\
--- a/algo/echo/echo-hash-4way.c
+++ b/algo/echo/echo-hash-4way.c
@@ -13,12 +13,19 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
+#define ECHO_SUBBYTES4(state, j) \
-//#define mul2mask    m512_const2_64( 0, 0x00001b00 )
+   state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \
-//_mm512_set4_epi32( 0, 0, 0, 0x00001b00 ) 
+   k1 = _mm512_add_epi32( k1, one ); \
-//_mm512_set4_epi32( 0x00001b00, 0, 0, 0 )  
+   state[1][j] = _mm512_aesenc_epi128( state[1][j], k1 ); \
-
+   k1 = _mm512_add_epi32( k1, one ); \
-//#define lsbmask    m512_const1_32( 0x01010101 ) 
+   state[2][j] = _mm512_aesenc_epi128( state[2][j], k1 ); \
   k1 = _mm512_add_epi32( k1, one ); \
   state[3][j] = _mm512_aesenc_epi128( state[3][j], k1 ); \
   k1 = _mm512_add_epi32( k1, one ); \
   state[0][j] = _mm512_aesenc_epi128( state[0][j], m512_zero ); \
   state[1][j] = _mm512_aesenc_epi128( state[1][j], m512_zero ); \
   state[2][j] = _mm512_aesenc_epi128( state[2][j], m512_zero ); \
   state[3][j] = _mm512_aesenc_epi128( state[3][j], m512_zero )
 #define ECHO_SUBBYTES( state, i, j ) \
 	state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
@@ -44,8 +51,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
   t1 = _mm512_and_si512( t1, lsbmask ); \
   t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
   s2 = _mm512_xor_si512( s2, t2 );\
-   state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
+   state2[ 0 ][ j ] = mm512_xor3( state2[ 0 ][ j ], s2, state1[ 1 ][ j1 ] ); \
                              _mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
   state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
   state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
   state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
@@ -55,8 +61,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
   t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
   s2 = _mm512_xor_si512( s2, t2 ); \
   state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
-   state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
+   state2[ 1 ][ j ] = mm512_xor3( state2[ 1 ][ j ], s2, state1[ 2 ][ j2 ] ); \
                            _mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
   state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
   state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
   s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
@@ -66,11 +71,29 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
   s2 = _mm512_xor_si512( s2, t2 ); \
   state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
   state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
-   state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
+   state2[ 2 ][ j ] = mm512_xor3( state2[ 2 ][ j ], s2, state1[ 3 ][ j3] ); \
                            _mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
   state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
 } while(0)
 #define ECHO_ROUND_UNROLL2 \
   ECHO_SUBBYTES4(_state, 0);\
   ECHO_SUBBYTES4(_state, 1);\
   ECHO_SUBBYTES4(_state, 2);\
   ECHO_SUBBYTES4(_state, 3);\
   ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
   ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
   ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
   ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
   ECHO_SUBBYTES4(_state2, 0);\
   ECHO_SUBBYTES4(_state2, 1);\
   ECHO_SUBBYTES4(_state2, 2);\
   ECHO_SUBBYTES4(_state2, 3);\
   ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
   ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
   ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
   ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
 /*
 #define ECHO_ROUND_UNROLL2 \
   ECHO_SUBBYTES(_state, 0, 0);\
   ECHO_SUBBYTES(_state, 1, 0);\
@@ -112,6 +135,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
   ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
   ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
   ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
 */
 #define SAVESTATE(dst, src)\
 	dst[0][0] = src[0][0];\
@@ -405,6 +429,20 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
 #define lsbmask_2way    m256_const1_32( 0x01010101 ) 
 #define ECHO_SUBBYTES4_2WAY( state, j ) \
   state[0][j] = _mm256_aesenc_epi128( state[0][j], k1 ); \
   k1 = _mm256_add_epi32( k1, m256_one_128 ); \
   state[1][j] = _mm256_aesenc_epi128( state[1][j], k1 ); \
   k1 = _mm256_add_epi32( k1, m256_one_128 ); \
   state[2][j] = _mm256_aesenc_epi128( state[2][j], k1 ); \
   k1 = _mm256_add_epi32( k1, m256_one_128 ); \
   state[3][j] = _mm256_aesenc_epi128( state[3][j], k1 ); \
   k1 = _mm256_add_epi32( k1, m256_one_128 ); \
   state[0][j] = _mm256_aesenc_epi128( state[0][j], m256_zero ); \
   state[1][j] = _mm256_aesenc_epi128( state[1][j], m256_zero ); \
   state[2][j] = _mm256_aesenc_epi128( state[2][j], m256_zero ); \
   state[3][j] = _mm256_aesenc_epi128( state[3][j], m256_zero )
 #define ECHO_SUBBYTES_2WAY( state, i, j ) \
        state[i][j] = _mm256_aesenc_epi128( state[i][j], k1 ); \
        k1 = _mm256_add_epi32( k1, m256_one_128 ); \
@@ -456,6 +494,25 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
   state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \
 } while(0)
 #define ECHO_ROUND_UNROLL2_2WAY \
   ECHO_SUBBYTES4_2WAY(_state, 0);\
   ECHO_SUBBYTES4_2WAY(_state, 1);\
   ECHO_SUBBYTES4_2WAY(_state, 2);\
   ECHO_SUBBYTES4_2WAY(_state, 3);\
   ECHO_MIXBYTES_2WAY(_state, _state2, 0, t1, t2, s2);\
   ECHO_MIXBYTES_2WAY(_state, _state2, 1, t1, t2, s2);\
   ECHO_MIXBYTES_2WAY(_state, _state2, 2, t1, t2, s2);\
   ECHO_MIXBYTES_2WAY(_state, _state2, 3, t1, t2, s2);\
   ECHO_SUBBYTES4_2WAY(_state2, 0);\
   ECHO_SUBBYTES4_2WAY(_state2, 1);\
   ECHO_SUBBYTES4_2WAY(_state2, 2);\
   ECHO_SUBBYTES4_2WAY(_state2, 3);\
   ECHO_MIXBYTES_2WAY(_state2, _state, 0, t1, t2, s2);\
   ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
   ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
   ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
 /*
 #define ECHO_ROUND_UNROLL2_2WAY \
   ECHO_SUBBYTES_2WAY(_state, 0, 0);\
   ECHO_SUBBYTES_2WAY(_state, 1, 0);\
@@ -497,6 +554,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
   ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
   ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
   ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
 */
 #define SAVESTATE_2WAY(dst, src)\
        dst[0][0] = src[0][0];\
--- a/algo/fugue/fugue-aesni.c
+++ b/algo/fugue/fugue-aesni.c
@@ -124,7 +124,16 @@ MYALIGN const unsigned int _IV512[] = {
 	t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
 	s7 = _mm_xor_si128(s7, t1)
 #define PRESUPERMIX(t0, t1, t2, t3, t4)\
   t2 = t0;\
   t3 = _mm_add_epi8(t0, t0);\
   t4 = _mm_add_epi8(t3, t3);\
   t1 = _mm_srli_epi16(t0, 6);\
   t1 = _mm_and_si128(t1, M128(_lsbmask2));\
   t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
   t0  = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
 /*
 #define PRESUPERMIX(x, t1, s1, s2, t2)\
 	s1 = x;\
 	s2 = _mm_add_epi8(x, x);\
@@ -133,37 +142,59 @@ MYALIGN const unsigned int _IV512[] = {
 	t1 = _mm_and_si128(t1, M128(_lsbmask2));\
 	s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
 	x  = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
 */
-#define SUBSTITUTE(r0, _t1, _t2, _t3, _t0)\
+#define SUBSTITUTE(r0, _t2 )\
 	_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
 	_t2 = _mm_aesenclast_si128( _t2, m128_zero )
 #define SUPERMIX(t0, t1, t2, t3, t4)\
   t2 = t0;\
   t3 = _mm_add_epi8(t0, t0);\
   t4 = _mm_add_epi8(t3, t3);\
   t1 = _mm_srli_epi16(t0, 6);\
   t1 = _mm_and_si128(t1, M128(_lsbmask2));\
   t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
   t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
   t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
   t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
   t4 = _mm_xor_si128(t4, t1);\
   t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
   t4 = _mm_xor_si128(t4, t1);\
   t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
   t2 = mm128_xor3(t2, t3, t0 );\
   t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
   t4 = mm128_xor3( t4, t1, t2 ); \
   t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
   t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
   t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
   t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
   t4 = mm128_xor3( t4, t2, t1 ); \
   t0 = _mm_xor_si128(t0, t3);\
   t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
 /*
 #define SUPERMIX(t0, t1, t2, t3, t4)\
 	PRESUPERMIX(t0, t1, t2, t3, t4);\
 	POSTSUPERMIX(t0, t1, t2, t3, t4)
-
+*/
 #define POSTSUPERMIX(t0, t1, t2, t3, t4)\
-	t1 = t2;\
+	t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
 	t1 = _mm_shuffle_epi8(t1, M128(_supermix1b));\
 	t4 = t1;\
 	t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
 	t4 = _mm_xor_si128(t4, t1);\
-	t1 = t4;\
+	t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
 	t1 = _mm_shuffle_epi8(t1, M128(_supermix1d));\
 	t4 = _mm_xor_si128(t4, t1);\
-	t1 = t2;\
+	t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
 	t1 = _mm_shuffle_epi8(t1, M128(_supermix1a));\
 	t4 = _mm_xor_si128(t4, t1);\
-	t2 = _mm_xor_si128(t2, t3);\
+	t2 = mm128_xor3(t2, t3, t0 );\
 	t2 = _mm_xor_si128(t2, t0);\
 	t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
 	t4 = _mm_xor_si128(t4, t2);\
 	t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
 	t4 = _mm_xor_si128(t4, t2);\
 	t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
-	t1 = t0;\
+	t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
 	t1 = _mm_shuffle_epi8(t1, M128(_supermix4a));\
 	t4 = _mm_xor_si128(t4, t1);\
 	t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
 	t0 = _mm_xor_si128(t0, t3);\
@@ -171,59 +202,55 @@ MYALIGN const unsigned int _IV512[] = {
 	t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
 	t4 = _mm_xor_si128(t4, t0)
 #define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
 	CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
 	PACK_S0(r1c, r1a, _t0);\
-	SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE(r1c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
 	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
 	r2c = _mm_xor_si128(r2c, _t0);\
   _t0 = mm128_mask_32( _t0, 8 ); \
 	r2d = _mm_xor_si128(r2d, _t0);\
 	UNPACK_S0(r1c, r1a, _t3);\
-	SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE(r2c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
 	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
 	r3c = _mm_xor_si128(r3c, _t0);\
   _t0 = mm128_mask_32( _t0, 8 ); \
 	r3d = _mm_xor_si128(r3d, _t0);\
 	UNPACK_S0(r2c, r2a, _t3);\
-	SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE(r3c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
 	UNPACK_S0(r3c, r3a, _t3)
 #define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
 	CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
 	PACK_S0(r1c, r1a, _t0);\
-	SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE( r1c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
 	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
 	r2c = _mm_xor_si128(r2c, _t0);\
   _t0 = mm128_mask_32( _t0, 8 ); \
 	r2d = _mm_xor_si128(r2d, _t0);\
 	UNPACK_S0(r1c, r1a, _t3);\
-	SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE(r2c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
 	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
 	r3c = _mm_xor_si128(r3c, _t0);\
   _t0 = mm128_mask_32( _t0, 8 ); \
 	r3d = _mm_xor_si128(r3d, _t0);\
 	UNPACK_S0(r2c, r2a, _t3);\
-	SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE( r3c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
 	_t0 = _mm_shuffle_epi32(r3c, 0x39);\
 	r4c = _mm_xor_si128(r4c, _t0);\
   _t0 = mm128_mask_32( _t0, 8 ); \
 	r4d = _mm_xor_si128(r4d, _t0);\
 	UNPACK_S0(r3c, r3a, _t3);\
-	SUBSTITUTE(r4c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE( r4c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
 	UNPACK_S0(r4c, r4a, _t3)
 #define LOADCOLUMN(x, s, a)\
 	block[0] = col[(base + a + 0) % s];\
 	block[1] = col[(base + a + 1) % s];\
@@ -247,14 +274,14 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
      case 1:
         TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4],
                       ctx->state[5], ctx->state[ 6], ctx->state[8],
-		       ctx->state[9], ctx->state[10], _t0, _t1, _t2 );
+                       ctx->state[9], ctx->state[10], _t0, _t1, _t2 );
-	 SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7],
+	      SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7],
                        ctx->state[1], ctx->state[7], ctx->state[8],
-		       	ctx->state[6], ctx->state[0], ctx->state[6],
+		                  ctx->state[6], ctx->state[0], ctx->state[6],
-		       	ctx->state[7], ctx->state[5], ctx->state[11],
+		                  ctx->state[7], ctx->state[5], ctx->state[11],
-		       	ctx->state[5], ctx->state[6], ctx->state[4],
+		                  ctx->state[5], ctx->state[6], ctx->state[4],
-		       	ctx->state[10] );
+		       	         ctx->state[10] );
         ctx->base++;
         pmsg += 4;
         uBlockCount--;
@@ -263,14 +290,14 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
      case 2:
         TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0],
                       ctx->state[ 1], ctx->state[2], ctx->state[4],
-		       ctx->state[ 5], ctx->state[6], _t0, _t1, _t2);
+                       ctx->state[ 5], ctx->state[6], _t0, _t1, _t2);
         SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3],
                        ctx->state[9], ctx->state[3], ctx->state[4],
-		       	ctx->state[2], ctx->state[8], ctx->state[2],
+                        ctx->state[2], ctx->state[8], ctx->state[2],
-		       	ctx->state[3], ctx->state[1], ctx->state[7],
+                        ctx->state[3], ctx->state[1], ctx->state[7],
-		       	ctx->state[1], ctx->state[2], ctx->state[0],
+                        ctx->state[1], ctx->state[2], ctx->state[0],
-		       	ctx->state[6]);
+                        ctx->state[6]);
         ctx->base = 0;
         pmsg += 4;
@@ -278,44 +305,42 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
      break;
   }
   while( uBlockCount > 0 )
   {
-      TIX512( pmsg, ctx->state[ 7], ctx->state[2], ctx->state[8], ctx->state[9],
+      TIX512( pmsg, ctx->state[ 7],ctx->state[2],ctx->state[8],ctx->state[9],
-                    ctx->state[10], ctx->state[0], ctx->state[1], ctx->state[2],
+                    ctx->state[10],ctx->state[0],ctx->state[1],ctx->state[2],
-              _t0, _t1, _t2 );
+                    _t0, _t1, _t2 );
-      SUBROUND512_4( ctx->state[0], ctx->state[1], ctx->state[11],
+      SUBROUND512_4( ctx->state[0], ctx->state[1],ctx->state[11],ctx->state[5],
-                     ctx->state[5], ctx->state[11], ctx->state[0],
+                     ctx->state[11],ctx->state[0],ctx->state[10],ctx->state[4],
-		     ctx->state[10], ctx->state[4], ctx->state[10],
+                     ctx->state[10],ctx->state[11],ctx->state[9],ctx->state[3],
-		     ctx->state[11], ctx->state[9], ctx->state[3],
+		               ctx->state[9],ctx->state[10],ctx->state[8],ctx->state[2] );
 		     ctx->state[9], ctx->state[10], ctx->state[8],
 		     ctx->state[2] );
      ctx->base++;
      pmsg += 4;
      uBlockCount--;
      if( uBlockCount == 0 ) break;
-      TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4], ctx->state[5],
+      TIX512( pmsg, ctx->state[3],ctx->state[10],ctx->state[4],ctx->state[5],
-                    ctx->state[6], ctx->state[8], ctx->state[9], ctx->state[10],
+                    ctx->state[6],ctx->state[8], ctx->state[9],ctx->state[10],
-              _t0, _t1, _t2 );
+                    _t0, _t1, _t2 );
-      SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7], ctx->state[1],                     ctx->state[7], ctx->state[8], ctx->state[6], ctx->state[0],
+      SUBROUND512_4( ctx->state[8],ctx->state[9],ctx->state[7],ctx->state[1],
-		     ctx->state[6], ctx->state[7], ctx->state[5], ctx->state[11],
+                     ctx->state[7],ctx->state[8],ctx->state[6],ctx->state[0],
-		     ctx->state[5], ctx->state[6, ctx->state[4], ctx->state[10]);
+		               ctx->state[6],ctx->state[7],ctx->state[5],ctx->state[11],
 		               ctx->state[5],ctx->state[6],ctx->state[4],ctx->state[10] );
      ctx->base++;
      pmsg += 4;
      uBlockCount--;
      if( uBlockCount == 0 ) break;
-      TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0], ctx->state[1],
+      TIX512( pmsg, ctx->state[11],ctx->state[6],ctx->state[0],ctx->state[1],
-		    ctx->state[2], ctx->state[4], ctx->state[5], ctx->state[6],
+                    ctx->state[2], ctx->state[4],ctx->state[5],ctx->state[6],
-               _t0, _t1, _t2);
+                    _t0, _t1, _t2);
-      SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3], ctx->state[9],
+      SUBROUND512_4( ctx->state[4],ctx->state[5],ctx->state[3],ctx->state[9],
-		     ctx->state[3], ctx->state[4], ctx->state[2], ctx->state[8],
+                     ctx->state[3],ctx->state[4],ctx->state[2],ctx->state[8],
-		     ctx->state[2], ctx->state[3], ctx->state[1], ctx->state[7],
+                     ctx->state[2],ctx->state[3],ctx->state[1],ctx->state[7],
-		     ctx->state[1], ctx->state[2], ctx->state[0], ctx->state[6]);
+		               ctx->state[1],ctx->state[2],ctx->state[0],ctx->state[6]);
      ctx->base = 0;
      pmsg += 4;
@@ -326,8 +351,8 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
 void Final512(hashState_fugue *ctx, BitSequence *hashval)
 {
-        unsigned int block[4] __attribute__ ((aligned (32)));
+   unsigned int block[4] __attribute__ ((aligned (32)));
-        unsigned int col[36] __attribute__ ((aligned (16)));
+   unsigned int col[36] __attribute__ ((aligned (16)));
 	unsigned int i, base;
 	__m128i r0, _t0, _t1, _t2, _t3;
@@ -357,7 +382,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
 		// SMIX
 		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
+		SUBSTITUTE(r0, _t2);
 		SUPERMIX(_t2, _t3, _t0, _t1, r0);
 		STORECOLUMN(r0, 36);
 	}
@@ -375,7 +400,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
 		// SMIX
 		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
+		SUBSTITUTE(r0, _t2);
 		SUPERMIX(_t2, _t3, _t0, _t1, r0);
 		STORECOLUMN(r0, 36);
@@ -390,7 +415,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
 		// SMIX
 		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
+		SUBSTITUTE(r0, _t2);
 		SUPERMIX(_t2, _t3, _t0, _t1, r0);
 		STORECOLUMN(r0, 36);
@@ -405,7 +430,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
 		// SMIX
 		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
+		SUBSTITUTE(r0, _t2);
 		SUPERMIX(_t2, _t3, _t0, _t1, r0);
 		STORECOLUMN(r0, 36);
@@ -420,7 +445,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
 		// SMIX
 		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
+		SUBSTITUTE(r0, _t2);
 		SUPERMIX(_t2, _t3, _t0, _t1, r0);
 		STORECOLUMN(r0, 36);
 	}
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -67,11 +67,9 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
 * xmm[j] will be lost
 * xmm[k] has to be all 0x1b */
 #define MUL2(i, j, k){\
-  j = _mm_xor_si128(j, j);\
+  j = _mm_cmpgt_epi8( m128_zero, i);\
  j = _mm_cmpgt_epi8(j, i);\
  i = _mm_add_epi8(i, i);\
-  j = _mm_and_si128(j, k);\
+  i = mm128_xorand(i, j, k );\
  i = _mm_xor_si128(i, j);\
 } 
 /**/
@@ -93,6 +91,96 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
   We almost fit into 16 registers, need only 3 spills to memory.
   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
   K. Matusiewicz, 2011/05/29 */
 #if defined(__AVX512VL__)
 #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* t_i = a_i + a_{i+1} */\
  b6 = a0;\
  b7 = a1;\
  a0 = _mm_xor_si128(a0, a1);\
  b0 = a2;\
  a1 = _mm_xor_si128(a1, a2);\
  b1 = a3;\
  TEMP2 = _mm_xor_si128(a2, a3);\
  b2 = a4;\
  a3 = _mm_xor_si128(a3, a4);\
  b3 = a5;\
  a4 = _mm_xor_si128(a4, a5);\
  b4 = a6;\
  a5 = _mm_xor_si128(a5, a6);\
  b5 = a7;\
  a6 = _mm_xor_si128(a6, a7);\
  a7 = _mm_xor_si128(a7, b6);\
   \
  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
  TEMP0 = mm128_xor3( b0, a4, a6 ); \
  /* spill values y_4, y_5 to memory */\
  TEMP1 = mm128_xor3( b1, a5, a7 );\
  b2 = mm128_xor3( b2, a6, a0 ); \
  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
  b0 = a0;\
  b3 = mm128_xor3( b3, a7, a1 ); \
  b1 = a1;\
  b6 = mm128_xor3( b6, a4, TEMP2 ); \
  b4 = mm128_xor3( b4, a0, TEMP2 ); \
  b7 = mm128_xor3( b7, a5, a3 ); \
  b5 = mm128_xor3( b5, a1, a3 ); \
  \
  /* compute x_i = t_i + t_{i+3} */\
  a0 = _mm_xor_si128(a0, a3);\
  a1 = _mm_xor_si128(a1, a4);\
  a2 = _mm_xor_si128(TEMP2, a5);\
  a3 = _mm_xor_si128(a3, a6);\
  a4 = _mm_xor_si128(a4, a7);\
  a5 = _mm_xor_si128(a5, b0);\
  a6 = _mm_xor_si128(a6, b1);\
  a7 = _mm_xor_si128(a7, TEMP2);\
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
  MUL2(a0, b0, b1);\
  a0 = _mm_xor_si128(a0, TEMP0);\
  MUL2(a1, b0, b1);\
  a1 = _mm_xor_si128(a1, TEMP1);\
  MUL2(a2, b0, b1);\
  a2 = _mm_xor_si128(a2, b2);\
  MUL2(a3, b0, b1);\
  a3 = _mm_xor_si128(a3, b3);\
  MUL2(a4, b0, b1);\
  a4 = _mm_xor_si128(a4, b4);\
  MUL2(a5, b0, b1);\
  a5 = _mm_xor_si128(a5, b5);\
  MUL2(a6, b0, b1);\
  a6 = _mm_xor_si128(a6, b6);\
  MUL2(a7, b0, b1);\
  a7 = _mm_xor_si128(a7, b7);\
  \
  /* compute v_i : double w_i      */\
  /* add to y_4 y_5 .. v3, v4, ... */\
  MUL2(a0, b0, b1);\
  b5 = _mm_xor_si128(b5, a0);\
  MUL2(a1, b0, b1);\
  b6 = _mm_xor_si128(b6, a1);\
  MUL2(a2, b0, b1);\
  b7 = _mm_xor_si128(b7, a2);\
  MUL2(a5, b0, b1);\
  b2 = _mm_xor_si128(b2, a5);\
  MUL2(a6, b0, b1);\
  b3 = _mm_xor_si128(b3, a6);\
  MUL2(a7, b0, b1);\
  b4 = _mm_xor_si128(b4, a7);\
  MUL2(a3, b0, b1);\
  MUL2(a4, b0, b1);\
  b0 = TEMP0;\
  b1 = TEMP1;\
  b0 = _mm_xor_si128(b0, a3);\
  b1 = _mm_xor_si128(b1, a4);\
 }/*MixBytes*/
 #else
 #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* t_i = a_i + a_{i+1} */\
  b6 = a0;\
@@ -189,6 +277,8 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
  b1 = _mm_xor_si128(b1, a4);\
 }/*MixBytes*/
 #endif
 /* one round
 * a0-a7 = input rows
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -58,11 +58,9 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
 * xmm[j] will be lost
 * xmm[k] has to be all 0x1b */
 #define MUL2(i, j, k){\
-  j = _mm_xor_si128(j, j);\
+  j = _mm_cmpgt_epi8( m128_zero, i);\
  j = _mm_cmpgt_epi8(j, i);\
  i = _mm_add_epi8(i, i);\
-  j = _mm_and_si128(j, k);\
+  i = mm128_xorand(i, j, k );\
  i = _mm_xor_si128(i, j);\
 } 
 /* Yet another implementation of MixBytes.
@@ -82,6 +80,96 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
   We almost fit into 16 registers, need only 3 spills to memory.
   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
   K. Matusiewicz, 2011/05/29 */
 #if defined(__AVX512VL__)
 #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* t_i = a_i + a_{i+1} */\
  b6 = a0;\
  b7 = a1;\
  a0 = _mm_xor_si128(a0, a1);\
  b0 = a2;\
  a1 = _mm_xor_si128(a1, a2);\
  b1 = a3;\
  TEMP2 = _mm_xor_si128(a2, a3);\
  b2 = a4;\
  a3 = _mm_xor_si128(a3, a4);\
  b3 = a5;\
  a4 = _mm_xor_si128(a4, a5);\
  b4 = a6;\
  a5 = _mm_xor_si128(a5, a6);\
  b5 = a7;\
  a6 = _mm_xor_si128(a6, a7);\
  a7 = _mm_xor_si128(a7, b6);\
   \
  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
  TEMP0 = mm128_xor3( b0, a4, a6 ); \
  /* spill values y_4, y_5 to memory */\
  TEMP1 = mm128_xor3( b1, a5, a7 );\
  b2 = mm128_xor3( b2, a6, a0 ); \
  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
  b0 = a0;\
  b3 = mm128_xor3( b3, a7, a1 ); \
  b1 = a1;\
  b6 = mm128_xor3( b6, a4, TEMP2 ); \
  b4 = mm128_xor3( b4, a0, TEMP2 ); \
  b7 = mm128_xor3( b7, a5, a3 ); \
  b5 = mm128_xor3( b5, a1, a3 ); \
  \
  /* compute x_i = t_i + t_{i+3} */\
  a0 = _mm_xor_si128(a0, a3);\
  a1 = _mm_xor_si128(a1, a4);\
  a2 = _mm_xor_si128(TEMP2, a5);\
  a3 = _mm_xor_si128(a3, a6);\
  a4 = _mm_xor_si128(a4, a7);\
  a5 = _mm_xor_si128(a5, b0);\
  a6 = _mm_xor_si128(a6, b1);\
  a7 = _mm_xor_si128(a7, TEMP2);\
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
  MUL2(a0, b0, b1);\
  a0 = _mm_xor_si128(a0, TEMP0);\
  MUL2(a1, b0, b1);\
  a1 = _mm_xor_si128(a1, TEMP1);\
  MUL2(a2, b0, b1);\
  a2 = _mm_xor_si128(a2, b2);\
  MUL2(a3, b0, b1);\
  a3 = _mm_xor_si128(a3, b3);\
  MUL2(a4, b0, b1);\
  a4 = _mm_xor_si128(a4, b4);\
  MUL2(a5, b0, b1);\
  a5 = _mm_xor_si128(a5, b5);\
  MUL2(a6, b0, b1);\
  a6 = _mm_xor_si128(a6, b6);\
  MUL2(a7, b0, b1);\
  a7 = _mm_xor_si128(a7, b7);\
  \
  /* compute v_i : double w_i      */\
  /* add to y_4 y_5 .. v3, v4, ... */\
  MUL2(a0, b0, b1);\
  b5 = _mm_xor_si128(b5, a0);\
  MUL2(a1, b0, b1);\
  b6 = _mm_xor_si128(b6, a1);\
  MUL2(a2, b0, b1);\
  b7 = _mm_xor_si128(b7, a2);\
  MUL2(a5, b0, b1);\
  b2 = _mm_xor_si128(b2, a5);\
  MUL2(a6, b0, b1);\
  b3 = _mm_xor_si128(b3, a6);\
  MUL2(a7, b0, b1);\
  b4 = _mm_xor_si128(b4, a7);\
  MUL2(a3, b0, b1);\
  MUL2(a4, b0, b1);\
  b0 = TEMP0;\
  b1 = TEMP1;\
  b0 = _mm_xor_si128(b0, a3);\
  b1 = _mm_xor_si128(b1, a4);\
 }/*MixBytes*/
 #else
 #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* t_i = a_i + a_{i+1} */\
  b6 = a0;\
@@ -178,6 +266,8 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
  b1 = _mm_xor_si128(b1, a4);\
 }/*MixBytes*/
 #endif
 /* one round
 * i = round number
 * a0-a7 = input rows
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -96,11 +96,9 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
 * xmm[j] will be lost
 * xmm[k] has to be all 0x1b */
 #define MUL2(i, j, k){\
-  j = _mm512_xor_si512(j, j);\
+  j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\
  j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
  i = _mm512_add_epi8(i, i);\
-  j = _mm512_and_si512(j, k);\
+  i = mm512_xorand( i, j, k );\
  i = _mm512_xor_si512(i, j);\
 } 
 /* Yet another implementation of MixBytes.
@@ -120,6 +118,95 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
   We almost fit into 16 registers, need only 3 spills to memory.
   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
   K. Matusiewicz, 2011/05/29 */
 #define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \
                  b0, b1, b2, b3, b4, b5, b6, b7) { \
  /* t_i = a_i + a_{i+1} */\
  b6 = a0; \
  b7 = a1; \
  a0 = _mm512_xor_si512( a0, a1 ); \
  b0 = a2; \
  a1 = _mm512_xor_si512( a1, a2 ); \
  b1 = a3; \
  TEMP2 = _mm512_xor_si512( a2, a3 ); \
  b2 = a4; \
  a3 = _mm512_xor_si512( a3, a4 ); \
  b3 = a5; \
  a4 = _mm512_xor_si512( a4, a5 );\
  b4 = a6; \
  a5 = _mm512_xor_si512( a5, a6 ); \
  b5 = a7; \
  a6 = _mm512_xor_si512( a6, a7 ); \
  a7 = _mm512_xor_si512( a7, b6 ); \
  \
  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
  TEMP0 = mm512_xor3( b0, a4, a6 ); \
  /* spill values y_4, y_5 to memory */\
  TEMP1 = mm512_xor3( b1, a5, a7 ); \
  b2 = mm512_xor3( b2, a6, a0 ); \
  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
  b0 = a0; \
  b3 = mm512_xor3( b3, a7, a1 ); \
  b1 = a1; \
  b6 = mm512_xor3( b6, a4, TEMP2 ); \
  b4 = mm512_xor3( b4, a0, TEMP2 ); \
  b7 = mm512_xor3( b7, a5, a3 ); \
  b5 = mm512_xor3( b5, a1, a3 ); \
  \
  /* compute x_i = t_i + t_{i+3} */\
  a0 = _mm512_xor_si512( a0, a3 ); \
  a1 = _mm512_xor_si512( a1, a4 ); \
  a2 = _mm512_xor_si512( TEMP2, a5 ); \
  a3 = _mm512_xor_si512( a3, a6 ); \
  a4 = _mm512_xor_si512( a4, a7 ); \
  a5 = _mm512_xor_si512( a5, b0 ); \
  a6 = _mm512_xor_si512( a6, b1 ); \
  a7 = _mm512_xor_si512( a7, TEMP2 ); \
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
  b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
  MUL2( a0, b0, b1 ); \
  a0 = _mm512_xor_si512( a0, TEMP0 ); \
  MUL2( a1, b0, b1 ); \
  a1 = _mm512_xor_si512( a1, TEMP1 ); \
  MUL2( a2, b0, b1 ); \
  a2 = _mm512_xor_si512( a2, b2 ); \
  MUL2( a3, b0, b1 ); \
  a3 = _mm512_xor_si512( a3, b3 ); \
  MUL2( a4, b0, b1 ); \
  a4 = _mm512_xor_si512( a4, b4 ); \
  MUL2( a5, b0, b1 ); \
  a5 = _mm512_xor_si512( a5, b5 ); \
  MUL2( a6, b0, b1 ); \
  a6 = _mm512_xor_si512( a6, b6 ); \
  MUL2( a7, b0, b1 ); \
  a7 = _mm512_xor_si512( a7, b7 ); \
  \
  /* compute v_i : double w_i      */\
  /* add to y_4 y_5 .. v3, v4, ... */\
  MUL2( a0, b0, b1 ); \
  b5 = _mm512_xor_si512( b5, a0 ); \
  MUL2( a1, b0, b1 ); \
  b6 = _mm512_xor_si512( b6, a1 ); \
  MUL2( a2, b0, b1 ); \
  b7 = _mm512_xor_si512( b7, a2 ); \
  MUL2( a5, b0, b1 ); \
  b2 = _mm512_xor_si512( b2, a5 ); \
  MUL2( a6, b0, b1 ); \
  b3 = _mm512_xor_si512( b3, a6 ); \
  MUL2( a7, b0, b1 ); \
  b4 = _mm512_xor_si512( b4, a7 ); \
  MUL2( a3, b0, b1 ); \
  MUL2( a4, b0, b1 ); \
  b0 = TEMP0;\
  b1 = TEMP1;\
  b0 = _mm512_xor_si512( b0, a3 ); \
  b1 = _mm512_xor_si512( b1, a4 ); \
 }/*MixBytes*/
 #if 0
 #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* t_i = a_i + a_{i+1} */\
  b6 = a0;\
@@ -215,7 +302,7 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
  b0 = _mm512_xor_si512(b0, a3);\
  b1 = _mm512_xor_si512(b1, a4);\
 }/*MixBytes*/
-
+#endif
 #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -104,11 +104,9 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
 * xmm[j] will be lost
 * xmm[k] has to be all 0x1b */
 #define MUL2(i, j, k){\
-  j = _mm512_xor_si512(j, j);\
+  j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\
  j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
  i = _mm512_add_epi8(i, i);\
-  j = _mm512_and_si512(j, k);\
+  i = mm512_xorand( i, j, k );\
  i = _mm512_xor_si512(i, j);\
 } 
 /**/
@@ -130,100 +128,90 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
   We almost fit into 16 registers, need only 3 spills to memory.
   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
   K. Matusiewicz, 2011/05/29 */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \
                  b0, b1, b2, b3, b4, b5, b6, b7) { \
  /* t_i = a_i + a_{i+1} */\
-  b6 = a0;\
+  b6 = a0; \
-  b7 = a1;\
+  b7 = a1; \
-  a0 = _mm512_xor_si512(a0, a1);\
+  a0 = _mm512_xor_si512( a0, a1 ); \
-  b0 = a2;\
+  b0 = a2; \
-  a1 = _mm512_xor_si512(a1, a2);\
+  a1 = _mm512_xor_si512( a1, a2 ); \
-  b1 = a3;\
+  b1 = a3; \
-  a2 = _mm512_xor_si512(a2, a3);\
+  TEMP2 = _mm512_xor_si512( a2, a3 ); \
-  b2 = a4;\
+  b2 = a4; \
-  a3 = _mm512_xor_si512(a3, a4);\
+  a3 = _mm512_xor_si512( a3, a4 ); \
-  b3 = a5;\
+  b3 = a5; \
-  a4 = _mm512_xor_si512(a4, a5);\
+  a4 = _mm512_xor_si512( a4, a5 );\
-  b4 = a6;\
+  b4 = a6; \
-  a5 = _mm512_xor_si512(a5, a6);\
+  a5 = _mm512_xor_si512( a5, a6 ); \
-  b5 = a7;\
+  b5 = a7; \
-  a6 = _mm512_xor_si512(a6, a7);\
+  a6 = _mm512_xor_si512( a6, a7 ); \
-  a7 = _mm512_xor_si512(a7, b6);\
+  a7 = _mm512_xor_si512( a7, b6 ); \
  \
  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  b0 = _mm512_xor_si512(b0, a4);\
+  TEMP0 = mm512_xor3( b0, a4, a6 ); \
  b6 = _mm512_xor_si512(b6, a4);\
  b1 = _mm512_xor_si512(b1, a5);\
  b7 = _mm512_xor_si512(b7, a5);\
  b2 = _mm512_xor_si512(b2, a6);\
  b0 = _mm512_xor_si512(b0, a6);\
  /* spill values y_4, y_5 to memory */\
-  TEMP0 = b0;\
+  TEMP1 = mm512_xor3( b1, a5, a7 ); \
-  b3 = _mm512_xor_si512(b3, a7);\
+  b2 = mm512_xor3( b2, a6, a0 ); \
  b1 = _mm512_xor_si512(b1, a7);\
  TEMP1 = b1;\
  b4 = _mm512_xor_si512(b4, a0);\
  b2 = _mm512_xor_si512(b2, a0);\
  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
-  b0 = a0;\
+  b0 = a0; \
-  b5 = _mm512_xor_si512(b5, a1);\
+  b3 = mm512_xor3( b3, a7, a1 ); \
-  b3 = _mm512_xor_si512(b3, a1);\
+  b1 = a1; \
-  b1 = a1;\
+  b6 = mm512_xor3( b6, a4, TEMP2 ); \
-  b6 = _mm512_xor_si512(b6, a2);\
+  b4 = mm512_xor3( b4, a0, TEMP2 ); \
-  b4 = _mm512_xor_si512(b4, a2);\
+  b7 = mm512_xor3( b7, a5, a3 ); \
-  TEMP2 = a2;\
+  b5 = mm512_xor3( b5, a1, a3 ); \
  b7 = _mm512_xor_si512(b7, a3);\
  b5 = _mm512_xor_si512(b5, a3);\
  \
  /* compute x_i = t_i + t_{i+3} */\
-  a0 = _mm512_xor_si512(a0, a3);\
+  a0 = _mm512_xor_si512( a0, a3 ); \
-  a1 = _mm512_xor_si512(a1, a4);\
+  a1 = _mm512_xor_si512( a1, a4 ); \
-  a2 = _mm512_xor_si512(a2, a5);\
+  a2 = _mm512_xor_si512( TEMP2, a5 ); \
-  a3 = _mm512_xor_si512(a3, a6);\
+  a3 = _mm512_xor_si512( a3, a6 ); \
-  a4 = _mm512_xor_si512(a4, a7);\
+  a4 = _mm512_xor_si512( a4, a7 ); \
-  a5 = _mm512_xor_si512(a5, b0);\
+  a5 = _mm512_xor_si512( a5, b0 ); \
-  a6 = _mm512_xor_si512(a6, b1);\
+  a6 = _mm512_xor_si512( a6, b1 ); \
-  a7 = _mm512_xor_si512(a7, TEMP2);\
+  a7 = _mm512_xor_si512( a7, TEMP2 ); \
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
-  MUL2(a0, b0, b1);\
+  MUL2( a0, b0, b1 ); \
-  a0 = _mm512_xor_si512(a0, TEMP0);\
+  a0 = _mm512_xor_si512( a0, TEMP0 ); \
-  MUL2(a1, b0, b1);\
+  MUL2( a1, b0, b1 ); \
-  a1 = _mm512_xor_si512(a1, TEMP1);\
+  a1 = _mm512_xor_si512( a1, TEMP1 ); \
-  MUL2(a2, b0, b1);\
+  MUL2( a2, b0, b1 ); \
-  a2 = _mm512_xor_si512(a2, b2);\
+  a2 = _mm512_xor_si512( a2, b2 ); \
-  MUL2(a3, b0, b1);\
+  MUL2( a3, b0, b1 ); \
-  a3 = _mm512_xor_si512(a3, b3);\
+  a3 = _mm512_xor_si512( a3, b3 ); \
-  MUL2(a4, b0, b1);\
+  MUL2( a4, b0, b1 ); \
-  a4 = _mm512_xor_si512(a4, b4);\
+  a4 = _mm512_xor_si512( a4, b4 ); \
-  MUL2(a5, b0, b1);\
+  MUL2( a5, b0, b1 ); \
-  a5 = _mm512_xor_si512(a5, b5);\
+  a5 = _mm512_xor_si512( a5, b5 ); \
-  MUL2(a6, b0, b1);\
+  MUL2( a6, b0, b1 ); \
-  a6 = _mm512_xor_si512(a6, b6);\
+  a6 = _mm512_xor_si512( a6, b6 ); \
-  MUL2(a7, b0, b1);\
+  MUL2( a7, b0, b1 ); \
-  a7 = _mm512_xor_si512(a7, b7);\
+  a7 = _mm512_xor_si512( a7, b7 ); \
  \
  /* compute v_i : double w_i      */\
  /* add to y_4 y_5 .. v3, v4, ... */\
-  MUL2(a0, b0, b1);\
+  MUL2( a0, b0, b1 ); \
-  b5 = _mm512_xor_si512(b5, a0);\
+  b5 = _mm512_xor_si512( b5, a0 ); \
-  MUL2(a1, b0, b1);\
+  MUL2( a1, b0, b1 ); \
-  b6 = _mm512_xor_si512(b6, a1);\
+  b6 = _mm512_xor_si512( b6, a1 ); \
-  MUL2(a2, b0, b1);\
+  MUL2( a2, b0, b1 ); \
-  b7 = _mm512_xor_si512(b7, a2);\
+  b7 = _mm512_xor_si512( b7, a2 ); \
-  MUL2(a5, b0, b1);\
+  MUL2( a5, b0, b1 ); \
-  b2 = _mm512_xor_si512(b2, a5);\
+  b2 = _mm512_xor_si512( b2, a5 ); \
-  MUL2(a6, b0, b1);\
+  MUL2( a6, b0, b1 ); \
-  b3 = _mm512_xor_si512(b3, a6);\
+  b3 = _mm512_xor_si512( b3, a6 ); \
-  MUL2(a7, b0, b1);\
+  MUL2( a7, b0, b1 ); \
-  b4 = _mm512_xor_si512(b4, a7);\
+  b4 = _mm512_xor_si512( b4, a7 ); \
-  MUL2(a3, b0, b1);\
+  MUL2( a3, b0, b1 ); \
-  MUL2(a4, b0, b1);\
+  MUL2( a4, b0, b1 ); \
  b0 = TEMP0;\
  b1 = TEMP1;\
-  b0 = _mm512_xor_si512(b0, a3);\
+  b0 = _mm512_xor_si512( b0, a3 ); \
-  b1 = _mm512_xor_si512(b1, a4);\
+  b1 = _mm512_xor_si512( b1, a4 ); \
 }/*MixBytes*/
 /* one round
@@ -709,11 +697,9 @@ static const __m256i SUBSH_MASK7_2WAY =
 * xmm[j] will be lost
 * xmm[k] has to be all 0x1b */
 #define MUL2_2WAY(i, j, k){\
-  j = _mm256_xor_si256(j, j);\
+  j = _mm256_cmpgt_epi8( m256_zero, i );\
  j = _mm256_cmpgt_epi8(j, i );\
  i = _mm256_add_epi8(i, i);\
-  j = _mm256_and_si256(j, k);\
+  i = mm256_xorand( i, j, k );\
  i = _mm256_xor_si256(i, j);\
 }
 #define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -44,6 +44,7 @@ void myriad_8way_hash( void *output, const void *input )
     rintrlv_8x64_4x128( vhashA, vhashB, input, 640 );
     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 640 );
     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(groestl512_4way_context) );
     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 640 );
     uint32_t hash0[20] __attribute__ ((aligned (64)));
@@ -58,8 +59,6 @@ void myriad_8way_hash( void *output, const void *input )
 //     rintrlv_4x128_8x32( vhash, vhashA, vhashB, 512 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
                       hash6, hash7 );
 #else
@@ -76,27 +75,27 @@ void myriad_8way_hash( void *output, const void *input )
                   hash4, hash5, hash6, hash7, input, 640 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
     intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
                         hash4, hash5, hash6, hash7, 512 );
 #endif
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
                       hash6, hash7 );
     sha256_8way_update( &ctx.sha, vhash, 64 );
     sha256_8way_close( &ctx.sha, output );
 }
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -560,22 +560,14 @@ do { \
     __m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
     dm = mm512_negate_32( _mm512_or_si512( dm, \
                                          _mm512_slli_epi64( dm, 32 ) ) ); \
-     m0 = _mm512_xor_si512( m0, _mm512_and_si512( dm, \
+     m0 = mm512_xorand( m0, dm, m512_const1_64( tp[0] ) ); \
-                                          m512_const1_64( tp[0] ) ) ); \
+     m1 = mm512_xorand( m1, dm, m512_const1_64( tp[1] ) ); \
-     m1 = _mm512_xor_si512( m1, _mm512_and_si512( dm, \
+     m2 = mm512_xorand( m2, dm, m512_const1_64( tp[2] ) ); \
-                                          m512_const1_64( tp[1] ) ) ); \
+     m3 = mm512_xorand( m3, dm, m512_const1_64( tp[3] ) ); \
-     m2 = _mm512_xor_si512( m2, _mm512_and_si512( dm, \
+     m4 = mm512_xorand( m4, dm, m512_const1_64( tp[4] ) ); \
-                                          m512_const1_64( tp[2] ) ) ); \
+     m5 = mm512_xorand( m5, dm, m512_const1_64( tp[5] ) ); \
-     m3 = _mm512_xor_si512( m3, _mm512_and_si512( dm, \
+     m6 = mm512_xorand( m6, dm, m512_const1_64( tp[6] ) ); \
-                                          m512_const1_64( tp[3] ) ) ); \
+     m7 = mm512_xorand( m7, dm, m512_const1_64( tp[7] ) ); \
     m4 = _mm512_xor_si512( m4, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[4] ) ) ); \
     m5 = _mm512_xor_si512( m5, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[5] ) ) ); \
     m6 = _mm512_xor_si512( m6, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[6] ) ) ); \
     m7 = _mm512_xor_si512( m7, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[7] ) ) ); \
     tp += 8; \
     db = _mm512_srli_epi64( db, 1 ); \
  } \
@@ -585,20 +577,13 @@ do { \
 do { \
  __m512i t; \
  t = a; \
-  a = _mm512_and_si512( a, c ); \
+  a = mm512_xorand( d, a, c ); \
-  a = _mm512_xor_si512( a, d ); \
+  c = mm512_xor3( a, b, c ); \
-  c = _mm512_xor_si512( c, b ); \
+  b = mm512_xoror( b, d, t ); \
  c = _mm512_xor_si512( c, a ); \
  d = _mm512_or_si512( d, t ); \
  d = _mm512_xor_si512( d, b ); \
  t = _mm512_xor_si512( t, c ); \
-  b = d; \
+  d = mm512_xoror( a, b, t ); \
-  d = _mm512_or_si512( d, t ); \
+  t = mm512_xorand( t, a, b ); \
-  d = _mm512_xor_si512( d, a ); \
+  b = mm512_xor3( b, d, t ); \
  a = _mm512_and_si512( a, b ); \
  t = _mm512_xor_si512( t, a ); \
  b = _mm512_xor_si512( b, d ); \
  b = _mm512_xor_si512( b, t ); \
  a = c; \
  c = b; \
  b = d; \
@@ -609,14 +594,12 @@ do { \
 do { \
   a = mm512_rol_32( a, 13 ); \
   c = mm512_rol_32( c,  3 ); \
-   b = _mm512_xor_si512( b, _mm512_xor_si512( a, c ) ); \
+   b = mm512_xor3( a, b, c ); \
-   d = _mm512_xor_si512( d, _mm512_xor_si512( c, \
+   d = mm512_xor3( d, c, _mm512_slli_epi32( a, 3 ) ); \
                                              _mm512_slli_epi32( a, 3 ) ) ); \
   b = mm512_rol_32( b, 1 ); \
   d = mm512_rol_32( d, 7 ); \
-   a = _mm512_xor_si512( a, _mm512_xor_si512( b, d ) ); \
+   a = mm512_xor3( a, b, d ); \
-   c = _mm512_xor_si512( c, _mm512_xor_si512( d, \
+   c = mm512_xor3( c, d, _mm512_slli_epi32( b, 7 ) ); \
                                              _mm512_slli_epi32( b, 7 ) ) ); \
   a = mm512_rol_32( a,  5 ); \
   c = mm512_rol_32( c, 22 ); \
 } while (0)
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -522,50 +522,53 @@ do { \
 // Haval-256 8 way 32 bit avx2
 #if defined (__AVX512VL__)
 // ( ~( a ^ b ) ) & c
 #define mm256_andnotxor( a, b, c ) \
   _mm256_ternarylogic_epi32( a, b, c, 0x82  )
 #else
 #define mm256_andnotxor( a, b, c ) \
   _mm256_andnot_si256( _mm256_xor_si256( a, b ), c )
 #endif
 #define F1_8W(x6, x5, x4, x3, x2, x1, x0) \
-   _mm256_xor_si256( x0, \
+ mm256_xor3( x0, mm256_andxor( x1, x0, x4 ), \
-       _mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \
+                 _mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
-                      _mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
+                                   _mm256_and_si256( x3, x6 ) ) ) \
                                     _mm256_and_si256( x3, x6 ) ) ) ) \
 #define F2_8W(x6, x5, x4, x3, x2, x1, x0) \
-   _mm256_xor_si256( \
+   mm256_xor3( mm256_andxor( x2, _mm256_andnot_si256( x3, x1 ), \
-      _mm256_and_si256( x2, \
+                       mm256_xor3( _mm256_and_si256( x4, x5 ), x6, x0 )  ), \
-         _mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \
+               mm256_andxor( x4, x1, x5 ), \
-                        _mm256_xor_si256( _mm256_and_si256( x4, x5 ), \
+               mm256_xorand( x0, x3, x5 ) ) \
                                       _mm256_xor_si256( x6, x0 ) ) ) ), \
         _mm256_xor_si256( \
             _mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \
             _mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \
 #define F3_8W(x6, x5, x4, x3, x2, x1, x0) \
-  _mm256_xor_si256( \
+  mm256_xor3( x0, \
-    _mm256_and_si256( x3, \
+              _mm256_and_si256( x3, \
-      _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
+                         mm256_xor3( _mm256_and_si256( x1, x2 ), x6, x0 ) ), \
-                     _mm256_xor_si256( x6, x0 ) ) ), \
+              _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
-      _mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \
+                                _mm256_and_si256( x2, x5 ) ) )
                                   _mm256_and_si256( x2, x5 ) ), x0 ) )
 #define F4_8W(x6, x5, x4, x3, x2, x1, x0) \
-  _mm256_xor_si256( \
+  mm256_xor3( \
-     _mm256_xor_si256( \
+      mm256_andxor( x3, x5, \
-        _mm256_and_si256( x3, \
+                    _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
-           _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
+                                      _mm256_or_si256( x4, x6 ) ) ), \
-                                         _mm256_or_si256( x4, x6 ) ), x5 ) ), \
+      _mm256_and_si256( x4, \
-        _mm256_and_si256( x4, \
+                        mm256_xor3( x0, _mm256_andnot_si256( x2, x5 ), \
-           _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \
+                                    _mm256_xor_si256( x1, x6 ) ) ), \
-                          _mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \
+      mm256_xorand( x0, x2, x6 ) )
     _mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) )
 #define F5_8W(x6, x5, x4, x3, x2, x1, x0) \
   _mm256_xor_si256( \
-       _mm256_and_si256( x0, \
+         mm256_andnotxor( mm256_and3( x1, x2, x3 ), x5, x0 ), \
-            mm256_not( _mm256_xor_si256( \
+         mm256_xor3( _mm256_and_si256( x1, x4 ), \
-                    _mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \
+                     _mm256_and_si256( x2, x5 ), \
-      _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
+                     _mm256_and_si256( x3, x6 ) ) )
                                    _mm256_and_si256( x2, x5 ) ), \
                                    _mm256_and_si256( x3, x6 ) ) )
 #define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \
   F1_8W(x1, x0, x3, x5, x6, x2, x4)
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -51,15 +51,15 @@ extern "C"{
 do { \
   __m512i cc = _mm512_set1_epi64( c ); \
    x3 = mm512_not( x3 ); \
-    x0 = _mm512_xor_si512( x0, _mm512_andnot_si512( x2, cc ) ); \
+    x0 = mm512_xorandnot( x0, x2, cc ); \
-    tmp = _mm512_xor_si512( cc, _mm512_and_si512( x0, x1 ) ); \
+    tmp = mm512_xorand( cc, x0, x1 ); \
-    x0 = _mm512_xor_si512( x0, _mm512_and_si512( x2, x3 ) ); \
+    x0 = mm512_xorand( x0, x2, x3 ); \
-    x3 = _mm512_xor_si512( x3, _mm512_andnot_si512( x1, x2 ) ); \
+    x3 = mm512_xorandnot( x3, x1, x2 ); \
-    x1 = _mm512_xor_si512( x1, _mm512_and_si512( x0, x2 ) ); \
+    x1 = mm512_xorand( x1, x0, x2 ); \
-    x2 = _mm512_xor_si512( x2, _mm512_andnot_si512( x3, x0 ) ); \
+    x2 = mm512_xorandnot( x2, x3, x0 ); \
-    x0 = _mm512_xor_si512( x0, _mm512_or_si512( x1, x3 ) ); \
+    x0 = mm512_xoror( x0, x1, x3 ); \
-    x3 = _mm512_xor_si512( x3, _mm512_and_si512( x1, x2 ) ); \
+    x3 = mm512_xorand( x3, x1, x2 ); \
-    x1 = _mm512_xor_si512( x1, _mm512_and_si512( tmp, x0 ) ); \
+    x1 = mm512_xorand( x1, tmp, x0 ); \
    x2 = _mm512_xor_si512( x2, tmp ); \
 } while (0)
@@ -67,11 +67,11 @@ do { \
 do { \
    x4 = _mm512_xor_si512( x4, x1 ); \
    x5 = _mm512_xor_si512( x5, x2 ); \
-    x6 = _mm512_xor_si512( x6, _mm512_xor_si512( x3, x0 ) ); \
+    x6 = mm512_xor3( x6, x3, x0 ); \
    x7 = _mm512_xor_si512( x7, x0 ); \
    x0 = _mm512_xor_si512( x0, x5 ); \
    x1 = _mm512_xor_si512( x1, x6 ); \
-    x2 = _mm512_xor_si512( x2, _mm512_xor_si512( x7, x4 ) ); \
+    x2 = mm512_xor3( x2, x7, x4 ); \
    x3 = _mm512_xor_si512( x3, x4 ); \
 } while (0)
@@ -318,12 +318,12 @@ static const sph_u64 C[] = {
 #define Wz_8W(x, c, n) \
 do { \
   __m512i t = _mm512_slli_epi64( _mm512_and_si512(x ## h, (c)), (n) ); \
-   x ## h = _mm512_or_si512( _mm512_and_si512( \
+   x ## h = mm512_orand( t, _mm512_srli_epi64( x ## h, (n) ), (c) ); \
                                _mm512_srli_epi64(x ## h, (n)), (c)), t ); \
   t = _mm512_slli_epi64( _mm512_and_si512(x ## l, (c)), (n) ); \
-   x ## l = _mm512_or_si512( _mm512_and_si512((x ## l >> (n)), (c)), t ); \
+   x ## l = mm512_orand( t, (x ## l >> (n)), (c) ); \
 } while (0)
 #define W80(x)   Wz_8W(x, m512_const1_64( 0x5555555555555555 ),  1 )
 #define W81(x)   Wz_8W(x, m512_const1_64( 0x3333333333333333 ),  2 )
 #define W82(x)   Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ),  4 )
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -76,6 +76,9 @@ static const uint64_t RC[] = {
 #define OR64(d, a, b)    (d = _mm512_or_si512(a,b))
 #define NOT64(d, s)      (d = _mm512_xor_si512(s,m512_neg1))
 #define ROL64(d, v, n)   (d = mm512_rol_64(v, n))
 #define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
 #define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
 #include "keccak-macros.c"
@@ -238,6 +241,8 @@ keccak512_8way_close(void *cc, void *dst)
 #undef NOT64
 #undef ROL64
 #undef KECCAK_F_1600
 #undef XOROR
 #undef XORAND
 #endif  // AVX512
@@ -255,6 +260,8 @@ keccak512_8way_close(void *cc, void *dst)
 #define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
 #define NOT64(d, s)      (d = _mm256_xor_si256(s,m256_neg1))
 #define ROL64(d, v, n)   (d = mm256_rol_64(v, n))
 #define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
 #define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
 #include "keccak-macros.c"
@@ -419,5 +426,7 @@ keccak512_4way_close(void *cc, void *dst)
 #undef NOT64
 #undef ROL64
 #undef KECCAK_F_1600
 #undef XOROR
 #undef XORAND
 #endif  // AVX2
--- a/algo/keccak/keccak-macros.c
+++ b/algo/keccak/keccak-macros.c
@@ -110,20 +110,34 @@
 #ifdef KHI_XO
 #undef KHI_XO
 #endif
 #define KHI_XO(d, a, b, c)   do { \
      XOROR(d, a, b, c); \
   } while (0)
 /*
 #define KHI_XO(d, a, b, c)   do { \
                DECL64(kt); \
                OR64(kt, b, c); \
                XOR64(d, a, kt); \
        } while (0)
 */
 #ifdef KHI_XA
 #undef KHI_XA
 #endif
 #define KHI_XA(d, a, b, c)   do { \
      XORAND(d, a, b, c); \
   } while (0)
 /*
 #define KHI_XA(d, a, b, c)   do { \
                DECL64(kt); \
                AND64(kt, b, c); \
                XOR64(d, a, kt); \
        } while (0)
 */
 #ifdef KHI
 #undef KHI
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -97,6 +97,21 @@ do { \
    MIXWORD4W(*(x+3),*(x+7),*t,*(t+1));\
    ADD_CONSTANT4W(*x, *(x+4), c0, c1);
 #define SUBCRUMB4W(a0,a1,a2,a3,t)\
    t  = a0;\
    a0 = mm512_xoror( a3, a0, a1 ); \
    a2 = _mm512_xor_si512(a2,a3);\
    a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
    a3 = mm512_xorand( a2, a3, t ); \
    a2 = mm512_xorand( a1, a2, a0);\
    a1 = _mm512_or_si512(a1,a3);\
    a3 = _mm512_xor_si512(a3,a2);\
    t  = _mm512_xor_si512(t,a1);\
    a2 = _mm512_and_si512(a2,a1);\
    a1 = mm512_xnor(a1,a0);\
    a0 = t;
 /*
 #define SUBCRUMB4W(a0,a1,a2,a3,t)\
    t  = _mm512_load_si512(&a0);\
    a0 = _mm512_or_si512(a0,a1);\
@@ -115,7 +130,25 @@ do { \
    a2 = _mm512_and_si512(a2,a1);\
    a1 = _mm512_xor_si512(a1,a0);\
    a0 = _mm512_load_si512(&t);
 */
 #define MIXWORD4W(a,b,t1,t2)\
    b  = _mm512_xor_si512(a,b);\
    t1 = _mm512_slli_epi32(a,2);\
    t2 = _mm512_srli_epi32(a,30);\
    a  = mm512_xoror( b, t1, t2 ); \
    t1 = _mm512_slli_epi32(b,14);\
    t2 = _mm512_srli_epi32(b,18);\
    b  = _mm512_or_si512(t1,t2);\
    b  = mm512_xoror( a, t1, t2 ); \
    t1 = _mm512_slli_epi32(a,10);\
    t2 = _mm512_srli_epi32(a,22);\
    a  = mm512_xoror( b, t1, t2 ); \
    t1 = _mm512_slli_epi32(b,1);\
    t2 = _mm512_srli_epi32(b,31);\
    b  = _mm512_or_si512(t1,t2);
 /*
 #define MIXWORD4W(a,b,t1,t2)\
    b  = _mm512_xor_si512(a,b);\
    t1 = _mm512_slli_epi32(a,2);\
@@ -133,6 +166,7 @@ do { \
    t1 = _mm512_slli_epi32(b,1);\
    t2 = _mm512_srli_epi32(b,31);\
    b  = _mm512_or_si512(t1,t2);
 */
 #define STEP_PART24W(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
    a1 = _mm512_shuffle_epi32(a1,147);\
@@ -248,17 +282,10 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
    __m512i tmp[2];
    __m512i x[8];
-    t0 = chainv[0];
+    t0 = mm512_xor3( chainv[0], chainv[2], chainv[4] );
-    t1 = chainv[1];
+    t1 = mm512_xor3( chainv[1], chainv[3], chainv[5] );
-
+    t0 = mm512_xor3( t0, chainv[6], chainv[8] );
-    t0 = _mm512_xor_si512( t0, chainv[2] );
+    t1 = mm512_xor3( t1, chainv[7], chainv[9] );
    t1 = _mm512_xor_si512( t1, chainv[3] );
    t0 = _mm512_xor_si512( t0, chainv[4] );
    t1 = _mm512_xor_si512( t1, chainv[5] );
    t0 = _mm512_xor_si512( t0, chainv[6] );
    t1 = _mm512_xor_si512( t1, chainv[7] );
    t0 = _mm512_xor_si512( t0, chainv[8] );
    t1 = _mm512_xor_si512( t1, chainv[9] );
    MULT24W( t0, t1 );
@@ -319,8 +346,8 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
    chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );
    MULT24W( chainv[0], chainv[1] );
-    chainv[0] = _mm512_xor_si512( _mm512_xor_si512( chainv[0], t0 ), msg0 );
+    chainv[0] = mm512_xor3( chainv[0], t0, msg0 );
-    chainv[1] = _mm512_xor_si512( _mm512_xor_si512( chainv[1], t1 ), msg1 );
+    chainv[1] = mm512_xor3( chainv[1], t1, msg1 );
    MULT24W( msg0, msg1 );
    chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
@@ -399,18 +426,10 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
    /*---- blank round with m=0 ----*/
    rnd512_4way( state, zero );
-    t[0] = chainv[0];
+    t[0] = mm512_xor3( chainv[0], chainv[2], chainv[4] );
-    t[1] = chainv[1];
+    t[1] = mm512_xor3( chainv[1], chainv[3], chainv[5] );
-
+    t[0] = mm512_xor3( t[0], chainv[6], chainv[8] );
-    t[0] = _mm512_xor_si512( t[0], chainv[2] );
+    t[1] = mm512_xor3( t[1], chainv[7], chainv[9] );
    t[1] = _mm512_xor_si512( t[1], chainv[3] );
    t[0] = _mm512_xor_si512( t[0], chainv[4] );
    t[1] = _mm512_xor_si512( t[1], chainv[5] );
    t[0] = _mm512_xor_si512( t[0], chainv[6] );
    t[1] = _mm512_xor_si512( t[1], chainv[7] );
    t[0] = _mm512_xor_si512( t[0], chainv[8] );
    t[1] = _mm512_xor_si512( t[1], chainv[9] );
    t[0] = _mm512_shuffle_epi32( t[0], 27 );
    t[1] = _mm512_shuffle_epi32( t[1], 27 );
@@ -676,8 +695,6 @@ do { \
  a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) );  \
 } while(0)
 // confirm pointer arithmetic
 // ok but use array indexes
 #define STEP_PART(x,c0,c1,t)\
    SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
    SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
@@ -688,23 +705,23 @@ do { \
    ADD_CONSTANT(*x, *(x+4), c0, c1);
 #define SUBCRUMB(a0,a1,a2,a3,t)\
-    t  = _mm256_load_si256(&a0);\
+    t  = a0;\
    a0 = _mm256_or_si256(a0,a1);\
    a2 = _mm256_xor_si256(a2,a3);\
-    a1 = _mm256_andnot_si256(a1, m256_neg1 );\
+    a1 = mm256_not( a1 );\
    a0 = _mm256_xor_si256(a0,a3);\
    a3 = _mm256_and_si256(a3,t);\
    a1 = _mm256_xor_si256(a1,a3);\
    a3 = _mm256_xor_si256(a3,a2);\
    a2 = _mm256_and_si256(a2,a0);\
-    a0 = _mm256_andnot_si256(a0, m256_neg1 );\
+    a0 = mm256_not( a0 );\
    a2 = _mm256_xor_si256(a2,a1);\
    a1 = _mm256_or_si256(a1,a3);\
    t  = _mm256_xor_si256(t,a1);\
    a3 = _mm256_xor_si256(a3,a2);\
    a2 = _mm256_and_si256(a2,a1);\
    a1 = _mm256_xor_si256(a1,a0);\
-    a0 = _mm256_load_si256(&t);\
+    a0 = t;\
 #define MIXWORD(a,b,t1,t2)\
    b  = _mm256_xor_si256(a,b);\
--- a/algo/panama/panama-hash-4way.c
+++ b/algo/panama/panama-hash-4way.c
@@ -312,10 +312,26 @@ do { \
      BUPDATE1_8W( 7, 1 ); \
 } while (0)
 #if defined(__AVX512VL__)
 #define GAMMA_8W(n0, n1, n2, n4)   \
   ( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )  
 #define THETA_8W(n0, n1, n2, n4)   \
   ( g ## n0 = mm256_xor3( a ## n0, a ## n1, a ## n4 ) )   
 #else
 #define GAMMA_8W(n0, n1, n2, n4)   \
   (g ## n0 = _mm256_xor_si256( a ## n0, \
                         _mm256_or_si256( a ## n1, mm256_not( a ## n2 ) ) ) )
 #define THETA_8W(n0, n1, n2, n4)   \
   ( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \
                                                            a ## n4 ) ) )
 #endif
 #define PI_ALL_8W   do { \
      a0  = g0; \
      a1  = mm256_rol_32( g7,   1 ); \
@@ -336,9 +352,6 @@ do { \
      a16 = mm256_rol_32( g10,  8 ); \
   } while (0)
 #define THETA_8W(n0, n1, n2, n4)   \
   ( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \
                                                            a ## n4 ) ) )
 #define SIGMA_ALL_8W   do { \
      a0  = _mm256_xor_si256( g0, m256_one_32 ); \
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -127,10 +127,8 @@ void quark_8way_hash( void *state, const void *input )
     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
-     if ( ( vh_mask & 0x0f ) != 0x0f )
+     groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
-       groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
     if ( ( vh_mask & 0xf0 ) != 0xf0 )
       groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
@@ -139,22 +137,14 @@ void quark_8way_hash( void *state, const void *input )
    dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  vhash, 512 );
-    if ( hash0[0] & 8 )
+    groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-    if ( hash1[0] & 8 )
+    groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-    if ( hash2[0] & 8)
+    groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
-       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
-    if ( hash3[0] & 8 )
+    groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
-       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
    if ( hash4[0] & 8 )
       groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
    if ( hash5[0] & 8 )
       groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
    if ( hash6[0] & 8 )
       groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
    if ( hash7[0] & 8 )
       groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
    intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 512 );
--- a/algo/sha/hmac-sha256-hash.c
+++ b/algo/sha/hmac-sha256-hash.c
@@ -39,17 +39,10 @@
 void
 SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
 {
 #if defined(HMAC_SPH_SHA)
   sph_sha256_context ctx;
   sph_sha256_init( &ctx );
   sph_sha256( &ctx, in, len );
   sph_sha256_close( &ctx, digest );
 #else
   SHA256_CTX ctx;
   SHA256_Init( &ctx );
   SHA256_Update( &ctx, in, len );
   SHA256_Final( digest, &ctx );
 #endif
 }
 /**
@@ -79,51 +72,29 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
   /* If Klen > 64, the key is really SHA256(K). */
   if ( Klen > 64 )
   {
 #if defined(HMAC_SPH_SHA)
      sph_sha256_init( &ctx->ictx );
      sph_sha256( &ctx->ictx, K, Klen );
      sph_sha256_close( &ctx->ictx, khash );
-#else
+
-      SHA256_Init( &ctx->ictx );
+      K = khash;
-      SHA256_Update( &ctx->ictx, K, Klen );
+      Klen = 32;
      SHA256_Final( khash, &ctx->ictx );
 #endif
       K = khash;
       Klen = 32;
   }
   /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
 #if defined(HMAC_SPH_SHA)
   sph_sha256_init( &ctx->ictx );
 #else
   SHA256_Init( &ctx->ictx );
 #endif
   for ( i = 0; i < Klen; i++ )  pad[i] = K[i] ^ 0x36;
   memset( pad + Klen, 0x36, 64 - Klen );
 #if defined(HMAC_SPH_SHA)
   sph_sha256( &ctx->ictx, pad, 64 );
 #else
   SHA256_Update( &ctx->ictx, pad, 64 );
 #endif
   /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
 #if defined(HMAC_SPH_SHA)
   sph_sha256_init( &ctx->octx );
 #else   
   SHA256_Init( &ctx->octx );
 #endif
   for ( i = 0; i < Klen; i++ )  pad[i] = K[i] ^ 0x5c;
   memset( pad + Klen, 0x5c, 64 - Klen );
 #if defined(HMAC_SPH_SHA)
   sph_sha256( &ctx->octx, pad, 64 );
 #else
   SHA256_Update( &ctx->octx, pad, 64 );
 #endif
 }
 /* Add bytes to the HMAC-SHA256 operation. */
@@ -131,11 +102,7 @@ void
 HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len )
 {
 	/* Feed data to the inner SHA256 operation. */
 #if defined(HMAC_SPH_SHA)
   sph_sha256( &ctx->ictx, in, len );
 #else
   SHA256_Update( &ctx->ictx, in, len );
 #endif
 }
 /* Finish an HMAC-SHA256 operation. */
@@ -144,20 +111,9 @@ HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx )
 {
   unsigned char ihash[32];
 #if defined(HMAC_SPH_SHA)
   sph_sha256_close( &ctx->ictx, ihash );
   sph_sha256( &ctx->octx, ihash, 32 );
   sph_sha256_close( &ctx->octx, digest );
 #else
   /* Finish the inner SHA256 operation. */
   SHA256_Final( ihash, &ctx->ictx );
   /* Feed the inner hash to the outer SHA256 operation. */
   SHA256_Update( &ctx->octx, ihash, 32 );
   /* Finish the outer SHA256 operation. */
   SHA256_Final( digest, &ctx->octx );
 #endif
 }
 /**
--- a/algo/sha/hmac-sha256-hash.h
+++ b/algo/sha/hmac-sha256-hash.h
@@ -29,24 +29,14 @@
 #ifndef HMAC_SHA256_H__
 #define HMAC_SHA256_H__
 //#define HMAC_SSL_SHA 1
 #define HMAC_SPH_SHA 1
 #include <sys/types.h>
 #include <stdint.h>
 #include "sph_sha2.h"
 #include <openssl/sha.h>
 typedef struct HMAC_SHA256Context
 {
 #if defined(HMAC_SPH_SHA)
   sph_sha256_context ictx;
   sph_sha256_context octx;
 #else
   SHA256_CTX ictx;
   SHA256_CTX octx;
 #endif
 } HMAC_SHA256_CTX;
 void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -59,6 +59,8 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data,
                         size_t len );
 void sha256_4way_close( sha256_4way_context *sc, void *dst );
 void sha256_4way_full( void *dst, const void *data, size_t len );
 void sha256_4way_transform( __m128i *state_out,  const __m128i *data,
                            const __m128i *state_in );
 #endif  // SSE2
@@ -77,6 +79,8 @@ void sha256_8way_init( sha256_8way_context *sc );
 void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
 void sha256_8way_close( sha256_8way_context *sc, void *dst );
 void sha256_8way_full( void *dst, const void *data, size_t len );
 void sha256_8way_transform( __m256i *state_out, const __m256i *data,
                            const __m256i *state_in );
 #endif  // AVX2
@@ -95,6 +99,12 @@ void sha256_16way_init( sha256_16way_context *sc );
 void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
 void sha256_16way_close( sha256_16way_context *sc, void *dst );
 void sha256_16way_full( void *dst, const void *data, size_t len );
 void sha256_16way_transform( __m512i *state_out, const __m512i *data,
                             const __m512i *state_in );
 void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
                             const __m512i *state_in );
 void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
                          const __m512i *state_in, const __m512i *state_mid );
 #endif // AVX512
--- a/algo/sha/sha2.c
+++ b/algo/sha/sha2.c
@@ -195,8 +195,28 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
 		hash[i] = swab32(hash[i]);
 }
-extern void sha256d(unsigned char *hash, const unsigned char *data, int len)
+#if defined (__SHA__)
 #include "algo/sha/sph_sha2.h"
 void sha256d(unsigned char *hash, const unsigned char *data, int len)
 {
   sph_sha256_context ctx __attribute__ ((aligned (64)));
   sph_sha256_init( &ctx );
   sph_sha256( &ctx, data, len );
   sph_sha256_close( &ctx, hash );
   sph_sha256_init( &ctx );
   sph_sha256( &ctx, hash, 32 );
   sph_sha256_close( &ctx, hash );
 }
 #else
 void sha256d(unsigned char *hash, const unsigned char *data, int len)
 {
   uint32_t S[16], T[16];
 	int i, r;
@@ -220,6 +240,8 @@ extern void sha256d(unsigned char *hash, const unsigned char *data, int len)
 		be32enc((uint32_t *)hash + i, T[i]);
 }
 #endif
 static inline void sha256d_preextend(uint32_t *W)
 {
 	W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
--- a/algo/sha/sha256-hash-2way-ni.c
+++ b/algo/sha/sha256-hash-2way-ni.c
@@ -0,0 +1,345 @@
 /*   Intel SHA extensions using C intrinsics               */
 /*   Written and place in public domain by Jeffrey Walton  */
 /*   Based on code from Intel, and by Sean Gulley for      */
 /*   the miTLS project.                                    */
 // A stripped down version with byte swapping removed. 
 #if defined(__SHA__)
 #include "sha256-hash-opt.h"
 void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
                              const void *msg_X, const void *msg_Y,
                              const uint32_t *in_X, const uint32_t *in_Y )
 {
    __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
    __m128i MSG_X, MSG_Y, TMP_X, TMP_Y;
    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
    __m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y;
    // Load initial values
    TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
    STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
    TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
    STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
    TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
    TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
    STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
    STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
    STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
    STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
    // Save current hash
    ABEF_SAVE_X = STATE0_X;
    ABEF_SAVE_Y = STATE0_Y;
    CDGH_SAVE_X = STATE1_X;
    CDGH_SAVE_Y = STATE1_Y;
    // Rounds 0-3
    TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
    TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
    TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
    // Rounds 4-7
    TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
    TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
    TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
    // Rounds 8-11
    TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
    TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
    TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
    // Rounds 12-15
    TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
    TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
    TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
    // Rounds 16-19
    TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
    // Rounds 20-23
    TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
    // Rounds 24-27
    TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
    // Rounds 28-31
    TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL);
    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
    // Rounds 32-35
    TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
    // Rounds 36-39
    TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
    // Rounds 40-43
    TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
    // Rounds 44-47
    TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
    // Rounds 48-51
    TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
    // Rounds 52-55
    TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
    // Rounds 56-59
    TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
    // Rounds 60-63
    TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
    // Add values back to state
    STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
    STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
    STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
    STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
    TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
    TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
    STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
    STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
    STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
    STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
    // Save state
    _mm_store_si128((__m128i*) &out_X[0], STATE0_X);
    _mm_store_si128((__m128i*) &out_X[4], STATE1_X);
    _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
    _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
 }
 #endif
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -74,9 +74,20 @@ static const uint32_t K256[64] =
 #define CHs(X, Y, Z) \
   _mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z ) 
 /*
 #define MAJs(X, Y, Z) \
   _mm_or_si128( _mm_and_si128( X, Y ), \
                    _mm_and_si128( _mm_or_si128( X, Y ), Z ) )
 */
 /*
 #define MAJs(X, Y, Z) \
  _mm_xor_si128( Y, _mm_and_si128( _mm_xor_si128( X, Y ), \
                                   _mm_xor_si128( Y, Z ) ) )
 */
 #define MAJs(X, Y, Z) \
  _mm_xor_si128( Y, _mm_and_si128( X_xor_Y = _mm_xor_si128( X, Y ), \
                                   Y_xor_Z ) )
 #define BSG2_0(x) \
   _mm_xor_si128( _mm_xor_si128( \
@@ -94,6 +105,7 @@ static const uint32_t K256[64] =
   _mm_xor_si128( _mm_xor_si128( \
        mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) )
 /*
 #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
  __m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \
@@ -122,9 +134,9 @@ do { \
  H  = _mm_add_epi32( T1, T2 ); \
  D  = _mm_add_epi32( D, T1 ); \
 } while (0)
 */
 /*
 #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
  __m128i T1, T2; \
@@ -132,16 +144,98 @@ do { \
  T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \
                                        K, W[i] ) ); \
  T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
  Y_xor_Z = X_xor_Y; \
  D  = _mm_add_epi32( D,  T1 ); \
  H  = _mm_add_epi32( T1, T2 ); \
 } while (0)
 */
 void sha256_4way_transform( __m128i *state_out, const __m128i *data,
                            const __m128i *state_in )
 {
   __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
   __m128i W[16];
   memcpy_128( W, data, 16 );
   A = state_in[0];
   B = state_in[1];
   C = state_in[2];
   D = state_in[3];
   E = state_in[4];
   F = state_in[5];
   G = state_in[6];
   H = state_in[7];
   Y_xor_Z = _mm_xor_si128( B, C );
   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
   for ( int j = 16; j < 64; j += 16 )
   {
      W[ 0] = SHA2s_MEXP( 14,  9,  1,  0 );
      W[ 1] = SHA2s_MEXP( 15, 10,  2,  1 );
      W[ 2] = SHA2s_MEXP(  0, 11,  3,  2 );
      W[ 3] = SHA2s_MEXP(  1, 12,  4,  3 );
      W[ 4] = SHA2s_MEXP(  2, 13,  5,  4 );
      W[ 5] = SHA2s_MEXP(  3, 14,  6,  5 );
      W[ 6] = SHA2s_MEXP(  4, 15,  7,  6 );
      W[ 7] = SHA2s_MEXP(  5,  0,  8,  7 );
      W[ 8] = SHA2s_MEXP(  6,  1,  9,  8 );
      W[ 9] = SHA2s_MEXP(  7,  2, 10,  9 );
      W[10] = SHA2s_MEXP(  8,  3, 11, 10 );
      W[11] = SHA2s_MEXP(  9,  4, 12, 11 );
      W[12] = SHA2s_MEXP( 10,  5, 13, 12 );
      W[13] = SHA2s_MEXP( 11,  6, 14, 13 );
      W[14] = SHA2s_MEXP( 12,  7, 15, 14 );
      W[15] = SHA2s_MEXP( 13,  8,  0, 15 );
      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
   }
   state_out[0] = _mm_add_epi32( state_in[0], A );
   state_out[1] = _mm_add_epi32( state_in[1], B );
   state_out[2] = _mm_add_epi32( state_in[2], C );
   state_out[3] = _mm_add_epi32( state_in[3], D );
   state_out[4] = _mm_add_epi32( state_in[4], E );
   state_out[5] = _mm_add_epi32( state_in[5], F );
   state_out[6] = _mm_add_epi32( state_in[6], G );
   state_out[7] = _mm_add_epi32( state_in[7], H );
 }
 static void
 sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] )
 {
-   register  __m128i A, B, C, D, E, F, G, H;
+   register  __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
   __m128i W[16];
   mm128_block_bswap_32( W, in );
@@ -170,6 +264,8 @@ sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] )
      H = m128_const1_64( 0x5BE0CD195BE0CD19 );
   }
   Y_xor_Z = _mm_xor_si128( B, C );
   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
@@ -321,10 +417,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
    high = (sc->count_high << 3) | (low >> 29);
    low = low << 3;
-    sc->buf[ pad >> 2 ] =
+    sc->buf[  pad     >> 2 ] = m128_const1_32( bswap_32( high ) );
-                 mm128_bswap_32( m128_const1_32( high ) );
+    sc->buf[( pad+4 ) >> 2 ] = m128_const1_32( bswap_32( low ) );
    sc->buf[ ( pad+4 ) >> 2 ] =
                 mm128_bswap_32( m128_const1_32( low ) );
    sha256_4way_round( sc, sc->buf, sc->val );
    mm128_block_bswap_32( dst, sc->val );
@@ -342,12 +436,39 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
 // SHA-256 8 way
 #if defined(__AVX512VL__)
 #define CHx(X, Y, Z) \
   _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
 #define MAJx(X, Y, Z) \
   _mm256_ternarylogic_epi32( X, Y, Z, 0xe8 )
 #define BSG2_0x(x) \
   mm256_xor3( mm256_ror_32(x,  2), mm256_ror_32(x, 13), mm256_ror_32(x, 22) )
 #define BSG2_1x(x) \
   mm256_xor3( mm256_ror_32(x,  6), mm256_ror_32(x, 11), mm256_ror_32(x, 25) )
 #define SSG2_0x(x) \
   mm256_xor3( mm256_ror_32(x,  7), mm256_ror_32(x, 18), _mm256_srli_epi32(x, 3) )
 #define SSG2_1x(x) \
   mm256_xor3( mm256_ror_32(x, 17), mm256_ror_32(x, 19), _mm256_srli_epi32(x, 10) )
 #else  // AVX2
 #define CHx(X, Y, Z) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
 #define MAJx(X, Y, Z) \
-   _mm256_or_si256( _mm256_and_si256( X, Y ), \
+  _mm256_xor_si256( Y, _mm256_and_si256( _mm256_xor_si256( X, Y ), \
-                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
+                                         _mm256_xor_si256( Y, Z ) ) )
 /*
 #define MAJx(X, Y, Z) \
  _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
                                         Y_xor_Z ) )
 */
 #define BSG2_0x(x) \
   _mm256_xor_si256( _mm256_xor_si256( \
@@ -365,6 +486,8 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
   _mm256_xor_si256( _mm256_xor_si256( \
       mm256_ror_32(x, 17), mm256_ror_32(x, 19) ), _mm256_srli_epi32(x, 10) )
 #endif   // AVX512 else AVX2
 #define SHA2x_MEXP( a, b, c, d ) \
     mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] );
@@ -379,8 +502,89 @@ do { \
  H  = _mm256_add_epi32( T1, T2 ); \
 } while (0)
 void sha256_8way_transform( __m256i *state_out, const __m256i *data,
                            const __m256i *state_in )
 {
   __m256i A, B, C, D, E, F, G, H;
   __m256i W[16];
   memcpy_256( W, data, 16 );
   A = state_in[0];
   B = state_in[1];
   C = state_in[2];
   D = state_in[3];
   E = state_in[4];
   F = state_in[5];
   G = state_in[6];
   H = state_in[7];
   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
   for ( int j = 16; j < 64; j += 16 )
   {
      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
      W[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
      W[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
      W[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
      W[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
      W[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
      W[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
      W[ 7] = SHA2x_MEXP(  5,  0,  8,  7 );
      W[ 8] = SHA2x_MEXP(  6,  1,  9,  8 );
      W[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
      W[10] = SHA2x_MEXP(  8,  3, 11, 10 );
      W[11] = SHA2x_MEXP(  9,  4, 12, 11 );
      W[12] = SHA2x_MEXP( 10,  5, 13, 12 );
      W[13] = SHA2x_MEXP( 11,  6, 14, 13 );
      W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
      W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
   }
   state_out[0] = _mm256_add_epi32( state_in[0], A );
   state_out[1] = _mm256_add_epi32( state_in[1], B );
   state_out[2] = _mm256_add_epi32( state_in[2], C );
   state_out[3] = _mm256_add_epi32( state_in[3], D );
   state_out[4] = _mm256_add_epi32( state_in[4], E );
   state_out[5] = _mm256_add_epi32( state_in[5], F );
   state_out[6] = _mm256_add_epi32( state_in[6], G );
   state_out[7] = _mm256_add_epi32( state_in[7], H );
 }
 static void
-sha256_8way_round( sha256_8way_context *ctx,  __m256i *in, __m256i r[8] )
+sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] )
 {
   register  __m256i A, B, C, D, E, F, G, H;
   __m256i W[16];
@@ -566,10 +770,8 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
    high = (sc->count_high << 3) | (low >> 29);
    low = low << 3;
-    sc->buf[ pad >> 2 ] =
+    sc->buf[   pad     >> 2 ] = m256_const1_32( bswap_32( high ) );
-                 mm256_bswap_32( m256_const1_32( high ) );
+    sc->buf[ ( pad+4 ) >> 2 ] = m256_const1_32( bswap_32( low ) );
    sc->buf[ ( pad+4 ) >> 2 ] =
                 mm256_bswap_32( m256_const1_32( low ) );
    sha256_8way_round( sc, sc->buf, sc->val );
@@ -589,27 +791,22 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
 // SHA-256 16 way
 #define CHx16(X, Y, Z) \
-   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) 
+   _mm512_ternarylogic_epi32( X, Y, Z, 0xca )
 #define MAJx16(X, Y, Z) \
-   _mm512_or_si512( _mm512_and_si512( X, Y ), \
+   _mm512_ternarylogic_epi32( X, Y, Z, 0xe8 )
                    _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
 #define BSG2_0x16(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
+   mm512_xor3( mm512_ror_32(x,  2), mm512_ror_32(x, 13), mm512_ror_32(x, 22) )
       mm512_ror_32(x,  2), mm512_ror_32(x, 13) ), mm512_ror_32( x, 22) )
 #define BSG2_1x16(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
+   mm512_xor3( mm512_ror_32(x,  6), mm512_ror_32(x, 11), mm512_ror_32(x, 25) )
       mm512_ror_32(x,  6), mm512_ror_32(x, 11) ), mm512_ror_32( x, 25) )
 #define SSG2_0x16(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
+   mm512_xor3( mm512_ror_32(x,  7), mm512_ror_32(x, 18), _mm512_srli_epi32(x, 3) )
       mm512_ror_32(x,  7), mm512_ror_32(x, 18) ), _mm512_srli_epi32(x, 3) ) 
 #define SSG2_1x16(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
+   mm512_xor3( mm512_ror_32(x, 17), mm512_ror_32(x, 19), _mm512_srli_epi32(x, 10) )
       mm512_ror_32(x, 17), mm512_ror_32(x, 19) ), _mm512_srli_epi32(x, 10) )
 #define SHA2x16_MEXP( a, b, c, d ) \
     mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] );
@@ -625,10 +822,216 @@ do { \
  H  = _mm512_add_epi32( T1, T2 ); \
 } while (0)
 // Tranform one 16 lane by 64 byte message block and update state.
 // Calling function is responsible for initializing the state, setting
 // correct byte order, counting bits and padding of the final block.
 // It's faster for multiple rounds of sha256 (sha256d/t/q) by eliminating
 // redundant byte swapping.
 //
 void sha256_16way_transform( __m512i *state_out, const __m512i *data,
                             const __m512i *state_in )
 {
   __m512i A, B, C, D, E, F, G, H;
   __m512i W[16];
   memcpy_512( W, data, 16 );
   A = state_in[0];
   B = state_in[1];
   C = state_in[2];
   D = state_in[3];
   E = state_in[4];
   F = state_in[5];
   G = state_in[6];
   H = state_in[7];
   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
   for ( int j = 16; j < 64; j += 16 )
   {
      W[ 0] = SHA2x16_MEXP( 14,  9,  1,  0 );
      W[ 1] = SHA2x16_MEXP( 15, 10,  2,  1 );
      W[ 2] = SHA2x16_MEXP(  0, 11,  3,  2 );
      W[ 3] = SHA2x16_MEXP(  1, 12,  4,  3 );
      W[ 4] = SHA2x16_MEXP(  2, 13,  5,  4 );
      W[ 5] = SHA2x16_MEXP(  3, 14,  6,  5 );
      W[ 6] = SHA2x16_MEXP(  4, 15,  7,  6 );
      W[ 7] = SHA2x16_MEXP(  5,  0,  8,  7 );
      W[ 8] = SHA2x16_MEXP(  6,  1,  9,  8 );
      W[ 9] = SHA2x16_MEXP(  7,  2, 10,  9 );
      W[10] = SHA2x16_MEXP(  8,  3, 11, 10 );
      W[11] = SHA2x16_MEXP(  9,  4, 12, 11 );
      W[12] = SHA2x16_MEXP( 10,  5, 13, 12 );
      W[13] = SHA2x16_MEXP( 11,  6, 14, 13 );
      W[14] = SHA2x16_MEXP( 12,  7, 15, 14 );
      W[15] = SHA2x16_MEXP( 13,  8,  0, 15 );
      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
   }
   state_out[0] = _mm512_add_epi32( state_in[0], A );
   state_out[1] = _mm512_add_epi32( state_in[1], B );
   state_out[2] = _mm512_add_epi32( state_in[2], C );
   state_out[3] = _mm512_add_epi32( state_in[3], D );
   state_out[4] = _mm512_add_epi32( state_in[4], E );
   state_out[5] = _mm512_add_epi32( state_in[5], F );
   state_out[6] = _mm512_add_epi32( state_in[6], G );
   state_out[7] = _mm512_add_epi32( state_in[7], H );
 }
 // Aggresive prehashing
 void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
                             const __m512i *state_in )
 {
   __m512i A, B, C, D, E, F, G, H;
   A = _mm512_load_si512( state_in     );
   B = _mm512_load_si512( state_in + 1 );
   C = _mm512_load_si512( state_in + 2 );
   D = _mm512_load_si512( state_in + 3 );
   E = _mm512_load_si512( state_in + 4 );
   F = _mm512_load_si512( state_in + 5 );
   G = _mm512_load_si512( state_in + 6 );
   H = _mm512_load_si512( state_in + 7 );
   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
   _mm512_store_si512( state_mid    , A );
   _mm512_store_si512( state_mid + 1, B );
   _mm512_store_si512( state_mid + 2, C );
   _mm512_store_si512( state_mid + 3, D );
   _mm512_store_si512( state_mid + 4, E );
   _mm512_store_si512( state_mid + 5, F );
   _mm512_store_si512( state_mid + 6, G );
   _mm512_store_si512( state_mid + 7, H );
 }   
 void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
                          const __m512i *state_in, const __m512i *state_mid )
 {
   __m512i A, B, C, D, E, F, G, H;
   __m512i W[16];
   memcpy_512( W, data, 16 );
   A = _mm512_load_si512( state_mid     );
   B = _mm512_load_si512( state_mid + 1 );
   C = _mm512_load_si512( state_mid + 2 );
   D = _mm512_load_si512( state_mid + 3 );
   E = _mm512_load_si512( state_mid + 4 );
   F = _mm512_load_si512( state_mid + 5 );
   G = _mm512_load_si512( state_mid + 6 );
   H = _mm512_load_si512( state_mid + 7 );
 //   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
 //   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
 //   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
   for ( int j = 16; j < 64; j += 16 )
   {
      W[ 0] = SHA2x16_MEXP( 14,  9,  1,  0 );
      W[ 1] = SHA2x16_MEXP( 15, 10,  2,  1 );
      W[ 2] = SHA2x16_MEXP(  0, 11,  3,  2 );
      W[ 3] = SHA2x16_MEXP(  1, 12,  4,  3 );
      W[ 4] = SHA2x16_MEXP(  2, 13,  5,  4 );
      W[ 5] = SHA2x16_MEXP(  3, 14,  6,  5 );
      W[ 6] = SHA2x16_MEXP(  4, 15,  7,  6 );
      W[ 7] = SHA2x16_MEXP(  5,  0,  8,  7 );
      W[ 8] = SHA2x16_MEXP(  6,  1,  9,  8 );
      W[ 9] = SHA2x16_MEXP(  7,  2, 10,  9 );
      W[10] = SHA2x16_MEXP(  8,  3, 11, 10 );
      W[11] = SHA2x16_MEXP(  9,  4, 12, 11 );
      W[12] = SHA2x16_MEXP( 10,  5, 13, 12 );
      W[13] = SHA2x16_MEXP( 11,  6, 14, 13 );
      W[14] = SHA2x16_MEXP( 12,  7, 15, 14 );
      W[15] = SHA2x16_MEXP( 13,  8,  0, 15 );
      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
   }
   A = _mm512_add_epi32( A, _mm512_load_si512( state_in     ) );
   B = _mm512_add_epi32( B, _mm512_load_si512( state_in + 1 ) );
   C = _mm512_add_epi32( C, _mm512_load_si512( state_in + 2 ) );
   D = _mm512_add_epi32( D, _mm512_load_si512( state_in + 3 ) );
   E = _mm512_add_epi32( E, _mm512_load_si512( state_in + 4 ) );
   F = _mm512_add_epi32( F, _mm512_load_si512( state_in + 5 ) );
   G = _mm512_add_epi32( G, _mm512_load_si512( state_in + 6 ) );
   H = _mm512_add_epi32( H, _mm512_load_si512( state_in + 7 ) );
   _mm512_store_si512( state_out    ,  A );
   _mm512_store_si512( state_out + 1,  B );
   _mm512_store_si512( state_out + 2,  C );
   _mm512_store_si512( state_out + 3,  D );
   _mm512_store_si512( state_out + 4,  E );
   _mm512_store_si512( state_out + 5,  F );
   _mm512_store_si512( state_out + 6,  G );
   _mm512_store_si512( state_out + 7,  H );
 }
 static void
 sha256_16way_round( sha256_16way_context *ctx,  __m512i *in, __m512i r[8] )
 {
-   register  __m512i A, B, C, D, E, F, G, H;
+   register __m512i A, B, C, D, E, F, G, H;
   __m512i W[16];
   mm512_block_bswap_32( W  , in   );
@@ -657,6 +1060,7 @@ sha256_16way_round( sha256_16way_context *ctx,  __m512i *in, __m512i r[8] )
      H = m512_const1_64( 0x5BE0CD195BE0CD19 );
   }
   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
@@ -800,10 +1204,8 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
    high = (sc->count_high << 3) | (low >> 29);
    low = low << 3;
-    sc->buf[ pad >> 2 ] =
+    sc->buf[   pad     >> 2 ] = m512_const1_32( bswap_32( high ) );
-                 mm512_bswap_32( m512_const1_32( high ) );
+    sc->buf[ ( pad+4 ) >> 2 ] = m512_const1_32( bswap_32( low ) );
    sc->buf[ ( pad+4 ) >> 2 ] =
                 mm512_bswap_32( m512_const1_32( low ) );
    sha256_16way_round( sc, sc->buf, sc->val );
--- a/algo/sha/sha256-hash-opt.c
+++ b/algo/sha/sha256-hash-opt.c
@@ -3,23 +3,24 @@
 /*   Based on code from Intel, and by Sean Gulley for      */
 /*   the miTLS project.                                    */
-// A drop in replacement for the function of the same name in sph_sha2.c.
+// A stripped down version with byte swapping removed. 
 #if defined(__SHA__)
-#include "simd-utils.h"
+#include "sha256-hash-opt.h"
-static void sha2_round( const uint8_t input[], uint32_t state[8] )
+void sha256_opt_transform( uint32_t *state_out, const void *input,
                           const uint32_t *state_in )
 {
    __m128i STATE0, STATE1;
-    __m128i MSG, TMP, MASK;
+    __m128i MSG, TMP;
    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
    __m128i ABEF_SAVE, CDGH_SAVE;
    // Load initial values
-    TMP = _mm_load_si128((__m128i*) &state[0]);
+    TMP = _mm_load_si128((__m128i*) &state_in[0]);
-    STATE1 = _mm_load_si128((__m128i*) &state[4]);
+    STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
-    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
+//    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
@@ -31,8 +32,8 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
    CDGH_SAVE = STATE1;
    // Rounds 0-3
-    MSG = _mm_load_si128((const __m128i*) (input+0));
+    TMSG0 = _mm_load_si128((const __m128i*) (input+0));
-    TMSG0 = _mm_shuffle_epi8(MSG, MASK);
+//    TMSG0 = _mm_shuffle_epi8(MSG, MASK);
    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
@@ -40,7 +41,7 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
    // Rounds 4-7
    TMSG1 = _mm_load_si128((const __m128i*) (input+16));
-    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
+//    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
@@ -49,7 +50,7 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
    // Rounds 8-11
    TMSG2 = _mm_load_si128((const __m128i*) (input+32));
-    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
+//    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
@@ -58,7 +59,7 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
    // Rounds 12-15
    TMSG3 = _mm_load_si128((const __m128i*) (input+48));
-    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
+//    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
@@ -192,9 +193,8 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
    // Save state
-    _mm_store_si128((__m128i*) &state[0], STATE0);
+    _mm_store_si128((__m128i*) &state_out[0], STATE0);
-    _mm_store_si128((__m128i*) &state[4], STATE1);
+    _mm_store_si128((__m128i*) &state_out[4], STATE1);
 }
 #endif
--- a/algo/sha/sha256-hash-opt.h
+++ b/algo/sha/sha256-hash-opt.h
@@ -0,0 +1,18 @@
 #ifndef SHA2_HASH_OPT_H__
 #define SHA2_HASH_OPT_H__ 1
 #include <stddef.h>
 #include "simd-utils.h"
 #if defined(__SHA__)
 void sha256_opt_transform( uint32_t *state_out, const void *input,
                           const uint32_t *state_in );
 // 2 way with interleaved instructions
 void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
                              const void *msg_X, const void *msg_Y,
                              const uint32_t *in_X, const uint32_t *in_Y );
 #endif
 #endif
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -0,0 +1,252 @@
 #include "sha256t-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
 #include "sha-hash-4way.h"
 #if defined(SHA256D_16WAY)
 int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
   __m512i  block[16]    __attribute__ ((aligned (64)));
   __m512i  hash32[8]    __attribute__ ((aligned (32)));
   __m512i  initstate[8] __attribute__ ((aligned (32)));
   __m512i  midstate[8]  __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   __m512i  vdata[20]    __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
   const uint32_t targ32_d7 = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 16;
   uint32_t n = first_nonce;
   __m512i *noncev = vdata + 19; 
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   const __m512i last_byte = m512_const1_32( 0x80000000 );
   const __m512i sixteen = m512_const1_32( 16 );
   for ( int i = 0; i < 19; i++ )
       vdata[i] = m512_const1_32( pdata[i] );
   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
   // initialize state
   initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
   initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
   initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
   initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
   initstate[4] = m512_const1_64( 0x510E527F510E527F );
   initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
   initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
   initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
   // hash first 64 bytes of data
   sha256_16way_transform( midstate, vdata, initstate );
   do
   {
      // 1. final 16 bytes of data, with padding
      memcpy_512( block, vdata + 16, 4 );
      block[ 4] = last_byte;
      memset_zero_512( block + 5, 10 );  
      block[15] = m512_const1_32( 80*8 ); // bit count
      sha256_16way_transform( hash32, block, midstate );
      // 2. 32 byte hash from 1.
      memcpy_512( block, hash32, 8 );
      block[ 8] = last_byte;
      memset_zero_512( block + 9, 6 );
      block[15] = m512_const1_32( 32*8 ); // bit count
      sha256_16way_transform( hash32, block, initstate );
      // byte swap final hash for testing
      mm512_block_bswap_32( hash32, hash32 );    
      for ( int lane = 0; lane < 16; lane++ )
      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
      {
         extr_lane_16x32( lane_hash, hash32, lane, 256 );
         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
         {
            pdata[19] = n + lane;
            submit_solution( work, lane_hash, mythr );
         }
       }
       *noncev = _mm512_add_epi32( *noncev, sixteen );
       n += 16;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
 #endif
 #if defined(SHA256D_8WAY)
 int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
   __m256i  block[16]    __attribute__ ((aligned (64)));
   __m256i  hash32[8]    __attribute__ ((aligned (32)));
   __m256i  initstate[8] __attribute__ ((aligned (32)));
   __m256i  midstate[8]  __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   __m256i  vdata[20]    __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
   const uint32_t targ32_d7 = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
   __m256i *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   const __m256i last_byte = m256_const1_32( 0x80000000 );
   const __m256i eight = m256_const1_32( 8 );
   for ( int i = 0; i < 19; i++ )
       vdata[i] = m256_const1_32( pdata[i] );
   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
   // initialize state
   initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
   initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
   initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
   initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
   initstate[4] = m256_const1_64( 0x510E527F510E527F );
   initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
   initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
   initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
   // hash first 64 bytes of data
   sha256_8way_transform( midstate, vdata, initstate );
   do
   {
      // 1. final 16 bytes of data, with padding
      memcpy_256( block, vdata + 16, 4 );
      block[ 4] = last_byte;
      memset_zero_256( block + 5, 10 );
      block[15] = m256_const1_32( 80*8 ); // bit count
      sha256_8way_transform( hash32, block, midstate );
      // 2. 32 byte hash from 1.
      memcpy_256( block, hash32, 8 );
      block[ 8] = last_byte;
      memset_zero_256( block + 9, 6 );
      block[15] = m256_const1_32( 32*8 ); // bit count
      sha256_8way_transform( hash32, block, initstate );
      // byte swap final hash for testing
      mm256_block_bswap_32( hash32, hash32 );
      for ( int lane = 0; lane < 8; lane++ )
      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
      {
         extr_lane_8x32( lane_hash, hash32, lane, 256 );
         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
         {
            pdata[19] = n + lane;
            submit_solution( work, lane_hash, mythr );
         }
       }
       *noncev = _mm256_add_epi32( *noncev, eight );
       n += 8;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
 #endif
 #if defined(SHA256D_4WAY)
 int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
   __m128i  block[16]    __attribute__ ((aligned (64)));
   __m128i  hash32[8]    __attribute__ ((aligned (32)));
   __m128i  initstate[8] __attribute__ ((aligned (32)));
   __m128i  midstate[8]  __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   __m128i  vdata[20]    __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
   const uint32_t targ32_d7 = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
   __m128i *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   const __m128i last_byte = m128_const1_32( 0x80000000 );
   const __m128i four = m128_const1_32( 4 );
   for ( int i = 0; i < 19; i++ )
       vdata[i] = m128_const1_32( pdata[i] );
   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
   // initialize state
   initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
   initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
   initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
   initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
   initstate[4] = m128_const1_64( 0x510E527F510E527F );
   initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
   initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
   initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
   // hash first 64 bytes of data
   sha256_4way_transform( midstate, vdata, initstate );
   do
   {
      // 1. final 16 bytes of data, with padding
      memcpy_128( block, vdata + 16, 4 );
      block[ 4] = last_byte;
      memset_zero_128( block + 5, 10 );
      block[15] = m128_const1_32( 80*8 ); // bit count
      sha256_4way_transform( hash32, block, midstate );
      // 2. 32 byte hash from 1.
      memcpy_128( block, hash32, 8 );
      block[ 8] = last_byte;
      memset_zero_128( block + 9, 6 );
      block[15] = m128_const1_32( 32*8 ); // bit count
      sha256_4way_transform( hash32, block, initstate );
      // byte swap final hash for testing
      mm128_block_bswap_32( hash32, hash32 );
      for ( int lane = 0; lane < 4; lane++ )
      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
      {
         extr_lane_4x32( lane_hash, hash32, lane, 256 );
         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
         {
            pdata[19] = n + lane;
            submit_solution( work, lane_hash, mythr );
         }
       }
       *noncev = _mm_add_epi32( *noncev, four );
       n += 4;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
 #endif
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -7,133 +7,173 @@
 #if defined(SHA256T_16WAY)
 static __thread sha256_16way_context sha256_ctx16 __attribute__ ((aligned (64)));
 void sha256t_16way_hash( void* output, const void* input )
 {
   uint32_t vhash[8*16] __attribute__ ((aligned (64)));
   sha256_16way_context ctx;
   memcpy( &ctx, &sha256_ctx16, sizeof ctx );
   sha256_16way_update( &ctx, input + (64<<4), 16 );
   sha256_16way_close( &ctx, vhash );
   sha256_16way_init( &ctx );
   sha256_16way_update( &ctx, vhash, 32 );
   sha256_16way_close( &ctx, vhash );
   sha256_16way_init( &ctx );
   sha256_16way_update( &ctx, vhash, 32 );
   sha256_16way_close( &ctx, output );
 }
 int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   __m512i  block[16]    __attribute__ ((aligned (64)));
-   uint32_t hash32[8*16] __attribute__ ((aligned (32)));
+   __m512i  hash32[8]    __attribute__ ((aligned (32)));
   __m512i  initstate[8] __attribute__ ((aligned (32)));
   __m512i  midstate[8]  __attribute__ ((aligned (32)));
   __m512i  midstate2[8] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *hash32_d7 = &(hash32[7<<4]);
+   __m512i  vdata[20]    __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
   const uint32_t targ32_d7 = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 16;
   uint32_t n = first_nonce;
-   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
+   __m512i *noncev = vdata + 19; 
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   const __m512i last_byte = m512_const1_32( 0x80000000 );
   const __m512i sixteen = m512_const1_32( 16 );
   for ( int i = 0; i < 19; i++ )
       vdata[i] = m512_const1_32( pdata[i] );
   mm512_bswap32_intrlv80_16x32( vdata, pdata );
   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
-   sha256_16way_init( &sha256_ctx16 );
+
-   sha256_16way_update( &sha256_ctx16, vdata, 64 );
+   // initialize state
   initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
   initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
   initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
   initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
   initstate[4] = m512_const1_64( 0x510E527F510E527F );
   initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
   initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
   initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
   // hash first 64 byte block of data
   sha256_16way_transform( midstate, vdata, initstate );
   // Do 3 rounds on the first 12 bytes of the next block
   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
   do
   {
-     pdata[19] = n;
+      // 1. final 16 bytes of data, with padding
-     sha256t_16way_hash( hash32, vdata );
+      memcpy_512( block, vdata + 16, 4 );
-     for ( int lane = 0; lane < 16; lane++ )
+      block[ 4] = last_byte;
-     if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      memset_zero_512( block + 5, 10 );  
-     {
+      block[15] = m512_const1_32( 80*8 ); // bit count
-        extr_lane_16x32( lane_hash, hash32, lane, 256 );
+      sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
-        if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+//      sha256_16way_transform( hash32, block, midstate );
-        {
+
-           pdata[19] = bswap_32( n + lane );
+      // 2. 32 byte hash from 1.
-           submit_solution( work, lane_hash, mythr );
+      memcpy_512( block, hash32, 8 );
-        }
+      block[ 8] = last_byte;
-      }
+      memset_zero_512( block + 9, 6 );
-      *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
+      block[15] = m512_const1_32( 32*8 ); // bit count
-      n += 16;
+      sha256_16way_transform( hash32, block, initstate );
      // 3. 32 byte hash from 2.
      memcpy_512( block, hash32, 8 );
      sha256_16way_transform( hash32, block, initstate );
      // byte swap final hash for testing
      mm512_block_bswap_32( hash32, hash32 );    
      for ( int lane = 0; lane < 16; lane++ )
      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
      {
         extr_lane_16x32( lane_hash, hash32, lane, 256 );
         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
         {
            pdata[19] = n + lane;
            submit_solution( work, lane_hash, mythr );
         }
       }
       *noncev = _mm512_add_epi32( *noncev, sixteen );
       n += 16;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
 #endif
 #if defined(SHA256T_8WAY)
 static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
 void sha256t_8way_hash( void* output, const void* input )
 {
   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
   sha256_8way_context ctx;
   memcpy( &ctx, &sha256_ctx8, sizeof ctx );
   sha256_8way_update( &ctx, input + (64<<3), 16 );
   sha256_8way_close( &ctx, vhash );
   sha256_8way_init( &ctx );
   sha256_8way_update( &ctx, vhash, 32 );
   sha256_8way_close( &ctx, vhash );
   sha256_8way_init( &ctx );
   sha256_8way_update( &ctx, vhash, 32 );
   sha256_8way_close( &ctx, output );
 }
 int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[20*8]  __attribute__ ((aligned (64)));
+   __m256i  block[16]    __attribute__ ((aligned (64)));
-   uint32_t hash32[8*8]    __attribute__ ((aligned (32)));
+   __m256i  hash32[8]    __attribute__ ((aligned (32)));
   __m256i  initstate[8] __attribute__ ((aligned (32)));
   __m256i  midstate[8]  __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *hash32_d7 = &(hash32[7<<3]);
+   __m256i  vdata[20]    __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
   const uint32_t targ32_d7 = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
-   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   __m256i *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   const __m256i last_byte = m256_const1_32( 0x80000000 );
   const __m256i eight = m256_const1_32( 8 );
-   mm256_bswap32_intrlv80_8x32( vdata, pdata );
+   for ( int i = 0; i < 19; i++ )
-   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
+       vdata[i] = m256_const1_32( pdata[i] );
-   sha256_8way_init( &sha256_ctx8 );
+
-   sha256_8way_update( &sha256_ctx8, vdata, 64 );
+   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
   // initialize state
   initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
   initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
   initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
   initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
   initstate[4] = m256_const1_64( 0x510E527F510E527F );
   initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
   initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
   initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
   // hash first 64 bytes of data
   sha256_8way_transform( midstate, vdata, initstate );
   do
   {
-     pdata[19] = n;
+      // 1. final 16 bytes of data, with padding
-     sha256t_8way_hash( hash32, vdata );
+      memcpy_256( block, vdata + 16, 4 );
-     for ( int lane = 0; lane < 8; lane++ )
+      block[ 4] = last_byte;
-     if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      memset_zero_256( block + 5, 10 );
-     {
+      block[15] = m256_const1_32( 80*8 ); // bit count
-        extr_lane_8x32( lane_hash, hash32, lane, 256 );
+      sha256_8way_transform( hash32, block, midstate );
-        if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+
-        {
+      // 2. 32 byte hash from 1.
-           pdata[19] = bswap_32( n + lane );
+      memcpy_256( block, hash32, 8 );
-           submit_solution( work, lane_hash, mythr );
+      block[ 8] = last_byte;
-        }
+      memset_zero_256( block + 9, 6 );
-      }
+      block[15] = m256_const1_32( 32*8 ); // bit count
-      *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
+      sha256_8way_transform( hash32, block, initstate );
-      n += 8;
+
      // 3. 32 byte hash from 2.
      memcpy_256( block, hash32, 8 );
      sha256_8way_transform( hash32, block, initstate );
      // byte swap final hash for testing
      mm256_block_bswap_32( hash32, hash32 );
      for ( int lane = 0; lane < 8; lane++ )
      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
      {
         extr_lane_8x32( lane_hash, hash32, lane, 256 );
         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
         {
            pdata[19] = n + lane;
            submit_solution( work, lane_hash, mythr );
         }
       }
       *noncev = _mm256_add_epi32( *noncev, eight );
       n += 8;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
@@ -144,82 +184,84 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
 #if defined(SHA256T_4WAY)
 static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
 void sha256t_4way_hash( void* output, const void* input )
 {
   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
   sha256_4way_context ctx;
   memcpy( &ctx, &sha256_ctx4, sizeof ctx );
   sha256_4way_update( &ctx, input + (64<<2), 16 );
   sha256_4way_close( &ctx, vhash );
   sha256_4way_init( &ctx );
   sha256_4way_update( &ctx, vhash, 32 );
   sha256_4way_close( &ctx, vhash );
   sha256_4way_init( &ctx );
   sha256_4way_update( &ctx, vhash, 32 );
   sha256_4way_close( &ctx, output );
 }
 int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
-	                   uint64_t *hashes_done, struct thr_info *mythr )
+                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   __m128i  block[16]    __attribute__ ((aligned (64)));
-   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   __m128i  hash32[8]    __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   __m128i  initstate[8] __attribute__ ((aligned (32)));
-   uint32_t *hash7 = &(hash[7<<2]);
+   __m128i  midstate[8]  __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   __m128i  vdata[20]    __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
+   const uint32_t targ32_d7 = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
-   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   __m128i *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   const __m128i last_byte = m128_const1_32( 0x80000000 );
   const __m128i four = m128_const1_32( 4 );
-   const uint64_t htmax[] = {          0,
+   for ( int i = 0; i < 19; i++ )
-                                     0xF,
+       vdata[i] = m128_const1_32( pdata[i] );
                                    0xFF,
                                   0xFFF,
                                  0xFFFF,
                              0x10000000 };
   const uint32_t masks[] = { 0xFFFFFFFF,
                              0xFFFFFFF0,
                              0xFFFFFF00,
                              0xFFFFF000,
                              0xFFFF0000,
                                       0 };
-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
   sha256_4way_init( &sha256_ctx4 );
   sha256_4way_update( &sha256_ctx4, vdata, 64 );
-   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+   // initialize state
   initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
   initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
   initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
   initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
   initstate[4] = m128_const1_64( 0x510E527F510E527F );
   initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
   initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
   initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
   // hash first 64 bytes of data
   sha256_4way_transform( midstate, vdata, initstate );
   do
   {
-      const uint32_t mask = masks[m];
+      // 1. final 16 bytes of data, with padding
-      do {
+      memcpy_128( block, vdata + 16, 4 );
-         *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
+      block[ 4] = last_byte;
-         pdata[19] = n;
+      memset_zero_128( block + 5, 10 );
      block[15] = m128_const1_32( 80*8 ); // bit count
      sha256_4way_transform( hash32, block, midstate );
-         sha256t_4way_hash( hash, vdata );
+      // 2. 32 byte hash from 1.
      memcpy_128( block, hash32, 8 );
      block[ 8] = last_byte;
      memset_zero_128( block + 9, 6 );
      block[15] = m128_const1_32( 32*8 ); // bit count
      sha256_4way_transform( hash32, block, initstate );
-         for ( int lane = 0; lane < 4; lane++ )
+      // 3. 32 byte hash from 2.
-         if ( !( hash7[ lane ] & mask ) )
+      memcpy_128( block, hash32, 8 );
      sha256_4way_transform( hash32, block, initstate );
      // byte swap final hash for testing
      mm128_block_bswap_32( hash32, hash32 );
      for ( int lane = 0; lane < 4; lane++ )
      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
      {
         extr_lane_4x32( lane_hash, hash32, lane, 256 );
         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
         {
-            extr_lane_4x32( lane_hash, hash, lane, 256 );
+            pdata[19] = n + lane;
-            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+            submit_solution( work, lane_hash, mythr );
            {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr );
 	         }
         }
-         n += 4;
+       }
-      } while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
+       *noncev = _mm_add_epi32( *noncev, four );
-      break;
+       n += 4;
-   }
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
-   *hashes_done = n - first_nonce + 1;
+   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -5,17 +5,13 @@ bool register_sha256t_algo( algo_gate_t* gate )
    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
 #if defined(SHA256T_16WAY)
    gate->scanhash   = (void*)&scanhash_sha256t_16way;
    gate->hash       = (void*)&sha256t_16way_hash;
 #elif defined(__SHA__)
    gate->optimizations = SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t;
    gate->hash       = (void*)&sha256t_hash;
 #elif defined(SHA256T_8WAY)
    gate->scanhash   = (void*)&scanhash_sha256t_8way;
    gate->hash       = (void*)&sha256t_8way_hash;
 #else
    gate->scanhash   = (void*)&scanhash_sha256t_4way;
    gate->hash       = (void*)&sha256t_4way_hash;
 #endif
    return true;
 }
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -17,7 +17,6 @@ bool register_sha256q_algo( algo_gate_t* gate );
 #if defined(SHA256T_16WAY)
 void sha256t_16way_hash( void *output, const void *input );
 int scanhash_sha256t_16way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 void sha256q_16way_hash( void *output, const void *input );
@@ -27,7 +26,6 @@ int scanhash_sha256q_16way( struct work *work, uint32_t max_nonce,
 #if defined(SHA256T_8WAY)
 void sha256t_8way_hash( void *output, const void *input );
 int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 void sha256q_8way_hash( void *output, const void *input );
@@ -37,7 +35,6 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
 #if defined(SHA256T_4WAY)
 void sha256t_4way_hash( void *output, const void *input );
 int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 void sha256q_4way_hash( void *output, const void *input );
@@ -45,10 +42,13 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 #if defined(__SHA__)
 int sha256t_hash( void *output, const void *input );
 int scanhash_sha256t( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 int sha256q_hash( void *output, const void *input );
 int scanhash_sha256q( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -3,10 +3,14 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/sha/sph_sha2.h"
+//#include "algo/sha/sph_sha2.h"
 #include "sha256-hash-opt.h"
 #if defined(__SHA__)
 // Only used on CPUs with SHA
 /*
 static __thread sph_sha256_context sha256t_ctx __attribute__ ((aligned (64)));
 void sha256t_midstate( const void* input )
@@ -37,12 +41,21 @@ int sha256t_hash( void* output, const void* input )
   return 1;
 }
 */
 /*
 int scanhash_sha256t( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t edata[20] __attribute__((aligned(64)));
+   uint32_t block[16]    __attribute__ ((aligned (64)));
-   uint32_t hash[8] __attribute__((aligned(64)));
+   uint32_t hash32[8]    __attribute__ ((aligned (32)));
   uint32_t initstate[8] __attribute__ ((aligned (32)));
   uint32_t midstate[8]  __attribute__ ((aligned (32)));
 //   uint32_t edata[20] __attribute__((aligned(64)));
 //   uint32_t hash[8] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -50,24 +63,148 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   __m128i shuf_bswap32 =
           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
-   mm128_bswap32_80( edata, pdata );
+//   mm128_bswap32_80( edata, pdata );
-   sha256t_midstate( edata );
+//   sha256t_midstate( edata );
   // initialize state
   initstate[0] = 0x6A09E667;
   initstate[1] = 0xBB67AE85;
   initstate[2] = 0x3C6EF372;
   initstate[3] = 0xA54FF53A;
   initstate[4] = 0x510E527F;
   initstate[5] = 0x9B05688C;
   initstate[6] = 0x1F83D9AB;
   initstate[7] = 0x5BE0CD19;
   // hash first 64 bytes of data
   sha256_opt_transform( midstate, pdata, initstate );
   do
   {
-      edata[19] = n;
+      // 1. final 16 bytes of data, with padding
-      if ( likely( sha256t_hash( hash, edata ) ) )
+      memcpy( block, pdata + 16, 16 );
-      if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
+      block[ 4] = 0x80000000;
-      {
+      memset( block + 5, 0, 40 );
-         pdata[19] = bswap_32( n );
+      block[15] = 80*8; // bit count
-         submit_solution( work, hash, mythr );
+      sha256_opt_transform( hash32, block, midstate );
-      }
+
      // 2. 32 byte hash from 1.
      memcpy( block, hash32, 32 );
      block[ 8] = 0x80000000;
      memset( block + 9, 0, 24 );
      block[15] = 32*8; // bit count
      sha256_opt_transform( hash32, block, initstate );
      // 3. 32 byte hash from 2.
      memcpy( block, hash32, 32 );
      sha256_opt_transform( hash32, block, initstate );
      // byte swap final hash for testing
      casti_m128i( hash32, 0 ) =
               _mm_shuffle_epi8( casti_m128i( hash32, 0 ), shuf_bswap32 );
      casti_m128i( hash32, 1 ) =
               _mm_shuffle_epi8( casti_m128i( hash32, 1 ), shuf_bswap32 );
      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
         submit_solution( work, hash32, mythr );
      n++;
-   } while ( n < last_nonce && !work_restart[thr_id].restart );
+      pdata[19] = n;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
   *hashes_done = n - first_nonce;
   return 0;
 }
 */
 int scanhash_sha256t( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t block0[16]    __attribute__ ((aligned (64)));
   uint32_t block1[16]    __attribute__ ((aligned (64)));
   uint32_t hash0[8]    __attribute__ ((aligned (32)));
   uint32_t hash1[8]    __attribute__ ((aligned (32)));
   uint32_t initstate[8] __attribute__ ((aligned (32)));
   uint32_t midstate[8]  __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 1;
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   __m128i shuf_bswap32 =
           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
   // initialize state
   initstate[0] = 0x6A09E667;
   initstate[1] = 0xBB67AE85;
   initstate[2] = 0x3C6EF372;
   initstate[3] = 0xA54FF53A;
   initstate[4] = 0x510E527F;
   initstate[5] = 0x9B05688C;
   initstate[6] = 0x1F83D9AB;
   initstate[7] = 0x5BE0CD19;
   // hash first 64 bytes of data
   sha256_opt_transform( midstate, pdata, initstate );
   do
   {
      // 1. final 16 bytes of data, with padding
      memcpy( block0, pdata + 16, 16 );
      memcpy( block1, pdata + 16, 16 );
      block0[ 3] = n;
      block1[ 3] = n+1;
      block0[ 4] = block1[ 4] = 0x80000000;
      memset( block0 + 5, 0, 40 );
      memset( block1 + 5, 0, 40 );
      block0[15] = block1[15] = 80*8; // bit count
      sha256_ni2way_transform( hash0, hash1, block0, block1, midstate, midstate );
      // 2. 32 byte hash from 1.
      memcpy( block0, hash0, 32 );
      memcpy( block1, hash1, 32 );
      block0[ 8] = block1[ 8] = 0x80000000;
      memset( block0 + 9, 0, 24 );
      memset( block1 + 9, 0, 24 );
      block0[15] = block1[15] = 32*8; // bit count
      sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
      // 3. 32 byte hash from 2.
      memcpy( block0, hash0, 32 );
      memcpy( block1, hash1, 32 );
      sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
      // byte swap final hash for testing
      casti_m128i( hash0, 0 ) =
               _mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
      casti_m128i( hash0, 1 ) =
               _mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
      casti_m128i( hash1, 0 ) =
               _mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
      casti_m128i( hash1, 1 ) =
               _mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
      if ( unlikely( valid_hash( hash0, ptarget ) && !bench ) )
      {
         pdata[19] = n;
         submit_solution( work, hash0, mythr );
      }
      if ( unlikely( valid_hash( hash1, ptarget ) && !bench ) )
      {
         pdata[19] = n+1;
         submit_solution( work, hash1, mythr );
      }
      n += 2;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
 #endif
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -96,74 +96,22 @@ static const uint64_t K512[80] =
 // SHA-512 8 way 64 bit
 #define CH8W(X, Y, Z) \
-   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) 
+   _mm512_ternarylogic_epi64( X, Y, Z, 0xca )
 #define MAJ8W(X, Y, Z) \
-   _mm512_or_si512( _mm512_and_si512( X, Y ), \
+   _mm512_ternarylogic_epi64( X, Y, Z, 0xe8 )
                    _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
 #define BSG8W_5_0(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
+   mm512_xor3( mm512_ror_64(x, 28), mm512_ror_64(x, 34), mm512_ror_64(x, 39) )
        mm512_ror_64(x, 28), mm512_ror_64(x, 34) ), mm512_ror_64(x, 39) )
 #define BSG8W_5_1(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
+   mm512_xor3( mm512_ror_64(x, 14), mm512_ror_64(x, 18), mm512_ror_64(x, 41) )
        mm512_ror_64(x, 14), mm512_ror_64(x, 18) ), mm512_ror_64(x, 41) )
 #define SSG8W_5_0(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
+   mm512_xor3( mm512_ror_64(x,  1), mm512_ror_64(x,  8), _mm512_srli_epi64(x, 7) ) 
        mm512_ror_64(x,  1), mm512_ror_64(x,  8) ), _mm512_srli_epi64(x, 7) ) 
 #define SSG8W_5_1(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
+   mm512_xor3( mm512_ror_64(x, 19), mm512_ror_64(x, 61), _mm512_srli_epi64(x, 6) )
        mm512_ror_64(x, 19), mm512_ror_64(x, 61) ), _mm512_srli_epi64(x, 6) )
 static inline __m512i ssg8w_512_add( __m512i w0, __m512i w1 )
 {
   __m512i w0a, w1a, w0b, w1b;
   w0a = mm512_ror_64( w0, 1 );
   w1a = mm512_ror_64( w1,19 );
   w0b = mm512_ror_64( w0, 8 );
   w1b = mm512_ror_64( w1,61 );
   w0a = _mm512_xor_si512( w0a, w0b );
   w1a = _mm512_xor_si512( w1a, w1b );
   w0b = _mm512_srli_epi64( w0, 7 );
   w1b = _mm512_srli_epi64( w1, 6 );
   w0a = _mm512_xor_si512( w0a, w0b );
   w1a = _mm512_xor_si512( w1a, w1b );
   return _mm512_add_epi64( w0a, w1a );
 }
 #define SSG8W_512x2_0( w0, w1, i ) do \
 { \
   __m512i X0a, X1a, X0b, X1b; \
  X0a = mm512_ror_64( W[i-15], 1 ); \
  X1a = mm512_ror_64( W[i-14], 1 ); \
  X0b = mm512_ror_64( W[i-15], 8 ); \
  X1b = mm512_ror_64( W[i-14], 8 ); \
  X0a = _mm512_xor_si512( X0a, X0b ); \
  X1a = _mm512_xor_si512( X1a, X1b ); \
  X0b = _mm512_srli_epi64( W[i-15], 7 ); \
  X1b = _mm512_srli_epi64( W[i-14], 7 ); \
  w0  = _mm512_xor_si512( X0a, X0b ); \
  w1  = _mm512_xor_si512( X1a, X1b ); \
 } while(0)
 #define SSG8W_512x2_1( w0, w1, i ) do \
 { \
   __m512i X0a, X1a, X0b, X1b; \
  X0a = mm512_ror_64( W[i-2],19 ); \
  X1a = mm512_ror_64( W[i-1],19 ); \
  X0b = mm512_ror_64( W[i-2],61 ); \
  X1b = mm512_ror_64( W[i-1],61 ); \
  X0a = _mm512_xor_si512( X0a, X0b ); \
  X1a = _mm512_xor_si512( X1a, X1b ); \
  X0b = _mm512_srli_epi64( W[i-2], 6 ); \
  X1b = _mm512_srli_epi64( W[i-1], 6 ); \
  w0  = _mm512_xor_si512( X0a, X0b ); \
  w1  = _mm512_xor_si512( X1a, X1b ); \
 } while(0)
 #define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
 do { \
@@ -187,8 +135,8 @@ sha512_8way_round( sha512_8way_context *ctx,  __m512i *in, __m512i r[8] )
   mm512_block_bswap_64( W+8, in+8 );
   for ( i = 16; i < 80; i++ )
-      W[i] = _mm512_add_epi64( ssg8w_512_add( W[i-15], W[i-2] ),
+      W[i] = mm512_add4_64( SSG8W_5_0( W[i-15] ), SSG8W_5_1( W[i-2] ),
-                               _mm512_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
+                             W[ i- 7 ], W[ i-16 ] );
   if ( ctx->initialized )
   {
@@ -319,13 +267,19 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
 // SHA-512 4 way 64 bit
-/*
+
 #define CH(X, Y, Z) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
 /*
 #define MAJ(X, Y, Z) \
   _mm256_or_si256( _mm256_and_si256( X, Y ), \
                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
 */
 #define MAJ(X, Y, Z) \
  _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
                                         Y_xor_Z ) )
 #define BSG5_0(x) \
  mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
@@ -334,7 +288,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
 #define BSG5_1(x) \
  mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
                   _mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 )
-*/
+
 /*
 #define BSG5_0(x) \
   _mm256_xor_si256( _mm256_xor_si256( \
@@ -402,7 +356,7 @@ static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
  w1  = _mm256_xor_si256( X1a, X1b ); \
 } while(0)
 */
-
+/*
 #define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
 do { \
  __m256i K = _mm256_set1_epi64x( K512[ i ] ); \
@@ -431,7 +385,7 @@ do { \
  H  = _mm256_add_epi64( T1, T2 ); \
  D  = _mm256_add_epi64( D, T1 ); \
 } while (0)
-
+*/
 /*
 #define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
 do { \
@@ -445,7 +399,7 @@ do { \
 } while (0)
 */
-/*
+
 #define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
 do { \
  __m256i T1, T2; \
@@ -453,16 +407,17 @@ do { \
  T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \
                                           K, W[i] ) ); \
  T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
  Y_xor_Z = X_xor_Y; \
  D  = _mm256_add_epi64( D, T1 ); \
  H  = _mm256_add_epi64( T1, T2 ); \
 } while (0)
-*/
+
 static void
 sha512_4way_round( sha512_4way_context *ctx,  __m256i *in, __m256i r[8] )
 {
   int i;
-   register __m256i A, B, C, D, E, F, G, H;
+   register __m256i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
   __m256i W[80];
   mm256_block_bswap_64( W  , in );
@@ -495,6 +450,8 @@ sha512_4way_round( sha512_4way_context *ctx,  __m256i *in, __m256i r[8] )
      H = m256_const1_64( 0x5BE0CD19137E2179 );
   }
   Y_xor_Z = _mm256_xor_si256( B, C );
   for ( i = 0; i < 80; i += 8 )
   {
      SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
--- a/algo/sha/sph_sha2.c
+++ b/algo/sha/sph_sha2.c
@@ -40,8 +40,8 @@
 #endif
 #define CH(X, Y, Z)    ((((Y) ^ (Z)) & (X)) ^ (Z))
-#define MAJ(X, Y, Z)   (((Y) & (Z)) | (((Y) | (Z)) & (X)))
+//#define MAJ(X, Y, Z)   (((Y) & (Z)) | (((Y) | (Z)) & (X)))
-
+#define MAJ( X, Y, Z )   ( Y  ^ ( ( X_xor_Y = X ^ Y ) & ( Y_xor_Z ) ) )
 #define ROTR    SPH_ROTR32
 #define BSG2_0(x)      (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
@@ -73,7 +73,194 @@ static const sph_u32 H256[8] = {
 #if defined(__SHA__)
-#include "sha256-hash-opt.c"
+#include "simd-utils.h"
 static void sha2_round( const uint8_t input[], uint32_t state[8] )
 {
    __m128i STATE0, STATE1;
    __m128i MSG, TMP, MASK;
    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
    __m128i ABEF_SAVE, CDGH_SAVE;
    // Load initial values
    TMP = _mm_load_si128((__m128i*) &state[0]);
    STATE1 = _mm_load_si128((__m128i*) &state[4]);
    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
    // Save current hash
    ABEF_SAVE = STATE0;
    CDGH_SAVE = STATE1;
    // Rounds 0-3
    MSG = _mm_load_si128((const __m128i*) (input+0));
    TMSG0 = _mm_shuffle_epi8(MSG, MASK);
    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
    // Rounds 4-7
    TMSG1 = _mm_load_si128((const __m128i*) (input+16));
    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
    // Rounds 8-11
    TMSG2 = _mm_load_si128((const __m128i*) (input+32));
    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
    // Rounds 12-15
    TMSG3 = _mm_load_si128((const __m128i*) (input+48));
    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
    TMSG0 = _mm_add_epi32(TMSG0, TMP);
    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
    // Rounds 16-19
    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
    TMSG1 = _mm_add_epi32(TMSG1, TMP);
    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
    // Rounds 20-23
    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
    TMSG2 = _mm_add_epi32(TMSG2, TMP);
    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
    // Rounds 24-27
    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
    TMSG3 = _mm_add_epi32(TMSG3, TMP);
    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
    // Rounds 28-31
    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
    TMSG0 = _mm_add_epi32(TMSG0, TMP);
    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
    // Rounds 32-35
    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
    TMSG1 = _mm_add_epi32(TMSG1, TMP);
    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
    // Rounds 36-39
    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
    TMSG2 = _mm_add_epi32(TMSG2, TMP);
    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
    // Rounds 40-43
    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
    TMSG3 = _mm_add_epi32(TMSG3, TMP);
    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
    // Rounds 44-47
    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
    TMSG0 = _mm_add_epi32(TMSG0, TMP);
    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
    // Rounds 48-51
    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
    TMSG1 = _mm_add_epi32(TMSG1, TMP);
    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
    // Rounds 52-55
    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
    TMSG2 = _mm_add_epi32(TMSG2, TMP);
    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
    // Rounds 56-59
    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
    TMSG3 = _mm_add_epi32(TMSG3, TMP);
    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
    // Rounds 60-63
    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
    // Add values back to state
    STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
    STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
    TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
    // Save state
    _mm_store_si128((__m128i*) &state[0], STATE0);
    _mm_store_si128((__m128i*) &state[4], STATE1);
 }
 #else   // no SHA
@@ -132,6 +319,7 @@ static const sph_u32 K[64] = {
 		t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \
 			+ K[pcount + (pc)] + W[(pc) & 0x0F]); \
 		t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \
      Y_xor_Z = X_xor_Y; \
 		d = SPH_T32(d + t1); \
 		h = SPH_T32(t1 + t2); \
 	} while (0)
@@ -142,7 +330,7 @@ static const sph_u32 K[64] = {
 	SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc)
 #define SHA2_ROUND_BODY(in, r)   do { \
-		sph_u32 A, B, C, D, E, F, G, H; \
+		sph_u32 A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; \
 		sph_u32 W[16]; \
 		unsigned pcount; \
 \
@@ -155,6 +343,7 @@ static const sph_u32 K[64] = {
 		G = (r)[6]; \
 		H = (r)[7]; \
 		pcount = 0; \
      Y_xor_Z = B ^ C; \
 		SHA2_STEP1(A, B, C, D, E, F, G, H, in,  0); \
 		SHA2_STEP1(H, A, B, C, D, E, F, G, in,  1); \
 		SHA2_STEP1(G, H, A, B, C, D, E, F, in,  2); \
@@ -202,7 +391,7 @@ static const sph_u32 K[64] = {
 #else  // large footprint (default)
 #define SHA2_ROUND_BODY(in, r)   do { \
-		sph_u32 A, B, C, D, E, F, G, H, T1, T2; \
+		sph_u32 A, B, C, D, E, F, G, H, T1, T2, X_xor_Y, Y_xor_Z;; \
 		sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \
 		sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \
 \
@@ -214,388 +403,453 @@ static const sph_u32 K[64] = {
 		F = (r)[5]; \
 		G = (r)[6]; \
 		H = (r)[7]; \
      Y_xor_Z = B ^ C; \
 		W00 = in(0); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x428A2F98) + W00); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W01 = in(1); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x71374491) + W01); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W02 = in(2); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0xB5C0FBCF) + W02); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W03 = in(3); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0xE9B5DBA5) + W03); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W04 = in(4); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x3956C25B) + W04); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W05 = in(5); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x59F111F1) + W05); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W06 = in(6); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x923F82A4) + W06); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W07 = in(7); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0xAB1C5ED5) + W07); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W08 = in(8); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0xD807AA98) + W08); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W09 = in(9); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x12835B01) + W09); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W10 = in(10); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x243185BE) + W10); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W11 = in(11); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x550C7DC3) + W11); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W12 = in(12); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x72BE5D74) + W12); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W13 = in(13); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x80DEB1FE) + W13); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W14 = in(14); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x9BDC06A7) + W14); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W15 = in(15); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0xC19BF174) + W15); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0xE49B69C1) + W00); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0xEFBE4786) + W01); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x0FC19DC6) + W02); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x240CA1CC) + W03); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x2DE92C6F) + W04); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x4A7484AA) + W05); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x5CB0A9DC) + W06); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x76F988DA) + W07); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x983E5152) + W08); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0xA831C66D) + W09); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0xB00327C8) + W10); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0xBF597FC7) + W11); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0xC6E00BF3) + W12); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0xD5A79147) + W13); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x06CA6351) + W14); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x14292967) + W15); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x27B70A85) + W00); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x2E1B2138) + W01); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x4D2C6DFC) + W02); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x53380D13) + W03); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x650A7354) + W04); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x766A0ABB) + W05); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x81C2C92E) + W06); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x92722C85) + W07); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0xA2BFE8A1) + W08); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0xA81A664B) + W09); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0xC24B8B70) + W10); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0xC76C51A3) + W11); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0xD192E819) + W12); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0xD6990624) + W13); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0xF40E3585) + W14); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x106AA070) + W15); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x19A4C116) + W00); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x1E376C08) + W01); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x2748774C) + W02); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x34B0BCB5) + W03); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x391C0CB3) + W04); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x4ED8AA4A) + W05); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x5B9CCA4F) + W06); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x682E6FF3) + W07); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x748F82EE) + W08); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x78A5636F) + W09); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x84C87814) + W10); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x8CC70208) + W11); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x90BEFFFA) + W12); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0xA4506CEB) + W13); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0xBEF9A3F7) + W14); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0xC67178F2) + W15); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		(r)[0] = SPH_T32((r)[0] + A); \
--- a/algo/sha/sph_sha2big.c
+++ b/algo/sha/sph_sha2big.c
@@ -38,7 +38,8 @@
 #if SPH_64
 #define CH(X, Y, Z)    ((((Y) ^ (Z)) & (X)) ^ (Z))
-#define MAJ(X, Y, Z)   (((X) & (Y)) | (((X) | (Y)) & (Z)))
+//#define MAJ(X, Y, Z)   (((X) & (Y)) | (((X) | (Y)) & (Z)))
 #define MAJ( X, Y, Z )   ( Y  ^ ( ( X ^ Y ) & ( Y ^ Z ) ) )
 #define ROTR64    SPH_ROTR64
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -310,12 +310,13 @@ do { \
 #define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
 do { \
-   xa0 = _mm256_xor_si256( xm, _mm256_xor_si256( xb1, _mm256_xor_si256(  \
+   xa0 = mm256_xor3( xm, xb1, _mm256_xor_si256(  \
            _mm256_andnot_si256( xb3, xb2 ), \
-            _mm256_mullo_epi32( _mm256_xor_si256( xa0, _mm256_xor_si256( xc, \
+            _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
-               _mm256_mullo_epi32(  mm256_rol_32( xa1, 15 ), _mm256_set1_epi32(5UL) ) \
+               _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \
-                   ) ), _mm256_set1_epi32(3UL) ) ) ) ); \
+                                   _mm256_set1_epi32(5UL) ) ), \
-   xb0 = mm256_not( _mm256_xor_si256( xa0, mm256_rol_32( xb0, 1 ) ) ); \
+               _mm256_set1_epi32(3UL) ) ) ); \
   xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
 } while (0)
 #define PERM_STEP_0_8   do { \
--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -23,6 +23,8 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
   register __m512i K0, K1, K2, K3, K4, K5, K6, K7;
   __m512i *M = (__m512i*)msg;
   __m512i *H = (__m512i*)ctx->h;
   const __m512i count = _mm512_set4_epi32( ctx->count3, ctx->count2,
                                            ctx->count1, ctx->count0 );
   int r;
   P0 = H[0];
@@ -62,16 +64,16 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
                                  _mm512_aesenc_epi128( K0, m512_zero ) ) );
     if ( r == 0 )
-        K0 = _mm512_xor_si512( K0, _mm512_set4_epi32( 
+        K0 = _mm512_xor_si512( K0,
-		              ~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) );
+                    _mm512_mask_xor_epi32( count, 0x8888, count, m512_neg1 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
     K1 = _mm512_xor_si512( K0,
 		           mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
     if ( r == 1 )
-        K1 = _mm512_xor_si512( K1, _mm512_set4_epi32(
+        K1 = _mm512_xor_si512( K1, mm512_ror128_32(
-	                 ~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) ); 
+                 _mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
     K2 = _mm512_xor_si512( K1,
@@ -96,8 +98,8 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
 		           mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
     if ( r == 2 )
-        K7 = _mm512_xor_si512( K7, _mm512_set4_epi32(
+        K7 = _mm512_xor_si512( K7, mm512_swap128_64(
-                    ~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) );
+                 _mm512_mask_xor_epi32( count, 0x2222, count, m512_neg1 ) ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
     P1 = _mm512_xor_si512( P1, X );
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -101,15 +101,6 @@ c512( sph_shavite_big_context *sc, const void *msg )
   // round
 //  working proof of concept   
 /*
   __m512i K = m512_const1_128( m[0] );
   __m512i X = _mm512_xor_si512( m512_const1_128( p1 ), K );
   X = _mm512_aesenc_epi128( X, m512_zero );
   k00 = _mm512_castsi512_si128( K );
   x = _mm512_castsi512_si128( X );
 */
   k00 = m[0];
   x = _mm_xor_si128( p1, k00 );
   x = _mm_aesenc_si128( x, zero );
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -747,11 +747,6 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
  static const m512_v16 code[] = { c1_16_512(185), c1_16_512(233) };
 //  static const m512_v16 code[] = { c1_16(185), c1_16(233),
 //                                   c1_16(185), c1_16(233) };
  S0l = _mm512_xor_si512( S[0], M[0] );
  S0h = _mm512_xor_si512( S[1], M[1] );
  S1l = _mm512_xor_si512( S[2], M[2] );
@@ -764,11 +759,16 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
 // targetted, local macros don't need a unique name
 #define S(i) S##i
 #define F_0( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xca )
 #define F_1( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xe8 )  
 /*  
 #define F_0(B, C, D) \
   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( C,D ), B ), D )
 #define F_1(B, C, D) \
   _mm512_or_si512( _mm512_and_si512( D, C ),\
                    _mm512_and_si512( _mm512_or_si512( D,C ), B ) )
 */
 #define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
 #define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
--- a/algo/simd/vector.c
+++ b/algo/simd/vector.c
@@ -6,10 +6,6 @@
 #define PRINT_SOME 0
 /* JDD all ocurrances of macro X in this file renamed to XX
 * due to name conflict
 */
 int SupportedLength(int hashbitlen) {
  if (hashbitlen <= 0 || hashbitlen > 512)
    return 0;
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -309,19 +309,13 @@ static const uint64_t IV512[] = {
      sc->bcount = bcount; \
   } while (0)
 // AVX2 all scalar vars are now vectors representing 4 nonces in parallel
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 #define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
 do { \
-  k8 = _mm512_xor_si512( _mm512_xor_si512( \
+  k8 = mm512_xor3( mm512_xor3( k0, k1, k2 ), mm512_xor3( k3, k4, k5 ), \
-                            _mm512_xor_si512( _mm512_xor_si512( k0, k1 ), \
+                   mm512_xor3( k6, k7, m512_const1_64( 0x1BD11BDAA9FC1A22) ));\
                                              _mm512_xor_si512( k2, k3 ) ), \
                            _mm512_xor_si512( _mm512_xor_si512( k4, k5 ), \
                                              _mm512_xor_si512( k6, k7 ) ) ), \
                         m512_const1_64( 0x1BD11BDAA9FC1A22) ); \
  t2 = t0 ^ t1; \
 } while (0)
@@ -340,7 +334,6 @@ do { \
                                         m512_const1_64( s ) ) ); \
 } while (0)
 #define TFBIG_MIX_8WAY(x0, x1, rc) \
 do { \
     x0 = _mm512_add_epi64( x0, x1 ); \
--- a/algo/verthash/Verthash.c
+++ b/algo/verthash/Verthash.c
@@ -44,8 +44,8 @@ int verthash_info_init(verthash_info_t* info, const char* file_name)
       if ( opt_data_file || !opt_verify ) 
       {
          if ( opt_data_file )
-             applog( LOG_ERR,
+             applog( LOG_ERR, "Verthash data file not found or invalid: %s",
-                     "Verthash data file not found or invalid: %s", info->fileName );
+                     info->fileName );
          else
          {
             applog( LOG_ERR,
@@ -134,76 +134,117 @@ static inline uint32_t fnv1a(const uint32_t a, const uint32_t b)
    return (a ^ b) * 0x1000193;
 }
-void verthash_hash( const unsigned char* blob_bytes,
+#if 0
-                    const size_t blob_size,
+static void rotate_indexes( uint32_t *p )
                    const unsigned char(*input)[VH_HEADER_SIZE],
                    unsigned char(*output)[VH_HASH_OUT_SIZE] )
 {
    unsigned char p1[ VH_HASH_OUT_SIZE ] __attribute__ ((aligned (64)));
    unsigned char p0[ VH_N_SUBSET ] __attribute__ ((aligned (64)));
    uint32_t seek_indexes[VH_N_INDEXES] __attribute__ ((aligned (64)));
    uint32_t* p0_index = (uint32_t*)p0;
    verthash_sha3_512_final_8( p0, ( (uint64_t*)input )[ 9 ] );
    for ( size_t x = 0; x < VH_N_ROT; ++x )
    {
        memcpy( seek_indexes + x * (VH_N_SUBSET / sizeof(uint32_t)),
                p0, VH_N_SUBSET);
 #if defined(__AVX2__)
-        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(__m256i); y += 8)
+   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m256i); x += 8 )
-        {
+   {
-           casti_m256i( p0_index, y   ) = mm256_rol_32(
+      __m256i *px = (__m256i*)p + x;
-                                            casti_m256i( p0_index, y   ), 1 );
+
-           casti_m256i( p0_index, y+1 ) = mm256_rol_32( 
+      px[0] = mm256_rol_32( px[0], 1 );
-                                            casti_m256i( p0_index, y+1 ), 1 );
+      px[1] = mm256_rol_32( px[1], 1 );
-           casti_m256i( p0_index, y+2 ) = mm256_rol_32(
+      px[2] = mm256_rol_32( px[2], 1 );
-                                            casti_m256i( p0_index, y+2 ), 1 );
+      px[3] = mm256_rol_32( px[3], 1 );
-           casti_m256i( p0_index, y+3 ) = mm256_rol_32(
+      px[4] = mm256_rol_32( px[4], 1 );
-                                            casti_m256i( p0_index, y+3 ), 1 );
+      px[5] = mm256_rol_32( px[5], 1 );
-           casti_m256i( p0_index, y+4 ) = mm256_rol_32(
+      px[6] = mm256_rol_32( px[6], 1 );
-                                            casti_m256i( p0_index, y+4 ), 1 );
+      px[7] = mm256_rol_32( px[7], 1 );
-           casti_m256i( p0_index, y+5 ) = mm256_rol_32(
+   }
                                            casti_m256i( p0_index, y+5 ), 1 );
           casti_m256i( p0_index, y+6 ) = mm256_rol_32(
                                            casti_m256i( p0_index, y+6 ), 1 );
           casti_m256i( p0_index, y+7 ) = mm256_rol_32(
                                            casti_m256i( p0_index, y+7 ), 1 );
        }
 #else
-        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(__m128i); y += 8)
+   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m128i); x += 8 )
-        {
+   {
-           casti_m128i( p0_index, y   ) = mm128_rol_32(
+      __m128i *px = (__m128i*)p0_index + x;
-                                            casti_m128i( p0_index, y   ), 1 );
+
-           casti_m128i( p0_index, y+1 ) = mm128_rol_32(
+      px[0] = mm128_rol_32( px[0], 1 );
-                                            casti_m128i( p0_index, y+1 ), 1 );
+      px[1] = mm128_rol_32( px[1], 1 );
-           casti_m128i( p0_index, y+2 ) = mm128_rol_32(
+      px[2] = mm128_rol_32( px[2], 1 );
-                                            casti_m128i( p0_index, y+2 ), 1 );
+      px[3] = mm128_rol_32( px[3], 1 );
-           casti_m128i( p0_index, y+3 ) = mm128_rol_32(
+      px[4] = mm128_rol_32( px[4], 1 );
-                                            casti_m128i( p0_index, y+3 ), 1 );
+      px[5] = mm128_rol_32( px[5], 1 );
-           casti_m128i( p0_index, y+4 ) = mm128_rol_32(
+      px[6] = mm128_rol_32( px[6], 1 );
-                                            casti_m128i( p0_index, y+4 ), 1 );
+      px[7] = mm128_rol_32( px[7], 1 );
-           casti_m128i( p0_index, y+5 ) = mm128_rol_32(
+   }
-                                            casti_m128i( p0_index, y+5 ), 1 );
+
-           casti_m128i( p0_index, y+6 ) = mm128_rol_32(
+#endif
-                                            casti_m128i( p0_index, y+6 ), 1 );
+/*   
-           casti_m128i( p0_index, y+7 ) = mm128_rol_32(
+   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(uint32_t); ++x )
-                                            casti_m128i( p0_index, y+7 ), 1 );
+      p[x] = ( p[x] << 1 ) | ( p[x] >> 31 );
-        }
+*/
 }
 #endif
 static inline uint32_t rotl32( uint32_t a, size_t r )
 {
   return ( a << r ) | ( a >> (32-r) );
 }
 // Vectorized and targetted version of fnv1a
 #if defined (__AVX2__)        
 #define MULXOR \
   *(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \
                                 *(__m256i*)hash, *(__m256i*)blob_off ), k );
 #elif defined(__SSE41__)
 #define MULXOR \
   casti_m128i( hash, 0 ) = _mm_mullo_epi32( _mm_xor_si128( \
                  casti_m128i( hash, 0 ), casti_m128i( blob_off, 0 ) ), k ); \
   casti_m128i( hash, 1 ) = _mm_mullo_epi32( _mm_xor_si128( \
                  casti_m128i( hash, 1 ), casti_m128i( blob_off, 1 ) ), k );
 #else
 #define MULXOR \
   for ( size_t j = 0; j < VH_HASH_OUT_SIZE / sizeof(uint32_t); j++ ) \
       hash[j] = fnv1a( hash[j], blob_off[j] ); \
 #endif
-    }
+#define UPDATE_ACCUMULATOR \
   accumulator = fnv1a( accumulator, blob_off[0] ); \
   accumulator = fnv1a( accumulator, blob_off[1] ); \
   accumulator = fnv1a( accumulator, blob_off[2] ); \
   accumulator = fnv1a( accumulator, blob_off[3] ); \
   accumulator = fnv1a( accumulator, blob_off[4] ); \
   accumulator = fnv1a( accumulator, blob_off[5] ); \
   accumulator = fnv1a( accumulator, blob_off[6] ); \
   accumulator = fnv1a( accumulator, blob_off[7] )
    sha3( &input[0], VH_HEADER_SIZE, &p1[0], VH_HASH_OUT_SIZE );
-    uint32_t* p1_32 = (uint32_t*)p1;
+// first pass no rotate
-    uint32_t* blob_bytes_32 = (uint32_t*)blob_bytes;
+#define ROUND_0 \
-    uint32_t value_accumulator = 0x811c9dc5;
+for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
 { \
   const uint32_t *blob_off = blob + \
                         ( ( fnv1a( subset[i], accumulator ) % mdiv ) \
                         * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \
   UPDATE_ACCUMULATOR; \
   MULXOR; \
 }
 // subsequent passes rotate by r on demand, no need for mass rotate
 #define ROUND_r( r ) \
 for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
 { \
   const uint32_t *blob_off = blob + \
                 ( ( fnv1a( rotl32( subset[i], r ), accumulator ) % mdiv ) \
                 * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \
   UPDATE_ACCUMULATOR; \
   MULXOR; \
 }
 void verthash_hash( const void *blob_bytes, const size_t blob_size,
                    const void *input, void *output )
 {
    uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (64)));
    uint32_t subset[ VH_N_SUBSET / 4 ] __attribute__ ((aligned (64)));
    const uint32_t *blob = (const uint32_t*)blob_bytes;
    uint32_t accumulator = 0x811c9dc5;
    const uint32_t mdiv = ( ( blob_size - VH_HASH_OUT_SIZE )
                             / VH_BYTE_ALIGNMENT ) + 1;
 #if defined (__AVX2__)        
@@ -212,39 +253,14 @@ void verthash_hash( const unsigned char* blob_bytes,
    const __m128i k = _mm_set1_epi32( 0x1000193 );
 #endif
-    for ( size_t i = 0; i < VH_N_INDEXES; i++ )
+    sha3( input, VH_HEADER_SIZE, hash, VH_HASH_OUT_SIZE );
-    {
+    verthash_sha3_512_final_8( subset, ( (uint64_t*)input )[ 9 ] );
        const uint32_t offset =
                      ( fnv1a( seek_indexes[i], value_accumulator) % mdiv )
                      * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) );
        const uint32_t *blob_off = blob_bytes_32 + offset;
-        // update value accumulator for next seek index
+    ROUND_0;
-        value_accumulator = fnv1a( value_accumulator, blob_off[0] );
+    for ( size_t r = 1; r < VH_N_ROT; ++r )
-        value_accumulator = fnv1a( value_accumulator, blob_off[1] );
+       ROUND_r( r );
        value_accumulator = fnv1a( value_accumulator, blob_off[2] );
        value_accumulator = fnv1a( value_accumulator, blob_off[3] );
        value_accumulator = fnv1a( value_accumulator, blob_off[4] );
        value_accumulator = fnv1a( value_accumulator, blob_off[5] );
        value_accumulator = fnv1a( value_accumulator, blob_off[6] );
        value_accumulator = fnv1a( value_accumulator, blob_off[7] );
-#if defined (__AVX2__)        
+    memcpy( output, hash, VH_HASH_OUT_SIZE );
        *(__m256i*)p1_32 = _mm256_mullo_epi32( _mm256_xor_si256(
                                  *(__m256i*)p1_32, *(__m256i*)blob_off ), k );
 #elif defined(__SSE41__)
        casti_m128i( p1_32, 0 ) = _mm_mullo_epi32( _mm_xor_si128( 
                    casti_m128i( p1_32, 0 ), casti_m128i( blob_off, 0 ) ), k );
        casti_m128i( p1_32, 1 ) = _mm_mullo_epi32( _mm_xor_si128( 
                    casti_m128i( p1_32, 1 ), casti_m128i( blob_off, 1 ) ), k );
 #else
         for ( size_t i2 = 0; i2 < VH_HASH_OUT_SIZE / sizeof(uint32_t); i2++ )
            p1_32[i2] = fnv1a( p1_32[i2], blob_off[i2] );
 #endif
    }
    memcpy( output, p1, VH_HASH_OUT_SIZE );
 }
 //-----------------------------------------------------------------------------
--- a/algo/verthash/Verthash.h
+++ b/algo/verthash/Verthash.h
@@ -47,10 +47,8 @@ void verthash_info_free(verthash_info_t* info);
 //! Generate verthash data file and save it to specified location.
 int verthash_generate_data_file(const char* output_file_name);
-void verthash_hash(const unsigned char* blob_bytes,
+void verthash_hash( const void *blob_bytes, const size_t blob_size,
-                   const size_t blob_size,
+                    const void *input, void *output );
                   const unsigned char(*input)[VH_HEADER_SIZE],
                   unsigned char(*output)[VH_HASH_OUT_SIZE]);
 void verthash_sha3_512_prehash_72( const void *input );
 void verthash_sha3_512_final_8( void *hash, const uint64_t nonce );
--- a/algo/verthash/verthash-gate.c
+++ b/algo/verthash/verthash-gate.c
@@ -62,7 +62,7 @@ void verthash_sha3_512_final_8( void *hash, const uint64_t nonce )
    __m256i vhashB[ 10 ] __attribute__ ((aligned (64)));
   sha3_4way_ctx_t ctx;
-   __m256i vnonce = _mm256_set1_epi64x( nonce );
+   const __m256i vnonce = _mm256_set1_epi64x( nonce );
   memcpy( &ctx, &sha3_mid_ctxA, sizeof ctx );
   sha3_4way_update( &ctx, &vnonce, 8 );
@@ -88,14 +88,13 @@ void verthash_sha3_512_final_8( void *hash, const uint64_t nonce )
 #endif
 }
 int scanhash_verthash( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t edata[20] __attribute__((aligned(64)));
   uint32_t hash[8] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+   const uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 1;
   uint32_t n = first_nonce;
@@ -109,8 +108,7 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
   {
      edata[19] = n;
      verthash_hash( verthashInfo.data, verthashInfo.dataSize, 
-                     (const unsigned char (*)[80]) edata,
+                     edata,  hash );
                     (unsigned char (*)[32]) hash );
      if ( valid_hash( hash, ptarget ) && !bench )
      {
         pdata[19] = bswap_32( n );
@@ -123,17 +121,16 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
   return 0;
 }
-const char *default_verthash_data_file = "verthash.dat";
+static const char *default_verthash_data_file = "verthash.dat";
 bool register_verthash_algo( algo_gate_t* gate )
 {
  opt_target_factor = 256.0;
  gate->scanhash  = (void*)&scanhash_verthash;
  gate->optimizations = AVX2_OPT;
-  char *verthash_data_file = opt_data_file ? opt_data_file
+  const char *verthash_data_file = opt_data_file ? opt_data_file
-                                           : default_verthash_data_file;
+                                                 : default_verthash_data_file;
   int vhLoadResult = verthash_info_init( &verthashInfo, verthash_data_file );
   if (vhLoadResult == 0) // No Error
@@ -160,7 +157,8 @@ bool register_verthash_algo( algo_gate_t* gate )
      // Handle Verthash error codes
      if ( vhLoadResult == 1 )
      {
-         applog( LOG_ERR, "Verthash data file not found: %s", verthash_data_file );
+         applog( LOG_ERR, "Verthash data file not found: %s",
                 verthash_data_file );
         if ( !opt_data_file )
            applog( LOG_NOTICE, "Add '--verify' to create verthash.dat");
      }
--- a/algo/yespower/yespower-opt.c
+++ b/algo/yespower/yespower-opt.c
@@ -373,6 +373,45 @@ static inline void salsa20(salsa20_blk_t *restrict B,
 #define INTEGERIFY (uint32_t)X.d[0]
 #endif
 // AVX512 ternary logic optimization
 #if defined(__AVX512VL__)
 #define XOR_X_XOR_X( in1, in2 ) \
 X0 =  _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
 X1 =  _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
 X2 =  _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
 X3 =  _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); 
 #define XOR_X_2_XOR_X( in1, in2, in3 ) \
 X0 =  _mm_ternarylogic_epi32( (in1).q[0], (in2).q[0], (in3).q[0], 0x96 ); \
 X1 =  _mm_ternarylogic_epi32( (in1).q[1], (in2).q[1], (in3).q[1], 0x96 ); \
 X2 =  _mm_ternarylogic_epi32( (in1).q[2], (in2).q[2], (in3).q[2], 0x96 ); \
 X3 =  _mm_ternarylogic_epi32( (in1).q[3], (in2).q[3], (in3).q[3], 0x96 );
 #define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
 X0 =  _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
 X1 =  _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
 X2 =  _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
 X3 =  _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); \
 SALSA20(out)
 #else
 #define XOR_X_XOR_X( in1, in2 ) \
  XOR_X( in1 ) \
  XOR_X( in2 ) 
 #define XOR_X_2_XOR_X( in1, in2, in3 ) \
   XOR_X_2( in1, in2 ) \
   XOR_X( in3 )
 #define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
   XOR_X(in1) \
   XOR_X(in2) \
   SALSA20( out )
 #endif
 /**
 * Apply the Salsa20 core to the block provided in X ^ in.
 */
@@ -406,11 +445,15 @@ static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1,
 {
 	DECL_X
-	XOR_X_2(Bin1[1], Bin2[1])
+   XOR_X_2_XOR_X( Bin1[1], Bin2[1], Bin1[0] )   
-	XOR_X(Bin1[0])
+//	XOR_X_2(Bin1[1], Bin2[1])
 //	XOR_X(Bin1[0])
 	SALSA20_XOR_MEM(Bin2[0], Bout[0])
-	XOR_X(Bin1[1])
+
-	SALSA20_XOR_MEM(Bin2[1], Bout[1])
+// Factor out the XOR from salsa20 to do a xor3
   XOR_X_SALSA20_XOR_MEM( Bin1[1], Bin2[1], Bout[1] )
 //   XOR_X(Bin1[1])
 //	SALSA20_XOR_MEM(Bin2[1], Bout[1])
 	return INTEGERIFY;
 }
@@ -745,13 +788,15 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
 	i = 0;
 	r--;
 	do {
-		XOR_X(Bin1[i])
+      XOR_X_XOR_X( Bin1[i], Bin2[i] )
-		XOR_X(Bin2[i])
+//      XOR_X(Bin1[i])
 //      XOR_X(Bin2[i])
 		PWXFORM
 		WRITE_X(Bout[i])
-		XOR_X(Bin1[i + 1])
+      XOR_X_XOR_X( Bin1[ i+1 ], Bin2[ i+1 ] )     
-		XOR_X(Bin2[i + 1])
+//		XOR_X(Bin1[i + 1])
 //		XOR_X(Bin2[i + 1])
 		PWXFORM
 		if (unlikely(i >= r))
--- a/algo/yespower/yespower.h
+++ b/algo/yespower/yespower.h
@@ -35,7 +35,6 @@
 #include "miner.h"
 #include "simd-utils.h"
 #include "algo/sha/sph_sha2.h"
 #include <openssl/sha.h>
 #ifdef __cplusplus
 extern "C" {
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -63,7 +63,7 @@ mv cpuminer cpuminer-avx
 # Westmere SSE4.2 AES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=westmere -Wall -fno-common" ./configure --with-curl
+CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-sse42.exe
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.16.2.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.17.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.16.2'
+PACKAGE_VERSION='3.17.1'
-PACKAGE_STRING='cpuminer-opt 3.16.2'
+PACKAGE_STRING='cpuminer-opt 3.17.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.16.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.17.1 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.16.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.17.1:";;
   esac
  cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.16.2
+cpuminer-opt configure 3.17.1
 generated by GNU Autoconf 2.69
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by cpuminer-opt $as_me 3.16.2, which was
+It was created by cpuminer-opt $as_me 3.17.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.16.2'
+ VERSION='3.17.1'
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.16.2, which was
+This file was extended by cpuminer-opt $as_me 3.17.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.16.2
+cpuminer-opt config.status 3.17.1
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.16.2])
+AC_INIT([cpuminer-opt], [3.17.1])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -447,8 +447,10 @@ static bool work_decode( const json_t *val, struct work *work )
    if ( !allow_mininginfo )
        net_diff = algo_gate.calc_network_diff( work );
    else
        net_diff = hash_to_diff( work->target );
-    work->targetdiff = hash_to_diff( work->target );
+    work->targetdiff = net_diff;
    stratum_diff = last_targetdiff = work->targetdiff;
    work->sharediff = 0;
    algo_gate.decode_extra_data( work, &net_blocks );
@@ -482,13 +484,17 @@ static bool get_mininginfo( CURL *curl, struct work *work )
   // "networkhashps": 56475980
   if ( res )
   {
      // net_diff is a global that is set from the work hash target by
      // both getwork and GBT. Don't overwrite it, define a local to override
      // the global.
      double net_diff = 0.;
  		json_t *key = json_object_get( res, "difficulty" );
   	if ( key )
      {
 	   	if ( json_is_object( key ) )
 		   	key = json_object_get( key, "proof-of-work" );
 		   if ( json_is_real( key ) )
-			   net_diff = work->targetdiff = json_real_value( key );
+			   net_diff = json_real_value( key );
 	   }
      key = json_object_get( res, "networkhashps" );
@@ -908,6 +914,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
   }
   for ( i = 0; i < ARRAY_SIZE( work->target ); i++ )
      work->target[7 - i] = be32dec( target + i );
   net_diff = work->targetdiff = hash_to_diff( work->target );
   tmp = json_object_get( val, "workid" );
   if ( tmp )
@@ -1047,6 +1054,8 @@ void report_summary_log( bool force )
         applog( LOG_NOTICE,"CPU temp: curr %s max %d, Freq: %.3f/%.3f GHz",
                 tempstr, hi_temp, lo_freq / 1e6, hi_freq / 1e6 );
         if ( curr_temp > hi_temp ) hi_temp = curr_temp;
         if ( ( opt_max_temp > 0.0 ) && ( curr_temp > opt_max_temp ) )
            restart_threads();
         prev_temp = curr_temp;
      }
   }
@@ -1145,7 +1154,7 @@ void report_summary_log( bool force )
   if ( mismatch )
   {
      if ( mismatch != 1 )
-         applog(LOG_WARNING,"Share count mismatch: %d, stats may be incorrect", mismatch );
+         applog(LOG_WARNING,"Share count mismatch: %d, stats may be inaccurate", mismatch );
      else
         applog(LOG_INFO,"Share count mismatch, submitted share may still be pending" );
   }
@@ -1165,7 +1174,8 @@ static int share_result( int result, struct work *work,
   char bres[48];
   bool solved = false; 
   bool stale = false;
-   char *acol = NULL, *bcol = NULL, *scol = NULL, *rcol = NULL;
+   char *acol, *bcol, *scol, *rcol;
   acol = bcol = scol = rcol = "\0";
   pthread_mutex_lock( &stats_lock );
@@ -1207,7 +1217,7 @@ static int share_result( int result, struct work *work,
      sprintf( sres, "S%d", stale_share_count );
      sprintf( rres, "R%d", rejected_share_count );
      if unlikely( ( my_stats.net_diff > 0. )
-                && ( my_stats.share_diff >= net_diff ) )
+                && ( my_stats.share_diff >= my_stats.net_diff ) )
      {
         solved = true;
         solved_block_count++;
@@ -2085,10 +2095,10 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
                         sctx->block_height, net_diff, g_work->job_id );
   else if ( !opt_quiet )
   {
-      unsigned char *xnonce2str = abin2hex( g_work->xnonce2,
+      unsigned char *xnonce2str = bebin2hex( g_work->xnonce2,
-                                            g_work->xnonce2_len );
+                                             g_work->xnonce2_len );
-      applog( LOG_INFO, "Extranonce2 %s, Block %d, Net Diff %.5g",
+      applog( LOG_INFO, "Extranonce2 %s, Block %d, Job %s",
-                  xnonce2str, sctx->block_height, net_diff );
+                        xnonce2str, sctx->block_height, g_work->job_id );
      free( xnonce2str );
   }
@@ -2171,11 +2181,11 @@ static void *miner_thread( void *userdata )
   /* Set worker threads to nice 19 and then preferentially to SCHED_IDLE
    * and if that fails, then SCHED_BATCH. No need for this to be an
    * error if it fails */
-   if (!opt_benchmark && opt_priority == 0)
+   if ( !opt_priority )
   {
      setpriority(PRIO_PROCESS, 0, 19);
-      if ( !thr_id && !opt_quiet )
+      if ( !thr_id && opt_debug )
-         applog(LOG_INFO, "Miner thread priority %d (nice 19)", opt_priority );
+         applog(LOG_INFO, "Default miner thread priority %d (nice 19)", opt_priority );
      drop_policy();
   }
   else
@@ -2192,9 +2202,12 @@ static void *miner_thread( void *userdata )
 	      case 4:   prio = -10;   break;
 	      case 5:   prio = -15;
      }
-	   if ( !( thr_id || opt_quiet ) )
+	   if ( !thr_id )
-	      applog( LOG_INFO, "Miner thread priority %d (nice %d)",
+      {
         applog( LOG_INFO, "User set miner thread priority %d (nice %d)",
                          opt_priority, prio );
         applog( LOG_WARNING, "High priority mining threads may cause system instability");
      }
 #endif
      setpriority(PRIO_PROCESS, 0, prio);
 	   if ( opt_priority == 0 )
@@ -2439,7 +2452,7 @@ static void *miner_thread( void *userdata )
             char hr_units[2] = {0,0};
             scale_hash_for_display( &hashrate,  hr_units );
             sprintf( hr, "%.2f", hashrate );
-#if ((defined(_WIN64) || defined(__WINDOWS__)) || defined(_WIN32))
+#if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
             applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
 #else
             float lo_freq = 0., hi_freq = 0.;
@@ -2739,10 +2752,10 @@ static void *stratum_thread(void *userdata )
 	          stratum.url = strdup( rpc_url );
 	          applog(LOG_BLUE, "Connection changed to %s", short_url);
          }
-          else // if ( !opt_quiet )
+          else 
 	          applog(LOG_WARNING, "Stratum connection reset");
          // reset stats queue as well
-          s_get_ptr = s_put_ptr = 0;
+          if ( s_get_ptr != s_put_ptr ) s_get_ptr = s_put_ptr = 0;
      }
      while ( !stratum.curl )
@@ -2789,13 +2802,15 @@ static void *stratum_thread(void *userdata )
         else
         {
            applog(LOG_WARNING, "Stratum connection interrupted");
-            stratum_disconnect( &stratum );
+//            stratum_disconnect( &stratum );
            stratum_need_reset = true;
         }
      }
      else
      {
         applog(LOG_ERR, "Stratum connection timeout");
-         stratum_disconnect( &stratum );
+         stratum_need_reset = true;
 //         stratum_disconnect( &stratum );
      }
   }  // loop
@@ -2843,7 +2858,6 @@ static bool cpu_capability( bool display_only )
     bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
     bool use_aes;
     bool use_sse2;
     bool use_sse42;
     bool use_avx2;
     bool use_avx512;
     bool use_sha;
@@ -2917,13 +2931,14 @@ static bool cpu_capability( bool display_only )
     if ( algo_features == EMPTY_SET ) printf( " None" );
     else
     {
-        if      ( algo_has_avx512 )    printf( " AVX512" );
+        if      ( algo_has_avx512  )  printf( " AVX512" );
-        else if ( algo_has_avx2   )    printf( " AVX2  " );
+        else if ( algo_has_avx2    )  printf( " AVX2  " );
-        else if ( algo_has_sse42  )    printf( " SSE4.2" );
+        else if ( algo_has_sse42   )  printf( " SSE4.2" );
-        else if ( algo_has_sse2   )    printf( " SSE2  " );
+        else if ( algo_has_sse2    )  printf( " SSE2  " );
-        if      ( algo_has_vaes   )    printf( " VAES"   );
+        if      ( algo_has_vaes ||
-        else if ( algo_has_aes    )    printf( "  AES"   );
+                  algo_has_vaes256 )  printf( " VAES"   );
-        if      ( algo_has_sha    )    printf( " SHA"    );
+        else if ( algo_has_aes     )  printf( "  AES"   );
        if      ( algo_has_sha     )  printf( " SHA"    );
     }
     printf("\n");
@@ -2959,13 +2974,12 @@ static bool cpu_capability( bool display_only )
     // Determine mining options
     use_sse2   = cpu_has_sse2   && algo_has_sse2;
     use_aes    = cpu_has_aes    && sw_has_aes    && algo_has_aes;
     use_sse42  = cpu_has_sse42  && sw_has_sse42  && algo_has_sse42;
     use_avx2   = cpu_has_avx2   && sw_has_avx2   && algo_has_avx2;
     use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
     use_sha    = cpu_has_sha    && sw_has_sha    && algo_has_sha;
-     use_vaes   = cpu_has_vaes   && sw_has_vaes   && algo_has_vaes
+     use_vaes   = cpu_has_vaes   && sw_has_vaes   && ( algo_has_vaes
-          && ( use_avx512 || algo_has_vaes256 );
+                                                    || algo_has_vaes256 );
-     use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 ||
+     use_none = !( use_sse2 || use_aes || use_avx512 || use_avx2 ||
                   use_sha || use_vaes );
     // Display best options
@@ -2975,7 +2989,6 @@ static bool cpu_capability( bool display_only )
     {
        if      ( use_avx512 ) printf( " AVX512" );
        else if ( use_avx2   ) printf( " AVX2"   );
        else if ( use_sse42  ) printf( " SSE4.2" );
        else if ( use_sse2   ) printf( " SSE2"   );
        if      ( use_vaes   ) printf( " VAES"   );
        else if ( use_aes    ) printf( " AES"    );
@@ -3394,8 +3407,6 @@ void parse_arg(int key, char *arg )
 		v = atoi(arg);
 		if (v < 0 || v > 5)	/* sanity check */
 			show_usage_and_exit(1);
      // option is deprecated, show warning
      applog( LOG_WARNING, "High priority mining threads may cause system instability");
 		opt_priority = v;
 		break;
   case 'N':    // N parameter for various scrypt algos
--- a/miner.h
+++ b/miner.h
@@ -307,6 +307,7 @@ extern json_t *json_rpc_call( CURL *curl, const char *url, const char *userpass,
 extern void cbin2hex(char *out, const char *in, size_t len);
 void   bin2hex( char *s, const unsigned char *p, size_t len );
 char  *abin2hex( const unsigned char *p, size_t len );
 char  *bebin2hex( const unsigned char *p, size_t len );
 bool   hex2bin( unsigned char *p, const char *hexstr, size_t len );
 bool   jobj_binary( const json_t *obj, const char *key, void *buf,
                    size_t buflen );
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -237,6 +237,25 @@ static inline void memset_128( __m128i *dst, const __m128i a, const int n )
 static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 {   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
 #if defined(__AVX512VL__)
 // a ^ b ^ c
 #define mm128_xor3( a, b, c ) \
   _mm_ternarylogic_epi64( a, b, c, 0x96 )
 // a ^ ( b & c )
 #define mm128_xorand( a, b, c ) \
   _mm_ternarylogic_epi64( a, b, c, 0x78 )
 #else
 #define mm128_xor3( a, b, c ) \
   _mm_xor_si128( a, _mm_xor_si128( b, c ) )
 #define mm128_xorand( a, b, c ) \
  _mm_xor_si128( a, _mm_and_si128( b, c ) )
 #endif
 //
 // Bit rotations
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -136,9 +136,84 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_add4_8( a, b, c, d ) \
   _mm256_add_epi8( _mm256_add_epi8( a, b ), _mm256_add_epi8( c, d ) )
 #if defined(__AVX512VL__)
 // AVX512 has ternary logic that supports any 3 input boolean expression.
 // a ^ b ^ c
 #define mm256_xor3( a, b, c ) \
   _mm256_ternarylogic_epi64( a, b, c, 0x96 )
 // legacy convenience only
 #define mm256_xor4( a, b, c, d ) \
   _mm256_xor_si256( a, mm256_xor3( b, c, d ) )
 // a & b & c
 #define mm256_and3( a, b, c ) \
   _mm256_ternarylogic_epi64( a, b, c, 0x80 )
 // a | b | c
 #define mm256_or3( a, b, c ) \
   _mm256_ternarylogic_epi64( a, b, c, 0xfe )
 // a ^ ( b & c )
 #define mm256_xorand( a, b, c ) \
   _mm256_ternarylogic_epi64( a, b, c, 0x78 )
 // a & ( b ^ c )
 #define mm256_andxor( a, b, c ) \
   _mm256_ternarylogic_epi64( a, b, c, 0x60 )
 // a ^ ( b | c )
 #define mm256_xoror( a, b, c ) \
   _mm256_ternarylogic_epi64( a, b, c, 0x1e )
 // a ^ ( ~b & c )   
 #define mm256_xorandnot( a, b, c ) \
  _mm256_ternarylogic_epi64( a, b, c, 0xd2 )
 // a | ( b & c )
 #define mm256_orand( a, b, c ) \
   _mm256_ternarylogic_epi64( a, b, c, 0xf8  )
 // ~( a ^ b ), same as (~a) ^ b
 #define mm256_xnor( a, b ) \
   _mm256_ternarylogic_epi64( a, b, b, 0x81  )
 #else
 #define mm256_xor3( a, b, c ) \
   _mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
 #define mm256_xor4( a, b, c, d ) \
   _mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )
 #define mm256_and3( a, b, c ) \
   _mm256_and_si256( a, _mm256_and_si256( b, c ) )
 #define mm256_or3( a, b, c ) \
   _mm256_or_si256( a, _mm256_or_si256( b, c ) )
 #define mm256_xorand( a, b, c ) \
 _mm256_xor_si256( a, _mm256_and_si256( b, c ) )
 #define mm256_andxor( a, b, c ) \
  _mm256_and_si256( a, _mm256_xor_si256( b, c ))
 #define mm256_xoror( a, b, c ) \
 _mm256_xor_si256( a, _mm256_or_si256( b, c ) )
 #define mm256_xorandnot( a, b, c ) \
 _mm256_xor_si256( a, _mm256_andnot_si256( b, c ) )
 #define mm256_orand( a, b, c ) \
 _mm256_or_si256( a, _mm256_and_si256( b, c ) )
 #define mm256_xnor( a, b ) \
  mm256_not( _mm256_xor_si256( a, b ) )
 #endif
 //
 //           Bit rotations.
 //
@@ -200,15 +275,17 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
 // Rotate elements accross all lanes.
 //
 // Swap 128 bit elements in 256 bit vector.
 #define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )
-// Rotate 256 bit vector by one 64 bit element
+#if defined(__AVX512VL__)
 #define mm256_ror_1x64( v )     _mm256_permute4x64_epi64( v, 0x39 )
 #define mm256_rol_1x64( v )     _mm256_permute4x64_epi64( v, 0x93 )
-#if defined(__AVX512F__) && defined(__AVX512VL__)
+static inline __m256i mm256_swap_128( const __m256i v )
 { return _mm256_alignr_epi64( v, v, 2 ); }
 static inline __m256i mm256_ror_1x64( const __m256i v )
 { return _mm256_alignr_epi64( v, v, 1 ); }
 static inline __m256i mm256_rol_1x64( const __m256i v )
 { return _mm256_alignr_epi64( v, v, 3 ); }
 static inline __m256i mm256_ror_1x32( const __m256i v )
 { return _mm256_alignr_epi32( v, v, 1 ); }
@@ -218,6 +295,13 @@ static inline __m256i mm256_rol_1x32( const __m256i v )
 #else   // AVX2
 // Swap 128 bit elements in 256 bit vector.
 #define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )
 // Rotate 256 bit vector by one 64 bit element
 #define mm256_ror_1x64( v )     _mm256_permute4x64_epi64( v, 0x39 )
 #define mm256_rol_1x64( v )     _mm256_permute4x64_epi64( v, 0x93 )
 // Rotate 256 bit vector by one 32 bit element.
 #define mm256_ror_1x32( v ) \
    _mm256_permutevar8x32_epi32( v, \
@@ -229,6 +313,7 @@ static inline __m256i mm256_rol_1x32( const __m256i v )
                     m256_const_64( 0x0000000600000005,  0x0000000400000003, \
                                    0x0000000200000001,  0x0000000000000007 )
 #endif    // AVX512 else AVX2
 //
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -61,7 +61,7 @@
 //
 //    Additionally, permutations using smaller vectors can be more efficient
 //    if the permutation doesn't cross lane boundaries, typically 128 bits,
-//    and the smnaller vector can use an imm comtrol.
+//    and the smaller vector can use an imm comtrol.
 //
 //    If the permutation doesn't cross lane boundaries a shuffle instructions
 //    can be used with imm control instead of permute.
@@ -107,7 +107,7 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
  return v.m512i;
 }
-// Equivalent of set1, broadcast lo element all elements.
+// Equivalent of set1, broadcast lo element to all elements.
 static inline __m512i m512_const1_256( const __m256i v )
 { return _mm512_inserti64x4( _mm512_castsi256_si512( v ), v, 1 ); }  
@@ -166,7 +166,9 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
 // Basic operations without SIMD equivalent
 // ~x
-#define mm512_not( x )       _mm512_xor_si512( x, m512_neg1 )
+// #define mm512_not( x )       _mm512_xor_si512( x, m512_neg1 )
 static inline __m512i mm512_not( const __m512i x )
 {  return _mm512_ternarylogic_epi64( x, x, x, 1 ); }
 // -x
 #define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x )
@@ -221,11 +223,61 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_add4_8( a, b, c, d ) \
   _mm512_add_epi8( _mm512_add_epi8( a, b ), _mm512_add_epi8( c, d ) )
 #define mm512_xor4( a, b, c, d ) \
   _mm512_xor_si512( _mm512_xor_si512( a, b ), _mm512_xor_si512( c, d ) )
 //
 // Ternary logic uses 8 bit truth table to define any 3 input logical
 // operation using any number or combinations of AND, OR XOR, NOT.
 // a ^ b ^ c
 #define mm512_xor3( a, b, c ) \
   _mm512_ternarylogic_epi64( a, b, c, 0x96 )
 // legacy convenience only
 #define mm512_xor4( a, b, c, d ) \
   _mm512_xor_si512( a, mm512_xor3( b, c, d ) )
 // a & b & c
 #define mm512_and3( a, b, c ) \
   _mm512_ternarylogic_epi64( a, b, c, 0x80 )
 // a | b | c
 #define mm512_or3( a, b, c ) \
   _mm512_ternarylogic_epi64( a, b, c, 0xfe )
 // a ^ ( b & c )
 #define mm512_xorand( a, b, c ) \
   _mm512_ternarylogic_epi64( a, b, c, 0x78 )
 // a & ( b ^ c )
 #define mm512_andxor( a, b, c ) \
   _mm512_ternarylogic_epi64( a, b, c, 0x60 )
 // a ^ ( b & c )
 #define mm512_xoror( a, b, c ) \
   _mm512_ternarylogic_epi64( a, b, c, 0x1e )
 // a ^ ( ~b & c )     [ xor( a, andnot( b, c ) ]
 #define mm512_xorandnot( a, b, c ) \
  _mm512_ternarylogic_epi64( a, b, c, 0xd2 ) 
 // a | ( b & c )
 #define mm512_orand( a, b, c ) \
   _mm512_ternarylogic_epi64( a, b, c, 0xf8  )
 // Some 2 input operations that don't have their own instruction mnemonic.
 // ~( a | b )
 #define mm512_nor( a, b ) \
   _mm512_ternarylogic_epi64( a, b, b, 0x01  )
 // ~( a ^ b ), same as (~a) ^ b
 #define mm512_xnor( a, b ) \
   _mm512_ternarylogic_epi64( a, b, b, 0x81  )
 // ~( a & b )
 #define mm512_nand( a, b ) \
   _mm512_ternarylogic_epi64( a, b, b, 0xef  )
 // Bit rotations.
 // AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
--- a/util.c
+++ b/util.c
@@ -795,6 +795,15 @@ char *abin2hex(const unsigned char *p, size_t len)
 	return s;
 }
 char *bebin2hex(const unsigned char *p, size_t len)
 {
   char *s = (char*) malloc((len * 2) + 1);
   if (!s)  return NULL;
   for ( size_t i = 0, j = len - 1; i < len; i++, j-- )
      sprintf( s + ( i*2 ), "%02x", (unsigned int) p[ j ] );
   return s;
 }
 bool hex2bin(unsigned char *p, const char *hexstr, size_t len)
 {
 	char hex_byte[3];
@@ -1789,10 +1798,14 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p
 				if ( !stratum_handle_method( sctx, sret ) )
 					applog( LOG_WARNING, "Stratum answer id is not correct!" );
 			}
-			res_val = json_object_get( extra, "result" );
+         else
-			if (opt_debug && (!res_val || json_is_false(res_val)))
+         {
-				applog(LOG_DEBUG, "Method extranonce.subscribe is not supported");
+            res_val = json_object_get( extra, "result" );
-			json_decref( extra );
+			   if ( opt_debug && ( !res_val || json_is_false( res_val ) ) )
 				   applog( LOG_DEBUG,
                       "Method extranonce.subscribe is not supported" );
         }
         json_decref( extra );
 		}
 		free(sret);
 	}
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -107,7 +107,7 @@ mv cpuminer.exe release/cpuminer-avx.exe
 # Westmere SSE4.2 AES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=westmere -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="-O3 -march=westmere -maes -Wall" ./configure $CONFIGURE_ARGS
 #CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
Author	SHA1	Message	Date
Jay D Dee	9b905fccc8	v3.17.1	2021-07-26 15:01:37 -04:00
Jay D Dee	92b3733925	v3.17.0	2021-07-15 20:30:44 -04:00
Jay D Dee	19cc88d102	v3.16.5	2021-06-26 12:27:44 -04:00
Jay D Dee	a053690170	v3.16.4	2021-06-23 21:52:42 -04:00
Jay D Dee	3c5e8921b7	v3.16.3	2021-05-06 14:55:03 -04:00