v3.17.1

v3.17.0
v3.16.5
2025-09-17 23:44:27 +00:00 · 2021-07-26 15:01:37 -04:00 · 2021-07-15 20:30:44 -04:00 · 2021-06-26 12:27:44 -04:00 · 2021-06-23 21:52:42 -04:00 · 2021-05-06 14:55:03 -04:00
69 changed files with 3077 additions and 1175 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -163,6 +163,8 @@ cpuminer_SOURCES = \
  algo/sha/sph_sha2big.c \
  algo/sha/sha256-hash-4way.c \
  algo/sha/sha512-hash-4way.c \
+  algo/sha/sha256-hash-opt.c \
+  algo/sha/sha256-hash-2way-ni.c \
  algo/sha/hmac-sha256-hash.c \
  algo/sha/hmac-sha256-hash-4way.c \
  algo/sha/sha2.c \
--- a/README.txt
+++ b/README.txt
@@ -64,6 +64,11 @@ source code obtained from the author's official repository. The exact
 procedure is documented in the build instructions for Windows:
 https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source

+Some DLL filess may already be installed on the system by Windows or third
+party packages. They often will work and may be used instead of the included
+file. Without a compelling reason to do so it's recommended to use the included
+files as they are packaged.
+
 If you like this software feel free to donate:

 BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
--- a/31
+++ b/31
@@ -65,11 +65,39 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v3.17.1
+
+Fixed Windows build for AES+SSE4.2 (Westmere), was missing AES.
+More ternary logic optimizations for AVX512, AVX512+VAES, and AVX512+AES.
+Fixed my-gr algo for VAES.
+
+v3.17.0
+
+AVX512 optimized using ternary logic instructions.
+Faster sha256t on all CPU architectures: AVX512 +30%, SHA +30%, AVX2 +9%.
+Use SHA on supported CPUs to produce merkle hash.
+Fixed byte order in Extranonce2 log & replaced Block height with Job ID.
+
+v3.16.5
+
+#329: Fixed GBT incorrect target diff in stats, second attempt.
+Fixed formatting error in share result log when --no-color option is used.
+
+v3.16.4
+
+Faster sha512 and sha256 when not using SHA CPU extension.
+#329: Fixed GBT incorrect target diff in stats.
+
+v3.16.3
+
+#313 Fix compile error with GCC 11.
+Incremental improvements to verthash.
+
 v3.16.2

 Verthash: midstate prehash optimization for all architectures.
 Verthash: AVX2 optimization.
-GBT: added support for Bech32 addresses, untested.
+GBT: added support for Bech32 addresses.
 Linux: added CPU frequency to benchmark log.
 Fixed integer overflow in time calculations.

@@ -111,7 +139,6 @@ RPC getmininginfo method.
 v3.15.5

 Fix stratum jobs lost if 2 jobs received in less than one second.
- 

 v3.15.4

--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -180,6 +180,7 @@ void blake512_8way_update( void *cc, const void *data, size_t len );
 void blake512_8way_close( void *cc, void *dst );
 void blake512_8way_full( blake_8way_big_context *sc, void * dst,
                        const void *data, size_t len );
+void blake512_8way_hash_le80( void *hash, const void *data );

 #endif  // AVX512
 #endif  // AVX2
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -669,14 +669,14 @@ do { \
      ROUND_S_8WAY(2); \
      ROUND_S_8WAY(3); \
   } \
-   H0 = _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), H0 ); \
-   H1 = _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), H1 ); \
-   H2 = _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), H2 ); \
-   H3 = _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), H3 ); \
-   H4 = _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), H4 ); \
-   H5 = _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), H5 ); \
-   H6 = _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), H6 ); \
-   H7 = _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), H7 ); \
+   H0 = mm256_xor3( V8, V0, H0 ); \
+   H1 = mm256_xor3( V9, V1, H1 ); \
+   H2 = mm256_xor3( VA, V2, H2 ); \
+   H3 = mm256_xor3( VB, V3, H3 ); \
+   H4 = mm256_xor3( VC, V4, H4 ); \
+   H5 = mm256_xor3( VD, V5, H5 ); \
+   H6 = mm256_xor3( VE, V6, H6 ); \
+   H7 = mm256_xor3( VF, V7, H7 ); \
 } while (0)


@@ -808,14 +808,14 @@ do { \
      ROUND_S_16WAY(2); \
      ROUND_S_16WAY(3); \
   } \
-   H0 = _mm512_xor_si512( _mm512_xor_si512( V8, V0 ), H0 ); \
-   H1 = _mm512_xor_si512( _mm512_xor_si512( V9, V1 ), H1 ); \
-   H2 = _mm512_xor_si512( _mm512_xor_si512( VA, V2 ), H2 ); \
-   H3 = _mm512_xor_si512( _mm512_xor_si512( VB, V3 ), H3 ); \
-   H4 = _mm512_xor_si512( _mm512_xor_si512( VC, V4 ), H4 ); \
-   H5 = _mm512_xor_si512( _mm512_xor_si512( VD, V5 ), H5 ); \
-   H6 = _mm512_xor_si512( _mm512_xor_si512( VE, V6 ), H6 ); \
-   H7 = _mm512_xor_si512( _mm512_xor_si512( VF, V7 ), H7 ); \
+   H0 = mm512_xor3( V8, V0, H0 ); \
+   H1 = mm512_xor3( V9, V1, H1 ); \
+   H2 = mm512_xor3( VA, V2, H2 ); \
+   H3 = mm512_xor3( VB, V3, H3 ); \
+   H4 = mm512_xor3( VC, V4, H4 ); \
+   H5 = mm512_xor3( VD, V5, H5 ); \
+   H6 = mm512_xor3( VE, V6, H6 ); \
+   H7 = mm512_xor3( VF, V7, H7 ); \
 } while (0)

 #endif
--- a/algo/blake/blake2b-hash-4way.c
+++ b/algo/blake/blake2b-hash-4way.c
@@ -122,14 +122,14 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
      B2B8W_G( 3, 4,  9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
   }

-   ctx->h[0] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[0], v[0] ), v[ 8] );
-   ctx->h[1] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[1], v[1] ), v[ 9] );
-   ctx->h[2] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[2], v[2] ), v[10] );
-   ctx->h[3] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[3], v[3] ), v[11] );
-   ctx->h[4] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[4], v[4] ), v[12] );
-   ctx->h[5] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[5], v[5] ), v[13] );
-   ctx->h[6] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[6], v[6] ), v[14] );
-   ctx->h[7] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[7], v[7] ), v[15] );
+   ctx->h[0] = mm512_xor3( ctx->h[0], v[0], v[ 8] );
+   ctx->h[1] = mm512_xor3( ctx->h[1], v[1], v[ 9] );
+   ctx->h[2] = mm512_xor3( ctx->h[2], v[2], v[10] );
+   ctx->h[3] = mm512_xor3( ctx->h[3], v[3], v[11] );
+   ctx->h[4] = mm512_xor3( ctx->h[4], v[4], v[12] );
+   ctx->h[5] = mm512_xor3( ctx->h[5], v[5], v[13] );
+   ctx->h[6] = mm512_xor3( ctx->h[6], v[6], v[14] );
+   ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
 }

 int blake2b_8way_init( blake2b_8way_ctx *ctx )
--- a/algo/blake/blake2b-hash-4way.h
+++ b/algo/blake/blake2b-hash-4way.h
@@ -17,7 +17,7 @@

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-ALIGN(128) typedef struct {
+typedef struct ALIGN( 64 ) {
   __m512i b[16]; // input buffer
   __m512i h[8];  // chained state
   uint64_t t[2];  // total number of bytes
@@ -35,7 +35,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
 #if defined(__AVX2__)

 // state context
-ALIGN(128) typedef struct {
+typedef struct ALIGN( 64 ) {
 	__m256i b[16]; // input buffer
 	__m256i h[8];  // chained state
 	uint64_t t[2];  // total number of bytes
--- a/algo/blake/blake2s-gate.h
+++ b/algo/blake/blake2s-gate.h
@@ -4,7 +4,6 @@
 #include <stdint.h>
 #include "algo-gate-api.h"

-//#if defined(__SSE4_2__)
 #if defined(__SSE2__)
  #define BLAKE2S_4WAY
 #endif
@@ -27,8 +26,6 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,

 #elif defined (BLAKE2S_8WAY)

-//#if defined(BLAKE2S_8WAY)
-
 void blake2s_8way_hash( void *state, const void *input );
 int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -368,7 +368,7 @@ do { \
   ROUND8W( 9 );

   for( size_t i = 0; i < 8; ++i )
-      S->h[i] = _mm256_xor_si256( _mm256_xor_si256( S->h[i], v[i] ), v[i + 8] );
+      S->h[i] = mm256_xor3( S->h[i], v[i], v[i + 8] );

 #undef G8W
 #undef ROUND8W
@@ -566,7 +566,7 @@ do { \
   ROUND16W( 9 );

   for( size_t i = 0; i < 8; ++i )
-      S->h[i] = _mm512_xor_si512( _mm512_xor_si512( S->h[i], v[i] ), v[i + 8] );
+      S->h[i] = mm512_xor3( S->h[i], v[i], v[i + 8] );

 #undef G16W
 #undef ROUND16W
--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -60,7 +60,7 @@ typedef struct __blake2s_nway_param
 } blake2s_nway_param;
 #pragma pack(pop)

-ALIGN( 64 ) typedef struct __blake2s_4way_state
+typedef struct ALIGN( 64 ) __blake2s_4way_state
 {
   __m128i h[8];
   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 4 ];
@@ -80,7 +80,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,

 #if defined(__AVX2__)

-ALIGN( 64 ) typedef struct __blake2s_8way_state
+typedef struct ALIGN( 64 ) __blake2s_8way_state
 {
   __m256i h[8];
   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 8 ];
@@ -101,7 +101,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-ALIGN( 128 ) typedef struct __blake2s_16way_state
+typedef struct ALIGN( 64 ) __blake2s_16way_state
 {
   __m512i h[8];
   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 16 ];
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -293,10 +293,6 @@ static const sph_u64 CB[16] = {
      H5 = (state)->H[5]; \
      H6 = (state)->H[6]; \
      H7 = (state)->H[7]; \
-      S0 = (state)->S[0]; \
-      S1 = (state)->S[1]; \
-      S2 = (state)->S[2]; \
-      S3 = (state)->S[3]; \
      T0 = (state)->T0; \
      T1 = (state)->T1; \
   } while (0)
@@ -310,10 +306,6 @@ static const sph_u64 CB[16] = {
      (state)->H[5] = H5; \
      (state)->H[6] = H6; \
      (state)->H[7] = H7; \
-      (state)->S[0] = S0; \
-      (state)->S[1] = S1; \
-      (state)->S[2] = S2; \
-      (state)->S[3] = S3; \
      (state)->T0 = T0; \
      (state)->T1 = T1; \
   } while (0)
@@ -348,7 +340,6 @@ static const sph_u64 CB[16] = {

 #define DECL_STATE64_8WAY \
   __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
-        __m512i S0, S1, S2, S3; \
   uint64_t T0, T1;

 #define COMPRESS64_8WAY( buf )   do \
@@ -366,10 +357,10 @@ static const sph_u64 CB[16] = {
  V5 = H5; \
  V6 = H6; \
  V7 = H7; \
-  V8 = _mm512_xor_si512( S0, m512_const1_64( CB0 ) );  \
-  V9 = _mm512_xor_si512( S1, m512_const1_64( CB1 ) );  \
-  VA = _mm512_xor_si512( S2, m512_const1_64( CB2 ) );  \
-  VB = _mm512_xor_si512( S3, m512_const1_64( CB3 ) );  \
+  V8 = m512_const1_64( CB0 );  \
+  V9 = m512_const1_64( CB1 );  \
+  VA = m512_const1_64( CB2 );  \
+  VB = m512_const1_64( CB3 );  \
  VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
                         m512_const1_64( CB4 ) );  \
  VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
@@ -414,14 +405,14 @@ static const sph_u64 CB[16] = {
  ROUND_B_8WAY(3); \
  ROUND_B_8WAY(4); \
  ROUND_B_8WAY(5); \
-  H0 = mm512_xor4( V8, V0, S0, H0 ); \
-  H1 = mm512_xor4( V9, V1, S1, H1 ); \
-  H2 = mm512_xor4( VA, V2, S2, H2 ); \
-  H3 = mm512_xor4( VB, V3, S3, H3 ); \
-  H4 = mm512_xor4( VC, V4, S0, H4 ); \
-  H5 = mm512_xor4( VD, V5, S1, H5 ); \
-  H6 = mm512_xor4( VE, V6, S2, H6 ); \
-  H7 = mm512_xor4( VF, V7, S3, H7 ); \
+  H0 = mm512_xor3( V8, V0, H0 ); \
+  H1 = mm512_xor3( V9, V1, H1 ); \
+  H2 = mm512_xor3( VA, V2, H2 ); \
+  H3 = mm512_xor3( VB, V3, H3 ); \
+  H4 = mm512_xor3( VC, V4, H4 ); \
+  H5 = mm512_xor3( VD, V5, H5 ); \
+  H6 = mm512_xor3( VE, V6, H6 ); \
+  H7 = mm512_xor3( VF, V7, H7 ); \
 } while (0)

 void blake512_8way_compress( blake_8way_big_context *sc )
@@ -440,10 +431,10 @@ void blake512_8way_compress( blake_8way_big_context *sc )
  V5 = sc->H[5];
  V6 = sc->H[6];
  V7 = sc->H[7];
-  V8 = _mm512_xor_si512( sc->S[0], m512_const1_64( CB0 ) );
-  V9 = _mm512_xor_si512( sc->S[1], m512_const1_64( CB1 ) );
-  VA = _mm512_xor_si512( sc->S[2], m512_const1_64( CB2 ) );
-  VB = _mm512_xor_si512( sc->S[3], m512_const1_64( CB3 ) );
+  V8 = m512_const1_64( CB0 );
+  V9 = m512_const1_64( CB1 );
+  VA = m512_const1_64( CB2 );
+  VB = m512_const1_64( CB3 );
  VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
                            m512_const1_64( CB4 ) );
  VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
@@ -492,19 +483,18 @@ void blake512_8way_compress( blake_8way_big_context *sc )
  ROUND_B_8WAY(4);
  ROUND_B_8WAY(5);

-  sc->H[0] = mm512_xor4( V8, V0, sc->S[0], sc->H[0] );
-  sc->H[1] = mm512_xor4( V9, V1, sc->S[1], sc->H[1] );
-  sc->H[2] = mm512_xor4( VA, V2, sc->S[2], sc->H[2] );
-  sc->H[3] = mm512_xor4( VB, V3, sc->S[3], sc->H[3] );
-  sc->H[4] = mm512_xor4( VC, V4, sc->S[0], sc->H[4] );
-  sc->H[5] = mm512_xor4( VD, V5, sc->S[1], sc->H[5] );
-  sc->H[6] = mm512_xor4( VE, V6, sc->S[2], sc->H[6] );
-  sc->H[7] = mm512_xor4( VF, V7, sc->S[3], sc->H[7] );
+  sc->H[0] = mm512_xor3( V8, V0, sc->H[0] );
+  sc->H[1] = mm512_xor3( V9, V1, sc->H[1] );
+  sc->H[2] = mm512_xor3( VA, V2, sc->H[2] );
+  sc->H[3] = mm512_xor3( VB, V3, sc->H[3] );
+  sc->H[4] = mm512_xor3( VC, V4, sc->H[4] );
+  sc->H[5] = mm512_xor3( VD, V5, sc->H[5] );
+  sc->H[6] = mm512_xor3( VE, V6, sc->H[6] );
+  sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
 }

 void blake512_8way_init( blake_8way_big_context *sc )
 {
-   __m512i zero = m512_zero;
   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
@@ -514,11 +504,6 @@ void blake512_8way_init( blake_8way_big_context *sc )
   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );

-   casti_m512i( sc->S, 0 ) = zero;
-   casti_m512i( sc->S, 1 ) = zero;
-   casti_m512i( sc->S, 2 ) = zero;
-   casti_m512i( sc->S, 3 ) = zero;
-
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
 }
@@ -641,11 +626,6 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );

-   casti_m512i( sc->S, 0 ) = m512_zero;
-   casti_m512i( sc->S, 1 ) = m512_zero;
-   casti_m512i( sc->S, 2 ) = m512_zero;
-   casti_m512i( sc->S, 3 ) = m512_zero;
-
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;

@@ -740,7 +720,6 @@ blake512_8way_close(void *cc, void *dst)

 #define DECL_STATE64_4WAY \
 	__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
-        __m256i S0, S1, S2, S3; \
 	uint64_t T0, T1;

 #define COMPRESS64_4WAY   do \
@@ -758,10 +737,10 @@ blake512_8way_close(void *cc, void *dst)
  V5 = H5; \
  V6 = H6; \
  V7 = H7; \
-  V8 = _mm256_xor_si256( S0, m256_const1_64( CB0 ) );  \
-  V9 = _mm256_xor_si256( S1, m256_const1_64( CB1 ) );  \
-  VA = _mm256_xor_si256( S2, m256_const1_64( CB2 ) );  \
-  VB = _mm256_xor_si256( S3, m256_const1_64( CB3 ) );  \
+  V8 = m256_const1_64( CB0 );  \
+  V9 = m256_const1_64( CB1 );  \
+  VA = m256_const1_64( CB2 );  \
+  VB = m256_const1_64( CB3 );  \
  VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
                         m256_const1_64( CB4 ) );  \
  VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
@@ -804,14 +783,14 @@ blake512_8way_close(void *cc, void *dst)
  ROUND_B_4WAY(3); \
  ROUND_B_4WAY(4); \
  ROUND_B_4WAY(5); \
-  H0 = mm256_xor4( V8, V0, S0, H0 ); \
-  H1 = mm256_xor4( V9, V1, S1, H1 ); \
-  H2 = mm256_xor4( VA, V2, S2, H2 ); \
-  H3 = mm256_xor4( VB, V3, S3, H3 ); \
-  H4 = mm256_xor4( VC, V4, S0, H4 ); \
-  H5 = mm256_xor4( VD, V5, S1, H5 ); \
-  H6 = mm256_xor4( VE, V6, S2, H6 ); \
-  H7 = mm256_xor4( VF, V7, S3, H7 ); \
+  H0 = mm256_xor3( V8, V0, H0 ); \
+  H1 = mm256_xor3( V9, V1, H1 ); \
+  H2 = mm256_xor3( VA, V2, H2 ); \
+  H3 = mm256_xor3( VB, V3, H3 ); \
+  H4 = mm256_xor3( VC, V4, H4 ); \
+  H5 = mm256_xor3( VD, V5, H5 ); \
+  H6 = mm256_xor3( VE, V6, H6 ); \
+  H7 = mm256_xor3( VF, V7, H7 ); \
 } while (0)


@@ -831,10 +810,10 @@ void blake512_4way_compress( blake_4way_big_context *sc )
  V5 = sc->H[5];
  V6 = sc->H[6];
  V7 = sc->H[7];
-  V8 = _mm256_xor_si256( sc->S[0], m256_const1_64( CB0 ) );
-  V9 = _mm256_xor_si256( sc->S[1], m256_const1_64( CB1 ) );
-  VA = _mm256_xor_si256( sc->S[2], m256_const1_64( CB2 ) );
-  VB = _mm256_xor_si256( sc->S[3], m256_const1_64( CB3 ) );
+  V8 = m256_const1_64( CB0 );
+  V9 = m256_const1_64( CB1 );
+  VA = m256_const1_64( CB2 );
+  VB = m256_const1_64( CB3 );
  VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
                             m256_const1_64( CB4 ) );
  VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
@@ -880,19 +859,18 @@ void blake512_4way_compress( blake_4way_big_context *sc )
  ROUND_B_4WAY(4);
  ROUND_B_4WAY(5);

-  sc->H[0] = mm256_xor4( V8, V0, sc->S[0], sc->H[0] );
-  sc->H[1] = mm256_xor4( V9, V1, sc->S[1], sc->H[1] );
-  sc->H[2] = mm256_xor4( VA, V2, sc->S[2], sc->H[2] );
-  sc->H[3] = mm256_xor4( VB, V3, sc->S[3], sc->H[3] );
-  sc->H[4] = mm256_xor4( VC, V4, sc->S[0], sc->H[4] );
-  sc->H[5] = mm256_xor4( VD, V5, sc->S[1], sc->H[5] );
-  sc->H[6] = mm256_xor4( VE, V6, sc->S[2], sc->H[6] );
-  sc->H[7] = mm256_xor4( VF, V7, sc->S[3], sc->H[7] );
+  sc->H[0] = mm256_xor3( V8, V0, sc->H[0] );
+  sc->H[1] = mm256_xor3( V9, V1, sc->H[1] );
+  sc->H[2] = mm256_xor3( VA, V2, sc->H[2] );
+  sc->H[3] = mm256_xor3( VB, V3, sc->H[3] );
+  sc->H[4] = mm256_xor3( VC, V4, sc->H[4] );
+  sc->H[5] = mm256_xor3( VD, V5, sc->H[5] );
+  sc->H[6] = mm256_xor3( VE, V6, sc->H[6] );
+  sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
 }

 void blake512_4way_init( blake_4way_big_context *sc )
 {
-   __m256i zero = m256_zero;
   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
@@ -902,11 +880,6 @@ void blake512_4way_init( blake_4way_big_context *sc )
   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );

-   casti_m256i( sc->S, 0 ) = zero;
-   casti_m256i( sc->S, 1 ) = zero;
-   casti_m256i( sc->S, 2 ) = zero;
-   casti_m256i( sc->S, 3 ) = zero;
-
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
 }
@@ -1026,11 +999,6 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );

-   casti_m256i( sc->S, 0 ) = m256_zero;
-   casti_m256i( sc->S, 1 ) = m256_zero;
-   casti_m256i( sc->S, 2 ) = m256_zero;
-   casti_m256i( sc->S, 3 ) = m256_zero;
-
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;

--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -323,7 +323,7 @@ int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen )

 int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
 {
-	blake2s_state S[1];
+	blake2s_state S;

 	/* Verify parameters */
 	if ( NULL == in ) return -1;
@@ -334,15 +334,15 @@ int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen

 	if( keylen > 0 )
 	{
-		if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
+		if( blake2s_init_key( &S, outlen, key, keylen ) < 0 ) return -1;
 	}
 	else
 	{
-		if( blake2s_init( S, outlen ) < 0 ) return -1;
+		if( blake2s_init( &S, outlen ) < 0 ) return -1;
 	}

-	blake2s_update( S, ( uint8_t * )in, inlen );
-	blake2s_final( S, out, outlen );
+	blake2s_update( &S, ( uint8_t * )in, inlen );
+	blake2s_final( &S, out, outlen );
 	return 0;
 }

--- a/algo/blake/sph-blake2s.h
+++ b/algo/blake/sph-blake2s.h
@@ -116,7 +116,7 @@ extern "C" {
 		uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
 	} blake2s_param;

-	ALIGN( 64 ) typedef struct __blake2s_state
+	typedef struct ALIGN( 64 ) __blake2s_state
 	{
 		uint32_t h[8];
 		uint32_t t[2];
--- a/algo/blake/sph_blake2b.h
+++ b/algo/blake/sph_blake2b.h
@@ -18,7 +18,7 @@
 #endif

 // state context
-ALIGN(64) typedef struct {
+typedef ALIGN(64) struct {
 	uint8_t b[128]; // input buffer
 	uint64_t h[8];  // chained state
 	uint64_t t[2];  // total number of bytes
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -867,40 +867,35 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
   qt[30] = expand2s8( qt, M, H, 30 );
   qt[31] = expand2s8( qt, M, H, 31 );

-   xl = _mm256_xor_si256(
-              mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
-              mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) );
-   xh = _mm256_xor_si256( xl,  _mm256_xor_si256(
-                 mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
-                 mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
+   xl = mm256_xor3( mm256_xor3( qt[16], qt[17], qt[18] ),
+                    mm256_xor3( qt[19], qt[20], qt[21] ),
+                    _mm256_xor_si256( qt[22], qt[23] ) );
+
+   xh = mm256_xor3( mm256_xor3( xl,     qt[24], qt[25] ),
+                    mm256_xor3( qt[26], qt[27], qt[28] ),
+                    mm256_xor3( qt[29], qt[30], qt[31] ) );

 #define DH1L( m, sl, sr, a, b, c ) \
-   _mm256_add_epi32( \
-               _mm256_xor_si256( M[m], \
-                  _mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \
-                                    _mm256_srli_epi32( qt[a], sr ) ) ), \
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+   _mm256_add_epi32( mm256_xor3( M[m], _mm256_slli_epi32( xh, sl ), \
+                                       _mm256_srli_epi32( qt[a], sr ) ), \
+                     mm256_xor3( xl, qt[b], qt[c] ) )

 #define DH1R( m, sl, sr, a, b, c ) \
-   _mm256_add_epi32( \
-               _mm256_xor_si256( M[m], \
-                  _mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \
-                                    _mm256_slli_epi32( qt[a], sr ) ) ), \
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+   _mm256_add_epi32( mm256_xor3( M[m], _mm256_srli_epi32( xh, sl ), \
+                                       _mm256_slli_epi32( qt[a], sr ) ), \
+                     mm256_xor3( xl, qt[b], qt[c] ) )

 #define DH2L( m, rl, sl, h, a, b, c ) \
   _mm256_add_epi32( _mm256_add_epi32( \
-       mm256_rol_32( dH[h], rl ), \
-          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
-                 _mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \
-                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+                        mm256_rol_32( dH[h], rl ), \
+                        mm256_xor3( xh, qt[a], M[m] ) ), \
+                     mm256_xor3( _mm256_slli_epi32( xl, sl ), qt[b], qt[c] ) ) 

 #define DH2R( m, rl, sr, h, a, b, c ) \
   _mm256_add_epi32( _mm256_add_epi32( \
-       mm256_rol_32( dH[h], rl ), \
-          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
-                 _mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \
-                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+                        mm256_rol_32( dH[h], rl ), \
+                        mm256_xor3( xh, qt[a], M[m] ) ), \
+                     mm256_xor3( _mm256_srli_epi32( xl, sr ), qt[b], qt[c] ) )

   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
   dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
@@ -924,88 +919,6 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
 #undef DH2L
 #undef DH2R

-/*   
-   dH[ 0] = _mm256_add_epi32(
-                 _mm256_xor_si256( M[0],
-                      _mm256_xor_si256( _mm256_slli_epi32( xh, 5 ),
-                                        _mm256_srli_epi32( qt[16], 5 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
-   dH[ 1] = _mm256_add_epi32(
-                 _mm256_xor_si256( M[1],
-                      _mm256_xor_si256( _mm256_srli_epi32( xh, 7 ),
-                                        _mm256_slli_epi32( qt[17], 8 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
-   dH[ 2] = _mm256_add_epi32(
-                 _mm256_xor_si256( M[2],
-                      _mm256_xor_si256( _mm256_srli_epi32( xh, 5 ),
-                                        _mm256_slli_epi32( qt[18], 5 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
-   dH[ 3] = _mm256_add_epi32(
-                 _mm256_xor_si256( M[3],
-                      _mm256_xor_si256( _mm256_srli_epi32( xh, 1 ),
-                                        _mm256_slli_epi32( qt[19], 5 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
-   dH[ 4] = _mm256_add_epi32(
-                 _mm256_xor_si256( M[4],
-                      _mm256_xor_si256( _mm256_srli_epi32( xh, 3 ),
-                                        _mm256_slli_epi32( qt[20], 0 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
-   dH[ 5] = _mm256_add_epi32(
-                 _mm256_xor_si256( M[5],
-                      _mm256_xor_si256( _mm256_slli_epi32( xh, 6 ),
-                                        _mm256_srli_epi32( qt[21], 6 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
-   dH[ 6] = _mm256_add_epi32(
-                 _mm256_xor_si256( M[6],
-                      _mm256_xor_si256( _mm256_srli_epi32( xh, 4 ),
-                                        _mm256_slli_epi32( qt[22], 6 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
-   dH[ 7] = _mm256_add_epi32(
-                 _mm256_xor_si256( M[7],
-                      _mm256_xor_si256( _mm256_srli_epi32( xh, 11 ),
-                                        _mm256_slli_epi32( qt[23], 2 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
-   dH[ 8] = _mm256_add_epi32( _mm256_add_epi32(
-                 mm256_rol_32( dH[4], 9 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
-                 _mm256_xor_si256( _mm256_slli_epi32( xl, 8 ),
-                                   _mm256_xor_si256( qt[23], qt[ 8] ) ) );
-   dH[ 9] = _mm256_add_epi32( _mm256_add_epi32(
-                 mm256_rol_32( dH[5], 10 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
-                 _mm256_xor_si256( _mm256_srli_epi32( xl, 6 ),
-                                   _mm256_xor_si256( qt[16], qt[ 9] ) ) );
-   dH[10] = _mm256_add_epi32( _mm256_add_epi32(
-                 mm256_rol_32( dH[6], 11 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
-                 _mm256_xor_si256( _mm256_slli_epi32( xl, 6 ),
-                                   _mm256_xor_si256( qt[17], qt[10] ) ) );
-   dH[11] = _mm256_add_epi32( _mm256_add_epi32(
-                 mm256_rol_32( dH[7], 12 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
-                 _mm256_xor_si256( _mm256_slli_epi32( xl, 4 ),
-                                   _mm256_xor_si256( qt[18], qt[11] ) ) );
-   dH[12] = _mm256_add_epi32( _mm256_add_epi32(
-                 mm256_rol_32( dH[0], 13 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
-                 _mm256_xor_si256( _mm256_srli_epi32( xl, 3 ),
-                                   _mm256_xor_si256( qt[19], qt[12] ) ) );
-   dH[13] = _mm256_add_epi32( _mm256_add_epi32(
-                 mm256_rol_32( dH[1], 14 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
-                 _mm256_xor_si256( _mm256_srli_epi32( xl, 4 ),
-                                   _mm256_xor_si256( qt[20], qt[13] ) ) );
-   dH[14] = _mm256_add_epi32( _mm256_add_epi32(
-                 mm256_rol_32( dH[2], 15 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
-                 _mm256_xor_si256( _mm256_srli_epi32( xl, 7 ),
-                                   _mm256_xor_si256( qt[21], qt[14] ) ) );
-   dH[15] = _mm256_add_epi32( _mm256_add_epi32(
-                 mm256_rol_32( dH[3], 16 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
-                 _mm256_xor_si256( _mm256_srli_epi32( xl, 2 ),
-                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
-*/
 }

 static const __m256i final_s8[16] =
@@ -1422,40 +1335,35 @@ void compress_small_16way( const __m512i *M, const __m512i H[16],
   qt[30] = expand2s16( qt, M, H, 30 );
   qt[31] = expand2s16( qt, M, H, 31 );

-   xl = _mm512_xor_si512(
-              mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
-              mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
-   xh = _mm512_xor_si512( xl,  _mm512_xor_si512(
-                 mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
-                 mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
+   xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
+                    mm512_xor3( qt[19], qt[20], qt[21] ),
+                    _mm512_xor_si512( qt[22], qt[23] ) );
+
+   xh = mm512_xor3( mm512_xor3( xl,     qt[24], qt[25] ),
+                    mm512_xor3( qt[26], qt[27], qt[28] ),
+                    mm512_xor3( qt[29], qt[30], qt[31] ) );

 #define DH1L( m, sl, sr, a, b, c ) \
-   _mm512_add_epi32( \
-               _mm512_xor_si512( M[m], \
-                  _mm512_xor_si512( _mm512_slli_epi32( xh, sl ), \
-                                    _mm512_srli_epi32( qt[a], sr ) ) ), \
-               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
+   _mm512_add_epi32( mm512_xor3( M[m], _mm512_slli_epi32( xh, sl ), \
+                                       _mm512_srli_epi32( qt[a], sr ) ), \
+                     mm512_xor3( xl, qt[b], qt[c] ) )

 #define DH1R( m, sl, sr, a, b, c ) \
-   _mm512_add_epi32( \
-               _mm512_xor_si512( M[m], \
-                  _mm512_xor_si512( _mm512_srli_epi32( xh, sl ), \
-                                    _mm512_slli_epi32( qt[a], sr ) ) ), \
-               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
+   _mm512_add_epi32( mm512_xor3( M[m], _mm512_srli_epi32( xh, sl ), \
+                                       _mm512_slli_epi32( qt[a], sr ) ), \
+                     mm512_xor3( xl, qt[b], qt[c] ) )

 #define DH2L( m, rl, sl, h, a, b, c ) \
   _mm512_add_epi32( _mm512_add_epi32( \
-       mm512_rol_32( dH[h], rl ), \
-          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
-                 _mm512_xor_si512( _mm512_slli_epi32( xl, sl ), \
-                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
+                        mm512_rol_32( dH[h], rl ), \
+                        mm512_xor3( xh, qt[a], M[m] ) ), \
+                     mm512_xor3( _mm512_slli_epi32( xl, sl ), qt[b], qt[c] ) ) 

 #define DH2R( m, rl, sr, h, a, b, c ) \
   _mm512_add_epi32( _mm512_add_epi32( \
-       mm512_rol_32( dH[h], rl ), \
-          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
-                 _mm512_xor_si512( _mm512_srli_epi32( xl, sr ), \
-                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
+                        mm512_rol_32( dH[h], rl ), \
+                        mm512_xor3( xh, qt[a], M[m] ) ), \
+                     mm512_xor3( _mm512_srli_epi32( xl, sr ), qt[b], qt[c] ) )

   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
   dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -1285,40 +1285,35 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
   qt[30] = expand2b8( qt, M, H, 30 );
   qt[31] = expand2b8( qt, M, H, 31 );

-   xl = _mm512_xor_si512(
-           mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
-           mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
-   xh = _mm512_xor_si512( xl, _mm512_xor_si512(
-           mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
-           mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
+   xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
+                    mm512_xor3( qt[19], qt[20], qt[21] ),
+                    _mm512_xor_si512( qt[22], qt[23] ) );
+
+   xh = mm512_xor3( mm512_xor3( xl,     qt[24], qt[25] ),
+                    mm512_xor3( qt[26], qt[27], qt[28] ),
+                    mm512_xor3( qt[29], qt[30], qt[31] ) );

 #define DH1L( m, sl, sr, a, b, c ) \
-   _mm512_add_epi64( \
-               _mm512_xor_si512( M[m], \
-                  _mm512_xor_si512( _mm512_slli_epi64( xh, sl ), \
-                                    _mm512_srli_epi64( qt[a], sr ) ) ), \
-               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
+   _mm512_add_epi64( mm512_xor3( M[m], _mm512_slli_epi64( xh, sl ), \
+                                       _mm512_srli_epi64( qt[a], sr ) ), \
+                     mm512_xor3( xl, qt[b], qt[c] ) )

 #define DH1R( m, sl, sr, a, b, c ) \
-   _mm512_add_epi64( \
-               _mm512_xor_si512( M[m], \
-                  _mm512_xor_si512( _mm512_srli_epi64( xh, sl ), \
-                                    _mm512_slli_epi64( qt[a], sr ) ) ), \
-               _mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
+   _mm512_add_epi64( mm512_xor3( M[m], _mm512_srli_epi64( xh, sl ), \
+                                       _mm512_slli_epi64( qt[a], sr ) ), \
+                     mm512_xor3( xl, qt[b], qt[c] ) )

 #define DH2L( m, rl, sl, h, a, b, c ) \
   _mm512_add_epi64( _mm512_add_epi64( \
-       mm512_rol_64( dH[h], rl ), \
-          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
-                 _mm512_xor_si512( _mm512_slli_epi64( xl, sl ), \
-                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
-   
+                        mm512_rol_64( dH[h], rl ), \
+                        mm512_xor3( xh, qt[a], M[m] ) ), \
+                     mm512_xor3( _mm512_slli_epi64( xl, sl ), qt[b], qt[c] ) ) 
+
 #define DH2R( m, rl, sr, h, a, b, c ) \
   _mm512_add_epi64( _mm512_add_epi64( \
-       mm512_rol_64( dH[h], rl ), \
-          _mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
-                 _mm512_xor_si512( _mm512_srli_epi64( xl, sr ), \
-                                   _mm512_xor_si512( qt[b], qt[c] ) ) );
+                        mm512_rol_64( dH[h], rl ), \
+                        mm512_xor3( xh, qt[a], M[m] ) ), \
+                     mm512_xor3( _mm512_srli_epi64( xl, sr ), qt[b], qt[c] ) )


   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -53,6 +53,20 @@ MYALIGN const unsigned int	zero[]			= {0x00000000, 0x00000000, 0x00000000, 0x000
 MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};


+#define ECHO_SUBBYTES4(state, j) \
+   state[0][j] = _mm_aesenc_si128(state[0][j], k1);\
+   k1 = _mm_add_epi32(k1, M128(const1));\
+   state[1][j] = _mm_aesenc_si128(state[1][j], k1);\
+   k1 = _mm_add_epi32(k1, M128(const1));\
+   state[2][j] = _mm_aesenc_si128(state[2][j], k1);\
+   k1 = _mm_add_epi32(k1, M128(const1));\
+   state[3][j] = _mm_aesenc_si128(state[3][j], k1);\
+   k1 = _mm_add_epi32(k1, M128(const1));\
+   state[0][j] = _mm_aesenc_si128(state[0][j], m128_zero ); \
+   state[1][j] = _mm_aesenc_si128(state[1][j], m128_zero ); \
+   state[2][j] = _mm_aesenc_si128(state[2][j], m128_zero ); \
+   state[3][j] = _mm_aesenc_si128(state[3][j], m128_zero )
+
 #define ECHO_SUBBYTES(state, i, j) \
 	state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
   k1 = _mm_add_epi32(k1, M128(const1));\
@@ -73,7 +87,7 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	t1 = _mm_and_si128(t1, M128(lsbmask));\
 	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
 	s2 = _mm_xor_si128(s2, t2);\
-	state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
+	state2[0][j] = mm128_xor3(state2[0][j], s2, state1[1][(j + 1) & 3] );\
 	state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
 	state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
 	state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
@@ -83,7 +97,7 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
 	s2 = _mm_xor_si128(s2, t2);\
 	state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
-	state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
+	state2[1][j] = mm128_xor3(state2[1][j], s2, state1[2][(j + 2) & 3] );\
 	state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
 	state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
 	s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
@@ -93,10 +107,29 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	s2 = _mm_xor_si128(s2, t2);\
 	state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
 	state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
-	state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
+	state2[2][j] = mm128_xor3(state2[2][j], s2, state1[3][(j + 3) & 3] );\
 	state2[3][j] = _mm_xor_si128(state2[3][j], s2)


+#define ECHO_ROUND_UNROLL2 \
+   ECHO_SUBBYTES4(_state, 0);\
+   ECHO_SUBBYTES4(_state, 1);\
+   ECHO_SUBBYTES4(_state, 2);\
+   ECHO_SUBBYTES4(_state, 3);\
+   ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
+   ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
+   ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
+   ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
+   ECHO_SUBBYTES4(_state2, 0);\
+   ECHO_SUBBYTES4(_state2, 1);\
+   ECHO_SUBBYTES4(_state2, 2);\
+   ECHO_SUBBYTES4(_state2, 3);\
+   ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
+   ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
+   ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
+   ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
+
+/*
 #define ECHO_ROUND_UNROLL2 \
 	ECHO_SUBBYTES(_state, 0, 0);\
 	ECHO_SUBBYTES(_state, 1, 0);\
@@ -138,7 +171,7 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
 	ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
 	ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
-
+*/


 #define SAVESTATE(dst, src)\
--- a/algo/echo/echo-hash-4way.c
+++ b/algo/echo/echo-hash-4way.c
@@ -13,12 +13,19 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-
-//#define mul2mask    m512_const2_64( 0, 0x00001b00 )
-//_mm512_set4_epi32( 0, 0, 0, 0x00001b00 ) 
-//_mm512_set4_epi32( 0x00001b00, 0, 0, 0 )  
-
-//#define lsbmask    m512_const1_32( 0x01010101 ) 
+#define ECHO_SUBBYTES4(state, j) \
+   state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \
+   k1 = _mm512_add_epi32( k1, one ); \
+   state[1][j] = _mm512_aesenc_epi128( state[1][j], k1 ); \
+   k1 = _mm512_add_epi32( k1, one ); \
+   state[2][j] = _mm512_aesenc_epi128( state[2][j], k1 ); \
+   k1 = _mm512_add_epi32( k1, one ); \
+   state[3][j] = _mm512_aesenc_epi128( state[3][j], k1 ); \
+   k1 = _mm512_add_epi32( k1, one ); \
+   state[0][j] = _mm512_aesenc_epi128( state[0][j], m512_zero ); \
+   state[1][j] = _mm512_aesenc_epi128( state[1][j], m512_zero ); \
+   state[2][j] = _mm512_aesenc_epi128( state[2][j], m512_zero ); \
+   state[3][j] = _mm512_aesenc_epi128( state[3][j], m512_zero )

 #define ECHO_SUBBYTES( state, i, j ) \
 	state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
@@ -44,8 +51,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
   t1 = _mm512_and_si512( t1, lsbmask ); \
   t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
   s2 = _mm512_xor_si512( s2, t2 );\
-   state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
-                              _mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
+   state2[ 0 ][ j ] = mm512_xor3( state2[ 0 ][ j ], s2, state1[ 1 ][ j1 ] ); \
   state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
   state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
   state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
@@ -55,8 +61,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
   t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
   s2 = _mm512_xor_si512( s2, t2 ); \
   state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
-   state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
-                            _mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
+   state2[ 1 ][ j ] = mm512_xor3( state2[ 1 ][ j ], s2, state1[ 2 ][ j2 ] ); \
   state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
   state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
   s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
@@ -66,11 +71,29 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
   s2 = _mm512_xor_si512( s2, t2 ); \
   state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
   state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
-   state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
-                            _mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
+   state2[ 2 ][ j ] = mm512_xor3( state2[ 2 ][ j ], s2, state1[ 3 ][ j3] ); \
   state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
 } while(0)

+#define ECHO_ROUND_UNROLL2 \
+   ECHO_SUBBYTES4(_state, 0);\
+   ECHO_SUBBYTES4(_state, 1);\
+   ECHO_SUBBYTES4(_state, 2);\
+   ECHO_SUBBYTES4(_state, 3);\
+   ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
+   ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
+   ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
+   ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
+   ECHO_SUBBYTES4(_state2, 0);\
+   ECHO_SUBBYTES4(_state2, 1);\
+   ECHO_SUBBYTES4(_state2, 2);\
+   ECHO_SUBBYTES4(_state2, 3);\
+   ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
+   ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
+   ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
+   ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
+
+/*
 #define ECHO_ROUND_UNROLL2 \
   ECHO_SUBBYTES(_state, 0, 0);\
   ECHO_SUBBYTES(_state, 1, 0);\
@@ -112,6 +135,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
   ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
   ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
   ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
+*/

 #define SAVESTATE(dst, src)\
 	dst[0][0] = src[0][0];\
@@ -405,6 +429,20 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,

 #define lsbmask_2way    m256_const1_32( 0x01010101 ) 

+#define ECHO_SUBBYTES4_2WAY( state, j ) \
+   state[0][j] = _mm256_aesenc_epi128( state[0][j], k1 ); \
+   k1 = _mm256_add_epi32( k1, m256_one_128 ); \
+   state[1][j] = _mm256_aesenc_epi128( state[1][j], k1 ); \
+   k1 = _mm256_add_epi32( k1, m256_one_128 ); \
+   state[2][j] = _mm256_aesenc_epi128( state[2][j], k1 ); \
+   k1 = _mm256_add_epi32( k1, m256_one_128 ); \
+   state[3][j] = _mm256_aesenc_epi128( state[3][j], k1 ); \
+   k1 = _mm256_add_epi32( k1, m256_one_128 ); \
+   state[0][j] = _mm256_aesenc_epi128( state[0][j], m256_zero ); \
+   state[1][j] = _mm256_aesenc_epi128( state[1][j], m256_zero ); \
+   state[2][j] = _mm256_aesenc_epi128( state[2][j], m256_zero ); \
+   state[3][j] = _mm256_aesenc_epi128( state[3][j], m256_zero )
+
 #define ECHO_SUBBYTES_2WAY( state, i, j ) \
        state[i][j] = _mm256_aesenc_epi128( state[i][j], k1 ); \
        k1 = _mm256_add_epi32( k1, m256_one_128 ); \
@@ -456,6 +494,25 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
   state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \
 } while(0)

+#define ECHO_ROUND_UNROLL2_2WAY \
+   ECHO_SUBBYTES4_2WAY(_state, 0);\
+   ECHO_SUBBYTES4_2WAY(_state, 1);\
+   ECHO_SUBBYTES4_2WAY(_state, 2);\
+   ECHO_SUBBYTES4_2WAY(_state, 3);\
+   ECHO_MIXBYTES_2WAY(_state, _state2, 0, t1, t2, s2);\
+   ECHO_MIXBYTES_2WAY(_state, _state2, 1, t1, t2, s2);\
+   ECHO_MIXBYTES_2WAY(_state, _state2, 2, t1, t2, s2);\
+   ECHO_MIXBYTES_2WAY(_state, _state2, 3, t1, t2, s2);\
+   ECHO_SUBBYTES4_2WAY(_state2, 0);\
+   ECHO_SUBBYTES4_2WAY(_state2, 1);\
+   ECHO_SUBBYTES4_2WAY(_state2, 2);\
+   ECHO_SUBBYTES4_2WAY(_state2, 3);\
+   ECHO_MIXBYTES_2WAY(_state2, _state, 0, t1, t2, s2);\
+   ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
+   ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
+   ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
+
+/*
 #define ECHO_ROUND_UNROLL2_2WAY \
   ECHO_SUBBYTES_2WAY(_state, 0, 0);\
   ECHO_SUBBYTES_2WAY(_state, 1, 0);\
@@ -497,6 +554,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
   ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
   ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
   ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
+*/

 #define SAVESTATE_2WAY(dst, src)\
        dst[0][0] = src[0][0];\
--- a/algo/fugue/fugue-aesni.c
+++ b/algo/fugue/fugue-aesni.c
@@ -124,7 +124,16 @@ MYALIGN const unsigned int _IV512[] = {
 	t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
 	s7 = _mm_xor_si128(s7, t1)

+#define PRESUPERMIX(t0, t1, t2, t3, t4)\
+   t2 = t0;\
+   t3 = _mm_add_epi8(t0, t0);\
+   t4 = _mm_add_epi8(t3, t3);\
+   t1 = _mm_srli_epi16(t0, 6);\
+   t1 = _mm_and_si128(t1, M128(_lsbmask2));\
+   t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
+   t0  = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))

+/*
 #define PRESUPERMIX(x, t1, s1, s2, t2)\
 	s1 = x;\
 	s2 = _mm_add_epi8(x, x);\
@@ -133,37 +142,59 @@ MYALIGN const unsigned int _IV512[] = {
 	t1 = _mm_and_si128(t1, M128(_lsbmask2));\
 	s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
 	x  = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
+*/

-#define SUBSTITUTE(r0, _t1, _t2, _t3, _t0)\
+#define SUBSTITUTE(r0, _t2 )\
 	_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
 	_t2 = _mm_aesenclast_si128( _t2, m128_zero )
-	
+
+#define SUPERMIX(t0, t1, t2, t3, t4)\
+   t2 = t0;\
+   t3 = _mm_add_epi8(t0, t0);\
+   t4 = _mm_add_epi8(t3, t3);\
+   t1 = _mm_srli_epi16(t0, 6);\
+   t1 = _mm_and_si128(t1, M128(_lsbmask2));\
+   t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
+   t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
+   t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
+   t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
+   t4 = _mm_xor_si128(t4, t1);\
+   t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
+   t4 = _mm_xor_si128(t4, t1);\
+   t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
+   t2 = mm128_xor3(t2, t3, t0 );\
+   t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
+   t4 = mm128_xor3( t4, t1, t2 ); \
+   t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
+   t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
+   t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
+   t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
+   t4 = mm128_xor3( t4, t2, t1 ); \
+   t0 = _mm_xor_si128(t0, t3);\
+   t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
+
+/*
 #define SUPERMIX(t0, t1, t2, t3, t4)\
 	PRESUPERMIX(t0, t1, t2, t3, t4);\
 	POSTSUPERMIX(t0, t1, t2, t3, t4)
-
+*/

 #define POSTSUPERMIX(t0, t1, t2, t3, t4)\
-	t1 = t2;\
-	t1 = _mm_shuffle_epi8(t1, M128(_supermix1b));\
+	t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
 	t4 = t1;\
 	t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
 	t4 = _mm_xor_si128(t4, t1);\
-	t1 = t4;\
-	t1 = _mm_shuffle_epi8(t1, M128(_supermix1d));\
+	t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
 	t4 = _mm_xor_si128(t4, t1);\
-	t1 = t2;\
-	t1 = _mm_shuffle_epi8(t1, M128(_supermix1a));\
+	t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
 	t4 = _mm_xor_si128(t4, t1);\
-	t2 = _mm_xor_si128(t2, t3);\
-	t2 = _mm_xor_si128(t2, t0);\
+	t2 = mm128_xor3(t2, t3, t0 );\
 	t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
 	t4 = _mm_xor_si128(t4, t2);\
 	t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
 	t4 = _mm_xor_si128(t4, t2);\
 	t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
-	t1 = t0;\
-	t1 = _mm_shuffle_epi8(t1, M128(_supermix4a));\
+	t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
 	t4 = _mm_xor_si128(t4, t1);\
 	t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
 	t0 = _mm_xor_si128(t0, t3);\
@@ -171,59 +202,55 @@ MYALIGN const unsigned int _IV512[] = {
 	t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
 	t4 = _mm_xor_si128(t4, t0)

-
 #define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
 	CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
 	PACK_S0(r1c, r1a, _t0);\
-	SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE(r1c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
 	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
 	r2c = _mm_xor_si128(r2c, _t0);\
   _t0 = mm128_mask_32( _t0, 8 ); \
 	r2d = _mm_xor_si128(r2d, _t0);\
 	UNPACK_S0(r1c, r1a, _t3);\
-	SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE(r2c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
 	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
 	r3c = _mm_xor_si128(r3c, _t0);\
   _t0 = mm128_mask_32( _t0, 8 ); \
 	r3d = _mm_xor_si128(r3d, _t0);\
 	UNPACK_S0(r2c, r2a, _t3);\
-	SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE(r3c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
 	UNPACK_S0(r3c, r3a, _t3)

-
 #define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
 	CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
 	PACK_S0(r1c, r1a, _t0);\
-	SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE( r1c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
 	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
 	r2c = _mm_xor_si128(r2c, _t0);\
   _t0 = mm128_mask_32( _t0, 8 ); \
 	r2d = _mm_xor_si128(r2d, _t0);\
 	UNPACK_S0(r1c, r1a, _t3);\
-	SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE(r2c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
 	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
 	r3c = _mm_xor_si128(r3c, _t0);\
   _t0 = mm128_mask_32( _t0, 8 ); \
 	r3d = _mm_xor_si128(r3d, _t0);\
 	UNPACK_S0(r2c, r2a, _t3);\
-	SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE( r3c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
 	_t0 = _mm_shuffle_epi32(r3c, 0x39);\
 	r4c = _mm_xor_si128(r4c, _t0);\
   _t0 = mm128_mask_32( _t0, 8 ); \
 	r4d = _mm_xor_si128(r4d, _t0);\
 	UNPACK_S0(r3c, r3a, _t3);\
-	SUBSTITUTE(r4c, _t1, _t2, _t3, _t0);\
+	SUBSTITUTE( r4c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
 	UNPACK_S0(r4c, r4a, _t3)

-
-
 #define LOADCOLUMN(x, s, a)\
 	block[0] = col[(base + a + 0) % s];\
 	block[1] = col[(base + a + 1) % s];\
@@ -247,14 +274,14 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
      case 1:
         TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4],
                       ctx->state[5], ctx->state[ 6], ctx->state[8],
-		       ctx->state[9], ctx->state[10], _t0, _t1, _t2 );
+                       ctx->state[9], ctx->state[10], _t0, _t1, _t2 );

-	 SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7],
+	      SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7],
                        ctx->state[1], ctx->state[7], ctx->state[8],
-		       	ctx->state[6], ctx->state[0], ctx->state[6],
-		       	ctx->state[7], ctx->state[5], ctx->state[11],
-		       	ctx->state[5], ctx->state[6], ctx->state[4],
-		       	ctx->state[10] );
+		                  ctx->state[6], ctx->state[0], ctx->state[6],
+		                  ctx->state[7], ctx->state[5], ctx->state[11],
+		                  ctx->state[5], ctx->state[6], ctx->state[4],
+		       	         ctx->state[10] );
         ctx->base++;
         pmsg += 4;
         uBlockCount--;
@@ -263,14 +290,14 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
      case 2:
         TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0],
                       ctx->state[ 1], ctx->state[2], ctx->state[4],
-		       ctx->state[ 5], ctx->state[6], _t0, _t1, _t2);
+                       ctx->state[ 5], ctx->state[6], _t0, _t1, _t2);

         SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3],
                        ctx->state[9], ctx->state[3], ctx->state[4],
-		       	ctx->state[2], ctx->state[8], ctx->state[2],
-		       	ctx->state[3], ctx->state[1], ctx->state[7],
-		       	ctx->state[1], ctx->state[2], ctx->state[0],
-		       	ctx->state[6]);
+                        ctx->state[2], ctx->state[8], ctx->state[2],
+                        ctx->state[3], ctx->state[1], ctx->state[7],
+                        ctx->state[1], ctx->state[2], ctx->state[0],
+                        ctx->state[6]);

         ctx->base = 0;
         pmsg += 4;
@@ -278,44 +305,42 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
      break;
   }

-
   while( uBlockCount > 0 )
   {
-      TIX512( pmsg, ctx->state[ 7], ctx->state[2], ctx->state[8], ctx->state[9],
-                    ctx->state[10], ctx->state[0], ctx->state[1], ctx->state[2],
-              _t0, _t1, _t2 );
-      SUBROUND512_4( ctx->state[0], ctx->state[1], ctx->state[11],
-                     ctx->state[5], ctx->state[11], ctx->state[0],
-		     ctx->state[10], ctx->state[4], ctx->state[10],
-		     ctx->state[11], ctx->state[9], ctx->state[3],
-		     ctx->state[9], ctx->state[10], ctx->state[8],
-		     ctx->state[2] );
+      TIX512( pmsg, ctx->state[ 7],ctx->state[2],ctx->state[8],ctx->state[9],
+                    ctx->state[10],ctx->state[0],ctx->state[1],ctx->state[2],
+                    _t0, _t1, _t2 );
+      SUBROUND512_4( ctx->state[0], ctx->state[1],ctx->state[11],ctx->state[5],
+                     ctx->state[11],ctx->state[0],ctx->state[10],ctx->state[4],
+                     ctx->state[10],ctx->state[11],ctx->state[9],ctx->state[3],
+		               ctx->state[9],ctx->state[10],ctx->state[8],ctx->state[2] );

      ctx->base++;
      pmsg += 4;
      uBlockCount--;
      if( uBlockCount == 0 ) break;

-      TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4], ctx->state[5],
-                    ctx->state[6], ctx->state[8], ctx->state[9], ctx->state[10],
-              _t0, _t1, _t2 );
+      TIX512( pmsg, ctx->state[3],ctx->state[10],ctx->state[4],ctx->state[5],
+                    ctx->state[6],ctx->state[8], ctx->state[9],ctx->state[10],
+                    _t0, _t1, _t2 );

-      SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7], ctx->state[1],                     ctx->state[7], ctx->state[8], ctx->state[6], ctx->state[0],
-		     ctx->state[6], ctx->state[7], ctx->state[5], ctx->state[11],
-		     ctx->state[5], ctx->state[6, ctx->state[4], ctx->state[10]);
+      SUBROUND512_4( ctx->state[8],ctx->state[9],ctx->state[7],ctx->state[1],
+                     ctx->state[7],ctx->state[8],ctx->state[6],ctx->state[0],
+		               ctx->state[6],ctx->state[7],ctx->state[5],ctx->state[11],
+		               ctx->state[5],ctx->state[6],ctx->state[4],ctx->state[10] );

      ctx->base++;
      pmsg += 4;
      uBlockCount--;
      if( uBlockCount == 0 ) break;

-      TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0], ctx->state[1],
-		    ctx->state[2], ctx->state[4], ctx->state[5], ctx->state[6],
-               _t0, _t1, _t2);
-      SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3], ctx->state[9],
-		     ctx->state[3], ctx->state[4], ctx->state[2], ctx->state[8],
-		     ctx->state[2], ctx->state[3], ctx->state[1], ctx->state[7],
-		     ctx->state[1], ctx->state[2], ctx->state[0], ctx->state[6]);
+      TIX512( pmsg, ctx->state[11],ctx->state[6],ctx->state[0],ctx->state[1],
+                    ctx->state[2], ctx->state[4],ctx->state[5],ctx->state[6],
+                    _t0, _t1, _t2);
+      SUBROUND512_4( ctx->state[4],ctx->state[5],ctx->state[3],ctx->state[9],
+                     ctx->state[3],ctx->state[4],ctx->state[2],ctx->state[8],
+                     ctx->state[2],ctx->state[3],ctx->state[1],ctx->state[7],
+		               ctx->state[1],ctx->state[2],ctx->state[0],ctx->state[6]);

      ctx->base = 0;
      pmsg += 4;
@@ -326,8 +351,8 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u

 void Final512(hashState_fugue *ctx, BitSequence *hashval)
 {
-        unsigned int block[4] __attribute__ ((aligned (32)));
-        unsigned int col[36] __attribute__ ((aligned (16)));
+   unsigned int block[4] __attribute__ ((aligned (32)));
+   unsigned int col[36] __attribute__ ((aligned (16)));
 	unsigned int i, base;
 	__m128i r0, _t0, _t1, _t2, _t3;

@@ -357,7 +382,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)

 		// SMIX
 		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
+		SUBSTITUTE(r0, _t2);
 		SUPERMIX(_t2, _t3, _t0, _t1, r0);
 		STORECOLUMN(r0, 36);
 	}
@@ -375,7 +400,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)

 		// SMIX
 		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
+		SUBSTITUTE(r0, _t2);
 		SUPERMIX(_t2, _t3, _t0, _t1, r0);
 		STORECOLUMN(r0, 36);

@@ -390,7 +415,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)

 		// SMIX
 		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
+		SUBSTITUTE(r0, _t2);
 		SUPERMIX(_t2, _t3, _t0, _t1, r0);
 		STORECOLUMN(r0, 36);

@@ -405,7 +430,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)

 		// SMIX
 		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
+		SUBSTITUTE(r0, _t2);
 		SUPERMIX(_t2, _t3, _t0, _t1, r0);
 		STORECOLUMN(r0, 36);

@@ -420,7 +445,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)

 		// SMIX
 		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
+		SUBSTITUTE(r0, _t2);
 		SUPERMIX(_t2, _t3, _t0, _t1, r0);
 		STORECOLUMN(r0, 36);
 	}
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -67,11 +67,9 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
 * xmm[j] will be lost
 * xmm[k] has to be all 0x1b */
 #define MUL2(i, j, k){\
-  j = _mm_xor_si128(j, j);\
-  j = _mm_cmpgt_epi8(j, i);\
+  j = _mm_cmpgt_epi8( m128_zero, i);\
  i = _mm_add_epi8(i, i);\
-  j = _mm_and_si128(j, k);\
-  i = _mm_xor_si128(i, j);\
+  i = mm128_xorand(i, j, k );\
 } 

 /**/
@@ -93,6 +91,96 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
   We almost fit into 16 registers, need only 3 spills to memory.
   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
   K. Matusiewicz, 2011/05/29 */
+
+#if defined(__AVX512VL__)
+
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* t_i = a_i + a_{i+1} */\
+  b6 = a0;\
+  b7 = a1;\
+  a0 = _mm_xor_si128(a0, a1);\
+  b0 = a2;\
+  a1 = _mm_xor_si128(a1, a2);\
+  b1 = a3;\
+  TEMP2 = _mm_xor_si128(a2, a3);\
+  b2 = a4;\
+  a3 = _mm_xor_si128(a3, a4);\
+  b3 = a5;\
+  a4 = _mm_xor_si128(a4, a5);\
+  b4 = a6;\
+  a5 = _mm_xor_si128(a5, a6);\
+  b5 = a7;\
+  a6 = _mm_xor_si128(a6, a7);\
+  a7 = _mm_xor_si128(a7, b6);\
+   \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  TEMP0 = mm128_xor3( b0, a4, a6 ); \
+  /* spill values y_4, y_5 to memory */\
+  TEMP1 = mm128_xor3( b1, a5, a7 );\
+  b2 = mm128_xor3( b2, a6, a0 ); \
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b3 = mm128_xor3( b3, a7, a1 ); \
+  b1 = a1;\
+  b6 = mm128_xor3( b6, a4, TEMP2 ); \
+  b4 = mm128_xor3( b4, a0, TEMP2 ); \
+  b7 = mm128_xor3( b7, a5, a3 ); \
+  b5 = mm128_xor3( b5, a1, a3 ); \
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm_xor_si128(a0, a3);\
+  a1 = _mm_xor_si128(a1, a4);\
+  a2 = _mm_xor_si128(TEMP2, a5);\
+  a3 = _mm_xor_si128(a3, a6);\
+  a4 = _mm_xor_si128(a4, a7);\
+  a5 = _mm_xor_si128(a5, b0);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, TEMP2);\
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  MUL2(a0, b0, b1);\
+  a0 = _mm_xor_si128(a0, TEMP0);\
+  MUL2(a1, b0, b1);\
+  a1 = _mm_xor_si128(a1, TEMP1);\
+  MUL2(a2, b0, b1);\
+  a2 = _mm_xor_si128(a2, b2);\
+  MUL2(a3, b0, b1);\
+  a3 = _mm_xor_si128(a3, b3);\
+  MUL2(a4, b0, b1);\
+  a4 = _mm_xor_si128(a4, b4);\
+  MUL2(a5, b0, b1);\
+  a5 = _mm_xor_si128(a5, b5);\
+  MUL2(a6, b0, b1);\
+  a6 = _mm_xor_si128(a6, b6);\
+  MUL2(a7, b0, b1);\
+  a7 = _mm_xor_si128(a7, b7);\
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2(a0, b0, b1);\
+  b5 = _mm_xor_si128(b5, a0);\
+  MUL2(a1, b0, b1);\
+  b6 = _mm_xor_si128(b6, a1);\
+  MUL2(a2, b0, b1);\
+  b7 = _mm_xor_si128(b7, a2);\
+  MUL2(a5, b0, b1);\
+  b2 = _mm_xor_si128(b2, a5);\
+  MUL2(a6, b0, b1);\
+  b3 = _mm_xor_si128(b3, a6);\
+  MUL2(a7, b0, b1);\
+  b4 = _mm_xor_si128(b4, a7);\
+  MUL2(a3, b0, b1);\
+  MUL2(a4, b0, b1);\
+  b0 = TEMP0;\
+  b1 = TEMP1;\
+  b0 = _mm_xor_si128(b0, a3);\
+  b1 = _mm_xor_si128(b1, a4);\
+}/*MixBytes*/
+
+#else
+
 #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* t_i = a_i + a_{i+1} */\
  b6 = a0;\
@@ -189,6 +277,8 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
  b1 = _mm_xor_si128(b1, a4);\
 }/*MixBytes*/

+#endif
+

 /* one round
 * a0-a7 = input rows
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -58,11 +58,9 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
 * xmm[j] will be lost
 * xmm[k] has to be all 0x1b */
 #define MUL2(i, j, k){\
-  j = _mm_xor_si128(j, j);\
-  j = _mm_cmpgt_epi8(j, i);\
+  j = _mm_cmpgt_epi8( m128_zero, i);\
  i = _mm_add_epi8(i, i);\
-  j = _mm_and_si128(j, k);\
-  i = _mm_xor_si128(i, j);\
+  i = mm128_xorand(i, j, k );\
 } 

 /* Yet another implementation of MixBytes.
@@ -82,6 +80,96 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
   We almost fit into 16 registers, need only 3 spills to memory.
   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
   K. Matusiewicz, 2011/05/29 */
+
+#if defined(__AVX512VL__)
+
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* t_i = a_i + a_{i+1} */\
+  b6 = a0;\
+  b7 = a1;\
+  a0 = _mm_xor_si128(a0, a1);\
+  b0 = a2;\
+  a1 = _mm_xor_si128(a1, a2);\
+  b1 = a3;\
+  TEMP2 = _mm_xor_si128(a2, a3);\
+  b2 = a4;\
+  a3 = _mm_xor_si128(a3, a4);\
+  b3 = a5;\
+  a4 = _mm_xor_si128(a4, a5);\
+  b4 = a6;\
+  a5 = _mm_xor_si128(a5, a6);\
+  b5 = a7;\
+  a6 = _mm_xor_si128(a6, a7);\
+  a7 = _mm_xor_si128(a7, b6);\
+   \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  TEMP0 = mm128_xor3( b0, a4, a6 ); \
+  /* spill values y_4, y_5 to memory */\
+  TEMP1 = mm128_xor3( b1, a5, a7 );\
+  b2 = mm128_xor3( b2, a6, a0 ); \
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b3 = mm128_xor3( b3, a7, a1 ); \
+  b1 = a1;\
+  b6 = mm128_xor3( b6, a4, TEMP2 ); \
+  b4 = mm128_xor3( b4, a0, TEMP2 ); \
+  b7 = mm128_xor3( b7, a5, a3 ); \
+  b5 = mm128_xor3( b5, a1, a3 ); \
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm_xor_si128(a0, a3);\
+  a1 = _mm_xor_si128(a1, a4);\
+  a2 = _mm_xor_si128(TEMP2, a5);\
+  a3 = _mm_xor_si128(a3, a6);\
+  a4 = _mm_xor_si128(a4, a7);\
+  a5 = _mm_xor_si128(a5, b0);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, TEMP2);\
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  MUL2(a0, b0, b1);\
+  a0 = _mm_xor_si128(a0, TEMP0);\
+  MUL2(a1, b0, b1);\
+  a1 = _mm_xor_si128(a1, TEMP1);\
+  MUL2(a2, b0, b1);\
+  a2 = _mm_xor_si128(a2, b2);\
+  MUL2(a3, b0, b1);\
+  a3 = _mm_xor_si128(a3, b3);\
+  MUL2(a4, b0, b1);\
+  a4 = _mm_xor_si128(a4, b4);\
+  MUL2(a5, b0, b1);\
+  a5 = _mm_xor_si128(a5, b5);\
+  MUL2(a6, b0, b1);\
+  a6 = _mm_xor_si128(a6, b6);\
+  MUL2(a7, b0, b1);\
+  a7 = _mm_xor_si128(a7, b7);\
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2(a0, b0, b1);\
+  b5 = _mm_xor_si128(b5, a0);\
+  MUL2(a1, b0, b1);\
+  b6 = _mm_xor_si128(b6, a1);\
+  MUL2(a2, b0, b1);\
+  b7 = _mm_xor_si128(b7, a2);\
+  MUL2(a5, b0, b1);\
+  b2 = _mm_xor_si128(b2, a5);\
+  MUL2(a6, b0, b1);\
+  b3 = _mm_xor_si128(b3, a6);\
+  MUL2(a7, b0, b1);\
+  b4 = _mm_xor_si128(b4, a7);\
+  MUL2(a3, b0, b1);\
+  MUL2(a4, b0, b1);\
+  b0 = TEMP0;\
+  b1 = TEMP1;\
+  b0 = _mm_xor_si128(b0, a3);\
+  b1 = _mm_xor_si128(b1, a4);\
+}/*MixBytes*/
+
+#else
+
 #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* t_i = a_i + a_{i+1} */\
  b6 = a0;\
@@ -178,6 +266,8 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
  b1 = _mm_xor_si128(b1, a4);\
 }/*MixBytes*/

+#endif
+
 /* one round
 * i = round number
 * a0-a7 = input rows
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -96,11 +96,9 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
 * xmm[j] will be lost
 * xmm[k] has to be all 0x1b */
 #define MUL2(i, j, k){\
-  j = _mm512_xor_si512(j, j);\
-  j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
+  j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\
  i = _mm512_add_epi8(i, i);\
-  j = _mm512_and_si512(j, k);\
-  i = _mm512_xor_si512(i, j);\
+  i = mm512_xorand( i, j, k );\
 } 

 /* Yet another implementation of MixBytes.
@@ -120,6 +118,95 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
   We almost fit into 16 registers, need only 3 spills to memory.
   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
   K. Matusiewicz, 2011/05/29 */
+
+#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \
+                  b0, b1, b2, b3, b4, b5, b6, b7) { \
+  /* t_i = a_i + a_{i+1} */\
+  b6 = a0; \
+  b7 = a1; \
+  a0 = _mm512_xor_si512( a0, a1 ); \
+  b0 = a2; \
+  a1 = _mm512_xor_si512( a1, a2 ); \
+  b1 = a3; \
+  TEMP2 = _mm512_xor_si512( a2, a3 ); \
+  b2 = a4; \
+  a3 = _mm512_xor_si512( a3, a4 ); \
+  b3 = a5; \
+  a4 = _mm512_xor_si512( a4, a5 );\
+  b4 = a6; \
+  a5 = _mm512_xor_si512( a5, a6 ); \
+  b5 = a7; \
+  a6 = _mm512_xor_si512( a6, a7 ); \
+  a7 = _mm512_xor_si512( a7, b6 ); \
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  TEMP0 = mm512_xor3( b0, a4, a6 ); \
+  /* spill values y_4, y_5 to memory */\
+  TEMP1 = mm512_xor3( b1, a5, a7 ); \
+  b2 = mm512_xor3( b2, a6, a0 ); \
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0; \
+  b3 = mm512_xor3( b3, a7, a1 ); \
+  b1 = a1; \
+  b6 = mm512_xor3( b6, a4, TEMP2 ); \
+  b4 = mm512_xor3( b4, a0, TEMP2 ); \
+  b7 = mm512_xor3( b7, a5, a3 ); \
+  b5 = mm512_xor3( b5, a1, a3 ); \
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm512_xor_si512( a0, a3 ); \
+  a1 = _mm512_xor_si512( a1, a4 ); \
+  a2 = _mm512_xor_si512( TEMP2, a5 ); \
+  a3 = _mm512_xor_si512( a3, a6 ); \
+  a4 = _mm512_xor_si512( a4, a7 ); \
+  a5 = _mm512_xor_si512( a5, b0 ); \
+  a6 = _mm512_xor_si512( a6, b1 ); \
+  a7 = _mm512_xor_si512( a7, TEMP2 ); \
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
+  MUL2( a0, b0, b1 ); \
+  a0 = _mm512_xor_si512( a0, TEMP0 ); \
+  MUL2( a1, b0, b1 ); \
+  a1 = _mm512_xor_si512( a1, TEMP1 ); \
+  MUL2( a2, b0, b1 ); \
+  a2 = _mm512_xor_si512( a2, b2 ); \
+  MUL2( a3, b0, b1 ); \
+  a3 = _mm512_xor_si512( a3, b3 ); \
+  MUL2( a4, b0, b1 ); \
+  a4 = _mm512_xor_si512( a4, b4 ); \
+  MUL2( a5, b0, b1 ); \
+  a5 = _mm512_xor_si512( a5, b5 ); \
+  MUL2( a6, b0, b1 ); \
+  a6 = _mm512_xor_si512( a6, b6 ); \
+  MUL2( a7, b0, b1 ); \
+  a7 = _mm512_xor_si512( a7, b7 ); \
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2( a0, b0, b1 ); \
+  b5 = _mm512_xor_si512( b5, a0 ); \
+  MUL2( a1, b0, b1 ); \
+  b6 = _mm512_xor_si512( b6, a1 ); \
+  MUL2( a2, b0, b1 ); \
+  b7 = _mm512_xor_si512( b7, a2 ); \
+  MUL2( a5, b0, b1 ); \
+  b2 = _mm512_xor_si512( b2, a5 ); \
+  MUL2( a6, b0, b1 ); \
+  b3 = _mm512_xor_si512( b3, a6 ); \
+  MUL2( a7, b0, b1 ); \
+  b4 = _mm512_xor_si512( b4, a7 ); \
+  MUL2( a3, b0, b1 ); \
+  MUL2( a4, b0, b1 ); \
+  b0 = TEMP0;\
+  b1 = TEMP1;\
+  b0 = _mm512_xor_si512( b0, a3 ); \
+  b1 = _mm512_xor_si512( b1, a4 ); \
+}/*MixBytes*/
+
+
+#if 0
 #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* t_i = a_i + a_{i+1} */\
  b6 = a0;\
@@ -215,7 +302,7 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
  b0 = _mm512_xor_si512(b0, a3);\
  b1 = _mm512_xor_si512(b1, a4);\
 }/*MixBytes*/
-
+#endif

 #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -104,11 +104,9 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
 * xmm[j] will be lost
 * xmm[k] has to be all 0x1b */
 #define MUL2(i, j, k){\
-  j = _mm512_xor_si512(j, j);\
-  j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
+  j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\
  i = _mm512_add_epi8(i, i);\
-  j = _mm512_and_si512(j, k);\
-  i = _mm512_xor_si512(i, j);\
+  i = mm512_xorand( i, j, k );\
 } 

 /**/
@@ -130,100 +128,90 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
   We almost fit into 16 registers, need only 3 spills to memory.
   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
   K. Matusiewicz, 2011/05/29 */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \
+                  b0, b1, b2, b3, b4, b5, b6, b7) { \
  /* t_i = a_i + a_{i+1} */\
-  b6 = a0;\
-  b7 = a1;\
-  a0 = _mm512_xor_si512(a0, a1);\
-  b0 = a2;\
-  a1 = _mm512_xor_si512(a1, a2);\
-  b1 = a3;\
-  a2 = _mm512_xor_si512(a2, a3);\
-  b2 = a4;\
-  a3 = _mm512_xor_si512(a3, a4);\
-  b3 = a5;\
-  a4 = _mm512_xor_si512(a4, a5);\
-  b4 = a6;\
-  a5 = _mm512_xor_si512(a5, a6);\
-  b5 = a7;\
-  a6 = _mm512_xor_si512(a6, a7);\
-  a7 = _mm512_xor_si512(a7, b6);\
+  b6 = a0; \
+  b7 = a1; \
+  a0 = _mm512_xor_si512( a0, a1 ); \
+  b0 = a2; \
+  a1 = _mm512_xor_si512( a1, a2 ); \
+  b1 = a3; \
+  TEMP2 = _mm512_xor_si512( a2, a3 ); \
+  b2 = a4; \
+  a3 = _mm512_xor_si512( a3, a4 ); \
+  b3 = a5; \
+  a4 = _mm512_xor_si512( a4, a5 );\
+  b4 = a6; \
+  a5 = _mm512_xor_si512( a5, a6 ); \
+  b5 = a7; \
+  a6 = _mm512_xor_si512( a6, a7 ); \
+  a7 = _mm512_xor_si512( a7, b6 ); \
  \
  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  b0 = _mm512_xor_si512(b0, a4);\
-  b6 = _mm512_xor_si512(b6, a4);\
-  b1 = _mm512_xor_si512(b1, a5);\
-  b7 = _mm512_xor_si512(b7, a5);\
-  b2 = _mm512_xor_si512(b2, a6);\
-  b0 = _mm512_xor_si512(b0, a6);\
+  TEMP0 = mm512_xor3( b0, a4, a6 ); \
  /* spill values y_4, y_5 to memory */\
-  TEMP0 = b0;\
-  b3 = _mm512_xor_si512(b3, a7);\
-  b1 = _mm512_xor_si512(b1, a7);\
-  TEMP1 = b1;\
-  b4 = _mm512_xor_si512(b4, a0);\
-  b2 = _mm512_xor_si512(b2, a0);\
+  TEMP1 = mm512_xor3( b1, a5, a7 ); \
+  b2 = mm512_xor3( b2, a6, a0 ); \
  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
-  b0 = a0;\
-  b5 = _mm512_xor_si512(b5, a1);\
-  b3 = _mm512_xor_si512(b3, a1);\
-  b1 = a1;\
-  b6 = _mm512_xor_si512(b6, a2);\
-  b4 = _mm512_xor_si512(b4, a2);\
-  TEMP2 = a2;\
-  b7 = _mm512_xor_si512(b7, a3);\
-  b5 = _mm512_xor_si512(b5, a3);\
+  b0 = a0; \
+  b3 = mm512_xor3( b3, a7, a1 ); \
+  b1 = a1; \
+  b6 = mm512_xor3( b6, a4, TEMP2 ); \
+  b4 = mm512_xor3( b4, a0, TEMP2 ); \
+  b7 = mm512_xor3( b7, a5, a3 ); \
+  b5 = mm512_xor3( b5, a1, a3 ); \
  \
  /* compute x_i = t_i + t_{i+3} */\
-  a0 = _mm512_xor_si512(a0, a3);\
-  a1 = _mm512_xor_si512(a1, a4);\
-  a2 = _mm512_xor_si512(a2, a5);\
-  a3 = _mm512_xor_si512(a3, a6);\
-  a4 = _mm512_xor_si512(a4, a7);\
-  a5 = _mm512_xor_si512(a5, b0);\
-  a6 = _mm512_xor_si512(a6, b1);\
-  a7 = _mm512_xor_si512(a7, TEMP2);\
+  a0 = _mm512_xor_si512( a0, a3 ); \
+  a1 = _mm512_xor_si512( a1, a4 ); \
+  a2 = _mm512_xor_si512( TEMP2, a5 ); \
+  a3 = _mm512_xor_si512( a3, a6 ); \
+  a4 = _mm512_xor_si512( a4, a7 ); \
+  a5 = _mm512_xor_si512( a5, b0 ); \
+  a6 = _mm512_xor_si512( a6, b1 ); \
+  a7 = _mm512_xor_si512( a7, TEMP2 ); \
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
-  MUL2(a0, b0, b1);\
-  a0 = _mm512_xor_si512(a0, TEMP0);\
-  MUL2(a1, b0, b1);\
-  a1 = _mm512_xor_si512(a1, TEMP1);\
-  MUL2(a2, b0, b1);\
-  a2 = _mm512_xor_si512(a2, b2);\
-  MUL2(a3, b0, b1);\
-  a3 = _mm512_xor_si512(a3, b3);\
-  MUL2(a4, b0, b1);\
-  a4 = _mm512_xor_si512(a4, b4);\
-  MUL2(a5, b0, b1);\
-  a5 = _mm512_xor_si512(a5, b5);\
-  MUL2(a6, b0, b1);\
-  a6 = _mm512_xor_si512(a6, b6);\
-  MUL2(a7, b0, b1);\
-  a7 = _mm512_xor_si512(a7, b7);\
+  b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
+  MUL2( a0, b0, b1 ); \
+  a0 = _mm512_xor_si512( a0, TEMP0 ); \
+  MUL2( a1, b0, b1 ); \
+  a1 = _mm512_xor_si512( a1, TEMP1 ); \
+  MUL2( a2, b0, b1 ); \
+  a2 = _mm512_xor_si512( a2, b2 ); \
+  MUL2( a3, b0, b1 ); \
+  a3 = _mm512_xor_si512( a3, b3 ); \
+  MUL2( a4, b0, b1 ); \
+  a4 = _mm512_xor_si512( a4, b4 ); \
+  MUL2( a5, b0, b1 ); \
+  a5 = _mm512_xor_si512( a5, b5 ); \
+  MUL2( a6, b0, b1 ); \
+  a6 = _mm512_xor_si512( a6, b6 ); \
+  MUL2( a7, b0, b1 ); \
+  a7 = _mm512_xor_si512( a7, b7 ); \
  \
  /* compute v_i : double w_i      */\
  /* add to y_4 y_5 .. v3, v4, ... */\
-  MUL2(a0, b0, b1);\
-  b5 = _mm512_xor_si512(b5, a0);\
-  MUL2(a1, b0, b1);\
-  b6 = _mm512_xor_si512(b6, a1);\
-  MUL2(a2, b0, b1);\
-  b7 = _mm512_xor_si512(b7, a2);\
-  MUL2(a5, b0, b1);\
-  b2 = _mm512_xor_si512(b2, a5);\
-  MUL2(a6, b0, b1);\
-  b3 = _mm512_xor_si512(b3, a6);\
-  MUL2(a7, b0, b1);\
-  b4 = _mm512_xor_si512(b4, a7);\
-  MUL2(a3, b0, b1);\
-  MUL2(a4, b0, b1);\
+  MUL2( a0, b0, b1 ); \
+  b5 = _mm512_xor_si512( b5, a0 ); \
+  MUL2( a1, b0, b1 ); \
+  b6 = _mm512_xor_si512( b6, a1 ); \
+  MUL2( a2, b0, b1 ); \
+  b7 = _mm512_xor_si512( b7, a2 ); \
+  MUL2( a5, b0, b1 ); \
+  b2 = _mm512_xor_si512( b2, a5 ); \
+  MUL2( a6, b0, b1 ); \
+  b3 = _mm512_xor_si512( b3, a6 ); \
+  MUL2( a7, b0, b1 ); \
+  b4 = _mm512_xor_si512( b4, a7 ); \
+  MUL2( a3, b0, b1 ); \
+  MUL2( a4, b0, b1 ); \
  b0 = TEMP0;\
  b1 = TEMP1;\
-  b0 = _mm512_xor_si512(b0, a3);\
-  b1 = _mm512_xor_si512(b1, a4);\
+  b0 = _mm512_xor_si512( b0, a3 ); \
+  b1 = _mm512_xor_si512( b1, a4 ); \
 }/*MixBytes*/

 /* one round
@@ -709,11 +697,9 @@ static const __m256i SUBSH_MASK7_2WAY =
 * xmm[j] will be lost
 * xmm[k] has to be all 0x1b */
 #define MUL2_2WAY(i, j, k){\
-  j = _mm256_xor_si256(j, j);\
-  j = _mm256_cmpgt_epi8(j, i );\
+  j = _mm256_cmpgt_epi8( m256_zero, i );\
  i = _mm256_add_epi8(i, i);\
-  j = _mm256_and_si256(j, k);\
-  i = _mm256_xor_si256(i, j);\
+  i = mm256_xorand( i, j, k );\
 }

 #define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -44,6 +44,7 @@ void myriad_8way_hash( void *output, const void *input )

     rintrlv_8x64_4x128( vhashA, vhashB, input, 640 );
     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 640 );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(groestl512_4way_context) );
     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 640 );

     uint32_t hash0[20] __attribute__ ((aligned (64)));
@@ -58,8 +59,6 @@ void myriad_8way_hash( void *output, const void *input )
 //     rintrlv_4x128_8x32( vhash, vhashA, vhashB, 512 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
-     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
-                       hash6, hash7 );

 #else

@@ -76,27 +75,27 @@ void myriad_8way_hash( void *output, const void *input )
                   hash4, hash5, hash6, hash7, input, 640 );

     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 640 );
-     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
-
-     intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
-                         hash4, hash5, hash6, hash7, 512 );
+     memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );

 #endif

+     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
+                       hash6, hash7 );
+     
     sha256_8way_update( &ctx.sha, vhash, 64 );
     sha256_8way_close( &ctx.sha, output );
 }
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -560,22 +560,14 @@ do { \
     __m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
     dm = mm512_negate_32( _mm512_or_si512( dm, \
                                          _mm512_slli_epi64( dm, 32 ) ) ); \
-     m0 = _mm512_xor_si512( m0, _mm512_and_si512( dm, \
-                                          m512_const1_64( tp[0] ) ) ); \
-     m1 = _mm512_xor_si512( m1, _mm512_and_si512( dm, \
-                                          m512_const1_64( tp[1] ) ) ); \
-     m2 = _mm512_xor_si512( m2, _mm512_and_si512( dm, \
-                                          m512_const1_64( tp[2] ) ) ); \
-     m3 = _mm512_xor_si512( m3, _mm512_and_si512( dm, \
-                                          m512_const1_64( tp[3] ) ) ); \
-     m4 = _mm512_xor_si512( m4, _mm512_and_si512( dm, \
-                                          m512_const1_64( tp[4] ) ) ); \
-     m5 = _mm512_xor_si512( m5, _mm512_and_si512( dm, \
-                                          m512_const1_64( tp[5] ) ) ); \
-     m6 = _mm512_xor_si512( m6, _mm512_and_si512( dm, \
-                                          m512_const1_64( tp[6] ) ) ); \
-     m7 = _mm512_xor_si512( m7, _mm512_and_si512( dm, \
-                                          m512_const1_64( tp[7] ) ) ); \
+     m0 = mm512_xorand( m0, dm, m512_const1_64( tp[0] ) ); \
+     m1 = mm512_xorand( m1, dm, m512_const1_64( tp[1] ) ); \
+     m2 = mm512_xorand( m2, dm, m512_const1_64( tp[2] ) ); \
+     m3 = mm512_xorand( m3, dm, m512_const1_64( tp[3] ) ); \
+     m4 = mm512_xorand( m4, dm, m512_const1_64( tp[4] ) ); \
+     m5 = mm512_xorand( m5, dm, m512_const1_64( tp[5] ) ); \
+     m6 = mm512_xorand( m6, dm, m512_const1_64( tp[6] ) ); \
+     m7 = mm512_xorand( m7, dm, m512_const1_64( tp[7] ) ); \
     tp += 8; \
     db = _mm512_srli_epi64( db, 1 ); \
  } \
@@ -585,20 +577,13 @@ do { \
 do { \
  __m512i t; \
  t = a; \
-  a = _mm512_and_si512( a, c ); \
-  a = _mm512_xor_si512( a, d ); \
-  c = _mm512_xor_si512( c, b ); \
-  c = _mm512_xor_si512( c, a ); \
-  d = _mm512_or_si512( d, t ); \
-  d = _mm512_xor_si512( d, b ); \
+  a = mm512_xorand( d, a, c ); \
+  c = mm512_xor3( a, b, c ); \
+  b = mm512_xoror( b, d, t ); \
  t = _mm512_xor_si512( t, c ); \
-  b = d; \
-  d = _mm512_or_si512( d, t ); \
-  d = _mm512_xor_si512( d, a ); \
-  a = _mm512_and_si512( a, b ); \
-  t = _mm512_xor_si512( t, a ); \
-  b = _mm512_xor_si512( b, d ); \
-  b = _mm512_xor_si512( b, t ); \
+  d = mm512_xoror( a, b, t ); \
+  t = mm512_xorand( t, a, b ); \
+  b = mm512_xor3( b, d, t ); \
  a = c; \
  c = b; \
  b = d; \
@@ -609,14 +594,12 @@ do { \
 do { \
   a = mm512_rol_32( a, 13 ); \
   c = mm512_rol_32( c,  3 ); \
-   b = _mm512_xor_si512( b, _mm512_xor_si512( a, c ) ); \
-   d = _mm512_xor_si512( d, _mm512_xor_si512( c, \
-                                              _mm512_slli_epi32( a, 3 ) ) ); \
+   b = mm512_xor3( a, b, c ); \
+   d = mm512_xor3( d, c, _mm512_slli_epi32( a, 3 ) ); \
   b = mm512_rol_32( b, 1 ); \
   d = mm512_rol_32( d, 7 ); \
-   a = _mm512_xor_si512( a, _mm512_xor_si512( b, d ) ); \
-   c = _mm512_xor_si512( c, _mm512_xor_si512( d, \
-                                              _mm512_slli_epi32( b, 7 ) ) ); \
+   a = mm512_xor3( a, b, d ); \
+   c = mm512_xor3( c, d, _mm512_slli_epi32( b, 7 ) ); \
   a = mm512_rol_32( a,  5 ); \
   c = mm512_rol_32( c, 22 ); \
 } while (0)
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -522,50 +522,53 @@ do { \

 // Haval-256 8 way 32 bit avx2

+#if defined (__AVX512VL__)
+
+// ( ~( a ^ b ) ) & c
+#define mm256_andnotxor( a, b, c ) \
+   _mm256_ternarylogic_epi32( a, b, c, 0x82  )
+
+#else
+
+#define mm256_andnotxor( a, b, c ) \
+   _mm256_andnot_si256( _mm256_xor_si256( a, b ), c )
+
+#endif
+
 #define F1_8W(x6, x5, x4, x3, x2, x1, x0) \
-   _mm256_xor_si256( x0, \
-       _mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \
-                      _mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
-                                     _mm256_and_si256( x3, x6 ) ) ) ) \
+ mm256_xor3( x0, mm256_andxor( x1, x0, x4 ), \
+                 _mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
+                                   _mm256_and_si256( x3, x6 ) ) ) \

 #define F2_8W(x6, x5, x4, x3, x2, x1, x0) \
-   _mm256_xor_si256( \
-      _mm256_and_si256( x2, \
-         _mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \
-                        _mm256_xor_si256( _mm256_and_si256( x4, x5 ), \
-                                       _mm256_xor_si256( x6, x0 ) ) ) ), \
-         _mm256_xor_si256( \
-             _mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \
-             _mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \
+   mm256_xor3( mm256_andxor( x2, _mm256_andnot_si256( x3, x1 ), \
+                       mm256_xor3( _mm256_and_si256( x4, x5 ), x6, x0 )  ), \
+               mm256_andxor( x4, x1, x5 ), \
+               mm256_xorand( x0, x3, x5 ) ) \

 #define F3_8W(x6, x5, x4, x3, x2, x1, x0) \
-  _mm256_xor_si256( \
-    _mm256_and_si256( x3, \
-      _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
-                     _mm256_xor_si256( x6, x0 ) ) ), \
-      _mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \
-                                   _mm256_and_si256( x2, x5 ) ), x0 ) )
+  mm256_xor3( x0, \
+              _mm256_and_si256( x3, \
+                         mm256_xor3( _mm256_and_si256( x1, x2 ), x6, x0 ) ), \
+              _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
+                                _mm256_and_si256( x2, x5 ) ) )

 #define F4_8W(x6, x5, x4, x3, x2, x1, x0) \
-  _mm256_xor_si256( \
-     _mm256_xor_si256( \
-        _mm256_and_si256( x3, \
-           _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
-                                         _mm256_or_si256( x4, x6 ) ), x5 ) ), \
-        _mm256_and_si256( x4, \
-           _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \
-                          _mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \
-     _mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) )
-
+  mm256_xor3( \
+      mm256_andxor( x3, x5, \
+                    _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
+                                      _mm256_or_si256( x4, x6 ) ) ), \
+      _mm256_and_si256( x4, \
+                        mm256_xor3( x0, _mm256_andnot_si256( x2, x5 ), \
+                                    _mm256_xor_si256( x1, x6 ) ) ), \
+      mm256_xorand( x0, x2, x6 ) )

 #define F5_8W(x6, x5, x4, x3, x2, x1, x0) \
   _mm256_xor_si256( \
-       _mm256_and_si256( x0, \
-            mm256_not( _mm256_xor_si256( \
-                    _mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \
-      _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
-                                    _mm256_and_si256( x2, x5 ) ), \
-                                    _mm256_and_si256( x3, x6 ) ) )
+         mm256_andnotxor( mm256_and3( x1, x2, x3 ), x5, x0 ), \
+         mm256_xor3( _mm256_and_si256( x1, x4 ), \
+                     _mm256_and_si256( x2, x5 ), \
+                     _mm256_and_si256( x3, x6 ) ) )

 #define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \
   F1_8W(x1, x0, x3, x5, x6, x2, x4)
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -51,15 +51,15 @@ extern "C"{
 do { \
   __m512i cc = _mm512_set1_epi64( c ); \
    x3 = mm512_not( x3 ); \
-    x0 = _mm512_xor_si512( x0, _mm512_andnot_si512( x2, cc ) ); \
-    tmp = _mm512_xor_si512( cc, _mm512_and_si512( x0, x1 ) ); \
-    x0 = _mm512_xor_si512( x0, _mm512_and_si512( x2, x3 ) ); \
-    x3 = _mm512_xor_si512( x3, _mm512_andnot_si512( x1, x2 ) ); \
-    x1 = _mm512_xor_si512( x1, _mm512_and_si512( x0, x2 ) ); \
-    x2 = _mm512_xor_si512( x2, _mm512_andnot_si512( x3, x0 ) ); \
-    x0 = _mm512_xor_si512( x0, _mm512_or_si512( x1, x3 ) ); \
-    x3 = _mm512_xor_si512( x3, _mm512_and_si512( x1, x2 ) ); \
-    x1 = _mm512_xor_si512( x1, _mm512_and_si512( tmp, x0 ) ); \
+    x0 = mm512_xorandnot( x0, x2, cc ); \
+    tmp = mm512_xorand( cc, x0, x1 ); \
+    x0 = mm512_xorand( x0, x2, x3 ); \
+    x3 = mm512_xorandnot( x3, x1, x2 ); \
+    x1 = mm512_xorand( x1, x0, x2 ); \
+    x2 = mm512_xorandnot( x2, x3, x0 ); \
+    x0 = mm512_xoror( x0, x1, x3 ); \
+    x3 = mm512_xorand( x3, x1, x2 ); \
+    x1 = mm512_xorand( x1, tmp, x0 ); \
    x2 = _mm512_xor_si512( x2, tmp ); \
 } while (0)

@@ -67,11 +67,11 @@ do { \
 do { \
    x4 = _mm512_xor_si512( x4, x1 ); \
    x5 = _mm512_xor_si512( x5, x2 ); \
-    x6 = _mm512_xor_si512( x6, _mm512_xor_si512( x3, x0 ) ); \
+    x6 = mm512_xor3( x6, x3, x0 ); \
    x7 = _mm512_xor_si512( x7, x0 ); \
    x0 = _mm512_xor_si512( x0, x5 ); \
    x1 = _mm512_xor_si512( x1, x6 ); \
-    x2 = _mm512_xor_si512( x2, _mm512_xor_si512( x7, x4 ) ); \
+    x2 = mm512_xor3( x2, x7, x4 ); \
    x3 = _mm512_xor_si512( x3, x4 ); \
 } while (0)

@@ -318,12 +318,12 @@ static const sph_u64 C[] = {
 #define Wz_8W(x, c, n) \
 do { \
   __m512i t = _mm512_slli_epi64( _mm512_and_si512(x ## h, (c)), (n) ); \
-   x ## h = _mm512_or_si512( _mm512_and_si512( \
-                                _mm512_srli_epi64(x ## h, (n)), (c)), t ); \
+   x ## h = mm512_orand( t, _mm512_srli_epi64( x ## h, (n) ), (c) ); \
   t = _mm512_slli_epi64( _mm512_and_si512(x ## l, (c)), (n) ); \
-   x ## l = _mm512_or_si512( _mm512_and_si512((x ## l >> (n)), (c)), t ); \
+   x ## l = mm512_orand( t, (x ## l >> (n)), (c) ); \
 } while (0)

+
 #define W80(x)   Wz_8W(x, m512_const1_64( 0x5555555555555555 ),  1 )
 #define W81(x)   Wz_8W(x, m512_const1_64( 0x3333333333333333 ),  2 )
 #define W82(x)   Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ),  4 )
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -76,6 +76,9 @@ static const uint64_t RC[] = {
 #define OR64(d, a, b)    (d = _mm512_or_si512(a,b))
 #define NOT64(d, s)      (d = _mm512_xor_si512(s,m512_neg1))
 #define ROL64(d, v, n)   (d = mm512_rol_64(v, n))
+#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
+#define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
+

 #include "keccak-macros.c"

@@ -238,6 +241,8 @@ keccak512_8way_close(void *cc, void *dst)
 #undef NOT64
 #undef ROL64
 #undef KECCAK_F_1600
+#undef XOROR
+#undef XORAND

 #endif  // AVX512

@@ -255,6 +260,8 @@ keccak512_8way_close(void *cc, void *dst)
 #define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
 #define NOT64(d, s)      (d = _mm256_xor_si256(s,m256_neg1))
 #define ROL64(d, v, n)   (d = mm256_rol_64(v, n))
+#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
+#define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))

 #include "keccak-macros.c"

@@ -419,5 +426,7 @@ keccak512_4way_close(void *cc, void *dst)
 #undef NOT64
 #undef ROL64
 #undef KECCAK_F_1600
+#undef XOROR
+#undef XORAND

 #endif  // AVX2
--- a/algo/keccak/keccak-macros.c
+++ b/algo/keccak/keccak-macros.c
@@ -110,20 +110,34 @@
 #ifdef KHI_XO
 #undef KHI_XO
 #endif
+
+#define KHI_XO(d, a, b, c)   do { \
+      XOROR(d, a, b, c); \
+   } while (0)
+
+/*
 #define KHI_XO(d, a, b, c)   do { \
                DECL64(kt); \
                OR64(kt, b, c); \
                XOR64(d, a, kt); \
        } while (0)
+*/

 #ifdef KHI_XA
 #undef KHI_XA
 #endif
+
+#define KHI_XA(d, a, b, c)   do { \
+      XORAND(d, a, b, c); \
+   } while (0)
+
+/*
 #define KHI_XA(d, a, b, c)   do { \
                DECL64(kt); \
                AND64(kt, b, c); \
                XOR64(d, a, kt); \
        } while (0)
+*/

 #ifdef KHI
 #undef KHI
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -97,6 +97,21 @@ do { \
    MIXWORD4W(*(x+3),*(x+7),*t,*(t+1));\
    ADD_CONSTANT4W(*x, *(x+4), c0, c1);

+#define SUBCRUMB4W(a0,a1,a2,a3,t)\
+    t  = a0;\
+    a0 = mm512_xoror( a3, a0, a1 ); \
+    a2 = _mm512_xor_si512(a2,a3);\
+    a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
+    a3 = mm512_xorand( a2, a3, t ); \
+    a2 = mm512_xorand( a1, a2, a0);\
+    a1 = _mm512_or_si512(a1,a3);\
+    a3 = _mm512_xor_si512(a3,a2);\
+    t  = _mm512_xor_si512(t,a1);\
+    a2 = _mm512_and_si512(a2,a1);\
+    a1 = mm512_xnor(a1,a0);\
+    a0 = t;
+
+/*
 #define SUBCRUMB4W(a0,a1,a2,a3,t)\
    t  = _mm512_load_si512(&a0);\
    a0 = _mm512_or_si512(a0,a1);\
@@ -115,7 +130,25 @@ do { \
    a2 = _mm512_and_si512(a2,a1);\
    a1 = _mm512_xor_si512(a1,a0);\
    a0 = _mm512_load_si512(&t);
+*/

+#define MIXWORD4W(a,b,t1,t2)\
+    b  = _mm512_xor_si512(a,b);\
+    t1 = _mm512_slli_epi32(a,2);\
+    t2 = _mm512_srli_epi32(a,30);\
+    a  = mm512_xoror( b, t1, t2 ); \
+    t1 = _mm512_slli_epi32(b,14);\
+    t2 = _mm512_srli_epi32(b,18);\
+    b  = _mm512_or_si512(t1,t2);\
+    b  = mm512_xoror( a, t1, t2 ); \
+    t1 = _mm512_slli_epi32(a,10);\
+    t2 = _mm512_srli_epi32(a,22);\
+    a  = mm512_xoror( b, t1, t2 ); \
+    t1 = _mm512_slli_epi32(b,1);\
+    t2 = _mm512_srli_epi32(b,31);\
+    b  = _mm512_or_si512(t1,t2);
+
+/*
 #define MIXWORD4W(a,b,t1,t2)\
    b  = _mm512_xor_si512(a,b);\
    t1 = _mm512_slli_epi32(a,2);\
@@ -133,6 +166,7 @@ do { \
    t1 = _mm512_slli_epi32(b,1);\
    t2 = _mm512_srli_epi32(b,31);\
    b  = _mm512_or_si512(t1,t2);
+*/

 #define STEP_PART24W(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
    a1 = _mm512_shuffle_epi32(a1,147);\
@@ -248,17 +282,10 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
    __m512i tmp[2];
    __m512i x[8];

-    t0 = chainv[0];
-    t1 = chainv[1];
-
-    t0 = _mm512_xor_si512( t0, chainv[2] );
-    t1 = _mm512_xor_si512( t1, chainv[3] );
-    t0 = _mm512_xor_si512( t0, chainv[4] );
-    t1 = _mm512_xor_si512( t1, chainv[5] );
-    t0 = _mm512_xor_si512( t0, chainv[6] );
-    t1 = _mm512_xor_si512( t1, chainv[7] );
-    t0 = _mm512_xor_si512( t0, chainv[8] );
-    t1 = _mm512_xor_si512( t1, chainv[9] );
+    t0 = mm512_xor3( chainv[0], chainv[2], chainv[4] );
+    t1 = mm512_xor3( chainv[1], chainv[3], chainv[5] );
+    t0 = mm512_xor3( t0, chainv[6], chainv[8] );
+    t1 = mm512_xor3( t1, chainv[7], chainv[9] );

    MULT24W( t0, t1 );

@@ -319,8 +346,8 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
    chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );

    MULT24W( chainv[0], chainv[1] );
-    chainv[0] = _mm512_xor_si512( _mm512_xor_si512( chainv[0], t0 ), msg0 );
-    chainv[1] = _mm512_xor_si512( _mm512_xor_si512( chainv[1], t1 ), msg1 );
+    chainv[0] = mm512_xor3( chainv[0], t0, msg0 );
+    chainv[1] = mm512_xor3( chainv[1], t1, msg1 );

    MULT24W( msg0, msg1 );
    chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
@@ -398,19 +425,11 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )

    /*---- blank round with m=0 ----*/
    rnd512_4way( state, zero );
-
-    t[0] = chainv[0];
-    t[1] = chainv[1];
-
-    t[0] = _mm512_xor_si512( t[0], chainv[2] );
-    t[1] = _mm512_xor_si512( t[1], chainv[3] );
-    t[0] = _mm512_xor_si512( t[0], chainv[4] );
-    t[1] = _mm512_xor_si512( t[1], chainv[5] );
-    t[0] = _mm512_xor_si512( t[0], chainv[6] );
-    t[1] = _mm512_xor_si512( t[1], chainv[7] );
-    t[0] = _mm512_xor_si512( t[0], chainv[8] );
-    t[1] = _mm512_xor_si512( t[1], chainv[9] );
-
+    
+    t[0] = mm512_xor3( chainv[0], chainv[2], chainv[4] );
+    t[1] = mm512_xor3( chainv[1], chainv[3], chainv[5] );
+    t[0] = mm512_xor3( t[0], chainv[6], chainv[8] );
+    t[1] = mm512_xor3( t[1], chainv[7], chainv[9] );
    t[0] = _mm512_shuffle_epi32( t[0], 27 );
    t[1] = _mm512_shuffle_epi32( t[1], 27 );

@@ -676,8 +695,6 @@ do { \
  a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) );  \
 } while(0)

-// confirm pointer arithmetic
-// ok but use array indexes
 #define STEP_PART(x,c0,c1,t)\
    SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
    SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
@@ -688,23 +705,23 @@ do { \
    ADD_CONSTANT(*x, *(x+4), c0, c1);

 #define SUBCRUMB(a0,a1,a2,a3,t)\
-    t  = _mm256_load_si256(&a0);\
+    t  = a0;\
    a0 = _mm256_or_si256(a0,a1);\
    a2 = _mm256_xor_si256(a2,a3);\
-    a1 = _mm256_andnot_si256(a1, m256_neg1 );\
+    a1 = mm256_not( a1 );\
    a0 = _mm256_xor_si256(a0,a3);\
    a3 = _mm256_and_si256(a3,t);\
    a1 = _mm256_xor_si256(a1,a3);\
    a3 = _mm256_xor_si256(a3,a2);\
    a2 = _mm256_and_si256(a2,a0);\
-    a0 = _mm256_andnot_si256(a0, m256_neg1 );\
+    a0 = mm256_not( a0 );\
    a2 = _mm256_xor_si256(a2,a1);\
    a1 = _mm256_or_si256(a1,a3);\
    t  = _mm256_xor_si256(t,a1);\
    a3 = _mm256_xor_si256(a3,a2);\
    a2 = _mm256_and_si256(a2,a1);\
    a1 = _mm256_xor_si256(a1,a0);\
-    a0 = _mm256_load_si256(&t);\
+    a0 = t;\

 #define MIXWORD(a,b,t1,t2)\
    b  = _mm256_xor_si256(a,b);\
--- a/algo/panama/panama-hash-4way.c
+++ b/algo/panama/panama-hash-4way.c
@@ -312,10 +312,26 @@ do { \
      BUPDATE1_8W( 7, 1 ); \
 } while (0)

+#if defined(__AVX512VL__)
+
+#define GAMMA_8W(n0, n1, n2, n4)   \
+   ( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )  
+
+#define THETA_8W(n0, n1, n2, n4)   \
+   ( g ## n0 = mm256_xor3( a ## n0, a ## n1, a ## n4 ) )   
+
+#else
+
 #define GAMMA_8W(n0, n1, n2, n4)   \
   (g ## n0 = _mm256_xor_si256( a ## n0, \
                         _mm256_or_si256( a ## n1, mm256_not( a ## n2 ) ) ) )

+#define THETA_8W(n0, n1, n2, n4)   \
+   ( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \
+                                                            a ## n4 ) ) )
+
+#endif
+
 #define PI_ALL_8W   do { \
      a0  = g0; \
      a1  = mm256_rol_32( g7,   1 ); \
@@ -336,9 +352,6 @@ do { \
      a16 = mm256_rol_32( g10,  8 ); \
   } while (0)

-#define THETA_8W(n0, n1, n2, n4)   \
-   ( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \
-                                                            a ## n4 ) ) )

 #define SIGMA_ALL_8W   do { \
      a0  = _mm256_xor_si256( g0, m256_one_32 ); \
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -127,10 +127,8 @@ void quark_8way_hash( void *state, const void *input )

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

-     if ( ( vh_mask & 0x0f ) != 0x0f )
-       groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
-     if ( ( vh_mask & 0xf0 ) != 0xf0 )
-       groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
+     groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );

     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );

@@ -139,22 +137,14 @@ void quark_8way_hash( void *state, const void *input )
    dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  vhash, 512 );

-    if ( hash0[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-    if ( hash1[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-    if ( hash2[0] & 8)
-       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-    if ( hash3[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-    if ( hash4[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
-    if ( hash5[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
-    if ( hash6[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
-    if ( hash7[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );

    intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 512 );
--- a/algo/sha/hmac-sha256-hash.c
+++ b/algo/sha/hmac-sha256-hash.c
@@ -39,17 +39,10 @@
 void
 SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
 {
-#if defined(HMAC_SPH_SHA)
   sph_sha256_context ctx;
   sph_sha256_init( &ctx );
   sph_sha256( &ctx, in, len );
   sph_sha256_close( &ctx, digest );
-#else
-   SHA256_CTX ctx;
-   SHA256_Init( &ctx );
-   SHA256_Update( &ctx, in, len );
-   SHA256_Final( digest, &ctx );
-#endif
 }

 /**
@@ -79,51 +72,29 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
   /* If Klen > 64, the key is really SHA256(K). */
   if ( Klen > 64 )
   {
-	   
-#if defined(HMAC_SPH_SHA)
      sph_sha256_init( &ctx->ictx );
      sph_sha256( &ctx->ictx, K, Klen );
      sph_sha256_close( &ctx->ictx, khash );
-#else
-      SHA256_Init( &ctx->ictx );
-      SHA256_Update( &ctx->ictx, K, Klen );
-      SHA256_Final( khash, &ctx->ictx );
-#endif
-       K = khash;
-       Klen = 32;
+
+      K = khash;
+      Klen = 32;
   }

   /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-#if defined(HMAC_SPH_SHA)
   sph_sha256_init( &ctx->ictx );
-#else
-   SHA256_Init( &ctx->ictx );
-#endif

   for ( i = 0; i < Klen; i++ )  pad[i] = K[i] ^ 0x36;

   memset( pad + Klen, 0x36, 64 - Klen );
-#if defined(HMAC_SPH_SHA)
   sph_sha256( &ctx->ictx, pad, 64 );
-#else
-   SHA256_Update( &ctx->ictx, pad, 64 );
-#endif

   /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-#if defined(HMAC_SPH_SHA)
   sph_sha256_init( &ctx->octx );
-#else   
-   SHA256_Init( &ctx->octx );
-#endif

   for ( i = 0; i < Klen; i++ )  pad[i] = K[i] ^ 0x5c;

   memset( pad + Klen, 0x5c, 64 - Klen );
-#if defined(HMAC_SPH_SHA)
   sph_sha256( &ctx->octx, pad, 64 );
-#else
-   SHA256_Update( &ctx->octx, pad, 64 );
-#endif
 }

 /* Add bytes to the HMAC-SHA256 operation. */
@@ -131,11 +102,7 @@ void
 HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len )
 {
 	/* Feed data to the inner SHA256 operation. */
-#if defined(HMAC_SPH_SHA)
   sph_sha256( &ctx->ictx, in, len );
-#else
-   SHA256_Update( &ctx->ictx, in, len );
-#endif
 }

 /* Finish an HMAC-SHA256 operation. */
@@ -144,20 +111,9 @@ HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx )
 {
   unsigned char ihash[32];

-#if defined(HMAC_SPH_SHA)
   sph_sha256_close( &ctx->ictx, ihash );
   sph_sha256( &ctx->octx, ihash, 32 );
   sph_sha256_close( &ctx->octx, digest );
-#else
-   /* Finish the inner SHA256 operation. */
-   SHA256_Final( ihash, &ctx->ictx );
-
-   /* Feed the inner hash to the outer SHA256 operation. */
-   SHA256_Update( &ctx->octx, ihash, 32 );
-
-   /* Finish the outer SHA256 operation. */
-   SHA256_Final( digest, &ctx->octx );
-#endif
 }

 /**
--- a/algo/sha/hmac-sha256-hash.h
+++ b/algo/sha/hmac-sha256-hash.h
@@ -29,24 +29,14 @@
 #ifndef HMAC_SHA256_H__
 #define HMAC_SHA256_H__

-//#define HMAC_SSL_SHA 1
-#define HMAC_SPH_SHA 1
-
 #include <sys/types.h>
 #include <stdint.h>
 #include "sph_sha2.h"
-#include <openssl/sha.h>
-

 typedef struct HMAC_SHA256Context
 {
-#if defined(HMAC_SPH_SHA)
   sph_sha256_context ictx;
   sph_sha256_context octx;
-#else
-   SHA256_CTX ictx;
-   SHA256_CTX octx;
-#endif
 } HMAC_SHA256_CTX;

 void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -59,6 +59,8 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data,
                         size_t len );
 void sha256_4way_close( sha256_4way_context *sc, void *dst );
 void sha256_4way_full( void *dst, const void *data, size_t len );
+void sha256_4way_transform( __m128i *state_out,  const __m128i *data,
+                            const __m128i *state_in );

 #endif  // SSE2

@@ -77,6 +79,8 @@ void sha256_8way_init( sha256_8way_context *sc );
 void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
 void sha256_8way_close( sha256_8way_context *sc, void *dst );
 void sha256_8way_full( void *dst, const void *data, size_t len );
+void sha256_8way_transform( __m256i *state_out, const __m256i *data,
+                            const __m256i *state_in );

 #endif  // AVX2

@@ -95,6 +99,12 @@ void sha256_16way_init( sha256_16way_context *sc );
 void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
 void sha256_16way_close( sha256_16way_context *sc, void *dst );
 void sha256_16way_full( void *dst, const void *data, size_t len );
+void sha256_16way_transform( __m512i *state_out, const __m512i *data,
+                             const __m512i *state_in );
+void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
+                             const __m512i *state_in );
+void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
+                          const __m512i *state_in, const __m512i *state_mid );

 #endif // AVX512

--- a/algo/sha/sha2.c
+++ b/algo/sha/sha2.c
@@ -195,8 +195,28 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
 		hash[i] = swab32(hash[i]);
 }

-extern void sha256d(unsigned char *hash, const unsigned char *data, int len)
+#if defined (__SHA__)
+
+#include "algo/sha/sph_sha2.h"
+
+void sha256d(unsigned char *hash, const unsigned char *data, int len)
 {
+   sph_sha256_context ctx __attribute__ ((aligned (64)));
+
+   sph_sha256_init( &ctx );
+   sph_sha256( &ctx, data, len );
+   sph_sha256_close( &ctx, hash );
+
+   sph_sha256_init( &ctx );
+   sph_sha256( &ctx, hash, 32 );
+   sph_sha256_close( &ctx, hash );
+}
+
+#else
+
+void sha256d(unsigned char *hash, const unsigned char *data, int len)
+{
+
   uint32_t S[16], T[16];
 	int i, r;

@@ -220,6 +240,8 @@ extern void sha256d(unsigned char *hash, const unsigned char *data, int len)
 		be32enc((uint32_t *)hash + i, T[i]);
 }

+#endif
+
 static inline void sha256d_preextend(uint32_t *W)
 {
 	W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
--- a/algo/sha/sha256-hash-2way-ni.c
+++ b/algo/sha/sha256-hash-2way-ni.c
@@ -0,0 +1,345 @@
+/*   Intel SHA extensions using C intrinsics               */
+/*   Written and place in public domain by Jeffrey Walton  */
+/*   Based on code from Intel, and by Sean Gulley for      */
+/*   the miTLS project.                                    */
+
+// A stripped down version with byte swapping removed. 
+
+#if defined(__SHA__)
+
+#include "sha256-hash-opt.h"
+
+void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
+                              const void *msg_X, const void *msg_Y,
+                              const uint32_t *in_X, const uint32_t *in_Y )
+{
+    __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
+    __m128i MSG_X, MSG_Y, TMP_X, TMP_Y;
+    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
+    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
+    __m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y;
+
+    // Load initial values
+    TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
+    STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
+    TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
+    STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
+
+    TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
+    TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
+    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
+    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
+    STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
+    STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
+    STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
+    STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
+
+    // Save current hash
+    ABEF_SAVE_X = STATE0_X;
+    ABEF_SAVE_Y = STATE0_Y;
+    CDGH_SAVE_X = STATE1_X;
+    CDGH_SAVE_Y = STATE1_Y;
+
+    // Rounds 0-3
+    TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
+    TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
+    TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
+    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Rounds 4-7
+    TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
+    TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
+    TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
+    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
+
+    // Rounds 8-11
+    TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
+    TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
+    TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
+    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
+
+    // Rounds 12-15
+    TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
+    TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
+    TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
+    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
+    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
+    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
+    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
+    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
+
+    // Rounds 16-19
+    TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
+    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
+    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
+    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
+    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
+    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
+
+    // Rounds 20-23
+    TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
+    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
+    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
+    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
+    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
+
+    // Rounds 24-27
+    TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
+    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
+    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
+    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
+    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
+
+    // Rounds 28-31
+    TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
+    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
+    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
+    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
+    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
+    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
+
+    // Rounds 32-35
+    TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
+    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
+    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
+    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
+    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
+    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
+
+    // Rounds 36-39
+    TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
+    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
+    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
+    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
+    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
+
+    // Rounds 40-43
+    TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
+    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
+    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
+    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
+    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
+
+    // Rounds 44-47
+    TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
+    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
+    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
+    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
+    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
+    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
+
+    // Rounds 48-51
+    TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
+    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
+    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
+    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
+    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
+    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
+
+    // Rounds 52-55
+    TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
+    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
+    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
+    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Rounds 56-59
+    TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
+    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
+    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
+    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Rounds 60-63
+    TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Add values back to state
+    STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
+    STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
+    STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
+    STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
+
+    TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
+    TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
+    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
+    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
+    STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
+    STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
+    STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
+    STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
+
+    // Save state
+    _mm_store_si128((__m128i*) &out_X[0], STATE0_X);
+    _mm_store_si128((__m128i*) &out_X[4], STATE1_X);
+    _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
+    _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
+}
+
+#endif
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -74,9 +74,20 @@ static const uint32_t K256[64] =
 #define CHs(X, Y, Z) \
   _mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z ) 

+/*
 #define MAJs(X, Y, Z) \
   _mm_or_si128( _mm_and_si128( X, Y ), \
                    _mm_and_si128( _mm_or_si128( X, Y ), Z ) )
+*/
+/*
+#define MAJs(X, Y, Z) \
+  _mm_xor_si128( Y, _mm_and_si128( _mm_xor_si128( X, Y ), \
+                                   _mm_xor_si128( Y, Z ) ) )
+*/
+
+#define MAJs(X, Y, Z) \
+  _mm_xor_si128( Y, _mm_and_si128( X_xor_Y = _mm_xor_si128( X, Y ), \
+                                   Y_xor_Z ) )

 #define BSG2_0(x) \
   _mm_xor_si128( _mm_xor_si128( \
@@ -94,6 +105,7 @@ static const uint32_t K256[64] =
   _mm_xor_si128( _mm_xor_si128( \
        mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) )

+/*
 #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
  __m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \
@@ -122,9 +134,9 @@ do { \
  H  = _mm_add_epi32( T1, T2 ); \
  D  = _mm_add_epi32( D, T1 ); \
 } while (0)
+*/


-/*
 #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
  __m128i T1, T2; \
@@ -132,16 +144,98 @@ do { \
  T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \
                                        K, W[i] ) ); \
  T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
+  Y_xor_Z = X_xor_Y; \
  D  = _mm_add_epi32( D,  T1 ); \
  H  = _mm_add_epi32( T1, T2 ); \
 } while (0)
-*/


+void sha256_4way_transform( __m128i *state_out, const __m128i *data,
+                            const __m128i *state_in )
+{
+   __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
+   __m128i W[16];
+
+   memcpy_128( W, data, 16 );
+
+   A = state_in[0];
+   B = state_in[1];
+   C = state_in[2];
+   D = state_in[3];
+   E = state_in[4];
+   F = state_in[5];
+   G = state_in[6];
+   H = state_in[7];
+   Y_xor_Z = _mm_xor_si128( B, C );
+
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2s_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2s_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2s_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2s_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2s_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2s_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2s_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2s_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2s_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2s_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2s_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2s_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2s_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2s_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2s_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2s_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+   
+   state_out[0] = _mm_add_epi32( state_in[0], A );
+   state_out[1] = _mm_add_epi32( state_in[1], B );
+   state_out[2] = _mm_add_epi32( state_in[2], C );
+   state_out[3] = _mm_add_epi32( state_in[3], D );
+   state_out[4] = _mm_add_epi32( state_in[4], E );
+   state_out[5] = _mm_add_epi32( state_in[5], F );
+   state_out[6] = _mm_add_epi32( state_in[6], G );
+   state_out[7] = _mm_add_epi32( state_in[7], H );
+}
+
 static void
 sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] )
 {
-   register  __m128i A, B, C, D, E, F, G, H;
+   register  __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
   __m128i W[16];

   mm128_block_bswap_32( W, in );
@@ -170,6 +264,8 @@ sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] )
      H = m128_const1_64( 0x5BE0CD195BE0CD19 );
   }

+   Y_xor_Z = _mm_xor_si128( B, C );
+
   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
@@ -321,10 +417,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
    high = (sc->count_high << 3) | (low >> 29);
    low = low << 3;

-    sc->buf[ pad >> 2 ] =
-                 mm128_bswap_32( m128_const1_32( high ) );
-    sc->buf[ ( pad+4 ) >> 2 ] =
-                 mm128_bswap_32( m128_const1_32( low ) );
+    sc->buf[  pad     >> 2 ] = m128_const1_32( bswap_32( high ) );
+    sc->buf[( pad+4 ) >> 2 ] = m128_const1_32( bswap_32( low ) );
    sha256_4way_round( sc, sc->buf, sc->val );

    mm128_block_bswap_32( dst, sc->val );
@@ -342,12 +436,39 @@ void sha256_4way_full( void *dst, const void *data, size_t len )

 // SHA-256 8 way

+#if defined(__AVX512VL__)
+
+#define CHx(X, Y, Z) \
+   _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
+
+#define MAJx(X, Y, Z) \
+   _mm256_ternarylogic_epi32( X, Y, Z, 0xe8 )
+
+#define BSG2_0x(x) \
+   mm256_xor3( mm256_ror_32(x,  2), mm256_ror_32(x, 13), mm256_ror_32(x, 22) )
+
+#define BSG2_1x(x) \
+   mm256_xor3( mm256_ror_32(x,  6), mm256_ror_32(x, 11), mm256_ror_32(x, 25) )
+
+#define SSG2_0x(x) \
+   mm256_xor3( mm256_ror_32(x,  7), mm256_ror_32(x, 18), _mm256_srli_epi32(x, 3) )
+
+#define SSG2_1x(x) \
+   mm256_xor3( mm256_ror_32(x, 17), mm256_ror_32(x, 19), _mm256_srli_epi32(x, 10) )
+
+#else  // AVX2
+
 #define CHx(X, Y, Z) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 

 #define MAJx(X, Y, Z) \
-   _mm256_or_si256( _mm256_and_si256( X, Y ), \
-                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
+  _mm256_xor_si256( Y, _mm256_and_si256( _mm256_xor_si256( X, Y ), \
+                                         _mm256_xor_si256( Y, Z ) ) )
+/*
+#define MAJx(X, Y, Z) \
+  _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
+                                         Y_xor_Z ) )
+*/

 #define BSG2_0x(x) \
   _mm256_xor_si256( _mm256_xor_si256( \
@@ -365,6 +486,8 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
   _mm256_xor_si256( _mm256_xor_si256( \
       mm256_ror_32(x, 17), mm256_ror_32(x, 19) ), _mm256_srli_epi32(x, 10) )

+#endif   // AVX512 else AVX2
+
 #define SHA2x_MEXP( a, b, c, d ) \
     mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] );

@@ -379,8 +502,89 @@ do { \
  H  = _mm256_add_epi32( T1, T2 ); \
 } while (0)

+void sha256_8way_transform( __m256i *state_out, const __m256i *data,
+                            const __m256i *state_in )
+{
+   __m256i A, B, C, D, E, F, G, H;
+   __m256i W[16];
+
+   memcpy_256( W, data, 16 );
+
+   A = state_in[0];
+   B = state_in[1];
+   C = state_in[2];
+   D = state_in[3];
+   E = state_in[4];
+   F = state_in[5];
+   G = state_in[6];
+   H = state_in[7];
+
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2x_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2x_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2x_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2x_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2x_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2x_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   state_out[0] = _mm256_add_epi32( state_in[0], A );
+   state_out[1] = _mm256_add_epi32( state_in[1], B );
+   state_out[2] = _mm256_add_epi32( state_in[2], C );
+   state_out[3] = _mm256_add_epi32( state_in[3], D );
+   state_out[4] = _mm256_add_epi32( state_in[4], E );
+   state_out[5] = _mm256_add_epi32( state_in[5], F );
+   state_out[6] = _mm256_add_epi32( state_in[6], G );
+   state_out[7] = _mm256_add_epi32( state_in[7], H );
+}
+
 static void
-sha256_8way_round( sha256_8way_context *ctx,  __m256i *in, __m256i r[8] )
+sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] )
 {
   register  __m256i A, B, C, D, E, F, G, H;
   __m256i W[16];
@@ -566,10 +770,8 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
    high = (sc->count_high << 3) | (low >> 29);
    low = low << 3;

-    sc->buf[ pad >> 2 ] =
-                 mm256_bswap_32( m256_const1_32( high ) );
-    sc->buf[ ( pad+4 ) >> 2 ] =
-                 mm256_bswap_32( m256_const1_32( low ) );
+    sc->buf[   pad     >> 2 ] = m256_const1_32( bswap_32( high ) );
+    sc->buf[ ( pad+4 ) >> 2 ] = m256_const1_32( bswap_32( low ) );

    sha256_8way_round( sc, sc->buf, sc->val );

@@ -589,27 +791,22 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
 // SHA-256 16 way

 #define CHx16(X, Y, Z) \
-   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) 
+   _mm512_ternarylogic_epi32( X, Y, Z, 0xca )

 #define MAJx16(X, Y, Z) \
-   _mm512_or_si512( _mm512_and_si512( X, Y ), \
-                    _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
+   _mm512_ternarylogic_epi32( X, Y, Z, 0xe8 )

 #define BSG2_0x16(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-       mm512_ror_32(x,  2), mm512_ror_32(x, 13) ), mm512_ror_32( x, 22) )
+   mm512_xor3( mm512_ror_32(x,  2), mm512_ror_32(x, 13), mm512_ror_32(x, 22) )

 #define BSG2_1x16(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-       mm512_ror_32(x,  6), mm512_ror_32(x, 11) ), mm512_ror_32( x, 25) )
+   mm512_xor3( mm512_ror_32(x,  6), mm512_ror_32(x, 11), mm512_ror_32(x, 25) )

 #define SSG2_0x16(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-       mm512_ror_32(x,  7), mm512_ror_32(x, 18) ), _mm512_srli_epi32(x, 3) ) 
+   mm512_xor3( mm512_ror_32(x,  7), mm512_ror_32(x, 18), _mm512_srli_epi32(x, 3) )

 #define SSG2_1x16(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-       mm512_ror_32(x, 17), mm512_ror_32(x, 19) ), _mm512_srli_epi32(x, 10) )
+   mm512_xor3( mm512_ror_32(x, 17), mm512_ror_32(x, 19), _mm512_srli_epi32(x, 10) )

 #define SHA2x16_MEXP( a, b, c, d ) \
     mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] );
@@ -625,10 +822,216 @@ do { \
  H  = _mm512_add_epi32( T1, T2 ); \
 } while (0)

+// Tranform one 16 lane by 64 byte message block and update state.
+// Calling function is responsible for initializing the state, setting
+// correct byte order, counting bits and padding of the final block.
+// It's faster for multiple rounds of sha256 (sha256d/t/q) by eliminating
+// redundant byte swapping.
+//
+void sha256_16way_transform( __m512i *state_out, const __m512i *data,
+                             const __m512i *state_in )
+{
+   __m512i A, B, C, D, E, F, G, H;
+   __m512i W[16];
+
+   memcpy_512( W, data, 16 );
+
+   A = state_in[0];
+   B = state_in[1];
+   C = state_in[2];
+   D = state_in[3];
+   E = state_in[4];
+   F = state_in[5];
+   G = state_in[6];
+   H = state_in[7];
+
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2x16_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2x16_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2x16_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2x16_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2x16_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2x16_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2x16_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2x16_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2x16_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2x16_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2x16_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2x16_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2x16_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2x16_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2x16_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2x16_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   state_out[0] = _mm512_add_epi32( state_in[0], A );
+   state_out[1] = _mm512_add_epi32( state_in[1], B );
+   state_out[2] = _mm512_add_epi32( state_in[2], C );
+   state_out[3] = _mm512_add_epi32( state_in[3], D );
+   state_out[4] = _mm512_add_epi32( state_in[4], E );
+   state_out[5] = _mm512_add_epi32( state_in[5], F );
+   state_out[6] = _mm512_add_epi32( state_in[6], G );
+   state_out[7] = _mm512_add_epi32( state_in[7], H );
+}
+
+// Aggresive prehashing
+void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
+                             const __m512i *state_in )
+{
+   __m512i A, B, C, D, E, F, G, H;
+
+   A = _mm512_load_si512( state_in     );
+   B = _mm512_load_si512( state_in + 1 );
+   C = _mm512_load_si512( state_in + 2 );
+   D = _mm512_load_si512( state_in + 3 );
+   E = _mm512_load_si512( state_in + 4 );
+   F = _mm512_load_si512( state_in + 5 );
+   G = _mm512_load_si512( state_in + 6 );
+   H = _mm512_load_si512( state_in + 7 );
+
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+
+   _mm512_store_si512( state_mid    , A );
+   _mm512_store_si512( state_mid + 1, B );
+   _mm512_store_si512( state_mid + 2, C );
+   _mm512_store_si512( state_mid + 3, D );
+   _mm512_store_si512( state_mid + 4, E );
+   _mm512_store_si512( state_mid + 5, F );
+   _mm512_store_si512( state_mid + 6, G );
+   _mm512_store_si512( state_mid + 7, H );
+}   
+
+void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
+                          const __m512i *state_in, const __m512i *state_mid )
+{
+   __m512i A, B, C, D, E, F, G, H;
+   __m512i W[16];
+
+   memcpy_512( W, data, 16 );
+
+   A = _mm512_load_si512( state_mid     );
+   B = _mm512_load_si512( state_mid + 1 );
+   C = _mm512_load_si512( state_mid + 2 );
+   D = _mm512_load_si512( state_mid + 3 );
+   E = _mm512_load_si512( state_mid + 4 );
+   F = _mm512_load_si512( state_mid + 5 );
+   G = _mm512_load_si512( state_mid + 6 );
+   H = _mm512_load_si512( state_mid + 7 );
+
+//   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+//   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+//   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2x16_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2x16_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2x16_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2x16_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2x16_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2x16_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2x16_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2x16_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2x16_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2x16_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2x16_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2x16_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2x16_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2x16_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2x16_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2x16_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   A = _mm512_add_epi32( A, _mm512_load_si512( state_in     ) );
+   B = _mm512_add_epi32( B, _mm512_load_si512( state_in + 1 ) );
+   C = _mm512_add_epi32( C, _mm512_load_si512( state_in + 2 ) );
+   D = _mm512_add_epi32( D, _mm512_load_si512( state_in + 3 ) );
+   E = _mm512_add_epi32( E, _mm512_load_si512( state_in + 4 ) );
+   F = _mm512_add_epi32( F, _mm512_load_si512( state_in + 5 ) );
+   G = _mm512_add_epi32( G, _mm512_load_si512( state_in + 6 ) );
+   H = _mm512_add_epi32( H, _mm512_load_si512( state_in + 7 ) );
+   
+   _mm512_store_si512( state_out    ,  A );
+   _mm512_store_si512( state_out + 1,  B );
+   _mm512_store_si512( state_out + 2,  C );
+   _mm512_store_si512( state_out + 3,  D );
+   _mm512_store_si512( state_out + 4,  E );
+   _mm512_store_si512( state_out + 5,  F );
+   _mm512_store_si512( state_out + 6,  G );
+   _mm512_store_si512( state_out + 7,  H );
+}
+
 static void
 sha256_16way_round( sha256_16way_context *ctx,  __m512i *in, __m512i r[8] )
 {
-   register  __m512i A, B, C, D, E, F, G, H;
+   register __m512i A, B, C, D, E, F, G, H;
   __m512i W[16];

   mm512_block_bswap_32( W  , in   );
@@ -657,6 +1060,7 @@ sha256_16way_round( sha256_16way_context *ctx,  __m512i *in, __m512i r[8] )
      H = m512_const1_64( 0x5BE0CD195BE0CD19 );
   }

+
   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
@@ -800,10 +1204,8 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
    high = (sc->count_high << 3) | (low >> 29);
    low = low << 3;

-    sc->buf[ pad >> 2 ] =
-                 mm512_bswap_32( m512_const1_32( high ) );
-    sc->buf[ ( pad+4 ) >> 2 ] =
-                 mm512_bswap_32( m512_const1_32( low ) );
+    sc->buf[   pad     >> 2 ] = m512_const1_32( bswap_32( high ) );
+    sc->buf[ ( pad+4 ) >> 2 ] = m512_const1_32( bswap_32( low ) );

    sha256_16way_round( sc, sc->buf, sc->val );

--- a/algo/sha/sha256-hash-opt.c
+++ b/algo/sha/sha256-hash-opt.c
@@ -3,23 +3,24 @@
 /*   Based on code from Intel, and by Sean Gulley for      */
 /*   the miTLS project.                                    */

-// A drop in replacement for the function of the same name in sph_sha2.c.
+// A stripped down version with byte swapping removed. 

 #if defined(__SHA__)

-#include "simd-utils.h"
+#include "sha256-hash-opt.h"

-static void sha2_round( const uint8_t input[], uint32_t state[8] )
+void sha256_opt_transform( uint32_t *state_out, const void *input,
+                           const uint32_t *state_in )
 {
    __m128i STATE0, STATE1;
-    __m128i MSG, TMP, MASK;
+    __m128i MSG, TMP;
    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
    __m128i ABEF_SAVE, CDGH_SAVE;

    // Load initial values
-    TMP = _mm_load_si128((__m128i*) &state[0]);
-    STATE1 = _mm_load_si128((__m128i*) &state[4]);
-    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
+    TMP = _mm_load_si128((__m128i*) &state_in[0]);
+    STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
+//    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);

    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
@@ -31,8 +32,8 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
    CDGH_SAVE = STATE1;

    // Rounds 0-3
-    MSG = _mm_load_si128((const __m128i*) (input+0));
-    TMSG0 = _mm_shuffle_epi8(MSG, MASK);
+    TMSG0 = _mm_load_si128((const __m128i*) (input+0));
+//    TMSG0 = _mm_shuffle_epi8(MSG, MASK);
    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
@@ -40,7 +41,7 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )

    // Rounds 4-7
    TMSG1 = _mm_load_si128((const __m128i*) (input+16));
-    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
+//    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
@@ -49,7 +50,7 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )

    // Rounds 8-11
    TMSG2 = _mm_load_si128((const __m128i*) (input+32));
-    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
+//    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    MSG = _mm_shuffle_epi32(MSG, 0x0E);
@@ -58,7 +59,7 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )

    // Rounds 12-15
    TMSG3 = _mm_load_si128((const __m128i*) (input+48));
-    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
+//    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
@@ -192,9 +193,8 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF

    // Save state
-    _mm_store_si128((__m128i*) &state[0], STATE0);
-    _mm_store_si128((__m128i*) &state[4], STATE1);
+    _mm_store_si128((__m128i*) &state_out[0], STATE0);
+    _mm_store_si128((__m128i*) &state_out[4], STATE1);
 }

-
 #endif
--- a/algo/sha/sha256-hash-opt.h
+++ b/algo/sha/sha256-hash-opt.h
@@ -0,0 +1,18 @@
+#ifndef SHA2_HASH_OPT_H__
+#define SHA2_HASH_OPT_H__ 1
+
+#include <stddef.h>
+#include "simd-utils.h"
+
+#if defined(__SHA__)
+
+void sha256_opt_transform( uint32_t *state_out, const void *input,
+                           const uint32_t *state_in );
+
+// 2 way with interleaved instructions
+void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
+                              const void *msg_X, const void *msg_Y,
+                              const uint32_t *in_X, const uint32_t *in_Y );
+
+#endif
+#endif
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -0,0 +1,252 @@
+#include "sha256t-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "sha-hash-4way.h"
+
+#if defined(SHA256D_16WAY)
+
+int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m512i  block[16]    __attribute__ ((aligned (64)));
+   __m512i  hash32[8]    __attribute__ ((aligned (32)));
+   __m512i  initstate[8] __attribute__ ((aligned (32)));
+   __m512i  midstate[8]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   __m512i  vdata[20]    __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 16;
+   uint32_t n = first_nonce;
+   __m512i *noncev = vdata + 19; 
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m512i last_byte = m512_const1_32( 0x80000000 );
+   const __m512i sixteen = m512_const1_32( 16 );
+
+   for ( int i = 0; i < 19; i++ )
+       vdata[i] = m512_const1_32( pdata[i] );
+
+   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
+                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
+
+   // initialize state
+   initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
+   initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
+   initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
+   initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
+   initstate[4] = m512_const1_64( 0x510E527F510E527F );
+   initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
+   initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
+   initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
+
+   // hash first 64 bytes of data
+   sha256_16way_transform( midstate, vdata, initstate );
+
+   do
+   {
+      // 1. final 16 bytes of data, with padding
+      memcpy_512( block, vdata + 16, 4 );
+      block[ 4] = last_byte;
+      memset_zero_512( block + 5, 10 );  
+      block[15] = m512_const1_32( 80*8 ); // bit count
+      sha256_16way_transform( hash32, block, midstate );
+
+      // 2. 32 byte hash from 1.
+      memcpy_512( block, hash32, 8 );
+      block[ 8] = last_byte;
+      memset_zero_512( block + 9, 6 );
+      block[15] = m512_const1_32( 32*8 ); // bit count
+      sha256_16way_transform( hash32, block, initstate );
+
+      // byte swap final hash for testing
+      mm512_block_bswap_32( hash32, hash32 );    
+
+      for ( int lane = 0; lane < 16; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_16x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+       }
+       *noncev = _mm512_add_epi32( *noncev, sixteen );
+       n += 16;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+#endif
+
+#if defined(SHA256D_8WAY)
+
+int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m256i  block[16]    __attribute__ ((aligned (64)));
+   __m256i  hash32[8]    __attribute__ ((aligned (32)));
+   __m256i  initstate[8] __attribute__ ((aligned (32)));
+   __m256i  midstate[8]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   __m256i  vdata[20]    __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+   __m256i *noncev = vdata + 19;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m256i last_byte = m256_const1_32( 0x80000000 );
+   const __m256i eight = m256_const1_32( 8 );
+
+   for ( int i = 0; i < 19; i++ )
+       vdata[i] = m256_const1_32( pdata[i] );
+
+   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
+
+   // initialize state
+   initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
+   initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
+   initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
+   initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
+   initstate[4] = m256_const1_64( 0x510E527F510E527F );
+   initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
+   initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
+   initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
+
+   // hash first 64 bytes of data
+   sha256_8way_transform( midstate, vdata, initstate );
+
+   do
+   {
+      // 1. final 16 bytes of data, with padding
+      memcpy_256( block, vdata + 16, 4 );
+      block[ 4] = last_byte;
+      memset_zero_256( block + 5, 10 );
+      block[15] = m256_const1_32( 80*8 ); // bit count
+      sha256_8way_transform( hash32, block, midstate );
+
+      // 2. 32 byte hash from 1.
+      memcpy_256( block, hash32, 8 );
+      block[ 8] = last_byte;
+      memset_zero_256( block + 9, 6 );
+      block[15] = m256_const1_32( 32*8 ); // bit count
+      sha256_8way_transform( hash32, block, initstate );
+
+      // byte swap final hash for testing
+      mm256_block_bswap_32( hash32, hash32 );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_8x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+       }
+       *noncev = _mm256_add_epi32( *noncev, eight );
+       n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
+#if defined(SHA256D_4WAY)
+
+int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m128i  block[16]    __attribute__ ((aligned (64)));
+   __m128i  hash32[8]    __attribute__ ((aligned (32)));
+   __m128i  initstate[8] __attribute__ ((aligned (32)));
+   __m128i  midstate[8]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   __m128i  vdata[20]    __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   uint32_t n = first_nonce;
+   __m128i *noncev = vdata + 19;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m128i last_byte = m128_const1_32( 0x80000000 );
+   const __m128i four = m128_const1_32( 4 );
+
+   for ( int i = 0; i < 19; i++ )
+       vdata[i] = m128_const1_32( pdata[i] );
+
+   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
+
+   // initialize state
+   initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
+   initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
+   initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
+   initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
+   initstate[4] = m128_const1_64( 0x510E527F510E527F );
+   initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
+   initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
+   initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
+
+   // hash first 64 bytes of data
+   sha256_4way_transform( midstate, vdata, initstate );
+
+   do
+   {
+      // 1. final 16 bytes of data, with padding
+      memcpy_128( block, vdata + 16, 4 );
+      block[ 4] = last_byte;
+      memset_zero_128( block + 5, 10 );
+      block[15] = m128_const1_32( 80*8 ); // bit count
+      sha256_4way_transform( hash32, block, midstate );
+
+      // 2. 32 byte hash from 1.
+      memcpy_128( block, hash32, 8 );
+      block[ 8] = last_byte;
+      memset_zero_128( block + 9, 6 );
+      block[15] = m128_const1_32( 32*8 ); // bit count
+      sha256_4way_transform( hash32, block, initstate );
+
+      // byte swap final hash for testing
+      mm128_block_bswap_32( hash32, hash32 );
+
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_4x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+       }
+       *noncev = _mm_add_epi32( *noncev, four );
+       n += 4;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -7,133 +7,173 @@

 #if defined(SHA256T_16WAY)

-static __thread sha256_16way_context sha256_ctx16 __attribute__ ((aligned (64)));
-
-void sha256t_16way_hash( void* output, const void* input )
-{
-   uint32_t vhash[8*16] __attribute__ ((aligned (64)));
-   sha256_16way_context ctx;
-   memcpy( &ctx, &sha256_ctx16, sizeof ctx );
-
-   sha256_16way_update( &ctx, input + (64<<4), 16 );
-   sha256_16way_close( &ctx, vhash );
-
-   sha256_16way_init( &ctx );
-   sha256_16way_update( &ctx, vhash, 32 );
-   sha256_16way_close( &ctx, vhash );
-
-   sha256_16way_init( &ctx );
-   sha256_16way_update( &ctx, vhash, 32 );
-   sha256_16way_close( &ctx, output );
-}
-
 int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
-   uint32_t hash32[8*16] __attribute__ ((aligned (32)));
+   __m512i  block[16]    __attribute__ ((aligned (64)));
+   __m512i  hash32[8]    __attribute__ ((aligned (32)));
+   __m512i  initstate[8] __attribute__ ((aligned (32)));
+   __m512i  midstate[8]  __attribute__ ((aligned (32)));
+   __m512i  midstate2[8] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *hash32_d7 = &(hash32[7<<4]);
+   __m512i  vdata[20]    __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
   const uint32_t targ32_d7 = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 16;
   uint32_t n = first_nonce;
-   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
+   __m512i *noncev = vdata + 19; 
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
+   const __m512i last_byte = m512_const1_32( 0x80000000 );
+   const __m512i sixteen = m512_const1_32( 16 );
+
+   for ( int i = 0; i < 19; i++ )
+       vdata[i] = m512_const1_32( pdata[i] );

-   mm512_bswap32_intrlv80_16x32( vdata, pdata );
   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
-   sha256_16way_init( &sha256_ctx16 );
-   sha256_16way_update( &sha256_ctx16, vdata, 64 );
+
+   // initialize state
+   initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
+   initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
+   initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
+   initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
+   initstate[4] = m512_const1_64( 0x510E527F510E527F );
+   initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
+   initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
+   initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
+
+   // hash first 64 byte block of data
+   sha256_16way_transform( midstate, vdata, initstate );
+
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );

   do
   {
-     pdata[19] = n;
-     sha256t_16way_hash( hash32, vdata );
-     for ( int lane = 0; lane < 16; lane++ )
-     if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
-     {
-        extr_lane_16x32( lane_hash, hash32, lane, 256 );
-        if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
-        {
-           pdata[19] = bswap_32( n + lane );
-           submit_solution( work, lane_hash, mythr );
-        }
-      }
-      *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
-      n += 16;
+      // 1. final 16 bytes of data, with padding
+      memcpy_512( block, vdata + 16, 4 );
+      block[ 4] = last_byte;
+      memset_zero_512( block + 5, 10 );  
+      block[15] = m512_const1_32( 80*8 ); // bit count
+      sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
+//      sha256_16way_transform( hash32, block, midstate );
+
+      // 2. 32 byte hash from 1.
+      memcpy_512( block, hash32, 8 );
+      block[ 8] = last_byte;
+      memset_zero_512( block + 9, 6 );
+      block[15] = m512_const1_32( 32*8 ); // bit count
+      sha256_16way_transform( hash32, block, initstate );
+
+      // 3. 32 byte hash from 2.
+      memcpy_512( block, hash32, 8 );
+      sha256_16way_transform( hash32, block, initstate );
+
+      // byte swap final hash for testing
+      mm512_block_bswap_32( hash32, hash32 );    
+
+      for ( int lane = 0; lane < 16; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_16x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+       }
+       *noncev = _mm512_add_epi32( *noncev, sixteen );
+       n += 16;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }

+
 #endif

 #if defined(SHA256T_8WAY)

-static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
-
-void sha256t_8way_hash( void* output, const void* input )
-{
-   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
-   sha256_8way_context ctx;
-   memcpy( &ctx, &sha256_ctx8, sizeof ctx );
-
-   sha256_8way_update( &ctx, input + (64<<3), 16 );
-   sha256_8way_close( &ctx, vhash );
-
-   sha256_8way_init( &ctx );
-   sha256_8way_update( &ctx, vhash, 32 );
-   sha256_8way_close( &ctx, vhash );
-
-   sha256_8way_init( &ctx );
-   sha256_8way_update( &ctx, vhash, 32 );
-   sha256_8way_close( &ctx, output );
-}
-
 int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[20*8]  __attribute__ ((aligned (64)));
-   uint32_t hash32[8*8]    __attribute__ ((aligned (32)));
+   __m256i  block[16]    __attribute__ ((aligned (64)));
+   __m256i  hash32[8]    __attribute__ ((aligned (32)));
+   __m256i  initstate[8] __attribute__ ((aligned (32)));
+   __m256i  midstate[8]  __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *hash32_d7 = &(hash32[7<<3]);
+   __m256i  vdata[20]    __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
   const uint32_t targ32_d7 = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
-   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   __m256i *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
+   const __m256i last_byte = m256_const1_32( 0x80000000 );
+   const __m256i eight = m256_const1_32( 8 );

-   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
-   sha256_8way_init( &sha256_ctx8 );
-   sha256_8way_update( &sha256_ctx8, vdata, 64 );
+   for ( int i = 0; i < 19; i++ )
+       vdata[i] = m256_const1_32( pdata[i] );
+
+   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
+
+   // initialize state
+   initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
+   initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
+   initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
+   initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
+   initstate[4] = m256_const1_64( 0x510E527F510E527F );
+   initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
+   initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
+   initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
+
+   // hash first 64 bytes of data
+   sha256_8way_transform( midstate, vdata, initstate );

   do
   {
-     pdata[19] = n;
-     sha256t_8way_hash( hash32, vdata );
-     for ( int lane = 0; lane < 8; lane++ )
-     if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
-     {
-        extr_lane_8x32( lane_hash, hash32, lane, 256 );
-        if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
-        {
-           pdata[19] = bswap_32( n + lane );
-           submit_solution( work, lane_hash, mythr );
-        }
-      }
-      *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
-      n += 8;
+      // 1. final 16 bytes of data, with padding
+      memcpy_256( block, vdata + 16, 4 );
+      block[ 4] = last_byte;
+      memset_zero_256( block + 5, 10 );
+      block[15] = m256_const1_32( 80*8 ); // bit count
+      sha256_8way_transform( hash32, block, midstate );
+
+      // 2. 32 byte hash from 1.
+      memcpy_256( block, hash32, 8 );
+      block[ 8] = last_byte;
+      memset_zero_256( block + 9, 6 );
+      block[15] = m256_const1_32( 32*8 ); // bit count
+      sha256_8way_transform( hash32, block, initstate );
+
+      // 3. 32 byte hash from 2.
+      memcpy_256( block, hash32, 8 );
+      sha256_8way_transform( hash32, block, initstate );
+
+      // byte swap final hash for testing
+      mm256_block_bswap_32( hash32, hash32 );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_8x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+       }
+       *noncev = _mm256_add_epi32( *noncev, eight );
+       n += 8;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
@@ -144,82 +184,84 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,

 #if defined(SHA256T_4WAY)

-static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
-
-void sha256t_4way_hash( void* output, const void* input )
-{
-   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-   sha256_4way_context ctx;
-   memcpy( &ctx, &sha256_ctx4, sizeof ctx );
-
-   sha256_4way_update( &ctx, input + (64<<2), 16 );
-   sha256_4way_close( &ctx, vhash );
-
-   sha256_4way_init( &ctx );
-   sha256_4way_update( &ctx, vhash, 32 );
-   sha256_4way_close( &ctx, vhash );
-
-   sha256_4way_init( &ctx );
-   sha256_4way_update( &ctx, vhash, 32 );
-   sha256_4way_close( &ctx, output );
-}
-
 int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
-	                   uint64_t *hashes_done, struct thr_info *mythr )
+                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t hash[8*4] __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &(hash[7<<2]);
+   __m128i  block[16]    __attribute__ ((aligned (64)));
+   __m128i  hash32[8]    __attribute__ ((aligned (32)));
+   __m128i  initstate[8] __attribute__ ((aligned (32)));
+   __m128i  midstate[8]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   __m128i  vdata[20]    __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
+   const uint32_t targ32_d7 = ptarget[7];
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
-   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   __m128i *noncev = vdata + 19;
   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m128i last_byte = m128_const1_32( 0x80000000 );
+   const __m128i four = m128_const1_32( 4 );

-   const uint64_t htmax[] = {          0,
-                                     0xF,
-                                    0xFF,
-                                   0xFFF,
-                                  0xFFFF,
-                              0x10000000 };
-   const uint32_t masks[] = { 0xFFFFFFFF,
-                              0xFFFFFFF0,
-                              0xFFFFFF00,
-                              0xFFFFF000,
-                              0xFFFF0000,
-                                       0 };
+   for ( int i = 0; i < 19; i++ )
+       vdata[i] = m128_const1_32( pdata[i] );

-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
-   sha256_4way_init( &sha256_ctx4 );
-   sha256_4way_update( &sha256_ctx4, vdata, 64 );
+   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );

-   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+   // initialize state
+   initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
+   initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
+   initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
+   initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
+   initstate[4] = m128_const1_64( 0x510E527F510E527F );
+   initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
+   initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
+   initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
+
+   // hash first 64 bytes of data
+   sha256_4way_transform( midstate, vdata, initstate );
+
+   do
   {
-      const uint32_t mask = masks[m];
-      do {
-         *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
-         pdata[19] = n;
+      // 1. final 16 bytes of data, with padding
+      memcpy_128( block, vdata + 16, 4 );
+      block[ 4] = last_byte;
+      memset_zero_128( block + 5, 10 );
+      block[15] = m128_const1_32( 80*8 ); // bit count
+      sha256_4way_transform( hash32, block, midstate );

-         sha256t_4way_hash( hash, vdata );
+      // 2. 32 byte hash from 1.
+      memcpy_128( block, hash32, 8 );
+      block[ 8] = last_byte;
+      memset_zero_128( block + 9, 6 );
+      block[15] = m128_const1_32( 32*8 ); // bit count
+      sha256_4way_transform( hash32, block, initstate );

-         for ( int lane = 0; lane < 4; lane++ )
-         if ( !( hash7[ lane ] & mask ) )
+      // 3. 32 byte hash from 2.
+      memcpy_128( block, hash32, 8 );
+      sha256_4way_transform( hash32, block, initstate );
+
+      // byte swap final hash for testing
+      mm128_block_bswap_32( hash32, hash32 );
+
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_4x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
         {
-            extr_lane_4x32( lane_hash, hash, lane, 256 );
-            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
-            {
-              pdata[19] = n + lane;
-              submit_solution( work, lane_hash, mythr );
-	         }
-         } 
-         n += 4;
-      } while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
-      break;
-   }
-   *hashes_done = n - first_nonce + 1;
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+       }
+       *noncev = _mm_add_epi32( *noncev, four );
+       n += 4;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
   return 0;
 }

--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -5,17 +5,13 @@ bool register_sha256t_algo( algo_gate_t* gate )
    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
 #if defined(SHA256T_16WAY)
    gate->scanhash   = (void*)&scanhash_sha256t_16way;
-    gate->hash       = (void*)&sha256t_16way_hash;
 #elif defined(__SHA__)
    gate->optimizations = SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t;
-    gate->hash       = (void*)&sha256t_hash;
 #elif defined(SHA256T_8WAY)
    gate->scanhash   = (void*)&scanhash_sha256t_8way;
-    gate->hash       = (void*)&sha256t_8way_hash;
 #else
    gate->scanhash   = (void*)&scanhash_sha256t_4way;
-    gate->hash       = (void*)&sha256t_4way_hash;
 #endif
    return true;
 }
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -17,7 +17,6 @@ bool register_sha256q_algo( algo_gate_t* gate );

 #if defined(SHA256T_16WAY)

-void sha256t_16way_hash( void *output, const void *input );
 int scanhash_sha256t_16way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 void sha256q_16way_hash( void *output, const void *input );
@@ -27,7 +26,6 @@ int scanhash_sha256q_16way( struct work *work, uint32_t max_nonce,

 #if defined(SHA256T_8WAY)

-void sha256t_8way_hash( void *output, const void *input );
 int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 void sha256q_8way_hash( void *output, const void *input );
@@ -37,7 +35,6 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,

 #if defined(SHA256T_4WAY)

-void sha256t_4way_hash( void *output, const void *input );
 int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 void sha256q_4way_hash( void *output, const void *input );
@@ -45,10 +42,13 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 #endif

+#if defined(__SHA__)

-int sha256t_hash( void *output, const void *input );
 int scanhash_sha256t( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
 int sha256q_hash( void *output, const void *input );
 int scanhash_sha256q( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -3,10 +3,14 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/sha/sph_sha2.h"
+//#include "algo/sha/sph_sha2.h"
+#include "sha256-hash-opt.h"
+
+#if defined(__SHA__)

 // Only used on CPUs with SHA

+/*
 static __thread sph_sha256_context sha256t_ctx __attribute__ ((aligned (64)));

 void sha256t_midstate( const void* input )
@@ -37,12 +41,21 @@ int sha256t_hash( void* output, const void* input )

   return 1;
 }
+*/

+/*
 int scanhash_sha256t( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t edata[20] __attribute__((aligned(64)));
-   uint32_t hash[8] __attribute__((aligned(64)));
+   uint32_t block[16]    __attribute__ ((aligned (64)));
+   uint32_t hash32[8]    __attribute__ ((aligned (32)));
+   uint32_t initstate[8] __attribute__ ((aligned (32)));
+   uint32_t midstate[8]  __attribute__ ((aligned (32)));
+
+
+
+//   uint32_t edata[20] __attribute__((aligned(64)));
+//   uint32_t hash[8] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -50,24 +63,148 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
+   __m128i shuf_bswap32 =
+           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );

-   mm128_bswap32_80( edata, pdata );
-   sha256t_midstate( edata );
+//   mm128_bswap32_80( edata, pdata );
+//   sha256t_midstate( edata );
+
+   // initialize state
+   initstate[0] = 0x6A09E667;
+   initstate[1] = 0xBB67AE85;
+   initstate[2] = 0x3C6EF372;
+   initstate[3] = 0xA54FF53A;
+   initstate[4] = 0x510E527F;
+   initstate[5] = 0x9B05688C;
+   initstate[6] = 0x1F83D9AB;
+   initstate[7] = 0x5BE0CD19;
+
+   // hash first 64 bytes of data
+   sha256_opt_transform( midstate, pdata, initstate );

   do
   {
-      edata[19] = n;
-      if ( likely( sha256t_hash( hash, edata ) ) )
-      if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
-      {
-         pdata[19] = bswap_32( n );
-         submit_solution( work, hash, mythr );
-      }
+      // 1. final 16 bytes of data, with padding
+      memcpy( block, pdata + 16, 16 );
+      block[ 4] = 0x80000000;
+      memset( block + 5, 0, 40 );
+      block[15] = 80*8; // bit count
+      sha256_opt_transform( hash32, block, midstate );
+
+      // 2. 32 byte hash from 1.
+      memcpy( block, hash32, 32 );
+      block[ 8] = 0x80000000;
+      memset( block + 9, 0, 24 );
+      block[15] = 32*8; // bit count
+      sha256_opt_transform( hash32, block, initstate );
+
+      // 3. 32 byte hash from 2.
+      memcpy( block, hash32, 32 );
+      sha256_opt_transform( hash32, block, initstate );
+
+      // byte swap final hash for testing
+      casti_m128i( hash32, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hash32, 0 ), shuf_bswap32 );
+      casti_m128i( hash32, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hash32, 1 ), shuf_bswap32 );
+
+      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
+         submit_solution( work, hash32, mythr );
      n++;
-   } while ( n < last_nonce && !work_restart[thr_id].restart );
+      pdata[19] = n;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
   *hashes_done = n - first_nonce;
+   return 0;
+}
+*/
+
+int scanhash_sha256t( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t block0[16]    __attribute__ ((aligned (64)));
+   uint32_t block1[16]    __attribute__ ((aligned (64)));
+   uint32_t hash0[8]    __attribute__ ((aligned (32)));
+   uint32_t hash1[8]    __attribute__ ((aligned (32)));
+   uint32_t initstate[8] __attribute__ ((aligned (32)));
+   uint32_t midstate[8]  __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 1;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   __m128i shuf_bswap32 =
+           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+
+   // initialize state
+   initstate[0] = 0x6A09E667;
+   initstate[1] = 0xBB67AE85;
+   initstate[2] = 0x3C6EF372;
+   initstate[3] = 0xA54FF53A;
+   initstate[4] = 0x510E527F;
+   initstate[5] = 0x9B05688C;
+   initstate[6] = 0x1F83D9AB;
+   initstate[7] = 0x5BE0CD19;
+
+   // hash first 64 bytes of data
+   sha256_opt_transform( midstate, pdata, initstate );
+
+   do
+   {
+      // 1. final 16 bytes of data, with padding
+      memcpy( block0, pdata + 16, 16 );
+      memcpy( block1, pdata + 16, 16 );
+      block0[ 3] = n;
+      block1[ 3] = n+1;
+      block0[ 4] = block1[ 4] = 0x80000000;
+      memset( block0 + 5, 0, 40 );
+      memset( block1 + 5, 0, 40 );
+      block0[15] = block1[15] = 80*8; // bit count
+      sha256_ni2way_transform( hash0, hash1, block0, block1, midstate, midstate );
+
+      // 2. 32 byte hash from 1.
+      memcpy( block0, hash0, 32 );
+      memcpy( block1, hash1, 32 );
+      block0[ 8] = block1[ 8] = 0x80000000;
+      memset( block0 + 9, 0, 24 );
+      memset( block1 + 9, 0, 24 );
+      block0[15] = block1[15] = 32*8; // bit count
+      sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
+
+      // 3. 32 byte hash from 2.
+      memcpy( block0, hash0, 32 );
+      memcpy( block1, hash1, 32 );
+      sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
+
+      // byte swap final hash for testing
+      casti_m128i( hash0, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
+      casti_m128i( hash0, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
+      casti_m128i( hash1, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
+      casti_m128i( hash1, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
+
+      if ( unlikely( valid_hash( hash0, ptarget ) && !bench ) )
+      {
+         pdata[19] = n;
+         submit_solution( work, hash0, mythr );
+      }
+      if ( unlikely( valid_hash( hash1, ptarget ) && !bench ) )
+      {
+         pdata[19] = n+1;
+         submit_solution( work, hash1, mythr );
+      }
+      n += 2;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
   pdata[19] = n;
+   *hashes_done = n - first_nonce;
   return 0;
 }

+#endif

--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -96,74 +96,22 @@ static const uint64_t K512[80] =
 // SHA-512 8 way 64 bit

 #define CH8W(X, Y, Z) \
-   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) 
+   _mm512_ternarylogic_epi64( X, Y, Z, 0xca )

 #define MAJ8W(X, Y, Z) \
-   _mm512_or_si512( _mm512_and_si512( X, Y ), \
-                    _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
+   _mm512_ternarylogic_epi64( X, Y, Z, 0xe8 )

 #define BSG8W_5_0(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-        mm512_ror_64(x, 28), mm512_ror_64(x, 34) ), mm512_ror_64(x, 39) )
+   mm512_xor3( mm512_ror_64(x, 28), mm512_ror_64(x, 34), mm512_ror_64(x, 39) )

 #define BSG8W_5_1(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-        mm512_ror_64(x, 14), mm512_ror_64(x, 18) ), mm512_ror_64(x, 41) )
+   mm512_xor3( mm512_ror_64(x, 14), mm512_ror_64(x, 18), mm512_ror_64(x, 41) )

 #define SSG8W_5_0(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-        mm512_ror_64(x,  1), mm512_ror_64(x,  8) ), _mm512_srli_epi64(x, 7) ) 
+   mm512_xor3( mm512_ror_64(x,  1), mm512_ror_64(x,  8), _mm512_srli_epi64(x, 7) ) 

 #define SSG8W_5_1(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-        mm512_ror_64(x, 19), mm512_ror_64(x, 61) ), _mm512_srli_epi64(x, 6) )
-
-static inline __m512i ssg8w_512_add( __m512i w0, __m512i w1 )
-{
-   __m512i w0a, w1a, w0b, w1b;
-   w0a = mm512_ror_64( w0, 1 );
-   w1a = mm512_ror_64( w1,19 );
-   w0b = mm512_ror_64( w0, 8 );
-   w1b = mm512_ror_64( w1,61 );
-   w0a = _mm512_xor_si512( w0a, w0b );
-   w1a = _mm512_xor_si512( w1a, w1b );
-   w0b = _mm512_srli_epi64( w0, 7 );
-   w1b = _mm512_srli_epi64( w1, 6 );
-   w0a = _mm512_xor_si512( w0a, w0b );
-   w1a = _mm512_xor_si512( w1a, w1b );
-   return _mm512_add_epi64( w0a, w1a );
-}
-
-
-#define SSG8W_512x2_0( w0, w1, i ) do \
-{ \
-   __m512i X0a, X1a, X0b, X1b; \
-  X0a = mm512_ror_64( W[i-15], 1 ); \
-  X1a = mm512_ror_64( W[i-14], 1 ); \
-  X0b = mm512_ror_64( W[i-15], 8 ); \
-  X1b = mm512_ror_64( W[i-14], 8 ); \
-  X0a = _mm512_xor_si512( X0a, X0b ); \
-  X1a = _mm512_xor_si512( X1a, X1b ); \
-  X0b = _mm512_srli_epi64( W[i-15], 7 ); \
-  X1b = _mm512_srli_epi64( W[i-14], 7 ); \
-  w0  = _mm512_xor_si512( X0a, X0b ); \
-  w1  = _mm512_xor_si512( X1a, X1b ); \
-} while(0)
-
-#define SSG8W_512x2_1( w0, w1, i ) do \
-{ \
-   __m512i X0a, X1a, X0b, X1b; \
-  X0a = mm512_ror_64( W[i-2],19 ); \
-  X1a = mm512_ror_64( W[i-1],19 ); \
-  X0b = mm512_ror_64( W[i-2],61 ); \
-  X1b = mm512_ror_64( W[i-1],61 ); \
-  X0a = _mm512_xor_si512( X0a, X0b ); \
-  X1a = _mm512_xor_si512( X1a, X1b ); \
-  X0b = _mm512_srli_epi64( W[i-2], 6 ); \
-  X1b = _mm512_srli_epi64( W[i-1], 6 ); \
-  w0  = _mm512_xor_si512( X0a, X0b ); \
-  w1  = _mm512_xor_si512( X1a, X1b ); \
-} while(0)
+   mm512_xor3( mm512_ror_64(x, 19), mm512_ror_64(x, 61), _mm512_srli_epi64(x, 6) )

 #define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
 do { \
@@ -187,8 +135,8 @@ sha512_8way_round( sha512_8way_context *ctx,  __m512i *in, __m512i r[8] )
   mm512_block_bswap_64( W+8, in+8 );

   for ( i = 16; i < 80; i++ )
-      W[i] = _mm512_add_epi64( ssg8w_512_add( W[i-15], W[i-2] ),
-                               _mm512_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
+      W[i] = mm512_add4_64( SSG8W_5_0( W[i-15] ), SSG8W_5_1( W[i-2] ),
+                             W[ i- 7 ], W[ i-16 ] );

   if ( ctx->initialized )
   {
@@ -319,14 +267,20 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )

 // SHA-512 4 way 64 bit

-/*
+
 #define CH(X, Y, Z) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 

+/*
 #define MAJ(X, Y, Z) \
   _mm256_or_si256( _mm256_and_si256( X, Y ), \
                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
+*/

+#define MAJ(X, Y, Z) \
+  _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
+                                         Y_xor_Z ) )
+                    
 #define BSG5_0(x) \
  mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
                   _mm256_xor_si256( mm256_ror_64( x,  5 ), x ), 6 ), x ), 28 )
@@ -334,7 +288,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
 #define BSG5_1(x) \
  mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
                   _mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 )
-*/
+
 /*
 #define BSG5_0(x) \
   _mm256_xor_si256( _mm256_xor_si256( \
@@ -402,7 +356,7 @@ static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
  w1  = _mm256_xor_si256( X1a, X1b ); \
 } while(0)
 */
-
+/*
 #define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
 do { \
  __m256i K = _mm256_set1_epi64x( K512[ i ] ); \
@@ -431,7 +385,7 @@ do { \
  H  = _mm256_add_epi64( T1, T2 ); \
  D  = _mm256_add_epi64( D, T1 ); \
 } while (0)
-
+*/
 /*
 #define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
 do { \
@@ -445,7 +399,7 @@ do { \
 } while (0)
 */

-/*
+
 #define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
 do { \
  __m256i T1, T2; \
@@ -453,16 +407,17 @@ do { \
  T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \
                                           K, W[i] ) ); \
  T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
+  Y_xor_Z = X_xor_Y; \
  D  = _mm256_add_epi64( D, T1 ); \
  H  = _mm256_add_epi64( T1, T2 ); \
 } while (0)
-*/
+

 static void
 sha512_4way_round( sha512_4way_context *ctx,  __m256i *in, __m256i r[8] )
 {
   int i;
-   register __m256i A, B, C, D, E, F, G, H;
+   register __m256i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
   __m256i W[80];

   mm256_block_bswap_64( W  , in );
@@ -495,6 +450,8 @@ sha512_4way_round( sha512_4way_context *ctx,  __m256i *in, __m256i r[8] )
      H = m256_const1_64( 0x5BE0CD19137E2179 );
   }

+   Y_xor_Z = _mm256_xor_si256( B, C );
+
   for ( i = 0; i < 80; i += 8 )
   {
      SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
--- a/algo/sha/sph_sha2.c
+++ b/algo/sha/sph_sha2.c
@@ -40,8 +40,8 @@
 #endif

 #define CH(X, Y, Z)    ((((Y) ^ (Z)) & (X)) ^ (Z))
-#define MAJ(X, Y, Z)   (((Y) & (Z)) | (((Y) | (Z)) & (X)))
-
+//#define MAJ(X, Y, Z)   (((Y) & (Z)) | (((Y) | (Z)) & (X)))
+#define MAJ( X, Y, Z )   ( Y  ^ ( ( X_xor_Y = X ^ Y ) & ( Y_xor_Z ) ) )
 #define ROTR    SPH_ROTR32

 #define BSG2_0(x)      (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
@@ -73,7 +73,194 @@ static const sph_u32 H256[8] = {

 #if defined(__SHA__)

-#include "sha256-hash-opt.c"
+#include "simd-utils.h"
+
+static void sha2_round( const uint8_t input[], uint32_t state[8] )
+{
+    __m128i STATE0, STATE1;
+    __m128i MSG, TMP, MASK;
+    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
+    __m128i ABEF_SAVE, CDGH_SAVE;
+
+    // Load initial values
+    TMP = _mm_load_si128((__m128i*) &state[0]);
+    STATE1 = _mm_load_si128((__m128i*) &state[4]);
+    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
+
+    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
+    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
+    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
+    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
+
+    // Save current hash
+    ABEF_SAVE = STATE0;
+    CDGH_SAVE = STATE1;
+
+    // Rounds 0-3
+    MSG = _mm_load_si128((const __m128i*) (input+0));
+    TMSG0 = _mm_shuffle_epi8(MSG, MASK);
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 4-7
+    TMSG1 = _mm_load_si128((const __m128i*) (input+16));
+    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+    // Rounds 8-11
+    TMSG2 = _mm_load_si128((const __m128i*) (input+32));
+    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 12-15
+    TMSG3 = _mm_load_si128((const __m128i*) (input+48));
+    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 16-19
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 20-23
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+    // Rounds 24-27
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 28-31
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 32-35
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 36-39
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+    // Rounds 40-43
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 44-47
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 48-51
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 52-55
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 56-59
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 60-63
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Add values back to state
+    STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
+    STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
+
+    TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
+    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
+    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
+    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
+
+    // Save state
+    _mm_store_si128((__m128i*) &state[0], STATE0);
+    _mm_store_si128((__m128i*) &state[4], STATE1);
+}

 #else   // no SHA

@@ -132,6 +319,7 @@ static const sph_u32 K[64] = {
 		t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \
 			+ K[pcount + (pc)] + W[(pc) & 0x0F]); \
 		t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \
+      Y_xor_Z = X_xor_Y; \
 		d = SPH_T32(d + t1); \
 		h = SPH_T32(t1 + t2); \
 	} while (0)
@@ -142,7 +330,7 @@ static const sph_u32 K[64] = {
 	SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc)

 #define SHA2_ROUND_BODY(in, r)   do { \
-		sph_u32 A, B, C, D, E, F, G, H; \
+		sph_u32 A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; \
 		sph_u32 W[16]; \
 		unsigned pcount; \
 \
@@ -155,6 +343,7 @@ static const sph_u32 K[64] = {
 		G = (r)[6]; \
 		H = (r)[7]; \
 		pcount = 0; \
+      Y_xor_Z = B ^ C; \
 		SHA2_STEP1(A, B, C, D, E, F, G, H, in,  0); \
 		SHA2_STEP1(H, A, B, C, D, E, F, G, in,  1); \
 		SHA2_STEP1(G, H, A, B, C, D, E, F, in,  2); \
@@ -202,7 +391,7 @@ static const sph_u32 K[64] = {
 #else  // large footprint (default)

 #define SHA2_ROUND_BODY(in, r)   do { \
-		sph_u32 A, B, C, D, E, F, G, H, T1, T2; \
+		sph_u32 A, B, C, D, E, F, G, H, T1, T2, X_xor_Y, Y_xor_Z;; \
 		sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \
 		sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \
 \
@@ -214,388 +403,453 @@ static const sph_u32 K[64] = {
 		F = (r)[5]; \
 		G = (r)[6]; \
 		H = (r)[7]; \
+      Y_xor_Z = B ^ C; \
 		W00 = in(0); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x428A2F98) + W00); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W01 = in(1); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x71374491) + W01); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W02 = in(2); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0xB5C0FBCF) + W02); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W03 = in(3); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0xE9B5DBA5) + W03); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W04 = in(4); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x3956C25B) + W04); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W05 = in(5); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x59F111F1) + W05); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W06 = in(6); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x923F82A4) + W06); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W07 = in(7); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0xAB1C5ED5) + W07); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W08 = in(8); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0xD807AA98) + W08); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W09 = in(9); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x12835B01) + W09); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W10 = in(10); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x243185BE) + W10); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W11 = in(11); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x550C7DC3) + W11); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W12 = in(12); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x72BE5D74) + W12); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W13 = in(13); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x80DEB1FE) + W13); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W14 = in(14); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x9BDC06A7) + W14); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W15 = in(15); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0xC19BF174) + W15); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0xE49B69C1) + W00); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0xEFBE4786) + W01); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x0FC19DC6) + W02); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x240CA1CC) + W03); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x2DE92C6F) + W04); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x4A7484AA) + W05); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x5CB0A9DC) + W06); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x76F988DA) + W07); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x983E5152) + W08); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0xA831C66D) + W09); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0xB00327C8) + W10); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0xBF597FC7) + W11); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0xC6E00BF3) + W12); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0xD5A79147) + W13); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x06CA6351) + W14); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x14292967) + W15); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x27B70A85) + W00); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x2E1B2138) + W01); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x4D2C6DFC) + W02); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x53380D13) + W03); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x650A7354) + W04); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x766A0ABB) + W05); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x81C2C92E) + W06); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x92722C85) + W07); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0xA2BFE8A1) + W08); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0xA81A664B) + W09); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0xC24B8B70) + W10); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0xC76C51A3) + W11); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0xD192E819) + W12); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0xD6990624) + W13); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0xF40E3585) + W14); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x106AA070) + W15); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x19A4C116) + W00); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x1E376C08) + W01); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x2748774C) + W02); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x34B0BCB5) + W03); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x391C0CB3) + W04); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x4ED8AA4A) + W05); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x5B9CCA4F) + W06); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x682E6FF3) + W07); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x748F82EE) + W08); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+      Y_xor_Z = X_xor_Y; \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x78A5636F) + W09); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+      Y_xor_Z = X_xor_Y; \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x84C87814) + W10); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+      Y_xor_Z = X_xor_Y; \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x8CC70208) + W11); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+      Y_xor_Z = X_xor_Y; \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x90BEFFFA) + W12); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+      Y_xor_Z = X_xor_Y; \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0xA4506CEB) + W13); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+      Y_xor_Z = X_xor_Y; \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0xBEF9A3F7) + W14); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+      Y_xor_Z = X_xor_Y; \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0xC67178F2) + W15); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+      Y_xor_Z = X_xor_Y; \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		(r)[0] = SPH_T32((r)[0] + A); \
--- a/algo/sha/sph_sha2big.c
+++ b/algo/sha/sph_sha2big.c
@@ -38,7 +38,8 @@
 #if SPH_64

 #define CH(X, Y, Z)    ((((Y) ^ (Z)) & (X)) ^ (Z))
-#define MAJ(X, Y, Z)   (((X) & (Y)) | (((X) | (Y)) & (Z)))
+//#define MAJ(X, Y, Z)   (((X) & (Y)) | (((X) | (Y)) & (Z)))
+#define MAJ( X, Y, Z )   ( Y  ^ ( ( X ^ Y ) & ( Y ^ Z ) ) )

 #define ROTR64    SPH_ROTR64

--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -310,12 +310,13 @@ do { \

 #define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
 do { \
-   xa0 = _mm256_xor_si256( xm, _mm256_xor_si256( xb1, _mm256_xor_si256(  \
+   xa0 = mm256_xor3( xm, xb1, _mm256_xor_si256(  \
            _mm256_andnot_si256( xb3, xb2 ), \
-            _mm256_mullo_epi32( _mm256_xor_si256( xa0, _mm256_xor_si256( xc, \
-               _mm256_mullo_epi32(  mm256_rol_32( xa1, 15 ), _mm256_set1_epi32(5UL) ) \
-                   ) ), _mm256_set1_epi32(3UL) ) ) ) ); \
-   xb0 = mm256_not( _mm256_xor_si256( xa0, mm256_rol_32( xb0, 1 ) ) ); \
+            _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
+               _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \
+                                   _mm256_set1_epi32(5UL) ) ), \
+               _mm256_set1_epi32(3UL) ) ) ); \
+   xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
 } while (0)

 #define PERM_STEP_0_8   do { \
--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -23,6 +23,8 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
   register __m512i K0, K1, K2, K3, K4, K5, K6, K7;
   __m512i *M = (__m512i*)msg;
   __m512i *H = (__m512i*)ctx->h;
+   const __m512i count = _mm512_set4_epi32( ctx->count3, ctx->count2,
+                                            ctx->count1, ctx->count0 );
   int r;

   P0 = H[0];
@@ -62,16 +64,16 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
                                  _mm512_aesenc_epi128( K0, m512_zero ) ) );

     if ( r == 0 )
-        K0 = _mm512_xor_si512( K0, _mm512_set4_epi32( 
-		              ~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) );
+        K0 = _mm512_xor_si512( K0,
+                    _mm512_mask_xor_epi32( count, 0x8888, count, m512_neg1 ) );

     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
     K1 = _mm512_xor_si512( K0,
 		           mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );

     if ( r == 1 )
-        K1 = _mm512_xor_si512( K1, _mm512_set4_epi32(
-	                 ~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) ); 
+        K1 = _mm512_xor_si512( K1, mm512_ror128_32(
+                 _mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) );

     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
     K2 = _mm512_xor_si512( K1,
@@ -96,8 +98,8 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
 		           mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );

     if ( r == 2 )
-        K7 = _mm512_xor_si512( K7, _mm512_set4_epi32(
-                    ~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) );
+        K7 = _mm512_xor_si512( K7, mm512_swap128_64(
+                 _mm512_mask_xor_epi32( count, 0x2222, count, m512_neg1 ) ) );
 
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
     P1 = _mm512_xor_si512( P1, X );
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -101,15 +101,6 @@ c512( sph_shavite_big_context *sc, const void *msg )

   // round

-//  working proof of concept   
-/*
-   __m512i K = m512_const1_128( m[0] );
-   __m512i X = _mm512_xor_si512( m512_const1_128( p1 ), K );
-   X = _mm512_aesenc_epi128( X, m512_zero );
-   k00 = _mm512_castsi512_si128( K );
-   x = _mm512_castsi512_si128( X );
-*/
-
   k00 = m[0];
   x = _mm_xor_si128( p1, k00 );
   x = _mm_aesenc_si128( x, zero );
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -747,11 +747,6 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )

  static const m512_v16 code[] = { c1_16_512(185), c1_16_512(233) };

-
-//  static const m512_v16 code[] = { c1_16(185), c1_16(233),
-//                                   c1_16(185), c1_16(233) };
-
-
  S0l = _mm512_xor_si512( S[0], M[0] );
  S0h = _mm512_xor_si512( S[1], M[1] );
  S1l = _mm512_xor_si512( S[2], M[2] );
@@ -764,11 +759,16 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
 // targetted, local macros don't need a unique name
 #define S(i) S##i

+#define F_0( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xca )
+#define F_1( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xe8 )  
+
+/*  
 #define F_0(B, C, D) \
   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( C,D ), B ), D )
 #define F_1(B, C, D) \
   _mm512_or_si512( _mm512_and_si512( D, C ),\
                    _mm512_and_si512( _mm512_or_si512( D,C ), B ) )
+*/

 #define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
 #define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
--- a/algo/simd/vector.c
+++ b/algo/simd/vector.c
@@ -6,10 +6,6 @@

 #define PRINT_SOME 0

-/* JDD all ocurrances of macro X in this file renamed to XX
- * due to name conflict
- */
-
 int SupportedLength(int hashbitlen) {
  if (hashbitlen <= 0 || hashbitlen > 512)
    return 0;
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -309,22 +309,16 @@ static const uint64_t IV512[] = {
      sc->bcount = bcount; \
   } while (0)
   
-// AVX2 all scalar vars are now vectors representing 4 nonces in parallel
-

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 #define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
 do { \
-  k8 = _mm512_xor_si512( _mm512_xor_si512( \
-                            _mm512_xor_si512( _mm512_xor_si512( k0, k1 ), \
-                                              _mm512_xor_si512( k2, k3 ) ), \
-                            _mm512_xor_si512( _mm512_xor_si512( k4, k5 ), \
-                                              _mm512_xor_si512( k6, k7 ) ) ), \
-                         m512_const1_64( 0x1BD11BDAA9FC1A22) ); \
+  k8 = mm512_xor3( mm512_xor3( k0, k1, k2 ), mm512_xor3( k3, k4, k5 ), \
+                   mm512_xor3( k6, k7, m512_const1_64( 0x1BD11BDAA9FC1A22) ));\
  t2 = t0 ^ t1; \
 } while (0)
-   
+
 #define TFBIG_ADDKEY_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \
 do { \
  w0 = _mm512_add_epi64( w0, SKBI(k,s,0) ); \
@@ -340,7 +334,6 @@ do { \
                                         m512_const1_64( s ) ) ); \
 } while (0)

-
 #define TFBIG_MIX_8WAY(x0, x1, rc) \
 do { \
     x0 = _mm512_add_epi64( x0, x1 ); \
--- a/algo/verthash/Verthash.c
+++ b/algo/verthash/Verthash.c
@@ -44,8 +44,8 @@ int verthash_info_init(verthash_info_t* info, const char* file_name)
       if ( opt_data_file || !opt_verify ) 
       {
          if ( opt_data_file )
-             applog( LOG_ERR,
-                     "Verthash data file not found or invalid: %s", info->fileName );
+             applog( LOG_ERR, "Verthash data file not found or invalid: %s",
+                     info->fileName );
          else
          {
             applog( LOG_ERR,
@@ -134,76 +134,117 @@ static inline uint32_t fnv1a(const uint32_t a, const uint32_t b)
    return (a ^ b) * 0x1000193;
 }

-void verthash_hash( const unsigned char* blob_bytes,
-                    const size_t blob_size,
-                    const unsigned char(*input)[VH_HEADER_SIZE],
-                    unsigned char(*output)[VH_HASH_OUT_SIZE] )
+#if 0
+static void rotate_indexes( uint32_t *p )
 {
-    unsigned char p1[ VH_HASH_OUT_SIZE ] __attribute__ ((aligned (64)));
-    unsigned char p0[ VH_N_SUBSET ] __attribute__ ((aligned (64)));
-    uint32_t seek_indexes[VH_N_INDEXES] __attribute__ ((aligned (64)));
-    uint32_t* p0_index = (uint32_t*)p0;
-
-    verthash_sha3_512_final_8( p0, ( (uint64_t*)input )[ 9 ] );
-    
-    for ( size_t x = 0; x < VH_N_ROT; ++x )
-    {
-        memcpy( seek_indexes + x * (VH_N_SUBSET / sizeof(uint32_t)),
-                p0, VH_N_SUBSET);
-
 #if defined(__AVX2__)

-        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(__m256i); y += 8)
-        {
-           casti_m256i( p0_index, y   ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y   ), 1 );
-           casti_m256i( p0_index, y+1 ) = mm256_rol_32( 
-                                            casti_m256i( p0_index, y+1 ), 1 );
-           casti_m256i( p0_index, y+2 ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y+2 ), 1 );
-           casti_m256i( p0_index, y+3 ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y+3 ), 1 );
-           casti_m256i( p0_index, y+4 ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y+4 ), 1 );
-           casti_m256i( p0_index, y+5 ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y+5 ), 1 );
-           casti_m256i( p0_index, y+6 ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y+6 ), 1 );
-           casti_m256i( p0_index, y+7 ) = mm256_rol_32(
-                                            casti_m256i( p0_index, y+7 ), 1 );
-        }
+   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m256i); x += 8 )
+   {
+      __m256i *px = (__m256i*)p + x;
+
+      px[0] = mm256_rol_32( px[0], 1 );
+      px[1] = mm256_rol_32( px[1], 1 );
+      px[2] = mm256_rol_32( px[2], 1 );
+      px[3] = mm256_rol_32( px[3], 1 );
+      px[4] = mm256_rol_32( px[4], 1 );
+      px[5] = mm256_rol_32( px[5], 1 );
+      px[6] = mm256_rol_32( px[6], 1 );
+      px[7] = mm256_rol_32( px[7], 1 );
+   }

 #else

-        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(__m128i); y += 8)
-        {
-           casti_m128i( p0_index, y   ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y   ), 1 );
-           casti_m128i( p0_index, y+1 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+1 ), 1 );
-           casti_m128i( p0_index, y+2 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+2 ), 1 );
-           casti_m128i( p0_index, y+3 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+3 ), 1 );
-           casti_m128i( p0_index, y+4 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+4 ), 1 );
-           casti_m128i( p0_index, y+5 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+5 ), 1 );
-           casti_m128i( p0_index, y+6 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+6 ), 1 );
-           casti_m128i( p0_index, y+7 ) = mm128_rol_32(
-                                            casti_m128i( p0_index, y+7 ), 1 );
-        }
-        
+   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m128i); x += 8 )
+   {
+      __m128i *px = (__m128i*)p0_index + x;
+
+      px[0] = mm128_rol_32( px[0], 1 );
+      px[1] = mm128_rol_32( px[1], 1 );
+      px[2] = mm128_rol_32( px[2], 1 );
+      px[3] = mm128_rol_32( px[3], 1 );
+      px[4] = mm128_rol_32( px[4], 1 );
+      px[5] = mm128_rol_32( px[5], 1 );
+      px[6] = mm128_rol_32( px[6], 1 );
+      px[7] = mm128_rol_32( px[7], 1 );
+   }
+
+#endif
+/*   
+   for ( size_t x = 0; x < VH_N_SUBSET / sizeof(uint32_t); ++x )
+      p[x] = ( p[x] << 1 ) | ( p[x] >> 31 );
+*/
+}
 #endif

-    }
+static inline uint32_t rotl32( uint32_t a, size_t r )
+{
+   return ( a << r ) | ( a >> (32-r) );
+}

-    sha3( &input[0], VH_HEADER_SIZE, &p1[0], VH_HASH_OUT_SIZE );
-    
-    uint32_t* p1_32 = (uint32_t*)p1;
-    uint32_t* blob_bytes_32 = (uint32_t*)blob_bytes;
-    uint32_t value_accumulator = 0x811c9dc5;
+// Vectorized and targetted version of fnv1a
+#if defined (__AVX2__)        
+
+#define MULXOR \
+   *(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \
+                                 *(__m256i*)hash, *(__m256i*)blob_off ), k );
+
+#elif defined(__SSE41__)
+
+#define MULXOR \
+   casti_m128i( hash, 0 ) = _mm_mullo_epi32( _mm_xor_si128( \
+                  casti_m128i( hash, 0 ), casti_m128i( blob_off, 0 ) ), k ); \
+   casti_m128i( hash, 1 ) = _mm_mullo_epi32( _mm_xor_si128( \
+                  casti_m128i( hash, 1 ), casti_m128i( blob_off, 1 ) ), k );
+
+#else
+
+#define MULXOR \
+   for ( size_t j = 0; j < VH_HASH_OUT_SIZE / sizeof(uint32_t); j++ ) \
+       hash[j] = fnv1a( hash[j], blob_off[j] ); \
+
+#endif
+
+#define UPDATE_ACCUMULATOR \
+   accumulator = fnv1a( accumulator, blob_off[0] ); \
+   accumulator = fnv1a( accumulator, blob_off[1] ); \
+   accumulator = fnv1a( accumulator, blob_off[2] ); \
+   accumulator = fnv1a( accumulator, blob_off[3] ); \
+   accumulator = fnv1a( accumulator, blob_off[4] ); \
+   accumulator = fnv1a( accumulator, blob_off[5] ); \
+   accumulator = fnv1a( accumulator, blob_off[6] ); \
+   accumulator = fnv1a( accumulator, blob_off[7] )
+
+
+// first pass no rotate
+#define ROUND_0 \
+for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
+{ \
+   const uint32_t *blob_off = blob + \
+                         ( ( fnv1a( subset[i], accumulator ) % mdiv ) \
+                         * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \
+   UPDATE_ACCUMULATOR; \
+   MULXOR; \
+}
+
+// subsequent passes rotate by r on demand, no need for mass rotate
+#define ROUND_r( r ) \
+for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
+{ \
+   const uint32_t *blob_off = blob + \
+                 ( ( fnv1a( rotl32( subset[i], r ), accumulator ) % mdiv ) \
+                 * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \
+   UPDATE_ACCUMULATOR; \
+   MULXOR; \
+}
+
+void verthash_hash( const void *blob_bytes, const size_t blob_size,
+                    const void *input, void *output )
+{
+    uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (64)));
+    uint32_t subset[ VH_N_SUBSET / 4 ] __attribute__ ((aligned (64)));
+    const uint32_t *blob = (const uint32_t*)blob_bytes;
+    uint32_t accumulator = 0x811c9dc5;
    const uint32_t mdiv = ( ( blob_size - VH_HASH_OUT_SIZE )
                             / VH_BYTE_ALIGNMENT ) + 1;
 #if defined (__AVX2__)        
@@ -211,40 +252,15 @@ void verthash_hash( const unsigned char* blob_bytes,
 #elif defined(__SSE41__)
    const __m128i k = _mm_set1_epi32( 0x1000193 );
 #endif
+    
+    sha3( input, VH_HEADER_SIZE, hash, VH_HASH_OUT_SIZE );
+    verthash_sha3_512_final_8( subset, ( (uint64_t*)input )[ 9 ] );

-    for ( size_t i = 0; i < VH_N_INDEXES; i++ )
-    {
-        const uint32_t offset =
-                      ( fnv1a( seek_indexes[i], value_accumulator) % mdiv )
-                      * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) );
-        const uint32_t *blob_off = blob_bytes_32 + offset;
+    ROUND_0;
+    for ( size_t r = 1; r < VH_N_ROT; ++r )
+       ROUND_r( r );

-        // update value accumulator for next seek index
-        value_accumulator = fnv1a( value_accumulator, blob_off[0] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[1] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[2] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[3] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[4] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[5] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[6] );
-        value_accumulator = fnv1a( value_accumulator, blob_off[7] );
-        
-#if defined (__AVX2__)        
-        *(__m256i*)p1_32 = _mm256_mullo_epi32( _mm256_xor_si256(
-                                  *(__m256i*)p1_32, *(__m256i*)blob_off ), k );
-#elif defined(__SSE41__)
-        casti_m128i( p1_32, 0 ) = _mm_mullo_epi32( _mm_xor_si128( 
-                    casti_m128i( p1_32, 0 ), casti_m128i( blob_off, 0 ) ), k );
-        casti_m128i( p1_32, 1 ) = _mm_mullo_epi32( _mm_xor_si128( 
-                    casti_m128i( p1_32, 1 ), casti_m128i( blob_off, 1 ) ), k );
-#else
-         for ( size_t i2 = 0; i2 < VH_HASH_OUT_SIZE / sizeof(uint32_t); i2++ )
-            p1_32[i2] = fnv1a( p1_32[i2], blob_off[i2] );
-#endif
-
-    }
-
-    memcpy( output, p1, VH_HASH_OUT_SIZE );
+    memcpy( output, hash, VH_HASH_OUT_SIZE );
 }

 //-----------------------------------------------------------------------------
--- a/algo/verthash/Verthash.h
+++ b/algo/verthash/Verthash.h
@@ -47,10 +47,8 @@ void verthash_info_free(verthash_info_t* info);
 //! Generate verthash data file and save it to specified location.
 int verthash_generate_data_file(const char* output_file_name);

-void verthash_hash(const unsigned char* blob_bytes,
-                   const size_t blob_size,
-                   const unsigned char(*input)[VH_HEADER_SIZE],
-                   unsigned char(*output)[VH_HASH_OUT_SIZE]);
+void verthash_hash( const void *blob_bytes, const size_t blob_size,
+                    const void *input, void *output );

 void verthash_sha3_512_prehash_72( const void *input );
 void verthash_sha3_512_final_8( void *hash, const uint64_t nonce );
--- a/algo/verthash/verthash-gate.c
+++ b/algo/verthash/verthash-gate.c
@@ -62,7 +62,7 @@ void verthash_sha3_512_final_8( void *hash, const uint64_t nonce )
    __m256i vhashB[ 10 ] __attribute__ ((aligned (64)));

   sha3_4way_ctx_t ctx;
-   __m256i vnonce = _mm256_set1_epi64x( nonce );
+   const __m256i vnonce = _mm256_set1_epi64x( nonce );

   memcpy( &ctx, &sha3_mid_ctxA, sizeof ctx );
   sha3_4way_update( &ctx, &vnonce, 8 );
@@ -88,14 +88,13 @@ void verthash_sha3_512_final_8( void *hash, const uint64_t nonce )
 #endif
 }

-
 int scanhash_verthash( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t edata[20] __attribute__((aligned(64)));
   uint32_t hash[8] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+   const uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 1;
   uint32_t n = first_nonce;
@@ -109,8 +108,7 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
   {
      edata[19] = n;
      verthash_hash( verthashInfo.data, verthashInfo.dataSize, 
-                     (const unsigned char (*)[80]) edata,
-                     (unsigned char (*)[32]) hash );
+                     edata,  hash );
      if ( valid_hash( hash, ptarget ) && !bench )
      {
         pdata[19] = bswap_32( n );
@@ -123,17 +121,16 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
   return 0;
 }

-const char *default_verthash_data_file = "verthash.dat";
+static const char *default_verthash_data_file = "verthash.dat";

 bool register_verthash_algo( algo_gate_t* gate )
 {
-
  opt_target_factor = 256.0;
  gate->scanhash  = (void*)&scanhash_verthash;
  gate->optimizations = AVX2_OPT;
   
-  char *verthash_data_file = opt_data_file ? opt_data_file
-                                           : default_verthash_data_file;
+  const char *verthash_data_file = opt_data_file ? opt_data_file
+                                                 : default_verthash_data_file;
  
   int vhLoadResult = verthash_info_init( &verthashInfo, verthash_data_file );
   if (vhLoadResult == 0) // No Error
@@ -160,7 +157,8 @@ bool register_verthash_algo( algo_gate_t* gate )
      // Handle Verthash error codes
      if ( vhLoadResult == 1 )
      {
-         applog( LOG_ERR, "Verthash data file not found: %s", verthash_data_file );
+         applog( LOG_ERR, "Verthash data file not found: %s",
+                 verthash_data_file );
         if ( !opt_data_file )
            applog( LOG_NOTICE, "Add '--verify' to create verthash.dat");
      }
--- a/algo/yespower/yespower-opt.c
+++ b/algo/yespower/yespower-opt.c
@@ -373,6 +373,45 @@ static inline void salsa20(salsa20_blk_t *restrict B,
 #define INTEGERIFY (uint32_t)X.d[0]
 #endif

+// AVX512 ternary logic optimization
+#if defined(__AVX512VL__)
+
+#define XOR_X_XOR_X( in1, in2 ) \
+ X0 =  _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
+ X1 =  _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
+ X2 =  _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
+ X3 =  _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); 
+
+#define XOR_X_2_XOR_X( in1, in2, in3 ) \
+ X0 =  _mm_ternarylogic_epi32( (in1).q[0], (in2).q[0], (in3).q[0], 0x96 ); \
+ X1 =  _mm_ternarylogic_epi32( (in1).q[1], (in2).q[1], (in3).q[1], 0x96 ); \
+ X2 =  _mm_ternarylogic_epi32( (in1).q[2], (in2).q[2], (in3).q[2], 0x96 ); \
+ X3 =  _mm_ternarylogic_epi32( (in1).q[3], (in2).q[3], (in3).q[3], 0x96 );
+
+#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
+ X0 =  _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
+ X1 =  _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
+ X2 =  _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
+ X3 =  _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); \
+ SALSA20(out)
+
+#else
+
+#define XOR_X_XOR_X( in1, in2 ) \
+  XOR_X( in1 ) \
+  XOR_X( in2 ) 
+
+#define XOR_X_2_XOR_X( in1, in2, in3 ) \
+   XOR_X_2( in1, in2 ) \
+   XOR_X( in3 )
+
+#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
+   XOR_X(in1) \
+   XOR_X(in2) \
+   SALSA20( out )
+
+#endif
+
 /**
 * Apply the Salsa20 core to the block provided in X ^ in.
 */
@@ -406,11 +445,15 @@ static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1,
 {
 	DECL_X

-	XOR_X_2(Bin1[1], Bin2[1])
-	XOR_X(Bin1[0])
+   XOR_X_2_XOR_X( Bin1[1], Bin2[1], Bin1[0] )   
+//	XOR_X_2(Bin1[1], Bin2[1])
+//	XOR_X(Bin1[0])
 	SALSA20_XOR_MEM(Bin2[0], Bout[0])
-	XOR_X(Bin1[1])
-	SALSA20_XOR_MEM(Bin2[1], Bout[1])
+
+// Factor out the XOR from salsa20 to do a xor3
+   XOR_X_SALSA20_XOR_MEM( Bin1[1], Bin2[1], Bout[1] )
+//   XOR_X(Bin1[1])
+//	SALSA20_XOR_MEM(Bin2[1], Bout[1])

 	return INTEGERIFY;
 }
@@ -745,13 +788,15 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
 	i = 0;
 	r--;
 	do {
-		XOR_X(Bin1[i])
-		XOR_X(Bin2[i])
+      XOR_X_XOR_X( Bin1[i], Bin2[i] )
+//      XOR_X(Bin1[i])
+//      XOR_X(Bin2[i])
 		PWXFORM
 		WRITE_X(Bout[i])

-		XOR_X(Bin1[i + 1])
-		XOR_X(Bin2[i + 1])
+      XOR_X_XOR_X( Bin1[ i+1 ], Bin2[ i+1 ] )     
+//		XOR_X(Bin1[i + 1])
+//		XOR_X(Bin2[i + 1])
 		PWXFORM

 		if (unlikely(i >= r))
--- a/algo/yespower/yespower.h
+++ b/algo/yespower/yespower.h
@@ -35,7 +35,6 @@
 #include "miner.h"
 #include "simd-utils.h"
 #include "algo/sha/sph_sha2.h"
-#include <openssl/sha.h>

 #ifdef __cplusplus
 extern "C" {
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -63,7 +63,7 @@ mv cpuminer cpuminer-avx
 # Westmere SSE4.2 AES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=westmere -Wall -fno-common" ./configure --with-curl
+CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-sse42.exe
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.16.2.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.17.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.16.2'
-PACKAGE_STRING='cpuminer-opt 3.16.2'
+PACKAGE_VERSION='3.17.1'
+PACKAGE_STRING='cpuminer-opt 3.17.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.16.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.17.1 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.16.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.17.1:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.16.2
+cpuminer-opt configure 3.17.1
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.16.2, which was
+It was created by cpuminer-opt $as_me 3.17.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.16.2'
+ VERSION='3.17.1'


 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.16.2, which was
+This file was extended by cpuminer-opt $as_me 3.17.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.16.2
+cpuminer-opt config.status 3.17.1
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.16.2])
+AC_INIT([cpuminer-opt], [3.17.1])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -447,8 +447,10 @@ static bool work_decode( const json_t *val, struct work *work )

    if ( !allow_mininginfo )
        net_diff = algo_gate.calc_network_diff( work );
+    else
+        net_diff = hash_to_diff( work->target );

-    work->targetdiff = hash_to_diff( work->target );
+    work->targetdiff = net_diff;
    stratum_diff = last_targetdiff = work->targetdiff;
    work->sharediff = 0;
    algo_gate.decode_extra_data( work, &net_blocks );
@@ -482,13 +484,17 @@ static bool get_mininginfo( CURL *curl, struct work *work )
   // "networkhashps": 56475980
   if ( res )
   {
+      // net_diff is a global that is set from the work hash target by
+      // both getwork and GBT. Don't overwrite it, define a local to override
+      // the global.
+      double net_diff = 0.;
  		json_t *key = json_object_get( res, "difficulty" );
   	if ( key )
      {
 	   	if ( json_is_object( key ) )
 		   	key = json_object_get( key, "proof-of-work" );
 		   if ( json_is_real( key ) )
-			   net_diff = work->targetdiff = json_real_value( key );
+			   net_diff = json_real_value( key );
 	   }

      key = json_object_get( res, "networkhashps" );
@@ -908,7 +914,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
   }
   for ( i = 0; i < ARRAY_SIZE( work->target ); i++ )
      work->target[7 - i] = be32dec( target + i );
-
+   net_diff = work->targetdiff = hash_to_diff( work->target );
+   
   tmp = json_object_get( val, "workid" );
   if ( tmp )
   {
@@ -1047,6 +1054,8 @@ void report_summary_log( bool force )
         applog( LOG_NOTICE,"CPU temp: curr %s max %d, Freq: %.3f/%.3f GHz",
                 tempstr, hi_temp, lo_freq / 1e6, hi_freq / 1e6 );
         if ( curr_temp > hi_temp ) hi_temp = curr_temp;
+         if ( ( opt_max_temp > 0.0 ) && ( curr_temp > opt_max_temp ) )
+            restart_threads();
         prev_temp = curr_temp;
      }
   }
@@ -1145,7 +1154,7 @@ void report_summary_log( bool force )
   if ( mismatch )
   {
      if ( mismatch != 1 )
-         applog(LOG_WARNING,"Share count mismatch: %d, stats may be incorrect", mismatch );
+         applog(LOG_WARNING,"Share count mismatch: %d, stats may be inaccurate", mismatch );
      else
         applog(LOG_INFO,"Share count mismatch, submitted share may still be pending" );
   }
@@ -1165,7 +1174,8 @@ static int share_result( int result, struct work *work,
   char bres[48];
   bool solved = false; 
   bool stale = false;
-   char *acol = NULL, *bcol = NULL, *scol = NULL, *rcol = NULL;
+   char *acol, *bcol, *scol, *rcol;
+   acol = bcol = scol = rcol = "\0";

   pthread_mutex_lock( &stats_lock );

@@ -1207,7 +1217,7 @@ static int share_result( int result, struct work *work,
      sprintf( sres, "S%d", stale_share_count );
      sprintf( rres, "R%d", rejected_share_count );
      if unlikely( ( my_stats.net_diff > 0. )
-                && ( my_stats.share_diff >= net_diff ) )
+                && ( my_stats.share_diff >= my_stats.net_diff ) )
      {
         solved = true;
         solved_block_count++;
@@ -2085,10 +2095,10 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
                         sctx->block_height, net_diff, g_work->job_id );
   else if ( !opt_quiet )
   {
-      unsigned char *xnonce2str = abin2hex( g_work->xnonce2,
-                                            g_work->xnonce2_len );
-      applog( LOG_INFO, "Extranonce2 %s, Block %d, Net Diff %.5g",
-                  xnonce2str, sctx->block_height, net_diff );
+      unsigned char *xnonce2str = bebin2hex( g_work->xnonce2,
+                                             g_work->xnonce2_len );
+      applog( LOG_INFO, "Extranonce2 %s, Block %d, Job %s",
+                        xnonce2str, sctx->block_height, g_work->job_id );
      free( xnonce2str );
   }

@@ -2171,11 +2181,11 @@ static void *miner_thread( void *userdata )
   /* Set worker threads to nice 19 and then preferentially to SCHED_IDLE
    * and if that fails, then SCHED_BATCH. No need for this to be an
    * error if it fails */
-   if (!opt_benchmark && opt_priority == 0)
+   if ( !opt_priority )
   {
      setpriority(PRIO_PROCESS, 0, 19);
-      if ( !thr_id && !opt_quiet )
-         applog(LOG_INFO, "Miner thread priority %d (nice 19)", opt_priority );
+      if ( !thr_id && opt_debug )
+         applog(LOG_INFO, "Default miner thread priority %d (nice 19)", opt_priority );
      drop_policy();
   }
   else
@@ -2192,9 +2202,12 @@ static void *miner_thread( void *userdata )
 	      case 4:   prio = -10;   break;
 	      case 5:   prio = -15;
      }
-	   if ( !( thr_id || opt_quiet ) )
-	      applog( LOG_INFO, "Miner thread priority %d (nice %d)",
+	   if ( !thr_id )
+      {
+         applog( LOG_INFO, "User set miner thread priority %d (nice %d)",
                          opt_priority, prio );
+         applog( LOG_WARNING, "High priority mining threads may cause system instability");
+      }
 #endif
      setpriority(PRIO_PROCESS, 0, prio);
 	   if ( opt_priority == 0 )
@@ -2439,7 +2452,7 @@ static void *miner_thread( void *userdata )
             char hr_units[2] = {0,0};
             scale_hash_for_display( &hashrate,  hr_units );
             sprintf( hr, "%.2f", hashrate );
-#if ((defined(_WIN64) || defined(__WINDOWS__)) || defined(_WIN32))
+#if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
             applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
 #else
             float lo_freq = 0., hi_freq = 0.;
@@ -2739,10 +2752,10 @@ static void *stratum_thread(void *userdata )
 	          stratum.url = strdup( rpc_url );
 	          applog(LOG_BLUE, "Connection changed to %s", short_url);
          }
-          else // if ( !opt_quiet )
+          else 
 	          applog(LOG_WARNING, "Stratum connection reset");
          // reset stats queue as well
-          s_get_ptr = s_put_ptr = 0;
+          if ( s_get_ptr != s_put_ptr ) s_get_ptr = s_put_ptr = 0;
      }

      while ( !stratum.curl )
@@ -2789,13 +2802,15 @@ static void *stratum_thread(void *userdata )
         else
         {
            applog(LOG_WARNING, "Stratum connection interrupted");
-            stratum_disconnect( &stratum );
+//            stratum_disconnect( &stratum );
+            stratum_need_reset = true;
         }
      }
      else
      {
         applog(LOG_ERR, "Stratum connection timeout");
-         stratum_disconnect( &stratum );
+         stratum_need_reset = true;
+//         stratum_disconnect( &stratum );
      }

   }  // loop
@@ -2843,7 +2858,6 @@ static bool cpu_capability( bool display_only )
     bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
     bool use_aes;
     bool use_sse2;
-     bool use_sse42;
     bool use_avx2;
     bool use_avx512;
     bool use_sha;
@@ -2917,13 +2931,14 @@ static bool cpu_capability( bool display_only )
     if ( algo_features == EMPTY_SET ) printf( " None" );
     else
     {
-        if      ( algo_has_avx512 )    printf( " AVX512" );
-        else if ( algo_has_avx2   )    printf( " AVX2  " );
-        else if ( algo_has_sse42  )    printf( " SSE4.2" );
-        else if ( algo_has_sse2   )    printf( " SSE2  " );
-        if      ( algo_has_vaes   )    printf( " VAES"   );
-        else if ( algo_has_aes    )    printf( "  AES"   );
-        if      ( algo_has_sha    )    printf( " SHA"    );
+        if      ( algo_has_avx512  )  printf( " AVX512" );
+        else if ( algo_has_avx2    )  printf( " AVX2  " );
+        else if ( algo_has_sse42   )  printf( " SSE4.2" );
+        else if ( algo_has_sse2    )  printf( " SSE2  " );
+        if      ( algo_has_vaes ||
+                  algo_has_vaes256 )  printf( " VAES"   );
+        else if ( algo_has_aes     )  printf( "  AES"   );
+        if      ( algo_has_sha     )  printf( " SHA"    );
     }
     printf("\n");

@@ -2959,13 +2974,12 @@ static bool cpu_capability( bool display_only )
     // Determine mining options
     use_sse2   = cpu_has_sse2   && algo_has_sse2;
     use_aes    = cpu_has_aes    && sw_has_aes    && algo_has_aes;
-     use_sse42  = cpu_has_sse42  && sw_has_sse42  && algo_has_sse42;
     use_avx2   = cpu_has_avx2   && sw_has_avx2   && algo_has_avx2;
     use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
     use_sha    = cpu_has_sha    && sw_has_sha    && algo_has_sha;
-     use_vaes   = cpu_has_vaes   && sw_has_vaes   && algo_has_vaes
-          && ( use_avx512 || algo_has_vaes256 );
-     use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 ||
+     use_vaes   = cpu_has_vaes   && sw_has_vaes   && ( algo_has_vaes
+                                                    || algo_has_vaes256 );
+     use_none = !( use_sse2 || use_aes || use_avx512 || use_avx2 ||
                   use_sha || use_vaes );

     // Display best options
@@ -2975,7 +2989,6 @@ static bool cpu_capability( bool display_only )
     {
        if      ( use_avx512 ) printf( " AVX512" );
        else if ( use_avx2   ) printf( " AVX2"   );
-        else if ( use_sse42  ) printf( " SSE4.2" );
        else if ( use_sse2   ) printf( " SSE2"   );
        if      ( use_vaes   ) printf( " VAES"   );
        else if ( use_aes    ) printf( " AES"    );
@@ -3394,8 +3407,6 @@ void parse_arg(int key, char *arg )
 		v = atoi(arg);
 		if (v < 0 || v > 5)	/* sanity check */
 			show_usage_and_exit(1);
-      // option is deprecated, show warning
-      applog( LOG_WARNING, "High priority mining threads may cause system instability");
 		opt_priority = v;
 		break;
   case 'N':    // N parameter for various scrypt algos
--- a/miner.h
+++ b/miner.h
@@ -307,6 +307,7 @@ extern json_t *json_rpc_call( CURL *curl, const char *url, const char *userpass,
 extern void cbin2hex(char *out, const char *in, size_t len);
 void   bin2hex( char *s, const unsigned char *p, size_t len );
 char  *abin2hex( const unsigned char *p, size_t len );
+char  *bebin2hex( const unsigned char *p, size_t len );
 bool   hex2bin( unsigned char *p, const char *hexstr, size_t len );
 bool   jobj_binary( const json_t *obj, const char *key, void *buf,
                    size_t buflen );
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -237,6 +237,25 @@ static inline void memset_128( __m128i *dst, const __m128i a, const int n )
 static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 {   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }

+#if defined(__AVX512VL__)
+
+// a ^ b ^ c
+#define mm128_xor3( a, b, c ) \
+   _mm_ternarylogic_epi64( a, b, c, 0x96 )
+
+// a ^ ( b & c )
+#define mm128_xorand( a, b, c ) \
+   _mm_ternarylogic_epi64( a, b, c, 0x78 )
+
+#else
+
+#define mm128_xor3( a, b, c ) \
+   _mm_xor_si128( a, _mm_xor_si128( b, c ) )
+
+#define mm128_xorand( a, b, c ) \
+  _mm_xor_si128( a, _mm_and_si128( b, c ) )
+
+#endif

 //
 // Bit rotations
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -136,9 +136,84 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_add4_8( a, b, c, d ) \
   _mm256_add_epi8( _mm256_add_epi8( a, b ), _mm256_add_epi8( c, d ) )

+#if defined(__AVX512VL__)
+
+// AVX512 has ternary logic that supports any 3 input boolean expression.
+
+// a ^ b ^ c
+#define mm256_xor3( a, b, c ) \
+   _mm256_ternarylogic_epi64( a, b, c, 0x96 )
+
+// legacy convenience only
+#define mm256_xor4( a, b, c, d ) \
+   _mm256_xor_si256( a, mm256_xor3( b, c, d ) )
+
+// a & b & c
+#define mm256_and3( a, b, c ) \
+   _mm256_ternarylogic_epi64( a, b, c, 0x80 )
+
+// a | b | c
+#define mm256_or3( a, b, c ) \
+   _mm256_ternarylogic_epi64( a, b, c, 0xfe )
+
+// a ^ ( b & c )
+#define mm256_xorand( a, b, c ) \
+   _mm256_ternarylogic_epi64( a, b, c, 0x78 )
+
+// a & ( b ^ c )
+#define mm256_andxor( a, b, c ) \
+   _mm256_ternarylogic_epi64( a, b, c, 0x60 )
+
+// a ^ ( b | c )
+#define mm256_xoror( a, b, c ) \
+   _mm256_ternarylogic_epi64( a, b, c, 0x1e )
+
+// a ^ ( ~b & c )   
+#define mm256_xorandnot( a, b, c ) \
+  _mm256_ternarylogic_epi64( a, b, c, 0xd2 )
+
+// a | ( b & c )
+#define mm256_orand( a, b, c ) \
+   _mm256_ternarylogic_epi64( a, b, c, 0xf8  )
+
+// ~( a ^ b ), same as (~a) ^ b
+#define mm256_xnor( a, b ) \
+   _mm256_ternarylogic_epi64( a, b, b, 0x81  )
+    
+#else
+
+#define mm256_xor3( a, b, c ) \
+   _mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
+
 #define mm256_xor4( a, b, c, d ) \
   _mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )

+#define mm256_and3( a, b, c ) \
+   _mm256_and_si256( a, _mm256_and_si256( b, c ) )
+
+#define mm256_or3( a, b, c ) \
+   _mm256_or_si256( a, _mm256_or_si256( b, c ) )
+
+#define mm256_xorand( a, b, c ) \
+ _mm256_xor_si256( a, _mm256_and_si256( b, c ) )
+
+#define mm256_andxor( a, b, c ) \
+  _mm256_and_si256( a, _mm256_xor_si256( b, c ))
+
+#define mm256_xoror( a, b, c ) \
+ _mm256_xor_si256( a, _mm256_or_si256( b, c ) )
+
+#define mm256_xorandnot( a, b, c ) \
+ _mm256_xor_si256( a, _mm256_andnot_si256( b, c ) )
+
+#define mm256_orand( a, b, c ) \
+ _mm256_or_si256( a, _mm256_and_si256( b, c ) )
+
+#define mm256_xnor( a, b ) \
+  mm256_not( _mm256_xor_si256( a, b ) )
+
+#endif
+
 //
 //           Bit rotations.
 //
@@ -200,15 +275,17 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )

 //
 // Rotate elements accross all lanes.
-//
-// Swap 128 bit elements in 256 bit vector.
-#define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )

-// Rotate 256 bit vector by one 64 bit element
-#define mm256_ror_1x64( v )     _mm256_permute4x64_epi64( v, 0x39 )
-#define mm256_rol_1x64( v )     _mm256_permute4x64_epi64( v, 0x93 )
+#if defined(__AVX512VL__)

-#if defined(__AVX512F__) && defined(__AVX512VL__)
+static inline __m256i mm256_swap_128( const __m256i v )
+{ return _mm256_alignr_epi64( v, v, 2 ); }
+
+static inline __m256i mm256_ror_1x64( const __m256i v )
+{ return _mm256_alignr_epi64( v, v, 1 ); }
+
+static inline __m256i mm256_rol_1x64( const __m256i v )
+{ return _mm256_alignr_epi64( v, v, 3 ); }

 static inline __m256i mm256_ror_1x32( const __m256i v )
 { return _mm256_alignr_epi32( v, v, 1 ); }
@@ -218,6 +295,13 @@ static inline __m256i mm256_rol_1x32( const __m256i v )

 #else   // AVX2

+// Swap 128 bit elements in 256 bit vector.
+#define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )
+
+// Rotate 256 bit vector by one 64 bit element
+#define mm256_ror_1x64( v )     _mm256_permute4x64_epi64( v, 0x39 )
+#define mm256_rol_1x64( v )     _mm256_permute4x64_epi64( v, 0x93 )
+
 // Rotate 256 bit vector by one 32 bit element.
 #define mm256_ror_1x32( v ) \
    _mm256_permutevar8x32_epi32( v, \
@@ -229,6 +313,7 @@ static inline __m256i mm256_rol_1x32( const __m256i v )
                     m256_const_64( 0x0000000600000005,  0x0000000400000003, \
                                    0x0000000200000001,  0x0000000000000007 )

+       
 #endif    // AVX512 else AVX2

 //
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -61,7 +61,7 @@
 //
 //    Additionally, permutations using smaller vectors can be more efficient
 //    if the permutation doesn't cross lane boundaries, typically 128 bits,
-//    and the smnaller vector can use an imm comtrol.
+//    and the smaller vector can use an imm comtrol.
 //
 //    If the permutation doesn't cross lane boundaries a shuffle instructions
 //    can be used with imm control instead of permute.
@@ -107,7 +107,7 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
  return v.m512i;
 }

-// Equivalent of set1, broadcast lo element all elements.
+// Equivalent of set1, broadcast lo element to all elements.
 static inline __m512i m512_const1_256( const __m256i v )
 { return _mm512_inserti64x4( _mm512_castsi256_si512( v ), v, 1 ); }  

@@ -166,7 +166,9 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
 // Basic operations without SIMD equivalent

 // ~x
-#define mm512_not( x )       _mm512_xor_si512( x, m512_neg1 )
+// #define mm512_not( x )       _mm512_xor_si512( x, m512_neg1 )
+static inline __m512i mm512_not( const __m512i x )
+{  return _mm512_ternarylogic_epi64( x, x, x, 1 ); }

 // -x
 #define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x )
@@ -221,11 +223,61 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_add4_8( a, b, c, d ) \
   _mm512_add_epi8( _mm512_add_epi8( a, b ), _mm512_add_epi8( c, d ) )

-#define mm512_xor4( a, b, c, d ) \
-   _mm512_xor_si512( _mm512_xor_si512( a, b ), _mm512_xor_si512( c, d ) )
-
-
 //
+// Ternary logic uses 8 bit truth table to define any 3 input logical
+// operation using any number or combinations of AND, OR XOR, NOT.
+
+// a ^ b ^ c
+#define mm512_xor3( a, b, c ) \
+   _mm512_ternarylogic_epi64( a, b, c, 0x96 )
+
+// legacy convenience only
+#define mm512_xor4( a, b, c, d ) \
+   _mm512_xor_si512( a, mm512_xor3( b, c, d ) )
+
+// a & b & c
+#define mm512_and3( a, b, c ) \
+   _mm512_ternarylogic_epi64( a, b, c, 0x80 )
+
+// a | b | c
+#define mm512_or3( a, b, c ) \
+   _mm512_ternarylogic_epi64( a, b, c, 0xfe )
+
+// a ^ ( b & c )
+#define mm512_xorand( a, b, c ) \
+   _mm512_ternarylogic_epi64( a, b, c, 0x78 )
+
+// a & ( b ^ c )
+#define mm512_andxor( a, b, c ) \
+   _mm512_ternarylogic_epi64( a, b, c, 0x60 )
+
+// a ^ ( b & c )
+#define mm512_xoror( a, b, c ) \
+   _mm512_ternarylogic_epi64( a, b, c, 0x1e )
+
+// a ^ ( ~b & c )     [ xor( a, andnot( b, c ) ]
+#define mm512_xorandnot( a, b, c ) \
+  _mm512_ternarylogic_epi64( a, b, c, 0xd2 ) 
+
+// a | ( b & c )
+#define mm512_orand( a, b, c ) \
+   _mm512_ternarylogic_epi64( a, b, c, 0xf8  )
+
+// Some 2 input operations that don't have their own instruction mnemonic.
+
+// ~( a | b )
+#define mm512_nor( a, b ) \
+   _mm512_ternarylogic_epi64( a, b, b, 0x01  )
+
+// ~( a ^ b ), same as (~a) ^ b
+#define mm512_xnor( a, b ) \
+   _mm512_ternarylogic_epi64( a, b, b, 0x81  )
+
+// ~( a & b )
+#define mm512_nand( a, b ) \
+   _mm512_ternarylogic_epi64( a, b, b, 0xef  )
+
+
 // Bit rotations.

 // AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
--- a/util.c
+++ b/util.c
@@ -795,6 +795,15 @@ char *abin2hex(const unsigned char *p, size_t len)
 	return s;
 }

+char *bebin2hex(const unsigned char *p, size_t len)
+{
+   char *s = (char*) malloc((len * 2) + 1);
+   if (!s)  return NULL;
+   for ( size_t i = 0, j = len - 1; i < len; i++, j-- )
+      sprintf( s + ( i*2 ), "%02x", (unsigned int) p[ j ] );
+   return s;
+}
+
 bool hex2bin(unsigned char *p, const char *hexstr, size_t len)
 {
 	char hex_byte[3];
@@ -1789,10 +1798,14 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p
 				if ( !stratum_handle_method( sctx, sret ) )
 					applog( LOG_WARNING, "Stratum answer id is not correct!" );
 			}
-			res_val = json_object_get( extra, "result" );
-			if (opt_debug && (!res_val || json_is_false(res_val)))
-				applog(LOG_DEBUG, "Method extranonce.subscribe is not supported");
-			json_decref( extra );
+         else
+         {
+            res_val = json_object_get( extra, "result" );
+			   if ( opt_debug && ( !res_val || json_is_false( res_val ) ) )
+				   applog( LOG_DEBUG,
+                       "Method extranonce.subscribe is not supported" );
+         }
+         json_decref( extra );
 		}
 		free(sret);
 	}
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -107,7 +107,7 @@ mv cpuminer.exe release/cpuminer-avx.exe
 # Westmere SSE4.2 AES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=westmere -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="-O3 -march=westmere -maes -Wall" ./configure $CONFIGURE_ARGS
 #CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
Author	SHA1	Message	Date
Jay D Dee	9b905fccc8	v3.17.1	2021-07-26 15:01:37 -04:00
Jay D Dee	92b3733925	v3.17.0	2021-07-15 20:30:44 -04:00
Jay D Dee	19cc88d102	v3.16.5	2021-06-26 12:27:44 -04:00
Jay D Dee	a053690170	v3.16.4	2021-06-23 21:52:42 -04:00
Jay D Dee	3c5e8921b7	v3.16.3	2021-05-06 14:55:03 -04:00