v3.23.2

2025-09-17 23:44:27 +00:00 · 2023-09-21 12:34:06 -04:00
parent d6b5750362
commit be88afc349
113 changed files with 3349 additions and 2920 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -163,8 +163,6 @@ cpuminer_SOURCES = \
  algo/sha/sph_sha2big.c \
  algo/sha/sha256-hash-4way.c \
  algo/sha/sha512-hash-4way.c \
-  algo/sha/sha256-hash-opt.c \
-  algo/sha/sha256-hash-2way-ni.c \
  algo/sha/hmac-sha256-hash.c \
  algo/sha/hmac-sha256-hash-4way.c \
  algo/sha/sha256d.c \
@@ -172,7 +170,6 @@ cpuminer_SOURCES = \
  algo/sha/sha256d-4way.c \
  algo/sha/sha256t-gate.c \
  algo/sha/sha256t-4way.c \
-  algo/sha/sha256t.c \
  algo/sha/sha256q-4way.c \
  algo/sha/sha256q.c \
  algo/sha/sha512256d-4way.c \
@@ -294,10 +291,10 @@ disable_flags =
 if USE_ASM
   cpuminer_SOURCES += asm/neoscrypt_asm.S
 if ARCH_x86
-   cpuminer_SOURCES += asm/sha2-x86.S asm/scrypt-x86.S asm/aesb-x86.S
+   cpuminer_SOURCES += asm/sha2-x86.S asm/scrypt-x86.S
 endif
 if ARCH_x86_64
-   cpuminer_SOURCES += asm/sha2-x64.S asm/scrypt-x64.S asm/aesb-x64.S
+   cpuminer_SOURCES += asm/sha2-x64.S asm/scrypt-x64.S
 endif
 if ARCH_ARM
   cpuminer_SOURCES += asm/sha2-arm.S asm/scrypt-arm.S
--- a/5
+++ b/5
@@ -65,6 +65,11 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v3.23.2
+
+sha256dt, sha256t & sha256d +10% with SHA, small improvement with AVX2.
+Other small improvements and code cleanup.
+
 v3.23.1

 #349: Fix sha256t low difficulty shares and low effective hash rate.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -248,7 +248,7 @@ int null_hash()
   return 0;
 };

-void init_algo_gate( algo_gate_t* gate )
+static void init_algo_gate( algo_gate_t* gate )
 {
   gate->miner_thread_init       = (void*)&return_true;
   gate->scanhash                = (void*)&scanhash_generic;
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -269,7 +269,7 @@ void std_get_new_work( struct work *work, struct work *g_work, int thr_id,
 void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
 void sha256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
 // OpenSSL sha256 deprecated
-void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
+//void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );

 bool std_le_work_decode( struct work *work );
 bool std_be_work_decode( struct work *work );
--- a/algo/argon2/argon2a/argon2a.c
+++ b/algo/argon2/argon2a/argon2a.c
@@ -77,7 +77,7 @@ bool register_argon2_algo( algo_gate_t* gate )
  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
  gate->scanhash        = (void*)&scanhash_argon2;
  gate->hash            = (void*)&argon2hash;
-  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
  opt_target_factor = 65536.0;

  return true;
--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -15,7 +15,7 @@
 #include <string.h>
 #include <stdio.h>

-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 #include "sph-blake2s.h"

 static const uint32_t blake2s_IV[8] =
--- a/algo/blake/sph_blake.h
+++ b/algo/blake/sph_blake.h
@@ -42,7 +42,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for BLAKE-224.
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -31,7 +31,7 @@
 #include <stdint.h>
 #include <string.h>
 #include "simd-utils.h"
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 #include "sph_blake2b.h"

 // Little-endian byte access.
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -41,8 +41,6 @@ extern "C"{
 #endif

 #include <stddef.h>
-
-#include "algo/sha/sph_types.h"
 #include "simd-utils.h"

 #define SPH_SIZE_bmw256   256
@@ -57,7 +55,7 @@ typedef struct {
   __m128i buf[64];
   __m128i H[16];
   size_t ptr;
-   sph_u32 bit_count;  // assume bit_count fits in 32 bits
+   uint32_t bit_count;  // assume bit_count fits in 32 bits
 } bmw_4way_small_context;

 typedef bmw_4way_small_context bmw256_4way_context;
@@ -144,7 +142,7 @@ typedef struct {
   __m256i buf[16];
   __m256i H[16];
   size_t ptr;
-   sph_u64 bit_count;
+   uint64_t bit_count;
 } bmw_4way_big_context __attribute__((aligned(128)));

 typedef bmw_4way_big_context bmw512_4way_context;
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -109,7 +109,7 @@ static const uint32_t IV256[] = {
             _mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
                                           rol_off_32( M, j, 3 ) ), \
                            rol_off_32( M, j, 10 ) ), \
-       _mm_set1_epi32( ( (j)+16 ) * SPH_C32(0x05555555UL) ) ), \
+       _mm_set1_epi32( ( (j)+16 ) * 0x05555555UL ) ), \
   H[ ( (j)+7 ) & 0xF ] )


@@ -485,7 +485,7 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
   size_t ptr;
   const int buf_size = 64;  // bytes of one lane, compatible with len

-   sc->bit_count += (sph_u32)len << 3;
+   sc->bit_count += (uint32_t)len << 3;
   buf = sc->buf;
   ptr = sc->ptr;
   h1 = sc->H;
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -45,15 +45,15 @@ extern "C"{

 #define LPAR   (

-static const sph_u64 IV512[] = {
-        SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
-        SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
-        SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
-        SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
-        SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
-        SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
-        SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
-        SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
+static const uint64_t IV512[] = {
+        0x8081828384858687, 0x88898A8B8C8D8E8F,
+        0x9091929394959697, 0x98999A9B9C9D9E9F,
+        0xA0A1A2A3A4A5A6A7, 0xA8A9AAABACADAEAF,
+        0xB0B1B2B3B4B5B6B7, 0xB8B9BABBBCBDBEBF,
+        0xC0C1C2C3C4C5C6C7, 0xC8C9CACBCCCDCECF,
+        0xD0D1D2D3D4D5D6D7, 0xD8D9DADBDCDDDEDF,
+        0xE0E1E2E3E4E5E6E7, 0xE8E9EAEBECEDEEEF,
+        0xF0F1F2F3F4F5F6F7, 0xF8F9FAFBFCFDFEFF
 };

 #if defined(__SSE2__)
@@ -894,7 +894,7 @@ static const __m256i final_b[16] =
 };

 static void
-bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
+bmw64_4way_init( bmw_4way_big_context *sc, const uint64_t *iv )
 {
   sc->H[ 0] = _mm256_set1_epi64x( 0x8081828384858687 );
   sc->H[ 1] = _mm256_set1_epi64x( 0x88898A8B8C8D8E8F );
@@ -926,7 +926,7 @@ bmw64_4way( bmw_4way_big_context *sc, const void *data, size_t len )
   size_t ptr;
   const int buf_size = 128;  // bytes of one lane, compatible with len

-   sc->bit_count += (sph_u64)len << 3;
+   sc->bit_count += (uint64_t)len << 3;
   buf = sc->buf;
   ptr = sc->ptr;
   h1 = sc->H;
@@ -1377,7 +1377,7 @@ static const __m512i final_b8[16] =


 void bmw512_8way_init( bmw512_8way_context *ctx )
-//bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
+//bmw64_4way_init( bmw_4way_big_context *sc, const uint64_t *iv )
 {
   ctx->H[ 0] = _mm512_set1_epi64( 0x8081828384858687 );
   ctx->H[ 1] = _mm512_set1_epi64( 0x88898A8B8C8D8E8F );
--- a/algo/bmw/sph_bmw.h
+++ b/algo/bmw/sph_bmw.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for BMW-224.
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -9,7 +9,6 @@
 #include <immintrin.h>
 #endif
 #include "cubehash_sse2.h"
-#include "algo/sha/sha3-defs.h"
 #include <stdbool.h>
 #include <unistd.h>
 #include <memory.h>
--- a/algo/cubehash/cubehash_sse2.h
+++ b/algo/cubehash/cubehash_sse2.h
@@ -3,7 +3,7 @@

 #include "compat.h"
 #include <stdint.h>
-#include "algo/sha/sha3-defs.h"
+#include "compat/sha3-defs.h"

 #define	OPTIMIZE_SSE2

--- a/algo/cubehash/sph_cubehash.h
+++ b/algo/cubehash/sph_cubehash.h
@@ -42,7 +42,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for CubeHash-224.
--- a/algo/echo/aes_ni/hash_api.h
+++ b/algo/echo/aes_ni/hash_api.h
@@ -22,7 +22,7 @@
 #endif


-#include "algo/sha/sha3_common.h"
+#include "compat/sha3_common.h"

 #include <emmintrin.h>

--- a/algo/echo/sph_echo.c
+++ b/algo/echo/sph_echo.c
@@ -73,7 +73,7 @@ extern "C"{
 #endif

 #define AES_BIG_ENDIAN   0
-#include "algo/sha/aes_helper.c"
+#include "compat/aes_helper.c"

 #if SPH_ECHO_64

--- a/algo/echo/sph_echo.h
+++ b/algo/echo/sph_echo.h
@@ -43,7 +43,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for ECHO-224.
--- a/algo/fugue/fugue-aesni.h
+++ b/algo/fugue/fugue-aesni.h
@@ -20,7 +20,7 @@
 #error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
 #endif

-#include "algo/sha/sha3_common.h"
+#include "compat/sha3_common.h"
 #include "simd-utils.h"


--- a/algo/fugue/sph_fugue.h
+++ b/algo/fugue/sph_fugue.h
@@ -2,7 +2,7 @@
 #define SPH_FUGUE_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 #ifdef __cplusplus
 extern "C"{
--- a/algo/gost/sph_gost.h
+++ b/algo/gost/sph_gost.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for GOST-256.
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -20,8 +20,8 @@
 #define LENGTH (512)

 #include "brg_endian.h"
-#define NEED_UINT_64T
-#include "algo/sha/brg_types.h"
+//#define NEED_UINT_64T
+#include "compat/brg_types.h"

 /* some sizes (number of bytes) */
 #define ROWS (8)
--- a/algo/groestl/aes_ni/hash-groestl256.h
+++ b/algo/groestl/aes_ni/hash-groestl256.h
@@ -34,8 +34,7 @@ typedef crypto_uint64 u64;
 //#define LENGTH (512)

 #include "brg_endian.h"
-#define NEED_UINT_64T
-#include "algo/sha/brg_types.h"
+#include "compat/brg_types.h"

 #ifdef IACA_TRACE
  #include IACA_MARKS
--- a/algo/groestl/groestl-gate.c
+++ b/algo/groestl/groestl-gate.c
@@ -17,7 +17,7 @@ bool register_dmd_gr_algo( algo_gate_t *gate )
 bool register_groestl_algo( algo_gate_t* gate )
 {
    register_dmd_gr_algo( gate );
-    gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+    gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
    return true;
 };

--- a/algo/groestl/groestl256-hash-4way.h
+++ b/algo/groestl/groestl256-hash-4way.h
@@ -22,10 +22,6 @@

 #define LENGTH (256)

-//#include "brg_endian.h"
-//#define NEED_UINT_64T
-//#include "algo/sha/brg_types.h"
-
 /* some sizes (number of bytes) */
 #define ROWS (8)
 #define LENGTHFIELDLEN (ROWS)
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -4,7 +4,7 @@
 #include <stdint.h>
 #include <string.h>
 #include "aes_ni/hash-groestl.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha256-hash.h"
 #if defined(__VAES__)
  #include "groestl512-hash-4way.h"
 #endif
--- a/algo/groestl/sph_groestl.h
+++ b/algo/groestl/sph_groestl.h
@@ -40,7 +40,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 #if !defined(__AES__)   
 /**
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -36,44 +36,64 @@
 #define HAMSI_4WAY_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"

 #if defined (__AVX2__)

 #include "simd-utils.h"

-#ifdef __cplusplus
-extern "C"{
-#endif
-
-#define SPH_SIZE_hamsi512   512
+// Hamsi-512 4x64

 // Partial is only scalar but needs pointer ref for hamsi-helper
 // deprecate partial_len
-typedef struct {
+typedef struct
+{
   __m256i h[8];
   __m256i buf[1];
   size_t partial_len;
-   sph_u32 count_high, count_low;
+   uint32_t count_high, count_low;
 } hamsi_4way_big_context;
-
 typedef hamsi_4way_big_context hamsi512_4way_context;

 void hamsi512_4way_init( hamsi512_4way_context *sc );
 void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
      size_t len );
-//#define hamsi512_4way hamsi512_4way_update
 void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );

+#define hamsi512_4x64_context   hamsi512_4way_context
+#define hamsi512_4x64_init      hamsi512_4way_init
+#define hamsi512_4x64_update    hamsi512_4way_update
+#define hamsi512_4x64_close     hamsi512_4way_close
+
+// Hamsi-512 8x32
+
+typedef struct
+{
+   __m256i h[16];
+   __m256i buf[2];
+   size_t partial_len;
+   uint32_t count_high, count_low;
+} hamsi_8x32_big_context;
+typedef hamsi_8x32_big_context hamsi512_8x32_context;
+
+void hamsi512_8x32_init( hamsi512_8x32_context *sc );
+void hamsi512_8x32_update( hamsi512_8x32_context *sc, const void *data,
+      size_t len );
+void hamsi512_8x32_close( hamsi512_8x32_context *sc, void *dst );
+void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,
+      size_t len );
+
+#endif
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+// Hamsi-512 8x64
+
 typedef struct {
   __m512i h[8];
   __m512i buf[1];
   size_t partial_len;
-   sph_u32 count_high, count_low;
+   uint32_t count_high, count_low;
 } hamsi_8way_big_context;
-
 typedef hamsi_8way_big_context hamsi512_8way_context;

 void hamsi512_8way_init( hamsi512_8way_context *sc );
@@ -81,15 +101,29 @@ void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
                           size_t len );
 void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );

+#define hamsi512_8x64_context   hamsi512_8way_context
+#define hamsi512_8x64_init      hamsi512_8way_init
+#define hamsi512_8x64_update    hamsi512_8way_update
+#define hamsi512_8x64_close     hamsi512_8way_close

-
-#endif
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
+// Hamsi-512 16x32
+
+typedef struct
+{
+   __m512i h[16];
+   __m512i buf[2];
+   size_t partial_len;
+   uint32_t count_high, count_low;
+} hamsi_16x32_big_context;
+typedef hamsi_16x32_big_context hamsi512_16x32_context;
+
+void hamsi512_16x32_init( hamsi512_16x32_context *sc );
+void hamsi512_16x32_update( hamsi512_16x32_context *sc, const void *data,
+                           size_t len );
+void hamsi512_16way_close( hamsi512_16x32_context *sc, void *dst );
+void hamsi512_16x32_full( hamsi512_16x32_context *sc, void *dst,
+                          const void *data, size_t len );
+
+#endif   // AVX512

 #endif
--- a/algo/hamsi/sph_hamsi.h
+++ b/algo/hamsi/sph_hamsi.h
@@ -36,7 +36,7 @@
 #define SPH_HAMSI_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 #ifdef __cplusplus
 extern "C"{
--- a/algo/haval/haval-4way-helper.c
+++ b/algo/haval/haval-4way-helper.c
@@ -48,7 +48,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
   while ( len > 0 )
   {
      unsigned clen;
-      sph_u32 clow, clow2;
+      uint32_t clow, clow2;

      clen = 128U - current;
      if ( clen > len )
@@ -67,7 +67,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
         current = 0;
      }
      clow = sc->count_low;
-      clow2 = SPH_T32(clow + clen);
+      clow2 = clow + clen;
      sc->count_low = clow2;
      if ( clow2 < clow )
         sc->count_high ++;
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -292,7 +292,9 @@ static const unsigned MP5[32] = {
 	 2, 23, 16, 22,  4,  1, 25, 15
 };

-static const sph_u32 RK2[32] = {
+#define SPH_C32(x) (x)
+
+static const uint32_t RK2[32] = {
 	SPH_C32(0x452821E6), SPH_C32(0x38D01377),
 	SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
 	SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
@@ -311,7 +313,7 @@ static const sph_u32 RK2[32] = {
 	SPH_C32(0x7B54A41D), SPH_C32(0xC25A59B5)
 };

-static const sph_u32 RK3[32] = {
+static const uint32_t RK3[32] = {
 	SPH_C32(0x9C30D539), SPH_C32(0x2AF26013),
 	SPH_C32(0xC5D1B023), SPH_C32(0x286085F0),
 	SPH_C32(0xCA417918), SPH_C32(0xB8DB38EF),
@@ -330,7 +332,7 @@ static const sph_u32 RK3[32] = {
 	SPH_C32(0xAFD6BA33), SPH_C32(0x6C24CF5C)
 };

-static const sph_u32 RK4[32] = {
+static const uint32_t RK4[32] = {
 	SPH_C32(0x7A325381), SPH_C32(0x28958677),
 	SPH_C32(0x3B8F4898), SPH_C32(0x6B4BB9AF),
 	SPH_C32(0xC4BFE81B), SPH_C32(0x66282193),
@@ -349,7 +351,7 @@ static const sph_u32 RK4[32] = {
 	SPH_C32(0x6EEF0B6C), SPH_C32(0x137A3BE4)
 };

-static const sph_u32 RK5[32] = {
+static const uint32_t RK5[32] = {
 	SPH_C32(0xBA3BF050), SPH_C32(0x7EFB2A98),
 	SPH_C32(0xA1F1651D), SPH_C32(0x39AF0176),
 	SPH_C32(0x66CA593E), SPH_C32(0x82430E88),
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -68,7 +68,6 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
 #include "simd-utils.h"

 #define SPH_SIZE_haval256_5   256
@@ -77,7 +76,7 @@ typedef struct {
   __m128i buf[32];
   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
   unsigned olen, passes;
-   sph_u32 count_high, count_low;
+   uint32_t count_high, count_low;
 } haval_4way_context;

 typedef haval_4way_context haval256_5_4way_context;
--- a/algo/haval/sph-haval.h
+++ b/algo/haval/sph-haval.h
@@ -66,7 +66,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for HAVAL-128/3.
--- a/algo/jh/sph_jh.h
+++ b/algo/jh/sph_jh.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for JH-224.
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -2,7 +2,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
-#include "sph_keccak.h"
 #include "keccak-hash-4way.h"

 #if defined(KECCAK_8WAY)
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -9,7 +9,7 @@ int hard_coded_eb = 1;
 bool register_keccak_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX2_OPT | AVX512_OPT;
-  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
  opt_target_factor = 128.0;
 #if defined (KECCAK_8WAY)
  gate->scanhash  = (void*)&scanhash_keccak_8way;
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -1,45 +1,6 @@
-/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */
-/**
- * Keccak interface. This is the interface for Keccak with the
- * recommended parameters for SHA-3, with output lengths 224, 256,
- * 384 and 512 bits.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_keccak.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
 #ifndef KECCAK_HASH_4WAY_H__
 #define KECCAK_HASH_4WAY_H__

-#ifdef __cplusplus
-extern "C"{
-#endif
-
 #ifdef  __AVX2__

 #include <stddef.h>
@@ -100,8 +61,4 @@ void keccak512_4way_addbits_and_close(

 #endif

-#ifdef __cplusplus
-}
-#endif
-
 #endif
--- a/algo/keccak/sha3d-4way.c
+++ b/algo/keccak/sha3d-4way.c
@@ -2,7 +2,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
-#include "sph_keccak.h"
 #include "keccak-hash-4way.h"

 #if defined(KECCAK_8WAY)
--- a/algo/keccak/sph_keccak.h
+++ b/algo/keccak/sph_keccak.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for Keccak-224.
--- a/algo/lanehash/lane.h
+++ b/algo/lanehash/lane.h
@@ -23,7 +23,6 @@
 #define LANE_H

 #include <string.h>
-//#include "algo/sha/sha3-defs.h"
 #include <stdint.h>

 typedef unsigned char BitSequence;
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -7,8 +7,10 @@

 #include "simd-utils.h"

+#define uint32 uint32_t
+
 /* initial values of chaining variables */
-static const uint32 IV[40] __attribute((aligned(64))) = {
+static const uint32_t IV[40] __attribute((aligned(64))) = {
    0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
    0xdef610bb,0xee058139,0x90152df4,0x6e292011,
    0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
@@ -22,7 +24,7 @@ static const uint32 IV[40] __attribute((aligned(64))) = {
 };

 /* Round Constants */
-static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
+static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
    0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
    0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
    0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -23,7 +23,7 @@
 #if defined(__AVX2__)

 #include <immintrin.h>
-#include "algo/sha/sha3-defs.h"
+//#include "algo/sha/sha3-defs.h"
 #include "simd-utils.h"

 /* The length of digests*/
@@ -54,7 +54,7 @@
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 typedef struct {
-    uint32 buffer[8*4];
+    uint32_t buffer[8*4];
    __m512i chainv[10];   /* Chaining values */
    int hashbitlen;
    int rembytes;
@@ -82,7 +82,7 @@ int luffa512_4way_update_close( luffa_4way_context *state, void *output,
 #endif

 typedef struct {
-    uint32 buffer[8*2];
+    uint32_t buffer[8*2];
    __m256i chainv[10];   /* Chaining values */
    int hashbitlen;
    int rembytes;
--- a/algo/luffa/luffa_for_sse2.h
+++ b/algo/luffa/luffa_for_sse2.h
@@ -22,7 +22,7 @@
 */

 #include <emmintrin.h>
-#include "algo/sha/sha3-defs.h"
+#include "compat/sha3-defs.h"
 /* The length of digests*/
 #define DIGEST_BIT_LEN_224 224
 #define DIGEST_BIT_LEN_256 256
--- a/algo/luffa/sph_luffa.h
+++ b/algo/luffa/sph_luffa.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for Luffa-224.
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -21,9 +21,8 @@
 #define LYRA2_H_

 #include <stdint.h>
-#include "algo/sha/sha3-defs.h"

-//typedef unsigned char byte;
+typedef unsigned char byte;

 //Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
 #define BLOCK_LEN_BLAKE2_SAFE_INT64 8                                   //512 bits (=64 bytes, =8 uint64_t)
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -4,7 +4,6 @@

 #include <memory.h>
 #include "algo/blake/sph_blake.h"
-#include "algo/cubehash/sph_cubehash.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/bmw/sph_bmw.h"
--- a/algo/lyra2/lyra2rev3.c
+++ b/algo/lyra2/lyra2rev3.c
@@ -4,7 +4,6 @@

 #include <memory.h>
 #include "algo/blake/sph_blake.h"
-#include "algo/cubehash/sph_cubehash.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/cubehash/cubehash_sse2.h" 
 //#include "lyra2.h"
--- a/algo/panama/sph_panama.h
+++ b/algo/panama/sph_panama.h
@@ -58,7 +58,7 @@
 #define SPH_PANAMA_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for PANAMA.
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -21,7 +21,7 @@
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/haval-hash-4way.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha512-hash.h"
 #if defined(__VAES__)
  #include "algo/groestl/groestl512-hash-4way.h"
  #include "algo/shavite/shavite-hash-4way.h"
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -3,7 +3,8 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha256-hash.h"
+#include "algo/sha/sha512-hash.h"
 #include "ripemd-hash-4way.h"

 #define LBRY_INPUT_SIZE 112
--- a/algo/ripemd/ripemd-hash-4way.h
+++ b/algo/ripemd/ripemd-hash-4way.h
@@ -2,7 +2,6 @@
 #define RIPEMD_HASH_4WAY_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"

 #if defined(__SSE4_2__)

--- a/algo/ripemd/sph_ripemd.h
+++ b/algo/ripemd/sph_ripemd.h
@@ -57,7 +57,7 @@
 #define SPH_RIPEMD_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for RIPEMD.
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -31,7 +31,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <inttypes.h>
-#include "algo/sha/sha-hash-4way.h"
 #include "algo/sha/sha256-hash.h"
 #include <mm_malloc.h>
 #include "malloc-huge.h"
--- a/algo/sha/hmac-sha256-hash-4way.h
+++ b/algo/sha/hmac-sha256-hash-4way.h
@@ -36,7 +36,7 @@
 #include <sys/types.h>
 #include <stdint.h>
 #include "simd-utils.h"
-#include "sha-hash-4way.h"
+#include "sha256-hash.h"

 typedef struct _hmac_sha256_4way_context
 {
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -1,168 +0,0 @@
-/* $Id: sph_sha2.h 216 2010-06-08 09:46:57Z tp $ */
-/**
- * SHA-224, SHA-256, SHA-384 and SHA-512 interface.
- *
- * SHA-256 has been published in FIPS 180-2, now amended with a change
- * notice to include SHA-224 as well (which is a simple variation on
- * SHA-256). SHA-384 and SHA-512 are also defined in FIPS 180-2. FIPS
- * standards can be found at:
- *    http://csrc.nist.gov/publications/fips/
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_sha2.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef SHA2_HASH_4WAY_H__
-#define SHA2_HASH_4WAY_H__ 1
-
-#include <stddef.h>
-#include "simd-utils.h"
-
-#if defined(__SSE2__)
-
-// SHA-256 4 way
-
-typedef struct {
-   __m128i buf[64>>2];
-   __m128i val[8];
-   uint32_t count_high, count_low;
-} sha256_4way_context __attribute__ ((aligned (64)));
-
-void sha256_4way_init( sha256_4way_context *sc );
-void sha256_4way_update( sha256_4way_context *sc, const void *data,
-                         size_t len );
-void sha256_4way_close( sha256_4way_context *sc, void *dst );
-void sha256_4way_full( void *dst, const void *data, size_t len );
-void sha256_4way_transform_le( __m128i *state_out,  const __m128i *data,
-                            const __m128i *state_in );
-void sha256_4way_transform_be( __m128i *state_out,  const __m128i *data,
-                            const __m128i *state_in );
-void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
-                                   const __m128i *W, const __m128i *state_in );
-void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
-        const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
-int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
-                                   const __m128i *state_in, const uint32_t *target );
-
-#endif  // SSE2
-
-#if defined (__AVX2__)
-
-// SHA-256 8 way
-
-typedef struct {
-   __m256i buf[64>>2];
-   __m256i val[8];
-   uint32_t count_high, count_low;
-} sha256_8way_context __attribute__ ((aligned (128)));
-
-void sha256_8way_init( sha256_8way_context *sc );
-void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
-void sha256_8way_close( sha256_8way_context *sc, void *dst );
-void sha256_8way_full( void *dst, const void *data, size_t len );
-void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
-                               const __m256i *state_in );
-void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
-                               const __m256i *state_in );
-
-void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
-                                 const __m256i *W, const __m256i *state_in );
-void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
-        const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
-int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
-                             const __m256i *state_in, const uint32_t *target );
-
-#endif  // AVX2
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-// SHA-256 16 way
-
-typedef struct {
-   __m512i buf[64>>2];
-   __m512i val[8];
-   uint32_t count_high, count_low;
-} sha256_16way_context __attribute__ ((aligned (128)));
-
-void sha256_16way_init( sha256_16way_context *sc );
-void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
-void sha256_16way_close( sha256_16way_context *sc, void *dst );
-void sha256_16way_full( void *dst, const void *data, size_t len );
-void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
-                             const __m512i *state_in );
-void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
-                             const __m512i *state_in );
-void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
-                                  const __m512i *W, const __m512i *state_in );
-void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
-        const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
-
-int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
-                            const __m512i *state_in, const uint32_t *target );
-
-#endif // AVX512
-
-#if defined (__AVX2__)
-
-// SHA-512 4 way
-
-typedef struct {
-   __m256i buf[128>>3];
-   __m256i val[8];
-   uint64_t count;
-   bool initialized;
-} sha512_4way_context __attribute__ ((aligned (128)));
-
-void sha512_4way_init( sha512_4way_context *sc);
-void sha512_4way_update( sha512_4way_context *sc, const void *data,
-                         size_t len );
-void sha512_4way_close( sha512_4way_context *sc, void *dst );
-void sha512_4way_full( void *dst, const void *data, size_t len );
-
-#endif  // AVX2
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-// SHA-512 8 way
-
-typedef struct {
-   __m512i buf[128>>3];
-   __m512i val[8];
-   uint64_t count;
-   bool initialized;
-} sha512_8way_context __attribute__ ((aligned (128)));
-
-void sha512_8way_init( sha512_8way_context *sc);
-void sha512_8way_update( sha512_8way_context *sc, const void *data, 
-                         size_t len );
-void sha512_8way_close( sha512_8way_context *sc, void *dst );
-void sha512_8way_full( void *dst, const void *data, size_t len );
-
-#endif  // AVX512
-
-#endif  // SHA256_4WAY_H__
--- a/algo/sha/sha256-hash-2way-ni.c
+++ b/algo/sha/sha256-hash-2way-ni.c
@@ -1,689 +0,0 @@
-/*   Intel SHA extensions using C intrinsics               */
-/*   Written and place in public domain by Jeffrey Walton  */
-/*   Based on code from Intel, and by Sean Gulley for      */
-/*   the miTLS project.                                    */
-
-// A stripped down version with byte swapping removed. 
-
-#if defined(__SHA__)
-
-#include "sha256-hash.h"
-
-void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
-                              const void *msg_X, const void *msg_Y,
-                              const uint32_t *in_X, const uint32_t *in_Y )
-{
-    __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
-    __m128i MSG_X, MSG_Y, TMP_X, TMP_Y;
-    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
-    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
-    __m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y;
-
-    // Load initial values
-    TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
-    STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
-    TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
-    STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
-
-    TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
-    TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
-    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
-    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
-    STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
-    STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
-    STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
-    STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
-
-    // Save current hash
-    ABEF_SAVE_X = STATE0_X;
-    ABEF_SAVE_Y = STATE0_Y;
-    CDGH_SAVE_X = STATE1_X;
-    CDGH_SAVE_Y = STATE1_Y;
-
-    // Rounds 0-3
-    TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
-    TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
-    TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
-    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Rounds 4-7
-    TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
-    TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
-    TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
-    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
-
-    // Rounds 8-11
-    TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
-    TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
-    TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
-    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
-
-    // Rounds 12-15
-    TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
-    TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
-    TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
-    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
-    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
-    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
-    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
-    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
-
-    // Rounds 16-19
-    TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
-    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
-    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
-    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
-    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
-    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
-
-    // Rounds 20-23
-    TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
-    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
-    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
-    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
-    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
-
-    // Rounds 24-27
-    TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
-    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
-    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
-    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
-    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
-
-    // Rounds 28-31
-    TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
-    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
-    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
-    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
-    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
-    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
-
-    // Rounds 32-35
-    TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
-    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
-    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
-    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
-    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
-    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
-
-    // Rounds 36-39
-    TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
-    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
-    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
-    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
-    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
-
-    // Rounds 40-43
-    TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
-    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
-    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
-    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
-    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
-
-    // Rounds 44-47
-    TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
-    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
-    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
-    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
-    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
-    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
-
-    // Rounds 48-51
-    TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
-    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
-    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
-    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
-    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
-    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
-
-    // Rounds 52-55
-    TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
-    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
-    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
-    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Rounds 56-59
-    TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
-    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
-    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
-    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Rounds 60-63
-    TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Add values back to state
-    STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
-    STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
-    STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
-    STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
-
-    TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
-    TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
-    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
-    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
-    STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
-    STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
-    STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
-    STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
-
-    // Save state
-    _mm_store_si128((__m128i*) &out_X[0], STATE0_X);
-    _mm_store_si128((__m128i*) &out_X[4], STATE1_X);
-    _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
-    _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
-}
-
-void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
-                              const void *msg_X, const void *msg_Y,
-                              const uint32_t *in_X, const uint32_t *in_Y )
-{
-    __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
-    __m128i MSG_X, MSG_Y, TMP_X, TMP_Y, MASK;
-    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
-    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
-    __m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y;
-
-    // Load initial values
-    TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
-    STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
-    TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
-    STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
-    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
-
-    TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
-    TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
-    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
-    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
-    STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
-    STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
-    STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
-    STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
-
-    // Save current hash
-    ABEF_SAVE_X = STATE0_X;
-    ABEF_SAVE_Y = STATE0_Y;
-    CDGH_SAVE_X = STATE1_X;
-    CDGH_SAVE_Y = STATE1_Y;
-
-    // Rounds 0-3
-    TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
-    TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
-    TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
-    TMSG0_X = _mm_shuffle_epi8( TMSG0_X, MASK );
-    TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, MASK );
-    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Rounds 4-7
-    TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
-    TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
-    TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
-    TMSG1_X = _mm_shuffle_epi8( TMSG1_X, MASK );
-    TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, MASK );
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
-    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
-
-    // Rounds 8-11
-    TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
-    TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
-    TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
-    TMSG2_X = _mm_shuffle_epi8( TMSG2_X, MASK );
-    TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, MASK );
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
-    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
-
-    // Rounds 12-15
-    TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
-    TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
-    TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
-    TMSG3_X = _mm_shuffle_epi8( TMSG3_X, MASK );
-    TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, MASK );
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
-    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
-    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
-    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
-    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
-    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
-
-    // Rounds 16-19
-    TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
-    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
-    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
-    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
-    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
-    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
-
-    // Rounds 20-23
-    TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
-    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
-    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
-    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
-    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
-
-    // Rounds 24-27
-    TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
-    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
-    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
-    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
-    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
-
-    // Rounds 28-31
-    TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
-    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
-    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
-    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
-    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
-    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
-
-    // Rounds 32-35
-    TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
-    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
-    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
-    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
-    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
-    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
-
-    // Rounds 36-39
-    TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
-    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
-    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
-    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
-    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
-
-    // Rounds 40-43
-    TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
-    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
-    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
-    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
-    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
-
-    // Rounds 44-47
-    TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
-    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
-    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
-    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
-    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
-    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
-
-    // Rounds 48-51
-    TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
-    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
-    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
-    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
-    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
-    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
-
-    // Rounds 52-55
-    TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
-    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
-    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
-    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Rounds 56-59
-    TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
-    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
-    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
-    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Rounds 60-63
-    TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Add values back to state
-    STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
-    STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
-    STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
-    STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
-
-    TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
-    TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
-    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
-    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
-    STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
-    STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
-    STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
-    STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
-
-    // Save state
-    _mm_store_si128((__m128i*) &out_X[0], STATE0_X);
-    _mm_store_si128((__m128i*) &out_X[4], STATE1_X);
-    _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
-    _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
-}
-
-
-#endif
-
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -3,7 +3,7 @@

 #include <stddef.h>
 #include <string.h>
-#include "sha-hash-4way.h"
+#include "sha256-hash.h"
 #include "compat.h"

 /*
@@ -610,6 +610,16 @@ do { \
   SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 14, j ); \
   SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, j );

+// Not used with AVX512, needed to satisfy the compiler
+#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
+{ \
+   __m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
+                              _mm256_set1_epi32( K256[(i)+(j)] ) ); \
+   __m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
+   D  = _mm256_add_epi32( D,  T1 ); \
+   H  = _mm256_add_epi32( T1, T2 ); \
+}
+
 #else  // AVX2

 #define CHx(X, Y, Z) \
@@ -621,6 +631,16 @@ do { \
  _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
                                         Y_xor_Z ) )

+#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
+{ \
+   __m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
+                              _mm256_set1_epi32( K256[(i)+(j)] ) ); \
+   __m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
+   Y_xor_Z = X_xor_Y; \
+   D  = _mm256_add_epi32( D,  T1 ); \
+   H  = _mm256_add_epi32( T1, T2 ); \
+}
+
 #define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
 do { \
  __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \
@@ -635,7 +655,6 @@ do { \
  H  = _mm256_add_epi32( T1, T2 ); \
 } while (0)

-
 // read Y_xor_Z, update X_xor_Y
 #define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \
  _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
@@ -769,7 +788,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
   SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  1, 0 );
   SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F,  2, 0 );

-   // round 3 part 1, ignore nonces W[3]
+   // round 3 part 1, avoid nonces W[3]
   T1 = mm256_add4_32( E, BSG2_1x(B), CHx(B, C, D),
                       _mm256_set1_epi32( K256[3] ) );
   A = _mm256_add_epi32( A, T1 );
@@ -807,23 +826,22 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G );
 #endif

-   // round 3 part 2, inject nonces  
+   // round 3 part 2, add nonces  
   A = _mm256_add_epi32( A, W[3] );
   E = _mm256_add_epi32( E, W[3] );
   
-//   SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
+   SHA256_8WAY_ROUND(       E, F, G, H, A, B, C, D,  4, 0 );
+   SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA256_8WAY_ROUND_NOMSG( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA256_8WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA256_8WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA256_8WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA256_8WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA256_8WAY_ROUND(       B, C, D, E, F, G, H, A, 15, 0 );

   W[ 0] = X[ 0];
   W[ 1] = X[ 1];
@@ -865,6 +883,7 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
   W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] ); 

   SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+
   SHA256_8WAY_MEXP_16ROUNDS( W );
   SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
   
@@ -887,8 +906,6 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
   _mm256_store_si256( state_out + 7,  H );
 }

-
-// It's working with a high hit rate but performance is lower
 int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
                           const __m256i *state_in, const uint32_t *target )
 {
@@ -912,14 +929,37 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
   const __m256i IV7 = H;
   const __m256i IV6 = G;

-   SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
+#if !defined(__AVX512VL__)
+   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
+#endif

-   for ( int j = 16; j < 48; j += 16 )
-   {
-      SHA256_8WAY_MEXP_16ROUNDS( W );
-      SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, j );
-   }
+   // rounds 0 to 16, ignore zero padding W[9..14]
+   SHA256_8WAY_ROUND(       A, B, C, D, E, F, G, H,  0, 0 );
+   SHA256_8WAY_ROUND(       H, A, B, C, D, E, F, G,  1, 0 );
+   SHA256_8WAY_ROUND(       G, H, A, B, C, D, E, F,  2, 0 );
+   SHA256_8WAY_ROUND(       F, G, H, A, B, C, D, E,  3, 0 );
+   SHA256_8WAY_ROUND(       E, F, G, H, A, B, C, D,  4, 0 );
+   SHA256_8WAY_ROUND(       D, E, F, G, H, A, B, C,  5, 0 );
+   SHA256_8WAY_ROUND(       C, D, E, F, G, H, A, B,  6, 0 );
+   SHA256_8WAY_ROUND(       B, C, D, E, F, G, H, A,  7, 0 );
+   SHA256_8WAY_ROUND(       A, B, C, D, E, F, G, H,  8, 0 );
+   SHA256_8WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA256_8WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA256_8WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA256_8WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA256_8WAY_ROUND(       B, C, D, E, F, G, H, A, 15, 0 );
+  
+   // rounds 16 ro 31
+   SHA256_8WAY_MEXP_16ROUNDS( W );
+   SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 16 );

+   // rounds 32  to 47
+   SHA256_8WAY_MEXP_16ROUNDS( W );
+   SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+
+   // rounds 48 to 60 mexp
   W[ 0] = SHA256_8WAY_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
   W[ 1] = SHA256_8WAY_MEXP( W[15], W[10], W[ 2], W[ 1] );
   W[ 2] = SHA256_8WAY_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
@@ -935,9 +975,10 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
   W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] );

 #if !defined(__AVX512VL__)
-   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
+   Y_xor_Z = _mm256_xor_si256( B, C );
 #endif

+   // rounds 48 to 57
   SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H,  0, 48 );
   SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  1, 48 );
   SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F,  2, 48 );
@@ -968,7 +1009,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
   flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
   if ( likely( 0xff == ( flip ^
                    mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ))
-   return 0;
+      return 0;

   t6_mask = mm256_movmask_32( vmask =_mm256_cmpeq_epi32( hash, targ ) );

@@ -983,28 +1024,29 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,

   if ( t6_mask )
   { 
-      // Testing H inconclusive: hash7 == target7, need to test G
+      // Testing H was inconclusive: hash7 == target7, need to test G
      targ = _mm256_and_si256( vmask, _mm256_set1_epi32( target[6] ) );
      hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf );

-      if ( unlikely( 0 != ( t6_mask & mm256_movmask_32(
+      if ( likely( 0 == ( t6_mask & mm256_movmask_32(
                                      _mm256_cmpeq_epi32( hash, targ ) ) ) ))
-         return 0;
-      else
      {
          flip = ( (int)target[6] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
          if ( likely( 0 != ( t6_mask & ( flip ^
                   mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ) ))
             return 0;
-          else if ( likely( target[6] == 0x80000000 ))
-          {
-             if ( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpgt_epi32(
-                                  hash, _mm256_xor_si256( hash, hash ) ) ) ) )
-               return 0;
-          }
+          if ( likely( ( target[6] == 0x80000000 )
+             && ( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpgt_epi32(
+                              hash, _mm256_xor_si256( hash, hash ) ) ) ) ) ))
+             return 0;
       } 
+//     else inconclusive, testing targ5 isn't practical, fininsh hashing  
   }

+// At this point either the hash will be good or the test was inconclusive.
+// If the latter it's probably a high target difficulty with a nearly equal
+// high difficulty hash that has a good chance of being good.  
+
   // rounds 59 to 61 part 2
   E = _mm256_add_epi32( T1, _mm256_add_epi32( BSG2_0x( F ),
                                               MAJx( F, G, H ) ) );
@@ -1179,6 +1221,15 @@ do { \
  H  = _mm512_add_epi32( T1, T2 ); \
 } while (0)
   
+#define SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
+{ \
+   __m512i T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \
+                              _mm512_set1_epi32( K256[(i)+(j)] ) ); \
+   __m512i T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
+   D  = _mm512_add_epi32( D,  T1 ); \
+   H  = _mm512_add_epi32( T1, T2 ); \
+}
+
 /*
 #define SHA256_16WAY_ROUND(A, B, C, D, E, F, G, H, i, j) \
 do { \
@@ -1292,7 +1343,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
   SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G,  1, 0 );
   SHA256_16WAY_ROUND( G, H, A, B, C, D, E, F,  2, 0 );

-   // round 3 part 1, ignore nonces W[3]
+   // round 3 part 1, avoid nonces W[3]
   T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D), 
                       _mm512_set1_epi32( K256[3] ) );
   A = _mm512_add_epi32( A, T1 );
@@ -1312,7 +1363,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
 void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
          const __m512i *state_in, const __m512i *state_mid, const __m512i *X )
 {
-   __m512i A, B, C, D, E, F, G, H, T1, T2;
+   __m512i A, B, C, D, E, F, G, H;
   __m512i W[16];

   memcpy_512( W, data, 16 );
@@ -1326,87 +1377,25 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
   G = _mm512_load_si512( state_mid + 6 );
   H = _mm512_load_si512( state_mid + 7 );

-   // round 3 part 2, inject nonces  
+   // round 3 part 2, add nonces  
   A = _mm512_add_epi32( A, W[3] );
   E = _mm512_add_epi32( E, W[3] );

-   // round 4
-   SHA256_16WAY_ROUND( E, F, G, H, A, B, C, D, 4, 0 );   
+   // rounds 4 to 15, ignore zero padding W[5..14]
+   SHA256_16WAY_ROUND      ( E, F, G, H, A, B, C, D,  4, 0 );   
+   SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA256_16WAY_ROUND_NOMSG( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA256_16WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA256_16WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA256_16WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA256_16WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA256_16WAY_ROUND      ( B, C, D, E, F, G, H, A, 15, 0 );

-   // round 5
-   T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B), 
-                       _mm512_set1_epi32( K256[5] ) ); 
-   T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) ); 
-   G  = _mm512_add_epi32( G,  T1 );
-   C  = _mm512_add_epi32( T1, T2 );
-
-   // round 6
-   T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A),
-                       _mm512_set1_epi32( K256[6] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) );
-   F  = _mm512_add_epi32( F,  T1 );
-   B  = _mm512_add_epi32( T1, T2 );
-
-   // round 7
-   T1 = mm512_add4_32( A, BSG2_1x16(F), CHx16(F, G, H),
-                       _mm512_set1_epi32( K256[7] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(B), MAJx16(B, C, D) );
-   E  = _mm512_add_epi32( E,  T1 );
-   A  = _mm512_add_epi32( T1, T2 );
-
-   // round 8
-   T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G),
-                       _mm512_set1_epi32( K256[8] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) );
-   D  = _mm512_add_epi32( D,  T1 );
-   H  = _mm512_add_epi32( T1, T2 );
-
-   // round 9
-   T1 = mm512_add4_32( G, BSG2_1x16(D), CHx16(D, E, F),
-                       _mm512_set1_epi32( K256[9] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(H), MAJx16(H, A, B) );
-   C  = _mm512_add_epi32( C,  T1 );
-   G  = _mm512_add_epi32( T1, T2 );
-       
-   // round 10
-   T1 = mm512_add4_32( F, BSG2_1x16(C), CHx16(C, D, E),
-                       _mm512_set1_epi32( K256[10] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(G), MAJx16(G, H, A) );
-   B  = _mm512_add_epi32( B,  T1 );
-   F  = _mm512_add_epi32( T1, T2 );
-       
-   // round 11
-   T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D),
-                       _mm512_set1_epi32( K256[11] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(F), MAJx16(F, G, H) );
-   A  = _mm512_add_epi32( A,  T1 );
-   E  = _mm512_add_epi32( T1, T2 );
-
-   // round 12
-   T1 = mm512_add4_32( D, BSG2_1x16(A), CHx16(A, B, C),
-                       _mm512_set1_epi32( K256[12] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(E), MAJx16(E, F, G) );
-   H  = _mm512_add_epi32( H,  T1 );
-   D  = _mm512_add_epi32( T1, T2 );
-
-   // round 13
-   T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B),
-                       _mm512_set1_epi32( K256[13] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) );
-   G  = _mm512_add_epi32( G,  T1 );
-   C  = _mm512_add_epi32( T1, T2 );
-  
-   // round 14
-   T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A),
-                       _mm512_set1_epi32( K256[14] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) );
-   F  = _mm512_add_epi32( F,  T1 );
-   B  = _mm512_add_epi32( T1, T2 );
-
-   // round 15   
-   SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
-
-   // rounds 16 to 31 mexp part 2, inject nonces.
+   // rounds 16 to 31 mexp part 2, add nonces.
   W[ 0] = X[ 0];
   W[ 1] = X[ 1];
   W[ 2] = _mm512_add_epi32( X[ 2], SSG2_0x16( W[ 3] ) );
@@ -1428,6 +1417,7 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,

   SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, 16 );

+   // rounds 32 to 63   
   W[ 0] = _mm512_add_epi32( X[ 6], _mm512_add_epi32( SSG2_1x16( W[14] ),
                                                      W[ 9] ) ); 
   W[ 1] = SHA256_16WAY_MEXP( W[15], W[10], W[ 2], W[ 1] );
@@ -1505,41 +1495,12 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
   SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H,  8, 0 );

   // rounds 9 to 14, ignore zero padding
-   T1 = mm512_add4_32( G, BSG2_1x16(D), CHx16(D, E, F),
-                       _mm512_set1_epi32( K256[9] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(H), MAJx16(H, A, B) );
-   C  = _mm512_add_epi32( C,  T1 );
-   G  = _mm512_add_epi32( T1, T2 );
-
-   T1 = mm512_add4_32( F, BSG2_1x16(C), CHx16(C, D, E),
-                       _mm512_set1_epi32( K256[10] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(G), MAJx16(G, H, A) );
-   B  = _mm512_add_epi32( B,  T1 );
-   F  = _mm512_add_epi32( T1, T2 );
-
-   T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D),
-                       _mm512_set1_epi32( K256[11] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(F), MAJx16(F, G, H) );
-   A  = _mm512_add_epi32( A,  T1 );
-   E  = _mm512_add_epi32( T1, T2 );
-
-   T1 = mm512_add4_32( D, BSG2_1x16(A), CHx16(A, B, C),
-                       _mm512_set1_epi32( K256[12] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(E), MAJx16(E, F, G) );
-   H  = _mm512_add_epi32( H,  T1 );
-   D  = _mm512_add_epi32( T1, T2 );
-
-   T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B),
-                       _mm512_set1_epi32( K256[13] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) );
-   G  = _mm512_add_epi32( G,  T1 );
-   C  = _mm512_add_epi32( T1, T2 );
-
-   T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A),
-                       _mm512_set1_epi32( K256[14] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) );
-   F  = _mm512_add_epi32( F,  T1 );
-   B  = _mm512_add_epi32( T1, T2 );
+   SHA256_16WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA256_16WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA256_16WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA256_16WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );

   // round 15
   SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
@@ -1575,7 +1536,6 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,

   // rounds 32 to 47
   SHA256_MEXP_16WAY_16ROUNDS( W );
-
   SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 );

   // rounds 48 to 60 mexp
@@ -1640,8 +1600,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
   {
      hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf );
      targ = _mm512_set1_epi32( target[6] );
-      if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask,
-                                                      hash, targ ) ))
+      if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) ))
          return 0;
   }

--- a/algo/sha/sha256-hash-opt.c
+++ b/algo/sha/sha256-hash-opt.c
@@ -1,388 +0,0 @@
-/*   Intel SHA extensions using C intrinsics               */
-/*   Written and place in public domain by Jeffrey Walton  */
-/*   Based on code from Intel, and by Sean Gulley for      */
-/*   the miTLS project.                                    */
-
-// A stripped down version with byte swapping removed. 
-
-#if defined(__SHA__)
-
-#include "sha256-hash.h"
-
-void sha256_opt_transform_le( uint32_t *state_out, const void *input,
-                           const uint32_t *state_in )
-{
-    __m128i STATE0, STATE1;
-    __m128i MSG, TMP;
-    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
-    __m128i ABEF_SAVE, CDGH_SAVE;
-
-    // Load initial values
-    TMP = _mm_load_si128((__m128i*) &state_in[0]);
-    STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
-//    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
-
-    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
-    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
-    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
-    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
-
-    // Save current hash
-    ABEF_SAVE = STATE0;
-    CDGH_SAVE = STATE1;
-
-    // Rounds 0-3
-    TMSG0 = _mm_load_si128((const __m128i*) (input+0));
-//    TMSG0 = _mm_shuffle_epi8(MSG, MASK);
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 4-7
-    TMSG1 = _mm_load_si128((const __m128i*) (input+16));
-//    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 8-11
-    TMSG2 = _mm_load_si128((const __m128i*) (input+32));
-//    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 12-15
-    TMSG3 = _mm_load_si128((const __m128i*) (input+48));
-//    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 16-19
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 20-23
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 24-27
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 28-31
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 32-35
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 36-39
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 40-43
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 44-47
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 48-51
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 52-55
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 56-59
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 60-63
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Add values back to state
-    STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
-    STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
-
-    TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
-    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
-    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
-    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
-
-    // Save state
-    _mm_store_si128((__m128i*) &state_out[0], STATE0);
-    _mm_store_si128((__m128i*) &state_out[4], STATE1);
-}
-
-
-void sha256_opt_transform_be( uint32_t *state_out, const void *input,
-                           const uint32_t *state_in )
-{
-    __m128i STATE0, STATE1;
-    __m128i MSG, TMP, MASK;
-    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
-    __m128i ABEF_SAVE, CDGH_SAVE;
-
-    // Load initial values
-    TMP = _mm_load_si128((__m128i*) &state_in[0]);
-    STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
-    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
-
-    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
-    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
-    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
-    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
-
-    // Save current hash
-    ABEF_SAVE = STATE0;
-    CDGH_SAVE = STATE1;
-
-    // Rounds 0-3
-    TMSG0 = _mm_load_si128((const __m128i*) (input+0));
-    TMSG0 = _mm_shuffle_epi8( TMSG0, MASK );
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 4-7
-    TMSG1 = _mm_load_si128((const __m128i*) (input+16));
-    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-    // Rounds 8-11
-    TMSG2 = _mm_load_si128((const __m128i*) (input+32));
-    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 12-15
-    TMSG3 = _mm_load_si128((const __m128i*) (input+48));
-    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 16-19
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 20-23
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 24-27
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 28-31
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 32-35
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 36-39
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 40-43
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 44-47
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 48-51
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 52-55
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 56-59
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 60-63
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Add values back to state
-    STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
-    STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
-
-    TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
-    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
-    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
-    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
-
-    // Save state
-    _mm_store_si128((__m128i*) &state_out[0], STATE0);
-    _mm_store_si128((__m128i*) &state_out[4], STATE1);
-}
-
-#endif
--- a/algo/sha/sha256-hash.c
+++ b/algo/sha/sha256-hash.c
--- a/algo/sha/sha256-hash.h
+++ b/algo/sha/sha256-hash.h
@@ -4,17 +4,18 @@
 #include <stddef.h>
 #include "simd-utils.h"
 #include "cpuminer-config.h"
-#include "sph_sha2.h"
-

 // generic interface 

-typedef struct {
+typedef struct
+{
   unsigned char buf[64];    /* first field, for alignment */
   uint32_t state[8];
   uint64_t count;
 } sha256_context __attribute__((aligned(64)));

+static const uint32_t SHA256_IV[8];
+
 void sha256_full( void *hash, const void *data, size_t len );
 void sha256_update( sha256_context *ctx, const void *data, size_t len );
 void sha256_final( sha256_context *ctx, void *hash );
@@ -41,20 +42,113 @@ void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
                              const void *msg_X, const void *msg_Y,
                              const uint32_t *in_X, const uint32_t *in_Y );

+void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
+                              uint32_t *sstate, const uint32_t *istate );
+
+void sha256_ni2way_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y,
+                 const void *msg_X, const void *msg_Y,
+                 const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
+                 const uint32_t *state_save_X, const uint32_t *state_save_Y );
+
 // Select target
 // with SHA...
 #define sha256_transform_le sha256_opt_transform_le
 #define sha256_transform_be sha256_opt_transform_be

 #else
-
 // without SHA...
+#include "sph_sha2.h"
+
 #define sha256_transform_le sph_sha256_transform_le
 #define sha256_transform_be sph_sha256_transform_be

 #endif

-// SHA can't do only 3 rounds
-#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-256 16 way
+
+typedef struct
+{
+   __m512i buf[64>>2];
+   __m512i val[8];
+   uint32_t count_high, count_low;
+} sha256_16way_context __attribute__ ((aligned (128)));
+
+void sha256_16way_init( sha256_16way_context *sc );
+void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
+void sha256_16way_close( sha256_16way_context *sc, void *dst );
+void sha256_16way_full( void *dst, const void *data, size_t len );
+void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
+                             const __m512i *state_in );
+void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
+                             const __m512i *state_in );
+void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
+                                  const __m512i *W, const __m512i *state_in );
+void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
+        const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
+
+int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
+                            const __m512i *state_in, const uint32_t *target );
+
+#endif // AVX512
+
+#if defined (__AVX2__)
+
+// SHA-256 8 way
+
+typedef struct
+{
+   __m256i buf[64>>2];
+   __m256i val[8];
+   uint32_t count_high, count_low;
+} sha256_8way_context __attribute__ ((aligned (64)));
+
+void sha256_8way_init( sha256_8way_context *sc );
+void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
+void sha256_8way_close( sha256_8way_context *sc, void *dst );
+void sha256_8way_full( void *dst, const void *data, size_t len );
+void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
+                               const __m256i *state_in );
+void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
+                               const __m256i *state_in );
+
+void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
+                                 const __m256i *W, const __m256i *state_in );
+void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
+        const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
+int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
+                             const __m256i *state_in, const uint32_t *target );
+
+#endif  // AVX2
+
+#if defined(__SSE2__)
+
+// SHA-256 4 way
+
+typedef struct
+{
+   __m128i buf[64>>2];
+   __m128i val[8];
+   uint32_t count_high, count_low;
+} sha256_4way_context __attribute__ ((aligned (32)));
+
+void sha256_4way_init( sha256_4way_context *sc );
+void sha256_4way_update( sha256_4way_context *sc, const void *data,
+                         size_t len );
+void sha256_4way_close( sha256_4way_context *sc, void *dst );
+void sha256_4way_full( void *dst, const void *data, size_t len );
+void sha256_4way_transform_le( __m128i *state_out,  const __m128i *data,
+                            const __m128i *state_in );
+void sha256_4way_transform_be( __m128i *state_out,  const __m128i *data,
+                            const __m128i *state_in );
+void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
+                                   const __m128i *W, const __m128i *state_in );
+void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
+        const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
+int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
+                                   const __m128i *state_in, const uint32_t *target );
+
+#endif  // SSE2

 #endif
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -4,7 +4,6 @@
 #include <string.h>
 #include <stdio.h>
 #include "sha256-hash.h"
-#include "sha-hash-4way.h"

 static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
 {
@@ -17,11 +16,15 @@ static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
 int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t block0[16]   __attribute__ ((aligned (64)));
-   uint32_t block1[16]   __attribute__ ((aligned (64)));
-   uint32_t hash0[8]     __attribute__ ((aligned (32)));
-   uint32_t hash1[8]     __attribute__ ((aligned (32)));
-   uint32_t mstate[8]  __attribute__ ((aligned (32)));
+   uint32_t block1a[16] __attribute__ ((aligned (64)));
+   uint32_t block1b[16] __attribute__ ((aligned (64)));
+   uint32_t block2a[16] __attribute__ ((aligned (64)));
+   uint32_t block2b[16] __attribute__ ((aligned (64)));
+   uint32_t hasha[8]    __attribute__ ((aligned (32)));
+   uint32_t hashb[8]    __attribute__ ((aligned (32)));
+   uint32_t mstatea[8]  __attribute__ ((aligned (32)));
+   uint32_t mstateb[8]  __attribute__ ((aligned (32)));
+   uint32_t sstate[8]   __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -32,56 +35,60 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
   const __m128i shuf_bswap32 =
           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );

-   // hash first 64 bytes of data
-   sha256_opt_transform_le( mstate, pdata, sha256_iv );
+   // hash first 64 byte block of data
+   sha256_opt_transform_le( mstatea, pdata, sha256_iv );
+
+   // fill & pad second bock without nonce
+   memcpy( block1a, pdata + 16, 12 );
+   memcpy( block1b, pdata + 16, 12 );
+   block1a[ 3] = 0;
+   block1b[ 3] = 0;
+   block1a[ 4] = block1b[ 4] = 0x80000000;
+   memset( block1a + 5, 0, 40 );
+   memset( block1b + 5, 0, 40 );
+   block1a[15] = block1b[15] = 80*8; // bit count
+
+   sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
+
+   // Pad third block
+   block2a[ 8] = block2b[ 8] = 0x80000000;
+   memset( block2a + 9, 0, 24 );
+   memset( block2b + 9, 0, 24 );
+   block2a[15] = block2b[15] = 32*8; // bit count

   do
   {
-      // 1. final 16 bytes of data, with padding
-      memcpy( block0, pdata + 16, 16 );
-      memcpy( block1, pdata + 16, 16 );
-      block0[ 3] = n;
-      block1[ 3] = n+1;
-      block0[ 4] = block1[ 4] = 0x80000000;
-      memset( block0 + 5, 0, 40 );
-      memset( block1 + 5, 0, 40 );
-      block0[15] = block1[15] = 80*8; // bit count
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1,
-                                  mstate, mstate );
+      // Insert nonce for second block
+      block1a[3] = n;
+      block1b[3] = n+1;
+      sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
+                                  mstateb, mstateb, sstate, sstate );

-      // 2. 32 byte hash from 1.
-      memcpy( block0, hash0, 32 );
-      memcpy( block1, hash1, 32 );
-      block0[ 8] = block1[ 8] = 0x80000000;
-      memset( block0 + 9, 0, 24 );
-      memset( block1 + 9, 0, 24 );
-      block0[15] = block1[15] = 32*8; // bit count
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1,
+      sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
                                  sha256_iv, sha256_iv );

-      if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) )
+      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
      {
-          casti_m128i( hash0, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
-          casti_m128i( hash0, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
-          if ( likely( valid_hash( hash0, ptarget ) && !bench ) )
+          casti_m128i( hasha, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
+          casti_m128i( hasha, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
+          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
          {
             pdata[19] = n;
-             submit_solution( work, hash0, mythr );
+             submit_solution( work, hasha, mythr );
          }
      }
-
-      if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) )
+      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
      {
-         casti_m128i( hash1, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
-         casti_m128i( hash1, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
-         if ( likely( valid_hash( hash1, ptarget ) && !bench ) )
+         casti_m128i( hashb, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
+         casti_m128i( hashb, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
+         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
         {
            pdata[19] = n+1;
-            submit_solution( work, hash1, mythr );
+            submit_solution( work, hashb, mythr );
         }
      }
      n += 2;
@@ -99,18 +106,16 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
 int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m512i  hash32[8]    __attribute__ ((aligned (128)));
-   __m512i  block[16]    __attribute__ ((aligned (64)));
+   __m512i  block[16]    __attribute__ ((aligned (128)));
   __m512i  buf[16]      __attribute__ ((aligned (64)));
+   __m512i  hash32[8]    __attribute__ ((aligned (64)));
   __m512i  mstate1[8]   __attribute__ ((aligned (64)));
   __m512i  mstate2[8]   __attribute__ ((aligned (64)));
   __m512i  istate[8]    __attribute__ ((aligned (64)));
   __m512i  mexp_pre[8]  __attribute__ ((aligned (64)));
   uint32_t phash[8]     __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
-   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 16;
   const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
@@ -134,7 +139,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   mstate1[6] = _mm512_set1_epi32( phash[6] );
   mstate1[7] = _mm512_set1_epi32( phash[7] );

-   // second message block data, with nonce & padding   
+   // second message block data, with nonce & padding
   buf[0] = _mm512_set1_epi32( pdata[16] );
   buf[1] = _mm512_set1_epi32( pdata[17] );
   buf[2] = _mm512_set1_epi32( pdata[18] );
@@ -142,12 +147,12 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
                              n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
   buf[4] = last_byte;
   memset_zero_512( buf+5, 10 );
-   buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
+   buf[15] = _mm512_set1_epi32( 80*8 );  // bit count

   // partially pre-expand & prehash second message block, avoiding the nonces
   sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );

-   // vectorize IV for 2nd & 3rd sha256
+   // vectorize IV for second hash
   istate[0] = _mm512_set1_epi32( sha256_iv[0] );
   istate[1] = _mm512_set1_epi32( sha256_iv[1] );
   istate[2] = _mm512_set1_epi32( sha256_iv[2] );
@@ -157,27 +162,26 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   istate[6] = _mm512_set1_epi32( sha256_iv[6] );
   istate[7] = _mm512_set1_epi32( sha256_iv[7] );

-   // initialize padding for 2nd sha256
+   // initialize padding for second hash
   block[ 8] = last_byte;
-   memset_zero_512( block + 9, 6 );
+   memset_zero_512( block+9, 6 );
   block[15] = _mm512_set1_epi32( 32*8 ); // bit count

   do
   {
      sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
-
-      if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) )
+      if ( unlikely( sha256_16way_transform_le_short(
+                                  hash32, block, istate, ptarget ) ) )
      {
         for ( int lane = 0; lane < 16; lane++ )
-         if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
         {
            extr_lane_16x32( phash, hash32, lane, 256 );
            casti_m256i( phash, 0 ) =
-                _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
+                   _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf ); 
            if ( likely( valid_hash( phash, ptarget ) && !bench ) )
            {
-               pdata[19] = n + lane;
-               submit_solution( work, phash, mythr );
+              pdata[19] = n + lane;
+              submit_solution( work, phash, mythr );
            }
         }
      }
@@ -188,92 +192,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   *hashes_done = n - first_nonce;
   return 0;
 }
-
-
-/*
-int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
-                           uint64_t *hashes_done, struct thr_info *mythr )
-{
-   __m512i  vdata[32]    __attribute__ ((aligned (128)));
-   __m512i  block[16]    __attribute__ ((aligned (64)));
-   __m512i  hash32[8]    __attribute__ ((aligned (64)));
-   __m512i  initstate[8] __attribute__ ((aligned (64)));
-   __m512i  midstate1[8] __attribute__ ((aligned (64)));
-   __m512i  midstate2[8] __attribute__ ((aligned (64)));
-   __m512i  mexp_pre[16] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
-   uint32_t *pdata = work->data;
-   const uint32_t *ptarget = work->target;
-   const uint32_t targ32_d7 = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 16;
-   uint32_t n = first_nonce;
-   __m512i *noncev = vdata + 19;
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-   const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
-   const __m512i sixteen = _mm512_set1_epi32( 16 );
-
-   for ( int i = 0; i < 19; i++ )
-       vdata[i] = _mm512_set1_epi32( pdata[i] );
-
-   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
-                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
-
-   vdata[16+4] = last_byte;
-   memset_zero_512( vdata+16 + 5, 10 );
-   vdata[16+15] = _mm512_set1_epi32( 80*8 ); // bit count
-
-   block[ 8] = last_byte;
-   memset_zero_512( block + 9, 6 );
-   block[15] = _mm512_set1_epi32( 32*8 ); // bit count
   
-   // initialize state
-   initstate[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
-   initstate[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
-   initstate[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
-   initstate[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
-   initstate[4] = _mm512_set1_epi64( 0x510E527F510E527F );
-   initstate[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
-   initstate[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
-   initstate[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
-
-   sha256_16way_transform_le( midstate1, vdata, initstate );
-
-   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
-
-   do
-   {
-      // 1. final 16 bytes of data, with padding
-      sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
-                                 mexp_pre );
-
-      // 2. 32 byte hash from 1.
-      sha256_16way_transform_le( hash32, block, initstate );
-      // byte swap final hash for testing
-      mm512_block_bswap_32( hash32, hash32 );
-
-      for ( int lane = 0; lane < 16; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
-      {
-         extr_lane_16x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
-         {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
-         }
-      }
-      *noncev = _mm512_add_epi32( *noncev, sixteen );
-      n += 16;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-*/
-
 #endif

 #if defined(SHA256D_8WAY)
@@ -284,15 +203,13 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
   __m256i  vdata[32]    __attribute__ ((aligned (64)));
   __m256i  block[16]    __attribute__ ((aligned (32)));
   __m256i  hash32[8]    __attribute__ ((aligned (32)));
-   __m256i  initstate[8] __attribute__ ((aligned (32)));
-   __m256i  midstate1[8] __attribute__ ((aligned (32)));
-   __m256i  midstate2[8] __attribute__ ((aligned (32)));
+   __m256i  istate[8]    __attribute__ ((aligned (32)));
+   __m256i  mstate1[8]   __attribute__ ((aligned (32)));
+   __m256i  mstate2[8]   __attribute__ ((aligned (32)));
   __m256i  mexp_pre[8]  __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
-   const uint32_t targ32_d7 = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
@@ -301,6 +218,8 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
   const bool bench = opt_benchmark;
   const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
   const __m256i eight = _mm256_set1_epi32( 8 );
+   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
+                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   for ( int i = 0; i < 19; i++ )
      vdata[i] = _mm256_set1_epi32( pdata[i] );
@@ -309,50 +228,47 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,

   vdata[16+4] = last_byte;
   memset_zero_256( vdata+16 + 5, 10 );
-   vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count
+   vdata[16+15] = _mm256_set1_epi32( 80*8 );

   block[ 8] = last_byte;
   memset_zero_256( block + 9, 6 );
-   block[15] = _mm256_set1_epi32( 32*8 ); // bit count
+   block[15] = _mm256_set1_epi32( 32*8 ); 
   
-   // initialize state
-   initstate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
-   initstate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
-   initstate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
-   initstate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
-   initstate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
-   initstate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
-   initstate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
-   initstate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
+   // initialize state for second hash
+   istate[0] = _mm256_set1_epi32( sha256_iv[0] );
+   istate[1] = _mm256_set1_epi32( sha256_iv[1] );
+   istate[2] = _mm256_set1_epi32( sha256_iv[2] );
+   istate[3] = _mm256_set1_epi32( sha256_iv[3] );
+   istate[4] = _mm256_set1_epi32( sha256_iv[4] );
+   istate[5] = _mm256_set1_epi32( sha256_iv[5] );
+   istate[6] = _mm256_set1_epi32( sha256_iv[6] );
+   istate[7] = _mm256_set1_epi32( sha256_iv[7] );
+
+   sha256_8way_transform_le( mstate1, vdata, istate );

-   sha256_8way_transform_le( midstate1, vdata, initstate );
-   
   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
-
+   sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
+   
   do
   {
-      // 1. final 16 bytes of data, with padding
-      sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
-                                mexp_pre );
-
-      // 2. 32 byte hash from 1.
-      sha256_8way_transform_le( hash32, block, initstate );
-      // byte swap final hash for testing
-      mm256_block_bswap_32( hash32, hash32 );
-
-      for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
+      if ( unlikely( sha256_8way_transform_le_short( hash32, block,
+                                                     istate, ptarget ) ) )
      {
-         extr_lane_8x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         for ( int lane = 0; lane < 8; lane++ )
         {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
+            extr_lane_8x32( lane_hash, hash32, lane, 256 );
+            casti_m256i( lane_hash, 0 ) =
+               _mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
+            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, lane_hash, mythr );
+            }
         }
-       }
-       *noncev = _mm256_add_epi32( *noncev, eight );
-       n += 8;
+      }
+      *noncev = _mm256_add_epi32( *noncev, eight );
+      n += 8;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
@@ -366,12 +282,12 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
 int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m128i  vdata[32]     __attribute__ ((aligned (64)));
-   __m128i  block[16]     __attribute__ ((aligned (32)));
-   __m128i  hash32[8]     __attribute__ ((aligned (32)));
-   __m128i  initstate[8]  __attribute__ ((aligned (32)));
-   __m128i  midstate1[8]   __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8]  __attribute__ ((aligned (32)));
+   __m128i  vdata[32]    __attribute__ ((aligned (64)));
+   __m128i  block[16]    __attribute__ ((aligned (32)));
+   __m128i  hash32[8]    __attribute__ ((aligned (32)));
+   __m128i  istate[8] __attribute__ ((aligned (32)));
+   __m128i  mstate[8]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
@@ -392,33 +308,30 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,

   vdata[16+4] = last_byte;
   memset_zero_128( vdata+16 + 5, 10 );
-   vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
+   vdata[16+15] = _mm_set1_epi32( 80*8 );

   block[ 8] = last_byte;
   memset_zero_128( block + 9, 6 );
-   block[15] = _mm_set1_epi32( 32*8 ); // bit count
-
+   block[15] = _mm_set1_epi32( 32*8 );
+   
   // initialize state
-   initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
-   initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
-   initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
-   initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
-   initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
-   initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
-   initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
-   initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
+   istate[0] = _mm_set1_epi32( sha256_iv[0] );
+   istate[1] = _mm_set1_epi32( sha256_iv[1] );
+   istate[2] = _mm_set1_epi32( sha256_iv[2] );
+   istate[3] = _mm_set1_epi32( sha256_iv[3] );
+   istate[4] = _mm_set1_epi32( sha256_iv[4] );
+   istate[5] = _mm_set1_epi32( sha256_iv[5] );
+   istate[6] = _mm_set1_epi32( sha256_iv[6] );
+   istate[7] = _mm_set1_epi32( sha256_iv[7] );

   // hash first 64 bytes of data
-   sha256_4way_transform_le( midstate1, vdata, initstate );
+   sha256_4way_transform_le( mstate, vdata, istate );

   do
   {
-      // 1. final 16 bytes of data, with padding
-      sha256_4way_transform_le( block, vdata+16, initstate );
+      sha256_4way_transform_le( block,  vdata+16, mstate  );
+      sha256_4way_transform_le( hash32, block, istate );

-      // 2. 32 byte hash from 1.
-      sha256_4way_transform_le( hash32, block, initstate );
-      // byte swap final hash for testing
      mm128_block_bswap_32( hash32, hash32 );

      for ( int lane = 0; lane < 4; lane++ )
@@ -440,3 +353,5 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
 }

 #endif
+
+
--- a/algo/sha/sha256dt.c
+++ b/algo/sha/sha256dt.c
@@ -4,7 +4,6 @@
 #include <string.h>
 #include <stdio.h>
 #include "sha256-hash.h"
-#include "sha-hash-4way.h"

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define SHA256DT_16WAY 1
@@ -22,14 +21,104 @@ static const uint32_t sha256dt_iv[8]  __attribute__ ((aligned (32))) =
      0xaa3ff126, 0x475bbf30, 0x8fd52e5b, 0x9f75c9ad
   };

+#if defined(SHA256DT_SHA)
+
+int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t block1a[16] __attribute__ ((aligned (64)));
+   uint32_t block1b[16] __attribute__ ((aligned (64)));
+   uint32_t block2a[16] __attribute__ ((aligned (64)));
+   uint32_t block2b[16] __attribute__ ((aligned (64)));
+   uint32_t hasha[8]    __attribute__ ((aligned (32)));
+   uint32_t hashb[8]    __attribute__ ((aligned (32)));
+   uint32_t mstatea[8]  __attribute__ ((aligned (32)));
+   uint32_t mstateb[8]  __attribute__ ((aligned (32)));
+   uint32_t sstate[8]   __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m128i shuf_bswap32 =
+           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+
+   // hash first 64 byte block of data
+   sha256_opt_transform_le( mstatea, pdata, sha256dt_iv );
+
+   // fill & pad second bock without nonce
+   memcpy( block1a, pdata + 16, 12 );
+   memcpy( block1b, pdata + 16, 12 );
+   block1a[ 3] = 0;
+   block1b[ 3] = 0;
+   block1a[ 4] = block1b[ 4] = 0x80000000;
+   memset( block1a + 5, 0, 40 );
+   memset( block1b + 5, 0, 40 );
+   block1a[15] = block1b[15] = 0x480; // funky bit count
+
+   sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
+
+   // Pad third block
+   block2a[ 8] = block2b[ 8] = 0x80000000;
+   memset( block2a + 9, 0, 24 );
+   memset( block2b + 9, 0, 24 );
+   block2a[15] = block2b[15] = 0x300; // bit count
+
+   do
+   {
+      // Insert nonce for second block
+      block1a[3] = n;
+      block1b[3] = n+1;
+      sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
+                                  mstateb, mstateb, sstate, sstate );
+
+      sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
+                                  sha256dt_iv, sha256dt_iv );
+
+      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
+      {
+          casti_m128i( hasha, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
+          casti_m128i( hasha, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
+          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
+          {
+             pdata[19] = n;
+             submit_solution( work, hasha, mythr );
+          }
+      }
+      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
+      {
+         casti_m128i( hashb, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
+         casti_m128i( hashb, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
+         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
+         {
+            pdata[19] = n+1;
+            submit_solution( work, hashb, mythr );
+         }
+      }
+      n += 2;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
 #if defined(SHA256DT_16WAY)

 int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m512i  hash32[8]    __attribute__ ((aligned (128)));
-   __m512i  block[16]    __attribute__ ((aligned (64)));
+   __m512i  block[16]    __attribute__ ((aligned (128)));
   __m512i  buf[16]      __attribute__ ((aligned (64)));
+   __m512i  hash32[8]    __attribute__ ((aligned (64)));
   __m512i  mstate1[8]   __attribute__ ((aligned (64)));
   __m512i  mstate2[8]   __attribute__ ((aligned (64)));
   __m512i  istate[8]    __attribute__ ((aligned (64)));
@@ -37,8 +126,6 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
   uint32_t phash[8]     __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
-//   uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
-//   const uint32_t targ32_d7 = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 16;
   const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
@@ -75,7 +162,7 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
   // partially pre-expand & prehash second message block, avoiding the nonces
   sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );

-   // vectorize IV for 2nd sha256
+   // vectorize IV for second hash
   istate[0] = _mm512_set1_epi32( sha256dt_iv[0] );
   istate[1] = _mm512_set1_epi32( sha256dt_iv[1] );
   istate[2] = _mm512_set1_epi32( sha256dt_iv[2] );
@@ -85,20 +172,18 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
   istate[6] = _mm512_set1_epi32( sha256dt_iv[6] );
   istate[7] = _mm512_set1_epi32( sha256dt_iv[7] );

-   // initialize padding for 2nd sha256
+   // initialize padding for second hash
   block[ 8] = last_byte;
   memset_zero_512( block+9, 6 );
   block[15] = _mm512_set1_epi32( 0x300 ); // bit count

   do
   {
-      // finish second block with nonces
      sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
      if ( unlikely( sha256_16way_transform_le_short(
                                  hash32, block, istate, ptarget ) ) )
      {
         for ( int lane = 0; lane < 16; lane++ )
-//         if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
         {
            extr_lane_16x32( phash, hash32, lane, 256 );
            casti_m256i( phash, 0 ) =
@@ -118,86 +203,9 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
   return 0;
 }
   
-#elif defined(SHA256DT_SHA)
+#endif

-int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t block0[16]   __attribute__ ((aligned (64)));
-   uint32_t block1[16]   __attribute__ ((aligned (64)));
-   uint32_t hash0[8]     __attribute__ ((aligned (32)));
-   uint32_t hash1[8]     __attribute__ ((aligned (32)));
-   uint32_t mstate[8]  __attribute__ ((aligned (32)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 2;
-   uint32_t n = first_nonce;
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-   const __m128i shuf_bswap32 =
-           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
-
-   // hash first 64 bytes of data
-   sha256_opt_transform_le( mstate, pdata, sha256dt_iv );
-
-   do
-   {
-      // 1. final 16 bytes of data, with padding
-      memcpy( block0, pdata + 16, 16 );
-      memcpy( block1, pdata + 16, 16 );
-      block0[ 3] = n;
-      block1[ 3] = n+1;
-      block0[ 4] = block1[ 4] = 0x80000000;
-      memset( block0 + 5, 0, 40 );
-      memset( block1 + 5, 0, 40 );
-      block0[15] = block1[15] = 0x480; // funky bit count
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1,
-                                  mstate, mstate );
-
-      // 2. 32 byte hash from 1.
-      memcpy( block0, hash0, 32 );
-      memcpy( block1, hash1, 32 );
-      block0[ 8] = block1[ 8] = 0x80000000;
-      memset( block0 + 9, 0, 24 );
-      memset( block1 + 9, 0, 24 );
-      block0[15] = block1[15] = 0x300; // bit count
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1,
-                                  sha256dt_iv, sha256dt_iv );
-
-      if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) )
-      {
-          casti_m128i( hash0, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
-          casti_m128i( hash0, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
-          if ( likely( valid_hash( hash0, ptarget ) && !bench ) )
-          {
-             pdata[19] = n;
-             submit_solution( work, hash0, mythr );
-          }
-      }
-      if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) )
-      {
-         casti_m128i( hash1, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
-         casti_m128i( hash1, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
-         if ( likely( valid_hash( hash1, ptarget ) && !bench ) )
-         {
-            pdata[19] = n+1;
-            submit_solution( work, hash1, mythr );
-         }
-      }
-      n += 2;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
-
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-#elif defined(SHA256DT_8WAY)
+#if defined(SHA256DT_8WAY)

 int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
@@ -236,7 +244,7 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
   memset_zero_256( block + 9, 6 );
   block[15] = _mm256_set1_epi32( 0x300 ); 
   
-   // initialize state
+   // initialize state for swecond hash
   istate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
   istate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 );
   istate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 );
@@ -253,11 +261,9 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
   
   do
   {
-      sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2,
-                                mexp_pre );
-
-      if ( unlikely( sha256_8way_transform_le_short(
-                            hash32, block, istate, ptarget ) ) )
+      sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
+      if ( unlikely( sha256_8way_transform_le_short( hash32, block,
+                                                     istate, ptarget ) ) )
      {
         for ( int lane = 0; lane < 8; lane++ )
         {
@@ -279,7 +285,9 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
   return 0;
 }

-#elif defined(SHA256DT_4WAY)
+#endif
+
+#if defined(SHA256DT_4WAY)

 int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
--- a/algo/sha/sha256q-4way.c
+++ b/algo/sha/sha256q-4way.c
@@ -3,7 +3,7 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "sha-hash-4way.h"
+#include "sha256-hash.h"

 #if defined(SHA256T_16WAY)

--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -4,7 +4,12 @@
 #include <string.h>
 #include <stdio.h>
 #include "sha256-hash.h"
-#include "sha-hash-4way.h"
+
+   static const uint32_t sha256_iv[8]  __attribute__ ((aligned (32))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };

 #if defined(SHA256T_16WAY)

@@ -19,11 +24,6 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   __m512i  istate[8]    __attribute__ ((aligned (64)));
   __m512i  mexp_pre[8]  __attribute__ ((aligned (64)));
   uint32_t phash[8]     __attribute__ ((aligned (32)));
-   static const uint32_t IV[8]  __attribute__ ((aligned (32))) =
-   {
-      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-   };
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
@@ -39,7 +39,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   // prehash first block directly from pdata
-   sha256_transform_le( phash, pdata, IV );
+   sha256_transform_le( phash, pdata, sha256_iv );

   // vectorize block 0 hash for second block
   mstate1[0] = _mm512_set1_epi32( phash[0] );
@@ -65,14 +65,14 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );

   // vectorize IV for 2nd & 3rd sha256
-   istate[0] = _mm512_set1_epi32( IV[0] );
-   istate[1] = _mm512_set1_epi32( IV[1] );
-   istate[2] = _mm512_set1_epi32( IV[2] );
-   istate[3] = _mm512_set1_epi32( IV[3] );
-   istate[4] = _mm512_set1_epi32( IV[4] );
-   istate[5] = _mm512_set1_epi32( IV[5] );
-   istate[6] = _mm512_set1_epi32( IV[6] );
-   istate[7] = _mm512_set1_epi32( IV[7] );
+   istate[0] = _mm512_set1_epi32( sha256_iv[0] );
+   istate[1] = _mm512_set1_epi32( sha256_iv[1] );
+   istate[2] = _mm512_set1_epi32( sha256_iv[2] );
+   istate[3] = _mm512_set1_epi32( sha256_iv[3] );
+   istate[4] = _mm512_set1_epi32( sha256_iv[4] );
+   istate[5] = _mm512_set1_epi32( sha256_iv[5] );
+   istate[6] = _mm512_set1_epi32( sha256_iv[6] );
+   istate[7] = _mm512_set1_epi32( sha256_iv[7] );

   // initialize padding for 2nd & 3rd sha256
   block[ 8] = last_byte;
@@ -110,6 +110,97 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,

 #endif

+#if defined(__SHA__)
+
+int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t block1a[16] __attribute__ ((aligned (64)));
+   uint32_t block1b[16] __attribute__ ((aligned (64)));
+   uint32_t block2a[16] __attribute__ ((aligned (64)));
+   uint32_t block2b[16] __attribute__ ((aligned (64)));
+   uint32_t hasha[8]   __attribute__ ((aligned (32)));
+   uint32_t hashb[8]   __attribute__ ((aligned (32)));
+   uint32_t mstatea[8] __attribute__ ((aligned (32)));
+   uint32_t mstateb[8] __attribute__ ((aligned (32)));
+   uint32_t sstate[8]  __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m128i shuf_bswap32 =
+           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+
+   // hash first 64 byte block of data
+   sha256_opt_transform_le( mstatea, pdata, sha256_iv );
+
+   // fill & pad second bock without nonce
+   memcpy( block1a, pdata + 16, 12 );
+   memcpy( block1b, pdata + 16, 12 );
+   block1a[ 3] = 0;
+   block1b[ 3] = 0;
+   block1a[ 4] = block1b[ 4] = 0x80000000;
+   memset( block1a + 5, 0, 40 );
+   memset( block1b + 5, 0, 40 );
+   block1a[15] = block1b[15] = 0x480; // funky bit count
+
+   sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
+
+   // Pad third block
+   block2a[ 8] = block2b[ 8] = 0x80000000;
+   memset( block2a + 9, 0, 24 );
+   memset( block2b + 9, 0, 24 );
+   block2a[15] = block2b[15] = 80*8; // bit count
+
+   do
+   {
+      // Insert nonce for second block
+      block1a[3] = n;
+      block1b[3] = n+1;
+      sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
+                                  mstateb, mstateb, sstate, sstate );
+      sha256_ni2way_transform_le( block2a, block2b, block2a, block2b,
+                                  sha256_iv, sha256_iv );
+      sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
+                                  sha256_iv, sha256_iv );
+
+      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
+      {
+          casti_m128i( hasha, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
+          casti_m128i( hasha, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
+          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
+          {
+             pdata[19] = n;
+             submit_solution( work, hasha, mythr );
+          }
+      }
+      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
+      {
+         casti_m128i( hashb, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
+         casti_m128i( hashb, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
+         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
+         {
+            pdata[19] = n+1;
+            submit_solution( work, hashb, mythr );
+         }
+      }
+      n += 2;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
 #if defined(SHA256T_8WAY)
   
 int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -5,9 +5,9 @@ bool register_sha256t_algo( algo_gate_t* gate )
    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
 #if defined(SHA256T_16WAY)
    gate->scanhash   = (void*)&scanhash_sha256t_16way;
-#elif defined(__SHA__)
+#elif defined(SHA256T_SHA)
    gate->optimizations = SHA_OPT;
-    gate->scanhash   = (void*)&scanhash_sha256t;
+    gate->scanhash   = (void*)&scanhash_sha256t_sha;
 #elif defined(SHA256T_8WAY)
    gate->scanhash   = (void*)&scanhash_sha256t_8way;
 #else
@@ -22,7 +22,7 @@ bool register_sha256q_algo( algo_gate_t* gate )
 #if defined(SHA256T_16WAY)
    gate->scanhash   = (void*)&scanhash_sha256q_16way;
    gate->hash       = (void*)&sha256q_16way_hash;
-#elif defined(__SHA__)
+#elif defined(SHA256T_SHA)
    gate->optimizations = SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256q;
    gate->hash       = (void*)&sha256q_hash;
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -6,6 +6,8 @@

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define SHA256T_16WAY 1
+#elif defined(__SHA__)
+  #define SHA256T_SHA 1
 #elif defined(__AVX2__)
  #define SHA256T_8WAY 1
 #else
@@ -42,9 +44,9 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 #endif

-#if defined(__SHA__)
+#if defined(SHA256T_SHA)

-int scanhash_sha256t( struct work *work, uint32_t max_nonce,
+int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );

 #endif
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -1,102 +0,0 @@
-#include "sha256t-gate.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-//#include "algo/sha/sph_sha2.h"
-#include "sha256-hash.h"
-
-#if defined(__SHA__)
-
-// Only used on CPUs with SHA
-
-
-int scanhash_sha256t( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t block0[16]    __attribute__ ((aligned (64)));
-   uint32_t block1[16]    __attribute__ ((aligned (64)));
-   uint32_t hash0[8]    __attribute__ ((aligned (32)));
-   uint32_t hash1[8]    __attribute__ ((aligned (32)));
-   uint32_t initstate[8] __attribute__ ((aligned (32)));
-   uint32_t midstate[8]  __attribute__ ((aligned (32)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 2;
-   uint32_t n = first_nonce;
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-   __m128i shuf_bswap32 =
-           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
-
-   // initialize state
-   initstate[0] = 0x6A09E667;
-   initstate[1] = 0xBB67AE85;
-   initstate[2] = 0x3C6EF372;
-   initstate[3] = 0xA54FF53A;
-   initstate[4] = 0x510E527F;
-   initstate[5] = 0x9B05688C;
-   initstate[6] = 0x1F83D9AB;
-   initstate[7] = 0x5BE0CD19;
-
-   // hash first 64 bytes of data
-   sha256_opt_transform_le( midstate, pdata, initstate );
-
-   do
-   {
-      // 1. final 16 bytes of data, with padding
-      memcpy( block0, pdata + 16, 16 );
-      memcpy( block1, pdata + 16, 16 );
-      block0[ 3] = n;
-      block1[ 3] = n+1;
-      block0[ 4] = block1[ 4] = 0x80000000;
-      memset( block0 + 5, 0, 40 );
-      memset( block1 + 5, 0, 40 );
-      block0[15] = block1[15] = 80*8; // bit count
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1, midstate, midstate );
-
-      // 2. 32 byte hash from 1.
-      memcpy( block0, hash0, 32 );
-      memcpy( block1, hash1, 32 );
-      block0[ 8] = block1[ 8] = 0x80000000;
-      memset( block0 + 9, 0, 24 );
-      memset( block1 + 9, 0, 24 );
-      block0[15] = block1[15] = 32*8; // bit count
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
-
-      // 3. 32 byte hash from 2.
-      memcpy( block0, hash0, 32 );
-      memcpy( block1, hash1, 32 );
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
-
-      // byte swap final hash for testing
-      casti_m128i( hash0, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
-      casti_m128i( hash0, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
-      casti_m128i( hash1, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
-      casti_m128i( hash1, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
-
-      if ( unlikely( valid_hash( hash0, ptarget ) && !bench ) )
-      {
-         pdata[19] = n;
-         submit_solution( work, hash0, mythr );
-      }
-      if ( unlikely( valid_hash( hash1, ptarget ) && !bench ) )
-      {
-         pdata[19] = n+1;
-         submit_solution( work, hash1, mythr );
-      }
-      n += 2;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
-
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-#endif
-
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -34,7 +34,7 @@

 #include <stddef.h>
 #include <string.h>
-#include "sha-hash-4way.h"
+#include "sha512-hash.h"

 /*
 static const uit64_t H512[8] =
--- a/algo/sha/sha512-hash.h
+++ b/algo/sha/sha512-hash.h
@@ -0,0 +1,46 @@
+#ifndef SHA512_HASH_H__
+#define SHA512_HASH_H__ 1
+
+#include <stddef.h>
+#include "simd-utils.h"
+#include "sph_sha2.h"
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-512 8 way
+
+typedef struct {
+   __m512i buf[128>>3];
+   __m512i val[8];
+   uint64_t count;
+   bool initialized;
+} sha512_8way_context __attribute__ ((aligned (128)));
+
+void sha512_8way_init( sha512_8way_context *sc);
+void sha512_8way_update( sha512_8way_context *sc, const void *data, 
+                         size_t len );
+void sha512_8way_close( sha512_8way_context *sc, void *dst );
+void sha512_8way_full( void *dst, const void *data, size_t len );
+
+#endif  // AVX512
+
+#if defined (__AVX2__)
+
+// SHA-512 4 way
+
+typedef struct {
+   __m256i buf[128>>3];
+   __m256i val[8];
+   uint64_t count;
+   bool initialized;
+} sha512_4way_context __attribute__ ((aligned (64)));
+
+void sha512_4way_init( sha512_4way_context *sc);
+void sha512_4way_update( sha512_4way_context *sc, const void *data,
+                         size_t len );
+void sha512_4way_close( sha512_4way_context *sc, void *dst );
+void sha512_4way_full( void *dst, const void *data, size_t len );
+
+#endif  // AVX2
+
+#endif
--- a/algo/sha/sha512256d-4way.c
+++ b/algo/sha/sha512256d-4way.c
@@ -1,5 +1,6 @@
 #include "algo-gate-api.h"
-#include "sha-hash-4way.h"
+#include "sha256-hash.h"
+#include "sha512-hash.h"
 #include <string.h>
 #include <stdint.h>

--- a/algo/sha/sph_sha2.h
+++ b/algo/sha/sph_sha2.h
@@ -41,7 +41,7 @@
 #define SPH_SHA2_H__

 #include <stddef.h>
-#include "sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for SHA-224.
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -58,7 +58,7 @@ extern "C"{
           M8, M9, MA, MB, MC, MD, ME, MF; \
   const __m256i FIVE  = _mm256_set1_epi32( 5 ); \
   const __m256i THREE = _mm256_set1_epi32( 3 ); \
-   sph_u32 Wlow, Whigh;
+   uint32_t Wlow, Whigh;

 #define READ_STATE8(state) do \
 { \
@@ -653,7 +653,7 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 	        M8, M9, MA, MB, MC, MD, ME, MF; \
   const __m128i FIVE  = _mm_set1_epi32( 5 ); \
   const __m128i THREE = _mm_set1_epi32( 3 ); \
-   sph_u32 Wlow, Whigh;
+   uint32_t Wlow, Whigh;

 #define READ_STATE(state) do \
 { \
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -1,51 +1,11 @@
-/* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */
-/**
- * Shabal interface. Shabal is a family of functions which differ by
- * their output size; this implementation defines Shabal for output
- * sizes 192, 224, 256, 384 and 512 bits.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_shabal.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
 #ifndef SHABAL_HASH_4WAY_H__
 #define SHABAL_HASH_4WAY_H__ 1

 #ifdef __SSE4_1__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
 #include "simd-utils.h"

-#ifdef __cplusplus
-extern "C"{
-#endif
-
 #define SPH_SIZE_shabal256   256

 #define SPH_SIZE_shabal512   512
@@ -55,7 +15,7 @@ extern "C"{
 typedef struct {
   __m256i buf[16];
   __m256i A[12], B[16], C[16];
-   sph_u32 Whigh, Wlow;
+   uint32_t Whigh, Wlow;
   size_t ptr;
   bool state_loaded;
 } shabal_8way_context __attribute__ ((aligned (64)));
@@ -80,7 +40,7 @@ void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
 typedef struct {
 	__m128i buf[16] __attribute__ ((aligned (64)));
 	__m128i A[12], B[16], C[16];
-	sph_u32 Whigh, Wlow;
+	uint32_t Whigh, Wlow;
   size_t ptr;
   bool state_loaded;
 } shabal_4way_context;
@@ -100,10 +60,6 @@ void shabal512_4way_close( void *cc, void *dst );
 void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                       void *dst );

-#ifdef __cplusplus
-}
-#endif
-
 #endif

 #endif
--- a/algo/shabal/sph_shabal.h
+++ b/algo/shabal/sph_shabal.h
@@ -37,7 +37,7 @@
 #define SPH_SHABAL_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 #ifdef __cplusplus
 extern "C"{
 #endif
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -1,6 +1,4 @@
 #include "shavite-hash-2way.h"
-#include "algo/sha/sph_types.h"
-
 #include <stdio.h>

 // This is a fake, it actually does not do parallel AES, that requires VAES.
--- a/algo/shavite/sph_shavite.c
+++ b/algo/shavite/sph_shavite.c
@@ -64,7 +64,7 @@ extern "C"{
 */

 #define AES_BIG_ENDIAN   0
-#include "algo/sha/aes_helper.c"
+#include "compat/aes_helper.c"

 static const sph_u32 IV224[] = {
 	C32(0x6774F31C), C32(0x990AE210), C32(0xC87D4274), C32(0xC9546371),
--- a/algo/shavite/sph_shavite.h
+++ b/algo/shavite/sph_shavite.h
@@ -39,7 +39,7 @@
 #define SPH_SHAVITE_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 #ifdef __cplusplus
 extern "C"{
--- a/algo/simd/nist.h
+++ b/algo/simd/nist.h
@@ -9,7 +9,7 @@
 #endif

 #include "simd-compat.h"
-#include "algo/sha/sha3-defs.h"
+#include "compat/sha3-defs.h"
 /*
 * NIST API Specific types.
 */
--- a/algo/simd/simd-compat.h
+++ b/algo/simd/simd-compat.h
@@ -24,7 +24,7 @@
 */

 #include <stdint.h>
-#include "algo/sha/brg_types.h"
+#include "compat/brg_types.h"

 #define C32(x)    ((u32)(x))

--- a/algo/simd/sph_simd.h
+++ b/algo/simd/sph_simd.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for SIMD-224.
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -2,7 +2,6 @@
 #include <string.h>
 #include <stdint.h>
 #include "skein-hash-4way.h"
-#include "algo/sha/sha-hash-4way.h"
 #include "algo/sha/sha256-hash.h"

 #if defined (SKEIN_8WAY)
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -1,5 +1,4 @@
 #include "skein-gate.h"
-#include "sph_skein.h"
 #include "skein-hash-4way.h"

 bool register_skein_algo( algo_gate_t* gate )
--- a/algo/skein/sph_skein.h
+++ b/algo/skein/sph_skein.h
@@ -46,7 +46,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 #if SPH_64

--- a/algo/tiger/sph_tiger.h
+++ b/algo/tiger/sph_tiger.h
@@ -45,7 +45,7 @@
 #define SPH_TIGER_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 #if SPH_64

--- a/algo/whirlpool/sph_whirlpool.h
+++ b/algo/whirlpool/sph_whirlpool.h
@@ -49,7 +49,7 @@
 #define SPH_WHIRLPOOL_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 #if SPH_64

--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -65,6 +65,9 @@ void init_x11_8way_ctx()
 #endif
 }

+static __thread __m512i x11_8way_midstate[16] __attribute__((aligned(64)));
+
+
 void x11_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
@@ -80,8 +83,9 @@ void x11_8way_hash( void *state, const void *input )
     uint64_t hash7[8] __attribute__ ((aligned (64)));
     x11_8way_ctx_holder ctx;
     memcpy( &ctx, &x11_8way_ctx, sizeof(x11_8way_ctx) );
-     blake512_8way_update( &ctx.blake, input, 80 );
-     blake512_8way_close( &ctx.blake, vhash );
+
+     blake512_8way_final_le( &ctx.blake, vhash, casti_m512i( input, 9 ),
+                             x11_8way_midstate );

     bmw512_8way_update( &ctx.bmw, vhash, 64 );
     bmw512_8way_close( &ctx.bmw, vhash );
@@ -252,39 +256,45 @@ void x11_8way_hash( void *state, const void *input )
 int scanhash_x11_8way( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[8*8] __attribute__ ((aligned (128)));
-     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
-     uint32_t *pdata = work->data;
-     uint32_t *ptarget = work->target;
-     uint32_t n = pdata[19];
-     const uint32_t first_nonce = pdata[19];
-     int thr_id = mythr->id;
-     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-     const uint32_t Htarg = ptarget[7];
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   __m128i edata[5] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   int thr_id = mythr->id;
+   __m512i  *noncev = (__m512i*)vdata + 9; 
+   const uint32_t last_nonce = max_nonce -8;
+   const __m512i eight = _mm512_set1_epi64( 8 );

-     const uint32_t last_nonce = max_nonce -8;
-     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   // convert LE32 to LE64
+   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
+   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
+   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
+   edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
+   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );

-     do
-     {
-        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-         _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                           n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+   mm512_intrlv80_8x64( vdata, edata );
+   *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
+                                    0,7, 0,6, 0,5, 0,4, 0,3, 0,2, 0,1, 0,0 ) );
+   blake512_8way_prehash_le( &x11_8way_ctx.blake, x11_8way_midstate, vdata );

-         x11_8way_hash( hash, vdata );
-         pdata[19] = n;
+   do
+   {
+      x11_8way_hash( hash, vdata );

-         for ( int i = 0; i < 8; i++ )
-         if ( ( hash+(i<<3) )[7] <= Htarg
-              && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-         {
-             pdata[19] = n+i;
-             submit_solution( work, hash+(i<<3), mythr );
-         }
-         n += 8;
-     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
-     *hashes_done = n - first_nonce;
-     return 0;
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( valid_hash( hash+(i<<3), ptarget ) && !opt_benchmark ))
+      {
+          pdata[19] = n+i;
+          submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = _mm512_add_epi32( *noncev, eight );
+      n += 8;
+   } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+   *hashes_done = n - first_nonce;
+   return 0;
 }


--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -263,7 +263,7 @@ bool register_hex_algo( algo_gate_t* gate )
  gate->scanhash        = (void*)&scanhash_hex;
  gate->hash            = (void*)&x16r_hash;
  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
-  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
  opt_target_factor = 128.0;
  return true;
 };
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -20,7 +20,7 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha512-hash.h"

 #if defined(__AES__)
 #include "algo/echo/aes_ni/hash_api.h"
@@ -42,7 +42,6 @@
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/shabal/shabal-hash-4way.h"
-#include "algo/sha/sha-hash-4way.h"

 #if defined(__VAES__)
 #include "algo/groestl/groestl512-hash-4way.h"
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -12,9 +12,7 @@
 #include "algo/tiger/sph_tiger.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/lyra2/lyra2.h"
-#if defined(__SHA__)
-  #include "algo/sha/sha256-hash.h"
-#endif
+#include "algo/sha/sha256-hash.h"

 #if defined (X21S_8WAY)

--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -20,7 +20,7 @@
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/haval-hash-4way.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha512-hash.h"
 #if defined(__VAES__)
  #include "algo/groestl/groestl512-hash-4way.h"
  #include "algo/shavite/shavite-hash-4way.h"
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -25,7 +25,7 @@
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/haval-hash-4way.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha512-hash.h"

 #if defined(X17_8WAY)

@@ -37,7 +37,6 @@ union _x17_8way_context_overlay
    jh512_8way_context      jh;
    keccak512_8way_context  keccak;
    luffa_4way_context      luffa;
-//    cube_4way_context       cube;
    cube_4way_2buf_context   cube;
 #if defined(__VAES__)
    groestl512_4way_context groestl;
@@ -190,7 +189,6 @@ int x17_8way_hash( void *state, const void *input, int thr_id )
     hamsi512_8way_init( &ctx.hamsi );
     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
     hamsi512_8way_close( &ctx.hamsi, vhash );
-
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );

--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -19,7 +19,7 @@
 #include "algo/fugue/fugue-aesni.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha512-hash.h"
 #include "algo/haval/haval-hash-4way.h"
 #if defined(__VAES__)
  #include "algo/groestl/groestl512-hash-4way.h"
--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -16,7 +16,8 @@
 #include "algo/fugue/fugue-aesni.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha512-hash.h"
+#include "algo/sha/sha256-hash.h"
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/tiger/sph_tiger.h"
 #include "algo/lyra2/lyra2.h"
@@ -26,9 +27,6 @@
  #include "algo/shavite/shavite-hash-4way.h"
  #include "algo/echo/echo-hash-4way.h"
 #endif
-#if defined(__SHA__)
-  #include "algo/sha/sha256-hash.h"
-#endif

 #if defined(X22I_8WAY)

--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -6,7 +6,8 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/shabal/shabal-hash-4way.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha512-hash.h"
+#include "algo/sha/sha256-hash.h"
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/blake/blake2s-hash-4way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -31,9 +32,6 @@
  #include "algo/shavite/shavite-hash-4way.h"
  #include "algo/echo/echo-hash-4way.h"
 #endif
-#if defined(__SHA__)
-  #include "algo/sha/sha256-hash.h"
-#endif

 void x25x_shuffle( void *hash )
 {
--- a/asm/aesb-x64.S
+++ b/asm/aesb-x64.S
@@ -1,72 +0,0 @@
-#include <cpuminer-config.h>
-
-#if defined(__linux__) && defined(__ELF__)
-    .section .note.GNU-stack,"",%progbits
-#endif
-
-    .text
-    .p2align 6
-    .globl fast_aesb_single_round
-    .globl _fast_aesb_single_round
-fast_aesb_single_round:
-_fast_aesb_single_round:
-#if defined(_WIN64) || defined(__CYGWIN__)
-    movdqa (%rcx), %xmm1
-    aesenc (%r8), %xmm1
-    movdqa %xmm1, (%rdx)
-#else
-    movdqa (%rdi), %xmm1
-    aesenc (%rdx), %xmm1
-    movdqa %xmm1, (%rsi)
-#endif
-    ret
-
-    .text
-    .p2align 6
-    .globl fast_aesb_pseudo_round_mut
-    .globl _fast_aesb_pseudo_round_mut
-fast_aesb_pseudo_round_mut:
-_fast_aesb_pseudo_round_mut:
-#if defined(_WIN64) || defined(__CYGWIN__)
-    mov %rdx, %r9
-    add $0xA0, %r9
-    movdqa (%rcx), %xmm1
- 
-    .LOOP:
-            aesenc (%rdx), %xmm1
-            add $0x10, %rdx
-			cmp %r9, %rdx
-            jl .LOOP
-
-    movdqa %xmm1, (%rcx)
-#else
-    mov %rsi, %r9
-    add $0xA0, %r9
-    movdqa (%rdi), %xmm1
- 
-    .LOOP:
-            aesenc (%rsi), %xmm1
-            add $0x10, %rsi
-            cmp %r9, %rsi
-            jl .LOOP
-
-    movdqa %xmm1, (%rdi)
-#endif
-    ret
-
-    .text
-    .globl mul128
-    .globl _mul128
-mul128:
-_mul128:
-#if defined(_WIN64) || defined(__CYGWIN__)
-	mov %rcx, %rax
-	mul %rdx
-	mov %rdx, (%r8)
-#else
-	mov %rdx, %r8
-	mov %rdi, %rax
-	mul %rsi
-	mov %rdx, (%r8)
-#endif
-	ret
--- a/asm/aesb-x86.S
+++ b/asm/aesb-x86.S
@@ -1,21 +0,0 @@
-#include <cpuminer-config.h>
-
-#if defined(__linux__) && defined(__ELF__)
-    .section .note.GNU-stack,"",%progbits
-#endif
-
-    .text
-    .p2align 6
-    .globl fast_aesb_single_round
-    .globl _fast_aesb_single_round
-fast_aesb_single_round:
-_fast_aesb_single_round:
-    ret
-
-    .text
-    .p2align 6
-    .globl fast_aesb_pseudo_round_mut
-    .globl _fast_aesb_pseudo_round_mut
-fast_aesb_pseudo_round_mut:
-_fast_aesb_pseudo_round_mut:
-    ret
--- a/comp.log
+++ b/comp.log
@@ -1,50 +0,0 @@
-make  all-recursive
-make[1]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev'
-Making all in compat
-make[2]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat'
-Making all in jansson
-make[3]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat/jansson'
-make[3]: Nothing to be done for `all'.
-make[3]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat/jansson'
-make[3]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat'
-make[3]: Nothing to be done for `all-am'.
-make[3]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat'
-make[2]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat'
-make[2]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev'
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT cpuminer-cpu-miner.o -MD -MP -MF .deps/cpuminer-cpu-miner.Tpo -c -o cpuminer-cpu-miner.o `test -f 'cpu-miner.c' || echo './'`cpu-miner.c
-mv -f .deps/cpuminer-cpu-miner.Tpo .deps/cpuminer-cpu-miner.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT cpuminer-util.o -MD -MP -MF .deps/cpuminer-util.Tpo -c -o cpuminer-util.o `test -f 'util.c' || echo './'`util.c
-mv -f .deps/cpuminer-util.Tpo .deps/cpuminer-util.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT cpuminer-algo-gate-api.o -MD -MP -MF .deps/cpuminer-algo-gate-api.Tpo -c -o cpuminer-algo-gate-api.o `test -f 'algo-gate-api.c' || echo './'`algo-gate-api.c
-mv -f .deps/cpuminer-algo-gate-api.Tpo .deps/cpuminer-algo-gate-api.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/shavite/cpuminer-shavite.o -MD -MP -MF algo/shavite/.deps/cpuminer-shavite.Tpo -c -o algo/shavite/cpuminer-shavite.o `test -f 'algo/shavite/shavite.c' || echo './'`algo/shavite/shavite.c
-mv -f algo/shavite/.deps/cpuminer-shavite.Tpo algo/shavite/.deps/cpuminer-shavite.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/keccak/cpuminer-keccak.o -MD -MP -MF algo/keccak/.deps/cpuminer-keccak.Tpo -c -o algo/keccak/cpuminer-keccak.o `test -f 'algo/keccak/keccak.c' || echo './'`algo/keccak/keccak.c
-mv -f algo/keccak/.deps/cpuminer-keccak.Tpo algo/keccak/.deps/cpuminer-keccak.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/cpuminer-axiom.o -MD -MP -MF algo/.deps/cpuminer-axiom.Tpo -c -o algo/cpuminer-axiom.o `test -f 'algo/axiom.c' || echo './'`algo/axiom.c
-mv -f algo/.deps/cpuminer-axiom.Tpo algo/.deps/cpuminer-axiom.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/blake/cpuminer-blake.o -MD -MP -MF algo/blake/.deps/cpuminer-blake.Tpo -c -o algo/blake/cpuminer-blake.o `test -f 'algo/blake/blake.c' || echo './'`algo/blake/blake.c
-mv -f algo/blake/.deps/cpuminer-blake.Tpo algo/blake/.deps/cpuminer-blake.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/blake/cpuminer-blake2.o -MD -MP -MF algo/blake/.deps/cpuminer-blake2.Tpo -c -o algo/blake/cpuminer-blake2.o `test -f 'algo/blake/blake2.c' || echo './'`algo/blake/blake2.c
-mv -f algo/blake/.deps/cpuminer-blake2.Tpo algo/blake/.deps/cpuminer-blake2.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/blake/cpuminer-blakecoin.o -MD -MP -MF algo/blake/.deps/cpuminer-blakecoin.Tpo -c -o algo/blake/cpuminer-blakecoin.o `test -f 'algo/blake/blakecoin.c' || echo './'`algo/blake/blakecoin.c
-mv -f algo/blake/.deps/cpuminer-blakecoin.Tpo algo/blake/.deps/cpuminer-blakecoin.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/blake/cpuminer-decred.o -MD -MP -MF algo/blake/.deps/cpuminer-decred.Tpo -c -o algo/blake/cpuminer-decred.o `test -f 'algo/blake/decred.c' || echo './'`algo/blake/decred.c
-mv -f algo/blake/.deps/cpuminer-decred.Tpo algo/blake/.deps/cpuminer-decred.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/blake/cpuminer-pentablake.o -MD -MP -MF algo/blake/.deps/cpuminer-pentablake.Tpo -c -o algo/blake/cpuminer-pentablake.o `test -f 'algo/blake/pentablake.c' || echo './'`algo/blake/pentablake.c
-mv -f algo/blake/.deps/cpuminer-pentablake.Tpo algo/blake/.deps/cpuminer-pentablake.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/bmw/cpuminer-bmw256.o -MD -MP -MF algo/bmw/.deps/cpuminer-bmw256.Tpo -c -o algo/bmw/cpuminer-bmw256.o `test -f 'algo/bmw/bmw256.c' || echo './'`algo/bmw/bmw256.c
-mv -f algo/bmw/.deps/cpuminer-bmw256.Tpo algo/bmw/.deps/cpuminer-bmw256.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/cpuminer-c11.o -MD -MP -MF algo/.deps/cpuminer-c11.Tpo -c -o algo/cpuminer-c11.o `test -f 'algo/c11.c' || echo './'`algo/c11.c
-mv -f algo/.deps/cpuminer-c11.Tpo algo/.deps/cpuminer-c11.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/cpuminer-cryptolight.o -MD -MP -MF algo/.deps/cpuminer-cryptolight.Tpo -c -o algo/cpuminer-cryptolight.o `test -f 'algo/cryptolight.c' || echo './'`algo/cryptolight.c
-mv -f algo/.deps/cpuminer-cryptolight.Tpo algo/.deps/cpuminer-cryptolight.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/cryptonight/cpuminer-cryptonight-common.o -MD -MP -MF algo/cryptonight/.deps/cpuminer-cryptonight-common.Tpo -c -o algo/cryptonight/cpuminer-cryptonight-common.o `test -f 'algo/cryptonight/cryptonight-common.c' || echo './'`algo/cryptonight/cryptonight-common.c
-mv -f algo/cryptonight/.deps/cpuminer-cryptonight-common.Tpo algo/cryptonight/.deps/cpuminer-cryptonight-common.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/cpuminer-drop.o -MD -MP -MF algo/.deps/cpuminer-drop.Tpo -c -o algo/cpuminer-drop.o `test -f 'algo/drop.c' || echo './'`algo/drop.c
-mv -f algo/.deps/cpuminer-drop.Tpo algo/.deps/cpuminer-drop.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/cpuminer-fresh.o -MD -MP -MF algo/.deps/cpuminer-fresh.Tpo -c -o algo/cpuminer-fresh.o `test -f 'algo/fresh.c' || echo './'`algo/fresh.c
-mv -f algo/.deps/cpuminer-fresh.Tpo algo/.deps/cpuminer-fresh.Po
-gcc -std=gnu99 -DHAVE_CONFIG_H -I.  -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast   -O3 -march=native  -Iyes/include -Iyes/include -MT algo/groestl/cpuminer-groestl.o -MD -MP -MF algo/groestl/.deps/cpuminer-groestl.Tpo -c -o algo/groestl/cpuminer-groestl.o `test -f 'algo/groestl/groestl.c' || echo './'`algo/groestl/groestl.c
-make[2]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev'
-make[1]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev'
--- a/algo/sha/aes_helper.c
+++ b/algo/sha/aes_helper.c
@@ -43,16 +43,15 @@
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */

-#include "sph_types.h"
 #ifdef __cplusplus
 extern "C"{
 #endif
 #if AES_BIG_ENDIAN

-#define AESx(x)   ( ((SPH_C32(x) >> 24) & SPH_C32(0x000000FF)) \
-                  | ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
-                  | ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
-                  | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
+#define AESx(x)   ( (((x) >> 24) & 0x000000FF) \
+                  | (((x) >>  8) & 0x0000FF00) \
+                  | (((x) <<  8) & 0x00FF0000) \
+                  | (((x) << 24) & 0xFF000000))

 #define AES0      AES0_BE
 #define AES1      AES1_BE
@@ -83,7 +82,7 @@ extern "C"{

 #else

-#define AESx(x)   SPH_C32(x)
+#define AESx(x)   (x)
 #define AES0      AES0_LE
 #define AES1      AES1_LE
 #define AES2      AES2_LE
@@ -119,7 +118,7 @@ extern "C"{
 * MixColumns for the column where that byte goes after ShiftRows.
 */

-static const sph_u32 AES0[256] = {
+static const uint32_t AES0[256] = {
 	AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
 	AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
 	AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
@@ -186,7 +185,7 @@ static const sph_u32 AES0[256] = {
 	AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
 };

-static const sph_u32 AES1[256] = {
+static const uint32_t AES1[256] = {
 	AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
 	AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
 	AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
@@ -253,7 +252,7 @@ static const sph_u32 AES1[256] = {
 	AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
 };

-static const sph_u32 AES2[256] = {
+static const uint32_t AES2[256] = {
 	AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
 	AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
 	AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
@@ -320,7 +319,7 @@ static const sph_u32 AES2[256] = {
 	AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
 };

-static const sph_u32 AES3[256] = {
+static const uint32_t AES3[256] = {
 	AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
 	AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
 	AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
--- a/algo/sha/brg_types.h
+++ b/algo/sha/brg_types.h
--- a/Show More
+++ b/Show More