v3.23.2

2026-02-22 16:33:08 +00:00 · 2023-09-21 12:34:06 -04:00
parent d6b5750362
commit be88afc349
113 changed files with 3349 additions and 2920 deletions
--- a/algo/argon2/argon2a/argon2a.c
+++ b/algo/argon2/argon2a/argon2a.c
@@ -77,7 +77,7 @@ bool register_argon2_algo( algo_gate_t* gate )
  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
  gate->scanhash        = (void*)&scanhash_argon2;
  gate->hash            = (void*)&argon2hash;
-  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
  opt_target_factor = 65536.0;

  return true;
--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -15,7 +15,7 @@
 #include <string.h>
 #include <stdio.h>

-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 #include "sph-blake2s.h"

 static const uint32_t blake2s_IV[8] =
--- a/algo/blake/sph_blake.h
+++ b/algo/blake/sph_blake.h
@@ -42,7 +42,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for BLAKE-224.
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -31,7 +31,7 @@
 #include <stdint.h>
 #include <string.h>
 #include "simd-utils.h"
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 #include "sph_blake2b.h"

 // Little-endian byte access.
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -41,8 +41,6 @@ extern "C"{
 #endif

 #include <stddef.h>
-
-#include "algo/sha/sph_types.h"
 #include "simd-utils.h"

 #define SPH_SIZE_bmw256   256
@@ -57,7 +55,7 @@ typedef struct {
   __m128i buf[64];
   __m128i H[16];
   size_t ptr;
-   sph_u32 bit_count;  // assume bit_count fits in 32 bits
+   uint32_t bit_count;  // assume bit_count fits in 32 bits
 } bmw_4way_small_context;

 typedef bmw_4way_small_context bmw256_4way_context;
@@ -144,7 +142,7 @@ typedef struct {
   __m256i buf[16];
   __m256i H[16];
   size_t ptr;
-   sph_u64 bit_count;
+   uint64_t bit_count;
 } bmw_4way_big_context __attribute__((aligned(128)));

 typedef bmw_4way_big_context bmw512_4way_context;
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -109,7 +109,7 @@ static const uint32_t IV256[] = {
             _mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
                                           rol_off_32( M, j, 3 ) ), \
                            rol_off_32( M, j, 10 ) ), \
-       _mm_set1_epi32( ( (j)+16 ) * SPH_C32(0x05555555UL) ) ), \
+       _mm_set1_epi32( ( (j)+16 ) * 0x05555555UL ) ), \
   H[ ( (j)+7 ) & 0xF ] )


@@ -485,7 +485,7 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
   size_t ptr;
   const int buf_size = 64;  // bytes of one lane, compatible with len

-   sc->bit_count += (sph_u32)len << 3;
+   sc->bit_count += (uint32_t)len << 3;
   buf = sc->buf;
   ptr = sc->ptr;
   h1 = sc->H;
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -45,15 +45,15 @@ extern "C"{

 #define LPAR   (

-static const sph_u64 IV512[] = {
-        SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
-        SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
-        SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
-        SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
-        SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
-        SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
-        SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
-        SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
+static const uint64_t IV512[] = {
+        0x8081828384858687, 0x88898A8B8C8D8E8F,
+        0x9091929394959697, 0x98999A9B9C9D9E9F,
+        0xA0A1A2A3A4A5A6A7, 0xA8A9AAABACADAEAF,
+        0xB0B1B2B3B4B5B6B7, 0xB8B9BABBBCBDBEBF,
+        0xC0C1C2C3C4C5C6C7, 0xC8C9CACBCCCDCECF,
+        0xD0D1D2D3D4D5D6D7, 0xD8D9DADBDCDDDEDF,
+        0xE0E1E2E3E4E5E6E7, 0xE8E9EAEBECEDEEEF,
+        0xF0F1F2F3F4F5F6F7, 0xF8F9FAFBFCFDFEFF
 };

 #if defined(__SSE2__)
@@ -894,7 +894,7 @@ static const __m256i final_b[16] =
 };

 static void
-bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
+bmw64_4way_init( bmw_4way_big_context *sc, const uint64_t *iv )
 {
   sc->H[ 0] = _mm256_set1_epi64x( 0x8081828384858687 );
   sc->H[ 1] = _mm256_set1_epi64x( 0x88898A8B8C8D8E8F );
@@ -926,7 +926,7 @@ bmw64_4way( bmw_4way_big_context *sc, const void *data, size_t len )
   size_t ptr;
   const int buf_size = 128;  // bytes of one lane, compatible with len

-   sc->bit_count += (sph_u64)len << 3;
+   sc->bit_count += (uint64_t)len << 3;
   buf = sc->buf;
   ptr = sc->ptr;
   h1 = sc->H;
@@ -1377,7 +1377,7 @@ static const __m512i final_b8[16] =


 void bmw512_8way_init( bmw512_8way_context *ctx )
-//bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
+//bmw64_4way_init( bmw_4way_big_context *sc, const uint64_t *iv )
 {
   ctx->H[ 0] = _mm512_set1_epi64( 0x8081828384858687 );
   ctx->H[ 1] = _mm512_set1_epi64( 0x88898A8B8C8D8E8F );
--- a/algo/bmw/sph_bmw.h
+++ b/algo/bmw/sph_bmw.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for BMW-224.
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -9,7 +9,6 @@
 #include <immintrin.h>
 #endif
 #include "cubehash_sse2.h"
-#include "algo/sha/sha3-defs.h"
 #include <stdbool.h>
 #include <unistd.h>
 #include <memory.h>
--- a/algo/cubehash/cubehash_sse2.h
+++ b/algo/cubehash/cubehash_sse2.h
@@ -3,7 +3,7 @@

 #include "compat.h"
 #include <stdint.h>
-#include "algo/sha/sha3-defs.h"
+#include "compat/sha3-defs.h"

 #define	OPTIMIZE_SSE2

--- a/algo/cubehash/sph_cubehash.h
+++ b/algo/cubehash/sph_cubehash.h
@@ -42,7 +42,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for CubeHash-224.
--- a/algo/echo/aes_ni/hash_api.h
+++ b/algo/echo/aes_ni/hash_api.h
@@ -22,7 +22,7 @@
 #endif


-#include "algo/sha/sha3_common.h"
+#include "compat/sha3_common.h"

 #include <emmintrin.h>

--- a/algo/echo/sph_echo.c
+++ b/algo/echo/sph_echo.c
@@ -73,7 +73,7 @@ extern "C"{
 #endif

 #define AES_BIG_ENDIAN   0
-#include "algo/sha/aes_helper.c"
+#include "compat/aes_helper.c"

 #if SPH_ECHO_64

--- a/algo/echo/sph_echo.h
+++ b/algo/echo/sph_echo.h
@@ -43,7 +43,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for ECHO-224.
--- a/algo/fugue/fugue-aesni.h
+++ b/algo/fugue/fugue-aesni.h
@@ -20,7 +20,7 @@
 #error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
 #endif

-#include "algo/sha/sha3_common.h"
+#include "compat/sha3_common.h"
 #include "simd-utils.h"


--- a/algo/fugue/sph_fugue.h
+++ b/algo/fugue/sph_fugue.h
@@ -2,7 +2,7 @@
 #define SPH_FUGUE_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 #ifdef __cplusplus
 extern "C"{
--- a/algo/gost/sph_gost.h
+++ b/algo/gost/sph_gost.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for GOST-256.
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -20,8 +20,8 @@
 #define LENGTH (512)

 #include "brg_endian.h"
-#define NEED_UINT_64T
-#include "algo/sha/brg_types.h"
+//#define NEED_UINT_64T
+#include "compat/brg_types.h"

 /* some sizes (number of bytes) */
 #define ROWS (8)
--- a/algo/groestl/aes_ni/hash-groestl256.h
+++ b/algo/groestl/aes_ni/hash-groestl256.h
@@ -34,8 +34,7 @@ typedef crypto_uint64 u64;
 //#define LENGTH (512)

 #include "brg_endian.h"
-#define NEED_UINT_64T
-#include "algo/sha/brg_types.h"
+#include "compat/brg_types.h"

 #ifdef IACA_TRACE
  #include IACA_MARKS
--- a/algo/groestl/groestl-gate.c
+++ b/algo/groestl/groestl-gate.c
@@ -17,7 +17,7 @@ bool register_dmd_gr_algo( algo_gate_t *gate )
 bool register_groestl_algo( algo_gate_t* gate )
 {
    register_dmd_gr_algo( gate );
-    gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+    gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
    return true;
 };

--- a/algo/groestl/groestl256-hash-4way.h
+++ b/algo/groestl/groestl256-hash-4way.h
@@ -22,10 +22,6 @@

 #define LENGTH (256)

-//#include "brg_endian.h"
-//#define NEED_UINT_64T
-//#include "algo/sha/brg_types.h"
-
 /* some sizes (number of bytes) */
 #define ROWS (8)
 #define LENGTHFIELDLEN (ROWS)
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -4,7 +4,7 @@
 #include <stdint.h>
 #include <string.h>
 #include "aes_ni/hash-groestl.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha256-hash.h"
 #if defined(__VAES__)
  #include "groestl512-hash-4way.h"
 #endif
--- a/algo/groestl/sph_groestl.h
+++ b/algo/groestl/sph_groestl.h
@@ -40,7 +40,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 #if !defined(__AES__)   
 /**
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -36,44 +36,64 @@
 #define HAMSI_4WAY_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"

 #if defined (__AVX2__)

 #include "simd-utils.h"

-#ifdef __cplusplus
-extern "C"{
-#endif
-
-#define SPH_SIZE_hamsi512   512
+// Hamsi-512 4x64

 // Partial is only scalar but needs pointer ref for hamsi-helper
 // deprecate partial_len
-typedef struct {
+typedef struct
+{
   __m256i h[8];
   __m256i buf[1];
   size_t partial_len;
-   sph_u32 count_high, count_low;
+   uint32_t count_high, count_low;
 } hamsi_4way_big_context;
-
 typedef hamsi_4way_big_context hamsi512_4way_context;

 void hamsi512_4way_init( hamsi512_4way_context *sc );
 void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
      size_t len );
-//#define hamsi512_4way hamsi512_4way_update
 void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );

+#define hamsi512_4x64_context   hamsi512_4way_context
+#define hamsi512_4x64_init      hamsi512_4way_init
+#define hamsi512_4x64_update    hamsi512_4way_update
+#define hamsi512_4x64_close     hamsi512_4way_close
+
+// Hamsi-512 8x32
+
+typedef struct
+{
+   __m256i h[16];
+   __m256i buf[2];
+   size_t partial_len;
+   uint32_t count_high, count_low;
+} hamsi_8x32_big_context;
+typedef hamsi_8x32_big_context hamsi512_8x32_context;
+
+void hamsi512_8x32_init( hamsi512_8x32_context *sc );
+void hamsi512_8x32_update( hamsi512_8x32_context *sc, const void *data,
+      size_t len );
+void hamsi512_8x32_close( hamsi512_8x32_context *sc, void *dst );
+void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,
+      size_t len );
+
+#endif
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+// Hamsi-512 8x64
+
 typedef struct {
   __m512i h[8];
   __m512i buf[1];
   size_t partial_len;
-   sph_u32 count_high, count_low;
+   uint32_t count_high, count_low;
 } hamsi_8way_big_context;
-
 typedef hamsi_8way_big_context hamsi512_8way_context;

 void hamsi512_8way_init( hamsi512_8way_context *sc );
@@ -81,15 +101,29 @@ void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
                           size_t len );
 void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );

+#define hamsi512_8x64_context   hamsi512_8way_context
+#define hamsi512_8x64_init      hamsi512_8way_init
+#define hamsi512_8x64_update    hamsi512_8way_update
+#define hamsi512_8x64_close     hamsi512_8way_close

-
-#endif
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
+// Hamsi-512 16x32
+
+typedef struct
+{
+   __m512i h[16];
+   __m512i buf[2];
+   size_t partial_len;
+   uint32_t count_high, count_low;
+} hamsi_16x32_big_context;
+typedef hamsi_16x32_big_context hamsi512_16x32_context;
+
+void hamsi512_16x32_init( hamsi512_16x32_context *sc );
+void hamsi512_16x32_update( hamsi512_16x32_context *sc, const void *data,
+                           size_t len );
+void hamsi512_16way_close( hamsi512_16x32_context *sc, void *dst );
+void hamsi512_16x32_full( hamsi512_16x32_context *sc, void *dst,
+                          const void *data, size_t len );
+
+#endif   // AVX512

 #endif
--- a/algo/hamsi/sph_hamsi.h
+++ b/algo/hamsi/sph_hamsi.h
@@ -36,7 +36,7 @@
 #define SPH_HAMSI_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 #ifdef __cplusplus
 extern "C"{
--- a/algo/haval/haval-4way-helper.c
+++ b/algo/haval/haval-4way-helper.c
@@ -48,7 +48,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
   while ( len > 0 )
   {
      unsigned clen;
-      sph_u32 clow, clow2;
+      uint32_t clow, clow2;

      clen = 128U - current;
      if ( clen > len )
@@ -67,7 +67,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
         current = 0;
      }
      clow = sc->count_low;
-      clow2 = SPH_T32(clow + clen);
+      clow2 = clow + clen;
      sc->count_low = clow2;
      if ( clow2 < clow )
         sc->count_high ++;
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -292,7 +292,9 @@ static const unsigned MP5[32] = {
 	 2, 23, 16, 22,  4,  1, 25, 15
 };

-static const sph_u32 RK2[32] = {
+#define SPH_C32(x) (x)
+
+static const uint32_t RK2[32] = {
 	SPH_C32(0x452821E6), SPH_C32(0x38D01377),
 	SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
 	SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
@@ -311,7 +313,7 @@ static const sph_u32 RK2[32] = {
 	SPH_C32(0x7B54A41D), SPH_C32(0xC25A59B5)
 };

-static const sph_u32 RK3[32] = {
+static const uint32_t RK3[32] = {
 	SPH_C32(0x9C30D539), SPH_C32(0x2AF26013),
 	SPH_C32(0xC5D1B023), SPH_C32(0x286085F0),
 	SPH_C32(0xCA417918), SPH_C32(0xB8DB38EF),
@@ -330,7 +332,7 @@ static const sph_u32 RK3[32] = {
 	SPH_C32(0xAFD6BA33), SPH_C32(0x6C24CF5C)
 };

-static const sph_u32 RK4[32] = {
+static const uint32_t RK4[32] = {
 	SPH_C32(0x7A325381), SPH_C32(0x28958677),
 	SPH_C32(0x3B8F4898), SPH_C32(0x6B4BB9AF),
 	SPH_C32(0xC4BFE81B), SPH_C32(0x66282193),
@@ -349,7 +351,7 @@ static const sph_u32 RK4[32] = {
 	SPH_C32(0x6EEF0B6C), SPH_C32(0x137A3BE4)
 };

-static const sph_u32 RK5[32] = {
+static const uint32_t RK5[32] = {
 	SPH_C32(0xBA3BF050), SPH_C32(0x7EFB2A98),
 	SPH_C32(0xA1F1651D), SPH_C32(0x39AF0176),
 	SPH_C32(0x66CA593E), SPH_C32(0x82430E88),
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -68,7 +68,6 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
 #include "simd-utils.h"

 #define SPH_SIZE_haval256_5   256
@@ -77,7 +76,7 @@ typedef struct {
   __m128i buf[32];
   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
   unsigned olen, passes;
-   sph_u32 count_high, count_low;
+   uint32_t count_high, count_low;
 } haval_4way_context;

 typedef haval_4way_context haval256_5_4way_context;
--- a/algo/haval/sph-haval.h
+++ b/algo/haval/sph-haval.h
@@ -66,7 +66,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for HAVAL-128/3.
--- a/algo/jh/sph_jh.h
+++ b/algo/jh/sph_jh.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for JH-224.
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -2,7 +2,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
-#include "sph_keccak.h"
 #include "keccak-hash-4way.h"

 #if defined(KECCAK_8WAY)
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -9,7 +9,7 @@ int hard_coded_eb = 1;
 bool register_keccak_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX2_OPT | AVX512_OPT;
-  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
  opt_target_factor = 128.0;
 #if defined (KECCAK_8WAY)
  gate->scanhash  = (void*)&scanhash_keccak_8way;
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -1,45 +1,6 @@
-/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */
-/**
- * Keccak interface. This is the interface for Keccak with the
- * recommended parameters for SHA-3, with output lengths 224, 256,
- * 384 and 512 bits.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_keccak.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
 #ifndef KECCAK_HASH_4WAY_H__
 #define KECCAK_HASH_4WAY_H__

-#ifdef __cplusplus
-extern "C"{
-#endif
-
 #ifdef  __AVX2__

 #include <stddef.h>
@@ -100,8 +61,4 @@ void keccak512_4way_addbits_and_close(

 #endif

-#ifdef __cplusplus
-}
-#endif
-
 #endif
--- a/algo/keccak/sha3d-4way.c
+++ b/algo/keccak/sha3d-4way.c
@@ -2,7 +2,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
-#include "sph_keccak.h"
 #include "keccak-hash-4way.h"

 #if defined(KECCAK_8WAY)
--- a/algo/keccak/sph_keccak.h
+++ b/algo/keccak/sph_keccak.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for Keccak-224.
--- a/algo/lanehash/lane.h
+++ b/algo/lanehash/lane.h
@@ -23,7 +23,6 @@
 #define LANE_H

 #include <string.h>
-//#include "algo/sha/sha3-defs.h"
 #include <stdint.h>

 typedef unsigned char BitSequence;
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -7,8 +7,10 @@

 #include "simd-utils.h"

+#define uint32 uint32_t
+
 /* initial values of chaining variables */
-static const uint32 IV[40] __attribute((aligned(64))) = {
+static const uint32_t IV[40] __attribute((aligned(64))) = {
    0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
    0xdef610bb,0xee058139,0x90152df4,0x6e292011,
    0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
@@ -22,7 +24,7 @@ static const uint32 IV[40] __attribute((aligned(64))) = {
 };

 /* Round Constants */
-static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
+static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
    0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
    0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
    0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -23,7 +23,7 @@
 #if defined(__AVX2__)

 #include <immintrin.h>
-#include "algo/sha/sha3-defs.h"
+//#include "algo/sha/sha3-defs.h"
 #include "simd-utils.h"

 /* The length of digests*/
@@ -54,7 +54,7 @@
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 typedef struct {
-    uint32 buffer[8*4];
+    uint32_t buffer[8*4];
    __m512i chainv[10];   /* Chaining values */
    int hashbitlen;
    int rembytes;
@@ -82,7 +82,7 @@ int luffa512_4way_update_close( luffa_4way_context *state, void *output,
 #endif

 typedef struct {
-    uint32 buffer[8*2];
+    uint32_t buffer[8*2];
    __m256i chainv[10];   /* Chaining values */
    int hashbitlen;
    int rembytes;
--- a/algo/luffa/luffa_for_sse2.h
+++ b/algo/luffa/luffa_for_sse2.h
@@ -22,7 +22,7 @@
 */

 #include <emmintrin.h>
-#include "algo/sha/sha3-defs.h"
+#include "compat/sha3-defs.h"
 /* The length of digests*/
 #define DIGEST_BIT_LEN_224 224
 #define DIGEST_BIT_LEN_256 256
--- a/algo/luffa/sph_luffa.h
+++ b/algo/luffa/sph_luffa.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for Luffa-224.
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -21,9 +21,8 @@
 #define LYRA2_H_

 #include <stdint.h>
-#include "algo/sha/sha3-defs.h"

-//typedef unsigned char byte;
+typedef unsigned char byte;

 //Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
 #define BLOCK_LEN_BLAKE2_SAFE_INT64 8                                   //512 bits (=64 bytes, =8 uint64_t)
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -4,7 +4,6 @@

 #include <memory.h>
 #include "algo/blake/sph_blake.h"
-#include "algo/cubehash/sph_cubehash.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/bmw/sph_bmw.h"
--- a/algo/lyra2/lyra2rev3.c
+++ b/algo/lyra2/lyra2rev3.c
@@ -4,7 +4,6 @@

 #include <memory.h>
 #include "algo/blake/sph_blake.h"
-#include "algo/cubehash/sph_cubehash.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/cubehash/cubehash_sse2.h" 
 //#include "lyra2.h"
--- a/algo/panama/sph_panama.h
+++ b/algo/panama/sph_panama.h
@@ -58,7 +58,7 @@
 #define SPH_PANAMA_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for PANAMA.
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -21,7 +21,7 @@
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/haval-hash-4way.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha512-hash.h"
 #if defined(__VAES__)
  #include "algo/groestl/groestl512-hash-4way.h"
  #include "algo/shavite/shavite-hash-4way.h"
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -3,7 +3,8 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha256-hash.h"
+#include "algo/sha/sha512-hash.h"
 #include "ripemd-hash-4way.h"

 #define LBRY_INPUT_SIZE 112
--- a/algo/ripemd/ripemd-hash-4way.h
+++ b/algo/ripemd/ripemd-hash-4way.h
@@ -2,7 +2,6 @@
 #define RIPEMD_HASH_4WAY_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"

 #if defined(__SSE4_2__)

--- a/algo/ripemd/sph_ripemd.h
+++ b/algo/ripemd/sph_ripemd.h
@@ -57,7 +57,7 @@
 #define SPH_RIPEMD_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for RIPEMD.
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -31,7 +31,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <inttypes.h>
-#include "algo/sha/sha-hash-4way.h"
 #include "algo/sha/sha256-hash.h"
 #include <mm_malloc.h>
 #include "malloc-huge.h"
--- a/algo/sha/aes_helper.c
+++ b/algo/sha/aes_helper.c
@@ -1,392 +0,0 @@
-/* $Id: aes_helper.c 220 2010-06-09 09:21:50Z tp $ */
-/*
- * AES tables. This file is not meant to be compiled by itself; it
- * is included by some hash function implementations. It contains
- * the precomputed tables and helper macros for evaluating an AES
- * round, optionally with a final XOR with a subkey.
- *
- * By default, this file defines the tables and macros for little-endian
- * processing (i.e. it is assumed that the input bytes have been read
- * from memory and assembled with the little-endian convention). If
- * the 'AES_BIG_ENDIAN' macro is defined (to a non-zero integer value)
- * when this file is included, then the tables and macros for big-endian
- * processing are defined instead. The big-endian tables and macros have
- * names distinct from the little-endian tables and macros, hence it is
- * possible to have both simultaneously, by including this file twice
- * (with and without the AES_BIG_ENDIAN macro).
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#include "sph_types.h"
-#ifdef __cplusplus
-extern "C"{
-#endif
-#if AES_BIG_ENDIAN
-
-#define AESx(x)   ( ((SPH_C32(x) >> 24) & SPH_C32(0x000000FF)) \
-                  | ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
-                  | ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
-                  | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
-
-#define AES0      AES0_BE
-#define AES1      AES1_BE
-#define AES2      AES2_BE
-#define AES3      AES3_BE
-
-#define AES_ROUND_BE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3)   do { \
-		(Y0) = AES0[((X0) >> 24) & 0xFF] \
-			^ AES1[((X1) >> 16) & 0xFF] \
-			^ AES2[((X2) >> 8) & 0xFF] \
-			^ AES3[(X3) & 0xFF] ^ (K0); \
-		(Y1) = AES0[((X1) >> 24) & 0xFF] \
-			^ AES1[((X2) >> 16) & 0xFF] \
-			^ AES2[((X3) >> 8) & 0xFF] \
-			^ AES3[(X0) & 0xFF] ^ (K1); \
-		(Y2) = AES0[((X2) >> 24) & 0xFF] \
-			^ AES1[((X3) >> 16) & 0xFF] \
-			^ AES2[((X0) >> 8) & 0xFF] \
-			^ AES3[(X1) & 0xFF] ^ (K2); \
-		(Y3) = AES0[((X3) >> 24) & 0xFF] \
-			^ AES1[((X0) >> 16) & 0xFF] \
-			^ AES2[((X1) >> 8) & 0xFF] \
-			^ AES3[(X2) & 0xFF] ^ (K3); \
-	} while (0)
-
-#define AES_ROUND_NOKEY_BE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \
-	AES_ROUND_BE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3)
-
-#else
-
-#define AESx(x)   SPH_C32(x)
-#define AES0      AES0_LE
-#define AES1      AES1_LE
-#define AES2      AES2_LE
-#define AES3      AES3_LE
-
-#define AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3)   do { \
-		(Y0) = AES0[(X0) & 0xFF] \
-			^ AES1[((X1) >> 8) & 0xFF] \
-			^ AES2[((X2) >> 16) & 0xFF] \
-			^ AES3[((X3) >> 24) & 0xFF] ^ (K0); \
-		(Y1) = AES0[(X1) & 0xFF] \
-			^ AES1[((X2) >> 8) & 0xFF] \
-			^ AES2[((X3) >> 16) & 0xFF] \
-			^ AES3[((X0) >> 24) & 0xFF] ^ (K1); \
-		(Y2) = AES0[(X2) & 0xFF] \
-			^ AES1[((X3) >> 8) & 0xFF] \
-			^ AES2[((X0) >> 16) & 0xFF] \
-			^ AES3[((X1) >> 24) & 0xFF] ^ (K2); \
-		(Y3) = AES0[(X3) & 0xFF] \
-			^ AES1[((X0) >> 8) & 0xFF] \
-			^ AES2[((X1) >> 16) & 0xFF] \
-			^ AES3[((X2) >> 24) & 0xFF] ^ (K3); \
-	} while (0)
-
-#define AES_ROUND_NOKEY_LE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \
-	AES_ROUND_LE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3)
-
-#endif
-
-/*
- * The AES*[] tables allow us to perform a fast evaluation of an AES
- * round; table AESi[] combines SubBytes for a byte at row i, and
- * MixColumns for the column where that byte goes after ShiftRows.
- */
-
-static const sph_u32 AES0[256] = {
-	AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
-	AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
-	AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
-	AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC),
-	AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA),
-	AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB),
-	AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45),
-	AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B),
-	AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C),
-	AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83),
-	AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9),
-	AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A),
-	AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D),
-	AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F),
-	AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF),
-	AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA),
-	AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34),
-	AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B),
-	AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D),
-	AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413),
-	AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1),
-	AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6),
-	AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972),
-	AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85),
-	AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED),
-	AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511),
-	AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE),
-	AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B),
-	AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05),
-	AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1),
-	AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142),
-	AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF),
-	AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3),
-	AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E),
-	AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A),
-	AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6),
-	AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3),
-	AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B),
-	AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428),
-	AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD),
-	AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14),
-	AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8),
-	AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4),
-	AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2),
-	AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA),
-	AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949),
-	AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF),
-	AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810),
-	AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C),
-	AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697),
-	AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E),
-	AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F),
-	AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC),
-	AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C),
-	AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969),
-	AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27),
-	AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122),
-	AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433),
-	AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9),
-	AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5),
-	AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A),
-	AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0),
-	AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E),
-	AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
-};
-
-static const sph_u32 AES1[256] = {
-	AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
-	AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
-	AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
-	AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A),
-	AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87),
-	AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B),
-	AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA),
-	AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B),
-	AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A),
-	AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F),
-	AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908),
-	AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F),
-	AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E),
-	AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5),
-	AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D),
-	AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F),
-	AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E),
-	AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB),
-	AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE),
-	AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397),
-	AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C),
-	AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED),
-	AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B),
-	AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A),
-	AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16),
-	AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194),
-	AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81),
-	AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3),
-	AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A),
-	AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104),
-	AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263),
-	AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D),
-	AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F),
-	AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39),
-	AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47),
-	AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695),
-	AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F),
-	AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83),
-	AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C),
-	AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76),
-	AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E),
-	AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4),
-	AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6),
-	AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B),
-	AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7),
-	AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0),
-	AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25),
-	AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018),
-	AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72),
-	AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751),
-	AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21),
-	AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85),
-	AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA),
-	AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12),
-	AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0),
-	AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9),
-	AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233),
-	AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7),
-	AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920),
-	AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A),
-	AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17),
-	AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8),
-	AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11),
-	AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
-};
-
-static const sph_u32 AES2[256] = {
-	AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
-	AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
-	AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
-	AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76),
-	AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D),
-	AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0),
-	AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF),
-	AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0),
-	AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26),
-	AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC),
-	AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1),
-	AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15),
-	AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3),
-	AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A),
-	AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2),
-	AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75),
-	AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A),
-	AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0),
-	AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3),
-	AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784),
-	AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED),
-	AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B),
-	AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39),
-	AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF),
-	AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB),
-	AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485),
-	AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F),
-	AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8),
-	AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F),
-	AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5),
-	AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321),
-	AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2),
-	AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC),
-	AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917),
-	AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D),
-	AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573),
-	AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC),
-	AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388),
-	AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14),
-	AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB),
-	AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A),
-	AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C),
-	AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662),
-	AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79),
-	AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D),
-	AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9),
-	AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA),
-	AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808),
-	AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E),
-	AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6),
-	AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F),
-	AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A),
-	AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66),
-	AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E),
-	AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9),
-	AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E),
-	AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311),
-	AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794),
-	AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9),
-	AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF),
-	AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D),
-	AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868),
-	AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F),
-	AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
-};
-
-static const sph_u32 AES3[256] = {
-	AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
-	AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
-	AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
-	AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676),
-	AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D),
-	AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0),
-	AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF),
-	AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0),
-	AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626),
-	AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC),
-	AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1),
-	AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515),
-	AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3),
-	AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A),
-	AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2),
-	AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575),
-	AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A),
-	AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0),
-	AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3),
-	AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484),
-	AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED),
-	AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B),
-	AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939),
-	AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF),
-	AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB),
-	AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585),
-	AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F),
-	AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8),
-	AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F),
-	AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5),
-	AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121),
-	AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2),
-	AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC),
-	AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717),
-	AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D),
-	AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373),
-	AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC),
-	AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888),
-	AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414),
-	AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB),
-	AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A),
-	AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C),
-	AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262),
-	AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979),
-	AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D),
-	AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9),
-	AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA),
-	AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808),
-	AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E),
-	AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6),
-	AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F),
-	AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A),
-	AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666),
-	AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E),
-	AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9),
-	AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E),
-	AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111),
-	AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494),
-	AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9),
-	AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF),
-	AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D),
-	AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868),
-	AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F),
-	AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616)
-};
-
-#ifdef __cplusplus
-}
-#endif
--- a/algo/sha/brg_types.h
+++ b/algo/sha/brg_types.h
@@ -1,234 +0,0 @@
-/*
- ---------------------------------------------------------------------------
- Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
-
- (a few lines added by Soeren S. Thomsen, October 2008)
-
- LICENSE TERMS
-
- The redistribution and use of this software (with or without changes)
- is allowed without the payment of fees or royalties provided that:
-
-  1. source code distributions include the above copyright notice, this
-     list of conditions and the following disclaimer;
-
-  2. binary distributions include the above copyright notice, this list
-     of conditions and the following disclaimer in their documentation;
-
-  3. the name of the copyright holder is not used to endorse products
-     built using this software without specific written permission.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and/or fitness for purpose.
- ---------------------------------------------------------------------------
- Issue Date: 20/12/2007
-
- The unsigned integer types defined here are of the form uint_<nn>t where
- <nn> is the length of the type; for example, the unsigned 32-bit type is
- 'uint_32t'.  These are NOT the same as the 'C99 integer types' that are
- defined in the inttypes.h and stdint.h headers since attempts to use these
- types have shown that support for them is still highly variable.  However,
- since the latter are of the form uint<nn>_t, a regular expression search
- and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t')
- can be used to convert the types used here to the C99 standard types.
-*/
-
-#ifndef _BRG_TYPES_H
-#define _BRG_TYPES_H
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#include <limits.h>
-
-#if defined( _MSC_VER ) && ( _MSC_VER >= 1300 )
-#  include <stddef.h>
-#  define ptrint_t intptr_t
-#elif defined( __GNUC__ ) && ( __GNUC__ >= 3 )
-#  include <stdint.h>
-#  define ptrint_t intptr_t
-#else
-#  define ptrint_t int
-#endif
-
-#ifndef BRG_UI8
-#  define BRG_UI8
-#  if UCHAR_MAX == 255u
-     typedef unsigned char uint_8t;
-#  else
-#    error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h
-#  endif
-#endif
-
-#ifndef BRG_UI16
-#  define BRG_UI16
-#  if USHRT_MAX == 65535u
-     typedef unsigned short uint_16t;
-#  else
-#    error Please define uint_16t as a 16-bit unsigned short type in brg_types.h
-#  endif
-#endif
-
-#ifndef BRG_UI32
-#  define BRG_UI32
-#  if UINT_MAX == 4294967295u
-#    define li_32(h) 0x##h##u
-     typedef unsigned int uint_32t;
-#  elif ULONG_MAX == 4294967295u
-#    define li_32(h) 0x##h##ul
-     typedef unsigned long uint_32t;
-#  elif defined( _CRAY )
-#    error This code needs 32-bit data types, which Cray machines do not provide
-#  else
-#    error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h
-#  endif
-#endif
-
-#ifndef BRG_UI64
-#  if defined( __BORLANDC__ ) && !defined( __MSDOS__ )
-#    define BRG_UI64
-#    define li_64(h) 0x##h##ui64
-     typedef unsigned __int64 uint_64t;
-#  elif defined( _MSC_VER ) && ( _MSC_VER < 1300 )    /* 1300 == VC++ 7.0 */
-#    define BRG_UI64
-#    define li_64(h) 0x##h##ui64
-     typedef unsigned __int64 uint_64t;
-#  elif defined( __sun ) && defined( ULONG_MAX ) && ULONG_MAX == 0xfffffffful
-#    define BRG_UI64
-#    define li_64(h) 0x##h##ull
-     typedef unsigned long long uint_64t;
-#  elif defined( __MVS__ )
-#    define BRG_UI64
-#    define li_64(h) 0x##h##ull
-     typedef unsigned int long long uint_64t;
-#  elif defined( UINT_MAX ) && UINT_MAX > 4294967295u
-#    if UINT_MAX == 18446744073709551615u
-#      define BRG_UI64
-#      define li_64(h) 0x##h##u
-       typedef unsigned int uint_64t;
-#    endif
-#  elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u
-#    if ULONG_MAX == 18446744073709551615ul
-#      define BRG_UI64
-#      define li_64(h) 0x##h##ul
-       typedef unsigned long uint_64t;
-#    endif
-#  elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u
-#    if ULLONG_MAX == 18446744073709551615ull
-#      define BRG_UI64
-#      define li_64(h) 0x##h##ull
-       typedef unsigned long long uint_64t;
-#    endif
-#  elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u
-#    if ULONG_LONG_MAX == 18446744073709551615ull
-#      define BRG_UI64
-#      define li_64(h) 0x##h##ull
-       typedef unsigned long long uint_64t;
-#    endif
-#  endif
-#endif
-
-#if !defined( BRG_UI64 )
-#  if defined( NEED_UINT_64T )
-#      define BRG_UI64
-#      define li_64(h) 0x##h##ull
-       typedef unsigned long long uint_64t;
-  /*#    error Please define uint_64t as an unsigned 64 bit type in brg_types.h*/
-#  endif
-#endif
-
-#ifndef RETURN_VALUES
-#  define RETURN_VALUES
-#  if defined( DLL_EXPORT )
-#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
-#      define VOID_RETURN    __declspec( dllexport ) void __stdcall
-#      define INT_RETURN     __declspec( dllexport ) int  __stdcall
-#    elif defined( __GNUC__ )
-#      define VOID_RETURN    __declspec( __dllexport__ ) void
-#      define INT_RETURN     __declspec( __dllexport__ ) int
-#    else
-#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
-#    endif
-#  elif defined( DLL_IMPORT )
-#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
-#      define VOID_RETURN    __declspec( dllimport ) void __stdcall
-#      define INT_RETURN     __declspec( dllimport ) int  __stdcall
-#    elif defined( __GNUC__ )
-#      define VOID_RETURN    __declspec( __dllimport__ ) void
-#      define INT_RETURN     __declspec( __dllimport__ ) int
-#    else
-#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
-#    endif
-#  elif defined( __WATCOMC__ )
-#    define VOID_RETURN  void __cdecl
-#    define INT_RETURN   int  __cdecl
-#  else
-#    define VOID_RETURN  void
-#    define INT_RETURN   int
-#  endif
-#endif
-
-/*	These defines are used to detect and set the memory alignment of pointers.
-    Note that offsets are in bytes.
-
-	ALIGN_OFFSET(x,n)			return the positive or zero offset of 
-								the memory addressed by the pointer 'x' 
-								from an address that is aligned on an 
-								'n' byte boundary ('n' is a power of 2)
-
-	ALIGN_FLOOR(x,n)			return a pointer that points to memory
-								that is aligned on an 'n' byte boundary 
-								and is not higher than the memory address
-								pointed to by 'x' ('n' is a power of 2)
-
-	ALIGN_CEIL(x,n)				return a pointer that points to memory
-								that is aligned on an 'n' byte boundary 
-								and is not lower than the memory address
-								pointed to by 'x' ('n' is a power of 2)
-*/
-
-#define ALIGN_OFFSET(x,n)	(((ptrint_t)(x)) & ((n) - 1))
-#define ALIGN_FLOOR(x,n)	((uint_8t*)(x) - ( ((ptrint_t)(x)) & ((n) - 1)))
-#define ALIGN_CEIL(x,n)		((uint_8t*)(x) + (-((ptrint_t)(x)) & ((n) - 1)))
-
-/*  These defines are used to declare buffers in a way that allows
-    faster operations on longer variables to be used.  In all these
-    defines 'size' must be a power of 2 and >= 8. NOTE that the 
-    buffer size is in bytes but the type length is in bits
-
-    UNIT_TYPEDEF(x,size)        declares a variable 'x' of length 
-                                'size' bits
-
-    BUFR_TYPEDEF(x,size,bsize)  declares a buffer 'x' of length 'bsize' 
-                                bytes defined as an array of variables
-                                each of 'size' bits (bsize must be a 
-                                multiple of size / 8)
-
-    UNIT_CAST(x,size)           casts a variable to a type of 
-                                length 'size' bits
-
-    UPTR_CAST(x,size)           casts a pointer to a pointer to a 
-                                varaiable of length 'size' bits
-*/
-
-#define UI_TYPE(size)               uint_##size##t
-#define UNIT_TYPEDEF(x,size)        typedef UI_TYPE(size) x
-#define BUFR_TYPEDEF(x,size,bsize)  typedef UI_TYPE(size) x[bsize / (size >> 3)]
-#define UNIT_CAST(x,size)           ((UI_TYPE(size) )(x))  
-#define UPTR_CAST(x,size)           ((UI_TYPE(size)*)(x))
-
-  /* Added by Soeren S. Thomsen (begin) */
-#define u8 uint_8t
-#define u32 uint_32t
-#define u64 uint_64t
-  /* (end) */
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
--- a/algo/sha/hmac-sha256-hash-4way.h
+++ b/algo/sha/hmac-sha256-hash-4way.h
@@ -36,7 +36,7 @@
 #include <sys/types.h>
 #include <stdint.h>
 #include "simd-utils.h"
-#include "sha-hash-4way.h"
+#include "sha256-hash.h"

 typedef struct _hmac_sha256_4way_context
 {
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -1,168 +0,0 @@
-/* $Id: sph_sha2.h 216 2010-06-08 09:46:57Z tp $ */
-/**
- * SHA-224, SHA-256, SHA-384 and SHA-512 interface.
- *
- * SHA-256 has been published in FIPS 180-2, now amended with a change
- * notice to include SHA-224 as well (which is a simple variation on
- * SHA-256). SHA-384 and SHA-512 are also defined in FIPS 180-2. FIPS
- * standards can be found at:
- *    http://csrc.nist.gov/publications/fips/
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_sha2.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef SHA2_HASH_4WAY_H__
-#define SHA2_HASH_4WAY_H__ 1
-
-#include <stddef.h>
-#include "simd-utils.h"
-
-#if defined(__SSE2__)
-
-// SHA-256 4 way
-
-typedef struct {
-   __m128i buf[64>>2];
-   __m128i val[8];
-   uint32_t count_high, count_low;
-} sha256_4way_context __attribute__ ((aligned (64)));
-
-void sha256_4way_init( sha256_4way_context *sc );
-void sha256_4way_update( sha256_4way_context *sc, const void *data,
-                         size_t len );
-void sha256_4way_close( sha256_4way_context *sc, void *dst );
-void sha256_4way_full( void *dst, const void *data, size_t len );
-void sha256_4way_transform_le( __m128i *state_out,  const __m128i *data,
-                            const __m128i *state_in );
-void sha256_4way_transform_be( __m128i *state_out,  const __m128i *data,
-                            const __m128i *state_in );
-void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
-                                   const __m128i *W, const __m128i *state_in );
-void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
-        const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
-int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
-                                   const __m128i *state_in, const uint32_t *target );
-
-#endif  // SSE2
-
-#if defined (__AVX2__)
-
-// SHA-256 8 way
-
-typedef struct {
-   __m256i buf[64>>2];
-   __m256i val[8];
-   uint32_t count_high, count_low;
-} sha256_8way_context __attribute__ ((aligned (128)));
-
-void sha256_8way_init( sha256_8way_context *sc );
-void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
-void sha256_8way_close( sha256_8way_context *sc, void *dst );
-void sha256_8way_full( void *dst, const void *data, size_t len );
-void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
-                               const __m256i *state_in );
-void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
-                               const __m256i *state_in );
-
-void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
-                                 const __m256i *W, const __m256i *state_in );
-void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
-        const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
-int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
-                             const __m256i *state_in, const uint32_t *target );
-
-#endif  // AVX2
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-// SHA-256 16 way
-
-typedef struct {
-   __m512i buf[64>>2];
-   __m512i val[8];
-   uint32_t count_high, count_low;
-} sha256_16way_context __attribute__ ((aligned (128)));
-
-void sha256_16way_init( sha256_16way_context *sc );
-void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
-void sha256_16way_close( sha256_16way_context *sc, void *dst );
-void sha256_16way_full( void *dst, const void *data, size_t len );
-void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
-                             const __m512i *state_in );
-void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
-                             const __m512i *state_in );
-void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
-                                  const __m512i *W, const __m512i *state_in );
-void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
-        const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
-
-int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
-                            const __m512i *state_in, const uint32_t *target );
-
-#endif // AVX512
-
-#if defined (__AVX2__)
-
-// SHA-512 4 way
-
-typedef struct {
-   __m256i buf[128>>3];
-   __m256i val[8];
-   uint64_t count;
-   bool initialized;
-} sha512_4way_context __attribute__ ((aligned (128)));
-
-void sha512_4way_init( sha512_4way_context *sc);
-void sha512_4way_update( sha512_4way_context *sc, const void *data,
-                         size_t len );
-void sha512_4way_close( sha512_4way_context *sc, void *dst );
-void sha512_4way_full( void *dst, const void *data, size_t len );
-
-#endif  // AVX2
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-// SHA-512 8 way
-
-typedef struct {
-   __m512i buf[128>>3];
-   __m512i val[8];
-   uint64_t count;
-   bool initialized;
-} sha512_8way_context __attribute__ ((aligned (128)));
-
-void sha512_8way_init( sha512_8way_context *sc);
-void sha512_8way_update( sha512_8way_context *sc, const void *data, 
-                         size_t len );
-void sha512_8way_close( sha512_8way_context *sc, void *dst );
-void sha512_8way_full( void *dst, const void *data, size_t len );
-
-#endif  // AVX512
-
-#endif  // SHA256_4WAY_H__
--- a/algo/sha/sha256-hash-2way-ni.c
+++ b/algo/sha/sha256-hash-2way-ni.c
@@ -1,689 +0,0 @@
-/*   Intel SHA extensions using C intrinsics               */
-/*   Written and place in public domain by Jeffrey Walton  */
-/*   Based on code from Intel, and by Sean Gulley for      */
-/*   the miTLS project.                                    */
-
-// A stripped down version with byte swapping removed. 
-
-#if defined(__SHA__)
-
-#include "sha256-hash.h"
-
-void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
-                              const void *msg_X, const void *msg_Y,
-                              const uint32_t *in_X, const uint32_t *in_Y )
-{
-    __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
-    __m128i MSG_X, MSG_Y, TMP_X, TMP_Y;
-    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
-    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
-    __m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y;
-
-    // Load initial values
-    TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
-    STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
-    TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
-    STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
-
-    TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
-    TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
-    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
-    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
-    STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
-    STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
-    STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
-    STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
-
-    // Save current hash
-    ABEF_SAVE_X = STATE0_X;
-    ABEF_SAVE_Y = STATE0_Y;
-    CDGH_SAVE_X = STATE1_X;
-    CDGH_SAVE_Y = STATE1_Y;
-
-    // Rounds 0-3
-    TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
-    TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
-    TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
-    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Rounds 4-7
-    TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
-    TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
-    TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
-    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
-
-    // Rounds 8-11
-    TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
-    TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
-    TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
-    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
-
-    // Rounds 12-15
-    TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
-    TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
-    TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
-    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
-    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
-    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
-    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
-    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
-
-    // Rounds 16-19
-    TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
-    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
-    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
-    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
-    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
-    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
-
-    // Rounds 20-23
-    TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
-    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
-    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
-    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
-    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
-
-    // Rounds 24-27
-    TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
-    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
-    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
-    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
-    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
-
-    // Rounds 28-31
-    TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
-    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
-    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
-    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
-    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
-    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
-
-    // Rounds 32-35
-    TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
-    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
-    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
-    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
-    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
-    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
-
-    // Rounds 36-39
-    TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
-    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
-    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
-    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
-    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
-
-    // Rounds 40-43
-    TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
-    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
-    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
-    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
-    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
-
-    // Rounds 44-47
-    TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
-    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
-    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
-    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
-    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
-    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
-
-    // Rounds 48-51
-    TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
-    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
-    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
-    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
-    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
-    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
-
-    // Rounds 52-55
-    TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
-    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
-    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
-    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Rounds 56-59
-    TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
-    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
-    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
-    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Rounds 60-63
-    TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Add values back to state
-    STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
-    STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
-    STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
-    STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
-
-    TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
-    TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
-    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
-    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
-    STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
-    STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
-    STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
-    STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
-
-    // Save state
-    _mm_store_si128((__m128i*) &out_X[0], STATE0_X);
-    _mm_store_si128((__m128i*) &out_X[4], STATE1_X);
-    _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
-    _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
-}
-
-void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
-                              const void *msg_X, const void *msg_Y,
-                              const uint32_t *in_X, const uint32_t *in_Y )
-{
-    __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
-    __m128i MSG_X, MSG_Y, TMP_X, TMP_Y, MASK;
-    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
-    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
-    __m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y;
-
-    // Load initial values
-    TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
-    STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
-    TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
-    STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
-    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
-
-    TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
-    TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
-    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
-    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
-    STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
-    STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
-    STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
-    STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
-
-    // Save current hash
-    ABEF_SAVE_X = STATE0_X;
-    ABEF_SAVE_Y = STATE0_Y;
-    CDGH_SAVE_X = STATE1_X;
-    CDGH_SAVE_Y = STATE1_Y;
-
-    // Rounds 0-3
-    TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
-    TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
-    TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
-    TMSG0_X = _mm_shuffle_epi8( TMSG0_X, MASK );
-    TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, MASK );
-    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Rounds 4-7
-    TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
-    TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
-    TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
-    TMSG1_X = _mm_shuffle_epi8( TMSG1_X, MASK );
-    TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, MASK );
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
-    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
-
-    // Rounds 8-11
-    TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
-    TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
-    TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
-    TMSG2_X = _mm_shuffle_epi8( TMSG2_X, MASK );
-    TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, MASK );
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
-    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
-
-    // Rounds 12-15
-    TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
-    TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
-    TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
-    TMSG3_X = _mm_shuffle_epi8( TMSG3_X, MASK );
-    TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, MASK );
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
-    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
-    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
-    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
-    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
-    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
-
-    // Rounds 16-19
-    TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
-    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
-    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
-    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
-    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
-    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
-
-    // Rounds 20-23
-    TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
-    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
-    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
-    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
-    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
-
-    // Rounds 24-27
-    TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
-    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
-    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
-    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
-    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
-
-    // Rounds 28-31
-    TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
-    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
-    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
-    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
-    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
-    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
-
-    // Rounds 32-35
-    TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
-    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
-    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
-    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
-    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
-    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
-
-    // Rounds 36-39
-    TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
-    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
-    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
-    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
-    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
-
-    // Rounds 40-43
-    TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
-    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
-    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
-    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
-    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
-
-    // Rounds 44-47
-    TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
-    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
-    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
-    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
-    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
-    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
-
-    // Rounds 48-51
-    TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
-    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
-    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
-    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
-    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
-    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
-    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
-
-    // Rounds 52-55
-    TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
-    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
-    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
-    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
-    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Rounds 56-59
-    TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
-    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
-    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
-    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
-    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
-    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Rounds 60-63
-    TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
-    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
-    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
-    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
-    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
-    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
-    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
-    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
-    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
-
-    // Add values back to state
-    STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
-    STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
-    STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
-    STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
-
-    TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
-    TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
-    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
-    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
-    STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
-    STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
-    STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
-    STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
-
-    // Save state
-    _mm_store_si128((__m128i*) &out_X[0], STATE0_X);
-    _mm_store_si128((__m128i*) &out_X[4], STATE1_X);
-    _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
-    _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
-}
-
-
-#endif
-
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -3,7 +3,7 @@

 #include <stddef.h>
 #include <string.h>
-#include "sha-hash-4way.h"
+#include "sha256-hash.h"
 #include "compat.h"

 /*
@@ -610,6 +610,16 @@ do { \
   SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 14, j ); \
   SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, j );

+// Not used with AVX512, needed to satisfy the compiler
+#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
+{ \
+   __m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
+                              _mm256_set1_epi32( K256[(i)+(j)] ) ); \
+   __m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
+   D  = _mm256_add_epi32( D,  T1 ); \
+   H  = _mm256_add_epi32( T1, T2 ); \
+}
+
 #else  // AVX2

 #define CHx(X, Y, Z) \
@@ -621,6 +631,16 @@ do { \
  _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
                                         Y_xor_Z ) )

+#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
+{ \
+   __m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
+                              _mm256_set1_epi32( K256[(i)+(j)] ) ); \
+   __m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
+   Y_xor_Z = X_xor_Y; \
+   D  = _mm256_add_epi32( D,  T1 ); \
+   H  = _mm256_add_epi32( T1, T2 ); \
+}
+
 #define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
 do { \
  __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \
@@ -635,7 +655,6 @@ do { \
  H  = _mm256_add_epi32( T1, T2 ); \
 } while (0)

-
 // read Y_xor_Z, update X_xor_Y
 #define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \
  _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
@@ -769,7 +788,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
   SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  1, 0 );
   SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F,  2, 0 );

-   // round 3 part 1, ignore nonces W[3]
+   // round 3 part 1, avoid nonces W[3]
   T1 = mm256_add4_32( E, BSG2_1x(B), CHx(B, C, D),
                       _mm256_set1_epi32( K256[3] ) );
   A = _mm256_add_epi32( A, T1 );
@@ -807,23 +826,22 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G );
 #endif

-   // round 3 part 2, inject nonces  
+   // round 3 part 2, add nonces  
   A = _mm256_add_epi32( A, W[3] );
   E = _mm256_add_epi32( E, W[3] );
   
-//   SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
+   SHA256_8WAY_ROUND(       E, F, G, H, A, B, C, D,  4, 0 );
+   SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA256_8WAY_ROUND_NOMSG( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA256_8WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA256_8WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA256_8WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA256_8WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA256_8WAY_ROUND(       B, C, D, E, F, G, H, A, 15, 0 );

   W[ 0] = X[ 0];
   W[ 1] = X[ 1];
@@ -865,6 +883,7 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
   W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] ); 

   SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+
   SHA256_8WAY_MEXP_16ROUNDS( W );
   SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
   
@@ -887,8 +906,6 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
   _mm256_store_si256( state_out + 7,  H );
 }

-
-// It's working with a high hit rate but performance is lower
 int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
                           const __m256i *state_in, const uint32_t *target )
 {
@@ -912,14 +929,37 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
   const __m256i IV7 = H;
   const __m256i IV6 = G;

-   SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
+#if !defined(__AVX512VL__)
+   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
+#endif

-   for ( int j = 16; j < 48; j += 16 )
-   {
-      SHA256_8WAY_MEXP_16ROUNDS( W );
-      SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, j );
-   }
+   // rounds 0 to 16, ignore zero padding W[9..14]
+   SHA256_8WAY_ROUND(       A, B, C, D, E, F, G, H,  0, 0 );
+   SHA256_8WAY_ROUND(       H, A, B, C, D, E, F, G,  1, 0 );
+   SHA256_8WAY_ROUND(       G, H, A, B, C, D, E, F,  2, 0 );
+   SHA256_8WAY_ROUND(       F, G, H, A, B, C, D, E,  3, 0 );
+   SHA256_8WAY_ROUND(       E, F, G, H, A, B, C, D,  4, 0 );
+   SHA256_8WAY_ROUND(       D, E, F, G, H, A, B, C,  5, 0 );
+   SHA256_8WAY_ROUND(       C, D, E, F, G, H, A, B,  6, 0 );
+   SHA256_8WAY_ROUND(       B, C, D, E, F, G, H, A,  7, 0 );
+   SHA256_8WAY_ROUND(       A, B, C, D, E, F, G, H,  8, 0 );
+   SHA256_8WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA256_8WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA256_8WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA256_8WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA256_8WAY_ROUND(       B, C, D, E, F, G, H, A, 15, 0 );
+  
+   // rounds 16 ro 31
+   SHA256_8WAY_MEXP_16ROUNDS( W );
+   SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 16 );

+   // rounds 32  to 47
+   SHA256_8WAY_MEXP_16ROUNDS( W );
+   SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+
+   // rounds 48 to 60 mexp
   W[ 0] = SHA256_8WAY_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
   W[ 1] = SHA256_8WAY_MEXP( W[15], W[10], W[ 2], W[ 1] );
   W[ 2] = SHA256_8WAY_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
@@ -935,9 +975,10 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
   W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] );

 #if !defined(__AVX512VL__)
-   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
+   Y_xor_Z = _mm256_xor_si256( B, C );
 #endif

+   // rounds 48 to 57
   SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H,  0, 48 );
   SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  1, 48 );
   SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F,  2, 48 );
@@ -968,7 +1009,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
   flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
   if ( likely( 0xff == ( flip ^
                    mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ))
-   return 0;
+      return 0;

   t6_mask = mm256_movmask_32( vmask =_mm256_cmpeq_epi32( hash, targ ) );

@@ -983,28 +1024,29 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,

   if ( t6_mask )
   { 
-      // Testing H inconclusive: hash7 == target7, need to test G
+      // Testing H was inconclusive: hash7 == target7, need to test G
      targ = _mm256_and_si256( vmask, _mm256_set1_epi32( target[6] ) );
      hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf );

-      if ( unlikely( 0 != ( t6_mask & mm256_movmask_32(
+      if ( likely( 0 == ( t6_mask & mm256_movmask_32(
                                      _mm256_cmpeq_epi32( hash, targ ) ) ) ))
-         return 0;
-      else
      {
          flip = ( (int)target[6] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
          if ( likely( 0 != ( t6_mask & ( flip ^
                   mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ) ))
             return 0;
-          else if ( likely( target[6] == 0x80000000 ))
-          {
-             if ( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpgt_epi32(
-                                  hash, _mm256_xor_si256( hash, hash ) ) ) ) )
-               return 0;
-          }
+          if ( likely( ( target[6] == 0x80000000 )
+             && ( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpgt_epi32(
+                              hash, _mm256_xor_si256( hash, hash ) ) ) ) ) ))
+             return 0;
       } 
+//     else inconclusive, testing targ5 isn't practical, fininsh hashing  
   }

+// At this point either the hash will be good or the test was inconclusive.
+// If the latter it's probably a high target difficulty with a nearly equal
+// high difficulty hash that has a good chance of being good.  
+
   // rounds 59 to 61 part 2
   E = _mm256_add_epi32( T1, _mm256_add_epi32( BSG2_0x( F ),
                                               MAJx( F, G, H ) ) );
@@ -1179,6 +1221,15 @@ do { \
  H  = _mm512_add_epi32( T1, T2 ); \
 } while (0)
   
+#define SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
+{ \
+   __m512i T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \
+                              _mm512_set1_epi32( K256[(i)+(j)] ) ); \
+   __m512i T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
+   D  = _mm512_add_epi32( D,  T1 ); \
+   H  = _mm512_add_epi32( T1, T2 ); \
+}
+
 /*
 #define SHA256_16WAY_ROUND(A, B, C, D, E, F, G, H, i, j) \
 do { \
@@ -1292,7 +1343,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
   SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G,  1, 0 );
   SHA256_16WAY_ROUND( G, H, A, B, C, D, E, F,  2, 0 );

-   // round 3 part 1, ignore nonces W[3]
+   // round 3 part 1, avoid nonces W[3]
   T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D), 
                       _mm512_set1_epi32( K256[3] ) );
   A = _mm512_add_epi32( A, T1 );
@@ -1312,7 +1363,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
 void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
          const __m512i *state_in, const __m512i *state_mid, const __m512i *X )
 {
-   __m512i A, B, C, D, E, F, G, H, T1, T2;
+   __m512i A, B, C, D, E, F, G, H;
   __m512i W[16];

   memcpy_512( W, data, 16 );
@@ -1326,87 +1377,25 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
   G = _mm512_load_si512( state_mid + 6 );
   H = _mm512_load_si512( state_mid + 7 );

-   // round 3 part 2, inject nonces  
+   // round 3 part 2, add nonces  
   A = _mm512_add_epi32( A, W[3] );
   E = _mm512_add_epi32( E, W[3] );

-   // round 4
-   SHA256_16WAY_ROUND( E, F, G, H, A, B, C, D, 4, 0 );   
+   // rounds 4 to 15, ignore zero padding W[5..14]
+   SHA256_16WAY_ROUND      ( E, F, G, H, A, B, C, D,  4, 0 );   
+   SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA256_16WAY_ROUND_NOMSG( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA256_16WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA256_16WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA256_16WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA256_16WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA256_16WAY_ROUND      ( B, C, D, E, F, G, H, A, 15, 0 );

-   // round 5
-   T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B), 
-                       _mm512_set1_epi32( K256[5] ) ); 
-   T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) ); 
-   G  = _mm512_add_epi32( G,  T1 );
-   C  = _mm512_add_epi32( T1, T2 );
-
-   // round 6
-   T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A),
-                       _mm512_set1_epi32( K256[6] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) );
-   F  = _mm512_add_epi32( F,  T1 );
-   B  = _mm512_add_epi32( T1, T2 );
-
-   // round 7
-   T1 = mm512_add4_32( A, BSG2_1x16(F), CHx16(F, G, H),
-                       _mm512_set1_epi32( K256[7] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(B), MAJx16(B, C, D) );
-   E  = _mm512_add_epi32( E,  T1 );
-   A  = _mm512_add_epi32( T1, T2 );
-
-   // round 8
-   T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G),
-                       _mm512_set1_epi32( K256[8] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) );
-   D  = _mm512_add_epi32( D,  T1 );
-   H  = _mm512_add_epi32( T1, T2 );
-
-   // round 9
-   T1 = mm512_add4_32( G, BSG2_1x16(D), CHx16(D, E, F),
-                       _mm512_set1_epi32( K256[9] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(H), MAJx16(H, A, B) );
-   C  = _mm512_add_epi32( C,  T1 );
-   G  = _mm512_add_epi32( T1, T2 );
-       
-   // round 10
-   T1 = mm512_add4_32( F, BSG2_1x16(C), CHx16(C, D, E),
-                       _mm512_set1_epi32( K256[10] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(G), MAJx16(G, H, A) );
-   B  = _mm512_add_epi32( B,  T1 );
-   F  = _mm512_add_epi32( T1, T2 );
-       
-   // round 11
-   T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D),
-                       _mm512_set1_epi32( K256[11] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(F), MAJx16(F, G, H) );
-   A  = _mm512_add_epi32( A,  T1 );
-   E  = _mm512_add_epi32( T1, T2 );
-
-   // round 12
-   T1 = mm512_add4_32( D, BSG2_1x16(A), CHx16(A, B, C),
-                       _mm512_set1_epi32( K256[12] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(E), MAJx16(E, F, G) );
-   H  = _mm512_add_epi32( H,  T1 );
-   D  = _mm512_add_epi32( T1, T2 );
-
-   // round 13
-   T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B),
-                       _mm512_set1_epi32( K256[13] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) );
-   G  = _mm512_add_epi32( G,  T1 );
-   C  = _mm512_add_epi32( T1, T2 );
-  
-   // round 14
-   T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A),
-                       _mm512_set1_epi32( K256[14] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) );
-   F  = _mm512_add_epi32( F,  T1 );
-   B  = _mm512_add_epi32( T1, T2 );
-
-   // round 15   
-   SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
-
-   // rounds 16 to 31 mexp part 2, inject nonces.
+   // rounds 16 to 31 mexp part 2, add nonces.
   W[ 0] = X[ 0];
   W[ 1] = X[ 1];
   W[ 2] = _mm512_add_epi32( X[ 2], SSG2_0x16( W[ 3] ) );
@@ -1428,6 +1417,7 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,

   SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, 16 );

+   // rounds 32 to 63   
   W[ 0] = _mm512_add_epi32( X[ 6], _mm512_add_epi32( SSG2_1x16( W[14] ),
                                                      W[ 9] ) ); 
   W[ 1] = SHA256_16WAY_MEXP( W[15], W[10], W[ 2], W[ 1] );
@@ -1505,41 +1495,12 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
   SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H,  8, 0 );

   // rounds 9 to 14, ignore zero padding
-   T1 = mm512_add4_32( G, BSG2_1x16(D), CHx16(D, E, F),
-                       _mm512_set1_epi32( K256[9] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(H), MAJx16(H, A, B) );
-   C  = _mm512_add_epi32( C,  T1 );
-   G  = _mm512_add_epi32( T1, T2 );
-
-   T1 = mm512_add4_32( F, BSG2_1x16(C), CHx16(C, D, E),
-                       _mm512_set1_epi32( K256[10] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(G), MAJx16(G, H, A) );
-   B  = _mm512_add_epi32( B,  T1 );
-   F  = _mm512_add_epi32( T1, T2 );
-
-   T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D),
-                       _mm512_set1_epi32( K256[11] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(F), MAJx16(F, G, H) );
-   A  = _mm512_add_epi32( A,  T1 );
-   E  = _mm512_add_epi32( T1, T2 );
-
-   T1 = mm512_add4_32( D, BSG2_1x16(A), CHx16(A, B, C),
-                       _mm512_set1_epi32( K256[12] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(E), MAJx16(E, F, G) );
-   H  = _mm512_add_epi32( H,  T1 );
-   D  = _mm512_add_epi32( T1, T2 );
-
-   T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B),
-                       _mm512_set1_epi32( K256[13] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) );
-   G  = _mm512_add_epi32( G,  T1 );
-   C  = _mm512_add_epi32( T1, T2 );
-
-   T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A),
-                       _mm512_set1_epi32( K256[14] ) );
-   T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) );
-   F  = _mm512_add_epi32( F,  T1 );
-   B  = _mm512_add_epi32( T1, T2 );
+   SHA256_16WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA256_16WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA256_16WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA256_16WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );

   // round 15
   SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
@@ -1575,7 +1536,6 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,

   // rounds 32 to 47
   SHA256_MEXP_16WAY_16ROUNDS( W );
-
   SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 );

   // rounds 48 to 60 mexp
@@ -1640,8 +1600,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
   {
      hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf );
      targ = _mm512_set1_epi32( target[6] );
-      if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask,
-                                                      hash, targ ) ))
+      if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) ))
          return 0;
   }

--- a/algo/sha/sha256-hash-opt.c
+++ b/algo/sha/sha256-hash-opt.c
@@ -1,388 +0,0 @@
-/*   Intel SHA extensions using C intrinsics               */
-/*   Written and place in public domain by Jeffrey Walton  */
-/*   Based on code from Intel, and by Sean Gulley for      */
-/*   the miTLS project.                                    */
-
-// A stripped down version with byte swapping removed. 
-
-#if defined(__SHA__)
-
-#include "sha256-hash.h"
-
-void sha256_opt_transform_le( uint32_t *state_out, const void *input,
-                           const uint32_t *state_in )
-{
-    __m128i STATE0, STATE1;
-    __m128i MSG, TMP;
-    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
-    __m128i ABEF_SAVE, CDGH_SAVE;
-
-    // Load initial values
-    TMP = _mm_load_si128((__m128i*) &state_in[0]);
-    STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
-//    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
-
-    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
-    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
-    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
-    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
-
-    // Save current hash
-    ABEF_SAVE = STATE0;
-    CDGH_SAVE = STATE1;
-
-    // Rounds 0-3
-    TMSG0 = _mm_load_si128((const __m128i*) (input+0));
-//    TMSG0 = _mm_shuffle_epi8(MSG, MASK);
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 4-7
-    TMSG1 = _mm_load_si128((const __m128i*) (input+16));
-//    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 8-11
-    TMSG2 = _mm_load_si128((const __m128i*) (input+32));
-//    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 12-15
-    TMSG3 = _mm_load_si128((const __m128i*) (input+48));
-//    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 16-19
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 20-23
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 24-27
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 28-31
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 32-35
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 36-39
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 40-43
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 44-47
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 48-51
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 52-55
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 56-59
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 60-63
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Add values back to state
-    STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
-    STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
-
-    TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
-    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
-    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
-    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
-
-    // Save state
-    _mm_store_si128((__m128i*) &state_out[0], STATE0);
-    _mm_store_si128((__m128i*) &state_out[4], STATE1);
-}
-
-
-void sha256_opt_transform_be( uint32_t *state_out, const void *input,
-                           const uint32_t *state_in )
-{
-    __m128i STATE0, STATE1;
-    __m128i MSG, TMP, MASK;
-    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
-    __m128i ABEF_SAVE, CDGH_SAVE;
-
-    // Load initial values
-    TMP = _mm_load_si128((__m128i*) &state_in[0]);
-    STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
-    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
-
-    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
-    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
-    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
-    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
-
-    // Save current hash
-    ABEF_SAVE = STATE0;
-    CDGH_SAVE = STATE1;
-
-    // Rounds 0-3
-    TMSG0 = _mm_load_si128((const __m128i*) (input+0));
-    TMSG0 = _mm_shuffle_epi8( TMSG0, MASK );
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 4-7
-    TMSG1 = _mm_load_si128((const __m128i*) (input+16));
-    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-    // Rounds 8-11
-    TMSG2 = _mm_load_si128((const __m128i*) (input+32));
-    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 12-15
-    TMSG3 = _mm_load_si128((const __m128i*) (input+48));
-    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 16-19
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 20-23
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 24-27
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 28-31
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 32-35
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 36-39
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 40-43
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 44-47
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 48-51
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 52-55
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 56-59
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 60-63
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Add values back to state
-    STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
-    STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
-
-    TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
-    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
-    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
-    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
-
-    // Save state
-    _mm_store_si128((__m128i*) &state_out[0], STATE0);
-    _mm_store_si128((__m128i*) &state_out[4], STATE1);
-}
-
-#endif
--- a/algo/sha/sha256-hash.c
+++ b/algo/sha/sha256-hash.c
--- a/algo/sha/sha256-hash.h
+++ b/algo/sha/sha256-hash.h
@@ -4,17 +4,18 @@
 #include <stddef.h>
 #include "simd-utils.h"
 #include "cpuminer-config.h"
-#include "sph_sha2.h"
-

 // generic interface 

-typedef struct {
+typedef struct
+{
   unsigned char buf[64];    /* first field, for alignment */
   uint32_t state[8];
   uint64_t count;
 } sha256_context __attribute__((aligned(64)));

+static const uint32_t SHA256_IV[8];
+
 void sha256_full( void *hash, const void *data, size_t len );
 void sha256_update( sha256_context *ctx, const void *data, size_t len );
 void sha256_final( sha256_context *ctx, void *hash );
@@ -41,20 +42,113 @@ void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
                              const void *msg_X, const void *msg_Y,
                              const uint32_t *in_X, const uint32_t *in_Y );

+void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
+                              uint32_t *sstate, const uint32_t *istate );
+
+void sha256_ni2way_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y,
+                 const void *msg_X, const void *msg_Y,
+                 const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
+                 const uint32_t *state_save_X, const uint32_t *state_save_Y );
+
 // Select target
 // with SHA...
 #define sha256_transform_le sha256_opt_transform_le
 #define sha256_transform_be sha256_opt_transform_be

 #else
-
 // without SHA...
+#include "sph_sha2.h"
+
 #define sha256_transform_le sph_sha256_transform_le
 #define sha256_transform_be sph_sha256_transform_be

 #endif

-// SHA can't do only 3 rounds
-#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-256 16 way
+
+typedef struct
+{
+   __m512i buf[64>>2];
+   __m512i val[8];
+   uint32_t count_high, count_low;
+} sha256_16way_context __attribute__ ((aligned (128)));
+
+void sha256_16way_init( sha256_16way_context *sc );
+void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
+void sha256_16way_close( sha256_16way_context *sc, void *dst );
+void sha256_16way_full( void *dst, const void *data, size_t len );
+void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
+                             const __m512i *state_in );
+void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
+                             const __m512i *state_in );
+void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
+                                  const __m512i *W, const __m512i *state_in );
+void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
+        const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
+
+int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
+                            const __m512i *state_in, const uint32_t *target );
+
+#endif // AVX512
+
+#if defined (__AVX2__)
+
+// SHA-256 8 way
+
+typedef struct
+{
+   __m256i buf[64>>2];
+   __m256i val[8];
+   uint32_t count_high, count_low;
+} sha256_8way_context __attribute__ ((aligned (64)));
+
+void sha256_8way_init( sha256_8way_context *sc );
+void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
+void sha256_8way_close( sha256_8way_context *sc, void *dst );
+void sha256_8way_full( void *dst, const void *data, size_t len );
+void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
+                               const __m256i *state_in );
+void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
+                               const __m256i *state_in );
+
+void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
+                                 const __m256i *W, const __m256i *state_in );
+void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
+        const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
+int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
+                             const __m256i *state_in, const uint32_t *target );
+
+#endif  // AVX2
+
+#if defined(__SSE2__)
+
+// SHA-256 4 way
+
+typedef struct
+{
+   __m128i buf[64>>2];
+   __m128i val[8];
+   uint32_t count_high, count_low;
+} sha256_4way_context __attribute__ ((aligned (32)));
+
+void sha256_4way_init( sha256_4way_context *sc );
+void sha256_4way_update( sha256_4way_context *sc, const void *data,
+                         size_t len );
+void sha256_4way_close( sha256_4way_context *sc, void *dst );
+void sha256_4way_full( void *dst, const void *data, size_t len );
+void sha256_4way_transform_le( __m128i *state_out,  const __m128i *data,
+                            const __m128i *state_in );
+void sha256_4way_transform_be( __m128i *state_out,  const __m128i *data,
+                            const __m128i *state_in );
+void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
+                                   const __m128i *W, const __m128i *state_in );
+void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
+        const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
+int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
+                                   const __m128i *state_in, const uint32_t *target );
+
+#endif  // SSE2

 #endif
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -4,7 +4,6 @@
 #include <string.h>
 #include <stdio.h>
 #include "sha256-hash.h"
-#include "sha-hash-4way.h"

 static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
 {
@@ -17,11 +16,15 @@ static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
 int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t block0[16]   __attribute__ ((aligned (64)));
-   uint32_t block1[16]   __attribute__ ((aligned (64)));
-   uint32_t hash0[8]     __attribute__ ((aligned (32)));
-   uint32_t hash1[8]     __attribute__ ((aligned (32)));
-   uint32_t mstate[8]  __attribute__ ((aligned (32)));
+   uint32_t block1a[16] __attribute__ ((aligned (64)));
+   uint32_t block1b[16] __attribute__ ((aligned (64)));
+   uint32_t block2a[16] __attribute__ ((aligned (64)));
+   uint32_t block2b[16] __attribute__ ((aligned (64)));
+   uint32_t hasha[8]    __attribute__ ((aligned (32)));
+   uint32_t hashb[8]    __attribute__ ((aligned (32)));
+   uint32_t mstatea[8]  __attribute__ ((aligned (32)));
+   uint32_t mstateb[8]  __attribute__ ((aligned (32)));
+   uint32_t sstate[8]   __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -32,56 +35,60 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
   const __m128i shuf_bswap32 =
           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );

-   // hash first 64 bytes of data
-   sha256_opt_transform_le( mstate, pdata, sha256_iv );
+   // hash first 64 byte block of data
+   sha256_opt_transform_le( mstatea, pdata, sha256_iv );
+
+   // fill & pad second bock without nonce
+   memcpy( block1a, pdata + 16, 12 );
+   memcpy( block1b, pdata + 16, 12 );
+   block1a[ 3] = 0;
+   block1b[ 3] = 0;
+   block1a[ 4] = block1b[ 4] = 0x80000000;
+   memset( block1a + 5, 0, 40 );
+   memset( block1b + 5, 0, 40 );
+   block1a[15] = block1b[15] = 80*8; // bit count
+
+   sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
+
+   // Pad third block
+   block2a[ 8] = block2b[ 8] = 0x80000000;
+   memset( block2a + 9, 0, 24 );
+   memset( block2b + 9, 0, 24 );
+   block2a[15] = block2b[15] = 32*8; // bit count

   do
   {
-      // 1. final 16 bytes of data, with padding
-      memcpy( block0, pdata + 16, 16 );
-      memcpy( block1, pdata + 16, 16 );
-      block0[ 3] = n;
-      block1[ 3] = n+1;
-      block0[ 4] = block1[ 4] = 0x80000000;
-      memset( block0 + 5, 0, 40 );
-      memset( block1 + 5, 0, 40 );
-      block0[15] = block1[15] = 80*8; // bit count
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1,
-                                  mstate, mstate );
+      // Insert nonce for second block
+      block1a[3] = n;
+      block1b[3] = n+1;
+      sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
+                                  mstateb, mstateb, sstate, sstate );

-      // 2. 32 byte hash from 1.
-      memcpy( block0, hash0, 32 );
-      memcpy( block1, hash1, 32 );
-      block0[ 8] = block1[ 8] = 0x80000000;
-      memset( block0 + 9, 0, 24 );
-      memset( block1 + 9, 0, 24 );
-      block0[15] = block1[15] = 32*8; // bit count
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1,
+      sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
                                  sha256_iv, sha256_iv );

-      if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) )
+      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
      {
-          casti_m128i( hash0, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
-          casti_m128i( hash0, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
-          if ( likely( valid_hash( hash0, ptarget ) && !bench ) )
+          casti_m128i( hasha, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
+          casti_m128i( hasha, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
+          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
          {
             pdata[19] = n;
-             submit_solution( work, hash0, mythr );
+             submit_solution( work, hasha, mythr );
          }
      }
-
-      if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) )
+      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
      {
-         casti_m128i( hash1, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
-         casti_m128i( hash1, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
-         if ( likely( valid_hash( hash1, ptarget ) && !bench ) )
+         casti_m128i( hashb, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
+         casti_m128i( hashb, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
+         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
         {
            pdata[19] = n+1;
-            submit_solution( work, hash1, mythr );
+            submit_solution( work, hashb, mythr );
         }
      }
      n += 2;
@@ -99,18 +106,16 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
 int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m512i  hash32[8]    __attribute__ ((aligned (128)));
-   __m512i  block[16]    __attribute__ ((aligned (64)));
+   __m512i  block[16]    __attribute__ ((aligned (128)));
   __m512i  buf[16]      __attribute__ ((aligned (64)));
+   __m512i  hash32[8]    __attribute__ ((aligned (64)));
   __m512i  mstate1[8]   __attribute__ ((aligned (64)));
   __m512i  mstate2[8]   __attribute__ ((aligned (64)));
   __m512i  istate[8]    __attribute__ ((aligned (64)));
   __m512i  mexp_pre[8]  __attribute__ ((aligned (64)));
   uint32_t phash[8]     __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
-   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 16;
   const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
@@ -134,7 +139,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   mstate1[6] = _mm512_set1_epi32( phash[6] );
   mstate1[7] = _mm512_set1_epi32( phash[7] );

-   // second message block data, with nonce & padding   
+   // second message block data, with nonce & padding
   buf[0] = _mm512_set1_epi32( pdata[16] );
   buf[1] = _mm512_set1_epi32( pdata[17] );
   buf[2] = _mm512_set1_epi32( pdata[18] );
@@ -142,12 +147,12 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
                              n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
   buf[4] = last_byte;
   memset_zero_512( buf+5, 10 );
-   buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
+   buf[15] = _mm512_set1_epi32( 80*8 );  // bit count

   // partially pre-expand & prehash second message block, avoiding the nonces
   sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );

-   // vectorize IV for 2nd & 3rd sha256
+   // vectorize IV for second hash
   istate[0] = _mm512_set1_epi32( sha256_iv[0] );
   istate[1] = _mm512_set1_epi32( sha256_iv[1] );
   istate[2] = _mm512_set1_epi32( sha256_iv[2] );
@@ -157,27 +162,26 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   istate[6] = _mm512_set1_epi32( sha256_iv[6] );
   istate[7] = _mm512_set1_epi32( sha256_iv[7] );

-   // initialize padding for 2nd sha256
+   // initialize padding for second hash
   block[ 8] = last_byte;
-   memset_zero_512( block + 9, 6 );
+   memset_zero_512( block+9, 6 );
   block[15] = _mm512_set1_epi32( 32*8 ); // bit count

   do
   {
      sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
-
-      if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) )
+      if ( unlikely( sha256_16way_transform_le_short(
+                                  hash32, block, istate, ptarget ) ) )
      {
         for ( int lane = 0; lane < 16; lane++ )
-         if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
         {
            extr_lane_16x32( phash, hash32, lane, 256 );
            casti_m256i( phash, 0 ) =
-                _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
+                   _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf ); 
            if ( likely( valid_hash( phash, ptarget ) && !bench ) )
            {
-               pdata[19] = n + lane;
-               submit_solution( work, phash, mythr );
+              pdata[19] = n + lane;
+              submit_solution( work, phash, mythr );
            }
         }
      }
@@ -188,92 +192,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   *hashes_done = n - first_nonce;
   return 0;
 }
-
-
-/*
-int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
-                           uint64_t *hashes_done, struct thr_info *mythr )
-{
-   __m512i  vdata[32]    __attribute__ ((aligned (128)));
-   __m512i  block[16]    __attribute__ ((aligned (64)));
-   __m512i  hash32[8]    __attribute__ ((aligned (64)));
-   __m512i  initstate[8] __attribute__ ((aligned (64)));
-   __m512i  midstate1[8] __attribute__ ((aligned (64)));
-   __m512i  midstate2[8] __attribute__ ((aligned (64)));
-   __m512i  mexp_pre[16] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
-   uint32_t *pdata = work->data;
-   const uint32_t *ptarget = work->target;
-   const uint32_t targ32_d7 = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 16;
-   uint32_t n = first_nonce;
-   __m512i *noncev = vdata + 19;
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-   const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
-   const __m512i sixteen = _mm512_set1_epi32( 16 );
-
-   for ( int i = 0; i < 19; i++ )
-       vdata[i] = _mm512_set1_epi32( pdata[i] );
-
-   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
-                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
-
-   vdata[16+4] = last_byte;
-   memset_zero_512( vdata+16 + 5, 10 );
-   vdata[16+15] = _mm512_set1_epi32( 80*8 ); // bit count
-
-   block[ 8] = last_byte;
-   memset_zero_512( block + 9, 6 );
-   block[15] = _mm512_set1_epi32( 32*8 ); // bit count
   
-   // initialize state
-   initstate[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
-   initstate[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
-   initstate[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
-   initstate[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
-   initstate[4] = _mm512_set1_epi64( 0x510E527F510E527F );
-   initstate[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
-   initstate[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
-   initstate[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
-
-   sha256_16way_transform_le( midstate1, vdata, initstate );
-
-   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
-
-   do
-   {
-      // 1. final 16 bytes of data, with padding
-      sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
-                                 mexp_pre );
-
-      // 2. 32 byte hash from 1.
-      sha256_16way_transform_le( hash32, block, initstate );
-      // byte swap final hash for testing
-      mm512_block_bswap_32( hash32, hash32 );
-
-      for ( int lane = 0; lane < 16; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
-      {
-         extr_lane_16x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
-         {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
-         }
-      }
-      *noncev = _mm512_add_epi32( *noncev, sixteen );
-      n += 16;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-*/
-
 #endif

 #if defined(SHA256D_8WAY)
@@ -284,15 +203,13 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
   __m256i  vdata[32]    __attribute__ ((aligned (64)));
   __m256i  block[16]    __attribute__ ((aligned (32)));
   __m256i  hash32[8]    __attribute__ ((aligned (32)));
-   __m256i  initstate[8] __attribute__ ((aligned (32)));
-   __m256i  midstate1[8] __attribute__ ((aligned (32)));
-   __m256i  midstate2[8] __attribute__ ((aligned (32)));
+   __m256i  istate[8]    __attribute__ ((aligned (32)));
+   __m256i  mstate1[8]   __attribute__ ((aligned (32)));
+   __m256i  mstate2[8]   __attribute__ ((aligned (32)));
   __m256i  mexp_pre[8]  __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
-   const uint32_t targ32_d7 = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
@@ -301,6 +218,8 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
   const bool bench = opt_benchmark;
   const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
   const __m256i eight = _mm256_set1_epi32( 8 );
+   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
+                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   for ( int i = 0; i < 19; i++ )
      vdata[i] = _mm256_set1_epi32( pdata[i] );
@@ -309,50 +228,47 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,

   vdata[16+4] = last_byte;
   memset_zero_256( vdata+16 + 5, 10 );
-   vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count
+   vdata[16+15] = _mm256_set1_epi32( 80*8 );

   block[ 8] = last_byte;
   memset_zero_256( block + 9, 6 );
-   block[15] = _mm256_set1_epi32( 32*8 ); // bit count
+   block[15] = _mm256_set1_epi32( 32*8 ); 
   
-   // initialize state
-   initstate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
-   initstate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
-   initstate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
-   initstate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
-   initstate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
-   initstate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
-   initstate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
-   initstate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
+   // initialize state for second hash
+   istate[0] = _mm256_set1_epi32( sha256_iv[0] );
+   istate[1] = _mm256_set1_epi32( sha256_iv[1] );
+   istate[2] = _mm256_set1_epi32( sha256_iv[2] );
+   istate[3] = _mm256_set1_epi32( sha256_iv[3] );
+   istate[4] = _mm256_set1_epi32( sha256_iv[4] );
+   istate[5] = _mm256_set1_epi32( sha256_iv[5] );
+   istate[6] = _mm256_set1_epi32( sha256_iv[6] );
+   istate[7] = _mm256_set1_epi32( sha256_iv[7] );
+
+   sha256_8way_transform_le( mstate1, vdata, istate );

-   sha256_8way_transform_le( midstate1, vdata, initstate );
-   
   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
-
+   sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
+   
   do
   {
-      // 1. final 16 bytes of data, with padding
-      sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
-                                mexp_pre );
-
-      // 2. 32 byte hash from 1.
-      sha256_8way_transform_le( hash32, block, initstate );
-      // byte swap final hash for testing
-      mm256_block_bswap_32( hash32, hash32 );
-
-      for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
+      if ( unlikely( sha256_8way_transform_le_short( hash32, block,
+                                                     istate, ptarget ) ) )
      {
-         extr_lane_8x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         for ( int lane = 0; lane < 8; lane++ )
         {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
+            extr_lane_8x32( lane_hash, hash32, lane, 256 );
+            casti_m256i( lane_hash, 0 ) =
+               _mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
+            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, lane_hash, mythr );
+            }
         }
-       }
-       *noncev = _mm256_add_epi32( *noncev, eight );
-       n += 8;
+      }
+      *noncev = _mm256_add_epi32( *noncev, eight );
+      n += 8;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
@@ -366,12 +282,12 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
 int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m128i  vdata[32]     __attribute__ ((aligned (64)));
-   __m128i  block[16]     __attribute__ ((aligned (32)));
-   __m128i  hash32[8]     __attribute__ ((aligned (32)));
-   __m128i  initstate[8]  __attribute__ ((aligned (32)));
-   __m128i  midstate1[8]   __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8]  __attribute__ ((aligned (32)));
+   __m128i  vdata[32]    __attribute__ ((aligned (64)));
+   __m128i  block[16]    __attribute__ ((aligned (32)));
+   __m128i  hash32[8]    __attribute__ ((aligned (32)));
+   __m128i  istate[8] __attribute__ ((aligned (32)));
+   __m128i  mstate[8]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
@@ -392,33 +308,30 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,

   vdata[16+4] = last_byte;
   memset_zero_128( vdata+16 + 5, 10 );
-   vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
+   vdata[16+15] = _mm_set1_epi32( 80*8 );

   block[ 8] = last_byte;
   memset_zero_128( block + 9, 6 );
-   block[15] = _mm_set1_epi32( 32*8 ); // bit count
-
+   block[15] = _mm_set1_epi32( 32*8 );
+   
   // initialize state
-   initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
-   initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
-   initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
-   initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
-   initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
-   initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
-   initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
-   initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
+   istate[0] = _mm_set1_epi32( sha256_iv[0] );
+   istate[1] = _mm_set1_epi32( sha256_iv[1] );
+   istate[2] = _mm_set1_epi32( sha256_iv[2] );
+   istate[3] = _mm_set1_epi32( sha256_iv[3] );
+   istate[4] = _mm_set1_epi32( sha256_iv[4] );
+   istate[5] = _mm_set1_epi32( sha256_iv[5] );
+   istate[6] = _mm_set1_epi32( sha256_iv[6] );
+   istate[7] = _mm_set1_epi32( sha256_iv[7] );

   // hash first 64 bytes of data
-   sha256_4way_transform_le( midstate1, vdata, initstate );
+   sha256_4way_transform_le( mstate, vdata, istate );

   do
   {
-      // 1. final 16 bytes of data, with padding
-      sha256_4way_transform_le( block, vdata+16, initstate );
+      sha256_4way_transform_le( block,  vdata+16, mstate  );
+      sha256_4way_transform_le( hash32, block, istate );

-      // 2. 32 byte hash from 1.
-      sha256_4way_transform_le( hash32, block, initstate );
-      // byte swap final hash for testing
      mm128_block_bswap_32( hash32, hash32 );

      for ( int lane = 0; lane < 4; lane++ )
@@ -440,3 +353,5 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
 }

 #endif
+
+
--- a/algo/sha/sha256dt.c
+++ b/algo/sha/sha256dt.c
@@ -4,7 +4,6 @@
 #include <string.h>
 #include <stdio.h>
 #include "sha256-hash.h"
-#include "sha-hash-4way.h"

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define SHA256DT_16WAY 1
@@ -22,14 +21,104 @@ static const uint32_t sha256dt_iv[8]  __attribute__ ((aligned (32))) =
      0xaa3ff126, 0x475bbf30, 0x8fd52e5b, 0x9f75c9ad
   };

+#if defined(SHA256DT_SHA)
+
+int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t block1a[16] __attribute__ ((aligned (64)));
+   uint32_t block1b[16] __attribute__ ((aligned (64)));
+   uint32_t block2a[16] __attribute__ ((aligned (64)));
+   uint32_t block2b[16] __attribute__ ((aligned (64)));
+   uint32_t hasha[8]    __attribute__ ((aligned (32)));
+   uint32_t hashb[8]    __attribute__ ((aligned (32)));
+   uint32_t mstatea[8]  __attribute__ ((aligned (32)));
+   uint32_t mstateb[8]  __attribute__ ((aligned (32)));
+   uint32_t sstate[8]   __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m128i shuf_bswap32 =
+           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+
+   // hash first 64 byte block of data
+   sha256_opt_transform_le( mstatea, pdata, sha256dt_iv );
+
+   // fill & pad second bock without nonce
+   memcpy( block1a, pdata + 16, 12 );
+   memcpy( block1b, pdata + 16, 12 );
+   block1a[ 3] = 0;
+   block1b[ 3] = 0;
+   block1a[ 4] = block1b[ 4] = 0x80000000;
+   memset( block1a + 5, 0, 40 );
+   memset( block1b + 5, 0, 40 );
+   block1a[15] = block1b[15] = 0x480; // funky bit count
+
+   sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
+
+   // Pad third block
+   block2a[ 8] = block2b[ 8] = 0x80000000;
+   memset( block2a + 9, 0, 24 );
+   memset( block2b + 9, 0, 24 );
+   block2a[15] = block2b[15] = 0x300; // bit count
+
+   do
+   {
+      // Insert nonce for second block
+      block1a[3] = n;
+      block1b[3] = n+1;
+      sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
+                                  mstateb, mstateb, sstate, sstate );
+
+      sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
+                                  sha256dt_iv, sha256dt_iv );
+
+      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
+      {
+          casti_m128i( hasha, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
+          casti_m128i( hasha, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
+          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
+          {
+             pdata[19] = n;
+             submit_solution( work, hasha, mythr );
+          }
+      }
+      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
+      {
+         casti_m128i( hashb, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
+         casti_m128i( hashb, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
+         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
+         {
+            pdata[19] = n+1;
+            submit_solution( work, hashb, mythr );
+         }
+      }
+      n += 2;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
 #if defined(SHA256DT_16WAY)

 int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m512i  hash32[8]    __attribute__ ((aligned (128)));
-   __m512i  block[16]    __attribute__ ((aligned (64)));
+   __m512i  block[16]    __attribute__ ((aligned (128)));
   __m512i  buf[16]      __attribute__ ((aligned (64)));
+   __m512i  hash32[8]    __attribute__ ((aligned (64)));
   __m512i  mstate1[8]   __attribute__ ((aligned (64)));
   __m512i  mstate2[8]   __attribute__ ((aligned (64)));
   __m512i  istate[8]    __attribute__ ((aligned (64)));
@@ -37,8 +126,6 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
   uint32_t phash[8]     __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
-//   uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
-//   const uint32_t targ32_d7 = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 16;
   const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
@@ -75,7 +162,7 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
   // partially pre-expand & prehash second message block, avoiding the nonces
   sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );

-   // vectorize IV for 2nd sha256
+   // vectorize IV for second hash
   istate[0] = _mm512_set1_epi32( sha256dt_iv[0] );
   istate[1] = _mm512_set1_epi32( sha256dt_iv[1] );
   istate[2] = _mm512_set1_epi32( sha256dt_iv[2] );
@@ -85,20 +172,18 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
   istate[6] = _mm512_set1_epi32( sha256dt_iv[6] );
   istate[7] = _mm512_set1_epi32( sha256dt_iv[7] );

-   // initialize padding for 2nd sha256
+   // initialize padding for second hash
   block[ 8] = last_byte;
   memset_zero_512( block+9, 6 );
   block[15] = _mm512_set1_epi32( 0x300 ); // bit count

   do
   {
-      // finish second block with nonces
      sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
      if ( unlikely( sha256_16way_transform_le_short(
                                  hash32, block, istate, ptarget ) ) )
      {
         for ( int lane = 0; lane < 16; lane++ )
-//         if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
         {
            extr_lane_16x32( phash, hash32, lane, 256 );
            casti_m256i( phash, 0 ) =
@@ -118,86 +203,9 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
   return 0;
 }
   
-#elif defined(SHA256DT_SHA)
+#endif

-int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t block0[16]   __attribute__ ((aligned (64)));
-   uint32_t block1[16]   __attribute__ ((aligned (64)));
-   uint32_t hash0[8]     __attribute__ ((aligned (32)));
-   uint32_t hash1[8]     __attribute__ ((aligned (32)));
-   uint32_t mstate[8]  __attribute__ ((aligned (32)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 2;
-   uint32_t n = first_nonce;
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-   const __m128i shuf_bswap32 =
-           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
-
-   // hash first 64 bytes of data
-   sha256_opt_transform_le( mstate, pdata, sha256dt_iv );
-
-   do
-   {
-      // 1. final 16 bytes of data, with padding
-      memcpy( block0, pdata + 16, 16 );
-      memcpy( block1, pdata + 16, 16 );
-      block0[ 3] = n;
-      block1[ 3] = n+1;
-      block0[ 4] = block1[ 4] = 0x80000000;
-      memset( block0 + 5, 0, 40 );
-      memset( block1 + 5, 0, 40 );
-      block0[15] = block1[15] = 0x480; // funky bit count
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1,
-                                  mstate, mstate );
-
-      // 2. 32 byte hash from 1.
-      memcpy( block0, hash0, 32 );
-      memcpy( block1, hash1, 32 );
-      block0[ 8] = block1[ 8] = 0x80000000;
-      memset( block0 + 9, 0, 24 );
-      memset( block1 + 9, 0, 24 );
-      block0[15] = block1[15] = 0x300; // bit count
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1,
-                                  sha256dt_iv, sha256dt_iv );
-
-      if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) )
-      {
-          casti_m128i( hash0, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
-          casti_m128i( hash0, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
-          if ( likely( valid_hash( hash0, ptarget ) && !bench ) )
-          {
-             pdata[19] = n;
-             submit_solution( work, hash0, mythr );
-          }
-      }
-      if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) )
-      {
-         casti_m128i( hash1, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
-         casti_m128i( hash1, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
-         if ( likely( valid_hash( hash1, ptarget ) && !bench ) )
-         {
-            pdata[19] = n+1;
-            submit_solution( work, hash1, mythr );
-         }
-      }
-      n += 2;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
-
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-#elif defined(SHA256DT_8WAY)
+#if defined(SHA256DT_8WAY)

 int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
@@ -236,7 +244,7 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
   memset_zero_256( block + 9, 6 );
   block[15] = _mm256_set1_epi32( 0x300 ); 
   
-   // initialize state
+   // initialize state for swecond hash
   istate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
   istate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 );
   istate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 );
@@ -253,11 +261,9 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
   
   do
   {
-      sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2,
-                                mexp_pre );
-
-      if ( unlikely( sha256_8way_transform_le_short(
-                            hash32, block, istate, ptarget ) ) )
+      sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
+      if ( unlikely( sha256_8way_transform_le_short( hash32, block,
+                                                     istate, ptarget ) ) )
      {
         for ( int lane = 0; lane < 8; lane++ )
         {
@@ -279,7 +285,9 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
   return 0;
 }

-#elif defined(SHA256DT_4WAY)
+#endif
+
+#if defined(SHA256DT_4WAY)

 int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
--- a/algo/sha/sha256q-4way.c
+++ b/algo/sha/sha256q-4way.c
@@ -3,7 +3,7 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "sha-hash-4way.h"
+#include "sha256-hash.h"

 #if defined(SHA256T_16WAY)

--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -4,7 +4,12 @@
 #include <string.h>
 #include <stdio.h>
 #include "sha256-hash.h"
-#include "sha-hash-4way.h"
+
+   static const uint32_t sha256_iv[8]  __attribute__ ((aligned (32))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };

 #if defined(SHA256T_16WAY)

@@ -19,11 +24,6 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   __m512i  istate[8]    __attribute__ ((aligned (64)));
   __m512i  mexp_pre[8]  __attribute__ ((aligned (64)));
   uint32_t phash[8]     __attribute__ ((aligned (32)));
-   static const uint32_t IV[8]  __attribute__ ((aligned (32))) =
-   {
-      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-   };
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
@@ -39,7 +39,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   // prehash first block directly from pdata
-   sha256_transform_le( phash, pdata, IV );
+   sha256_transform_le( phash, pdata, sha256_iv );

   // vectorize block 0 hash for second block
   mstate1[0] = _mm512_set1_epi32( phash[0] );
@@ -65,14 +65,14 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );

   // vectorize IV for 2nd & 3rd sha256
-   istate[0] = _mm512_set1_epi32( IV[0] );
-   istate[1] = _mm512_set1_epi32( IV[1] );
-   istate[2] = _mm512_set1_epi32( IV[2] );
-   istate[3] = _mm512_set1_epi32( IV[3] );
-   istate[4] = _mm512_set1_epi32( IV[4] );
-   istate[5] = _mm512_set1_epi32( IV[5] );
-   istate[6] = _mm512_set1_epi32( IV[6] );
-   istate[7] = _mm512_set1_epi32( IV[7] );
+   istate[0] = _mm512_set1_epi32( sha256_iv[0] );
+   istate[1] = _mm512_set1_epi32( sha256_iv[1] );
+   istate[2] = _mm512_set1_epi32( sha256_iv[2] );
+   istate[3] = _mm512_set1_epi32( sha256_iv[3] );
+   istate[4] = _mm512_set1_epi32( sha256_iv[4] );
+   istate[5] = _mm512_set1_epi32( sha256_iv[5] );
+   istate[6] = _mm512_set1_epi32( sha256_iv[6] );
+   istate[7] = _mm512_set1_epi32( sha256_iv[7] );

   // initialize padding for 2nd & 3rd sha256
   block[ 8] = last_byte;
@@ -110,6 +110,97 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,

 #endif

+#if defined(__SHA__)
+
+int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t block1a[16] __attribute__ ((aligned (64)));
+   uint32_t block1b[16] __attribute__ ((aligned (64)));
+   uint32_t block2a[16] __attribute__ ((aligned (64)));
+   uint32_t block2b[16] __attribute__ ((aligned (64)));
+   uint32_t hasha[8]   __attribute__ ((aligned (32)));
+   uint32_t hashb[8]   __attribute__ ((aligned (32)));
+   uint32_t mstatea[8] __attribute__ ((aligned (32)));
+   uint32_t mstateb[8] __attribute__ ((aligned (32)));
+   uint32_t sstate[8]  __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m128i shuf_bswap32 =
+           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+
+   // hash first 64 byte block of data
+   sha256_opt_transform_le( mstatea, pdata, sha256_iv );
+
+   // fill & pad second bock without nonce
+   memcpy( block1a, pdata + 16, 12 );
+   memcpy( block1b, pdata + 16, 12 );
+   block1a[ 3] = 0;
+   block1b[ 3] = 0;
+   block1a[ 4] = block1b[ 4] = 0x80000000;
+   memset( block1a + 5, 0, 40 );
+   memset( block1b + 5, 0, 40 );
+   block1a[15] = block1b[15] = 0x480; // funky bit count
+
+   sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
+
+   // Pad third block
+   block2a[ 8] = block2b[ 8] = 0x80000000;
+   memset( block2a + 9, 0, 24 );
+   memset( block2b + 9, 0, 24 );
+   block2a[15] = block2b[15] = 80*8; // bit count
+
+   do
+   {
+      // Insert nonce for second block
+      block1a[3] = n;
+      block1b[3] = n+1;
+      sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
+                                  mstateb, mstateb, sstate, sstate );
+      sha256_ni2way_transform_le( block2a, block2b, block2a, block2b,
+                                  sha256_iv, sha256_iv );
+      sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
+                                  sha256_iv, sha256_iv );
+
+      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
+      {
+          casti_m128i( hasha, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
+          casti_m128i( hasha, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
+          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
+          {
+             pdata[19] = n;
+             submit_solution( work, hasha, mythr );
+          }
+      }
+      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
+      {
+         casti_m128i( hashb, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
+         casti_m128i( hashb, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
+         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
+         {
+            pdata[19] = n+1;
+            submit_solution( work, hashb, mythr );
+         }
+      }
+      n += 2;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
 #if defined(SHA256T_8WAY)
   
 int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -5,9 +5,9 @@ bool register_sha256t_algo( algo_gate_t* gate )
    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
 #if defined(SHA256T_16WAY)
    gate->scanhash   = (void*)&scanhash_sha256t_16way;
-#elif defined(__SHA__)
+#elif defined(SHA256T_SHA)
    gate->optimizations = SHA_OPT;
-    gate->scanhash   = (void*)&scanhash_sha256t;
+    gate->scanhash   = (void*)&scanhash_sha256t_sha;
 #elif defined(SHA256T_8WAY)
    gate->scanhash   = (void*)&scanhash_sha256t_8way;
 #else
@@ -22,7 +22,7 @@ bool register_sha256q_algo( algo_gate_t* gate )
 #if defined(SHA256T_16WAY)
    gate->scanhash   = (void*)&scanhash_sha256q_16way;
    gate->hash       = (void*)&sha256q_16way_hash;
-#elif defined(__SHA__)
+#elif defined(SHA256T_SHA)
    gate->optimizations = SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256q;
    gate->hash       = (void*)&sha256q_hash;
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -6,6 +6,8 @@

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define SHA256T_16WAY 1
+#elif defined(__SHA__)
+  #define SHA256T_SHA 1
 #elif defined(__AVX2__)
  #define SHA256T_8WAY 1
 #else
@@ -42,9 +44,9 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 #endif

-#if defined(__SHA__)
+#if defined(SHA256T_SHA)

-int scanhash_sha256t( struct work *work, uint32_t max_nonce,
+int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );

 #endif
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -1,102 +0,0 @@
-#include "sha256t-gate.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-//#include "algo/sha/sph_sha2.h"
-#include "sha256-hash.h"
-
-#if defined(__SHA__)
-
-// Only used on CPUs with SHA
-
-
-int scanhash_sha256t( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t block0[16]    __attribute__ ((aligned (64)));
-   uint32_t block1[16]    __attribute__ ((aligned (64)));
-   uint32_t hash0[8]    __attribute__ ((aligned (32)));
-   uint32_t hash1[8]    __attribute__ ((aligned (32)));
-   uint32_t initstate[8] __attribute__ ((aligned (32)));
-   uint32_t midstate[8]  __attribute__ ((aligned (32)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 2;
-   uint32_t n = first_nonce;
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-   __m128i shuf_bswap32 =
-           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
-
-   // initialize state
-   initstate[0] = 0x6A09E667;
-   initstate[1] = 0xBB67AE85;
-   initstate[2] = 0x3C6EF372;
-   initstate[3] = 0xA54FF53A;
-   initstate[4] = 0x510E527F;
-   initstate[5] = 0x9B05688C;
-   initstate[6] = 0x1F83D9AB;
-   initstate[7] = 0x5BE0CD19;
-
-   // hash first 64 bytes of data
-   sha256_opt_transform_le( midstate, pdata, initstate );
-
-   do
-   {
-      // 1. final 16 bytes of data, with padding
-      memcpy( block0, pdata + 16, 16 );
-      memcpy( block1, pdata + 16, 16 );
-      block0[ 3] = n;
-      block1[ 3] = n+1;
-      block0[ 4] = block1[ 4] = 0x80000000;
-      memset( block0 + 5, 0, 40 );
-      memset( block1 + 5, 0, 40 );
-      block0[15] = block1[15] = 80*8; // bit count
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1, midstate, midstate );
-
-      // 2. 32 byte hash from 1.
-      memcpy( block0, hash0, 32 );
-      memcpy( block1, hash1, 32 );
-      block0[ 8] = block1[ 8] = 0x80000000;
-      memset( block0 + 9, 0, 24 );
-      memset( block1 + 9, 0, 24 );
-      block0[15] = block1[15] = 32*8; // bit count
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
-
-      // 3. 32 byte hash from 2.
-      memcpy( block0, hash0, 32 );
-      memcpy( block1, hash1, 32 );
-      sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
-
-      // byte swap final hash for testing
-      casti_m128i( hash0, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
-      casti_m128i( hash0, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
-      casti_m128i( hash1, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
-      casti_m128i( hash1, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
-
-      if ( unlikely( valid_hash( hash0, ptarget ) && !bench ) )
-      {
-         pdata[19] = n;
-         submit_solution( work, hash0, mythr );
-      }
-      if ( unlikely( valid_hash( hash1, ptarget ) && !bench ) )
-      {
-         pdata[19] = n+1;
-         submit_solution( work, hash1, mythr );
-      }
-      n += 2;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
-
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-#endif
-
--- a/algo/sha/sha3-defs.h
+++ b/algo/sha/sha3-defs.h
@@ -1,22 +0,0 @@
-
-#ifndef DEFS_X5_H__
-#define DEFS_X5_H__
-#include <emmintrin.h>
-typedef unsigned char BitSequence;
-typedef unsigned long long DataLength;
-typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2} HashReturn;
-
-typedef unsigned char uint8;
-typedef unsigned int uint32;
-typedef unsigned long long uint64;
-
-//typedef struct {
-//    uint32 buffer[8]; /* Buffer to be hashed */
-//    __m128i chainv[10];   /* Chaining values */
-//    uint64 bitlen[2]; /* Message length in bits */
-//    uint32 rembitlen; /* Length of buffer data to be hashed */
-//    int hashbitlen;
-//} hashState_luffa;
-
-typedef unsigned char byte;
-#endif
--- a/algo/sha/sha3_common.h
+++ b/algo/sha/sha3_common.h
@@ -1,31 +0,0 @@
-/*
- * file        : sha3_common.h
- * version     : 1.0.208
- * date        : 14.12.2010
- *
- * Common declarations
- *
- * Cagdas Calik
- * ccalik@metu.edu.tr
- * Institute of Applied Mathematics, Middle East Technical University, Turkey.
- *
- */
-#include "sha3-defs.h"
-#ifndef SHA3_COMMON_H
-#define SHA3_COMMON_H
-
-
-#ifdef __GNUC__
-#define MYALIGN __attribute__((aligned(16)))
-#else
-#define MYALIGN __declspec(align(16))
-#endif
-
-#define M128(x) *((__m128i*)x)
-
-
-//typedef unsigned char BitSequence;
-//typedef unsigned long long DataLength;
-//typedef enum {SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2} HashReturn;
-
-#endif // SHA3_COMMON_H
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -34,7 +34,7 @@

 #include <stddef.h>
 #include <string.h>
-#include "sha-hash-4way.h"
+#include "sha512-hash.h"

 /*
 static const uit64_t H512[8] =
--- a/algo/sha/sha512-hash.h
+++ b/algo/sha/sha512-hash.h
@@ -0,0 +1,46 @@
+#ifndef SHA512_HASH_H__
+#define SHA512_HASH_H__ 1
+
+#include <stddef.h>
+#include "simd-utils.h"
+#include "sph_sha2.h"
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-512 8 way
+
+typedef struct {
+   __m512i buf[128>>3];
+   __m512i val[8];
+   uint64_t count;
+   bool initialized;
+} sha512_8way_context __attribute__ ((aligned (128)));
+
+void sha512_8way_init( sha512_8way_context *sc);
+void sha512_8way_update( sha512_8way_context *sc, const void *data, 
+                         size_t len );
+void sha512_8way_close( sha512_8way_context *sc, void *dst );
+void sha512_8way_full( void *dst, const void *data, size_t len );
+
+#endif  // AVX512
+
+#if defined (__AVX2__)
+
+// SHA-512 4 way
+
+typedef struct {
+   __m256i buf[128>>3];
+   __m256i val[8];
+   uint64_t count;
+   bool initialized;
+} sha512_4way_context __attribute__ ((aligned (64)));
+
+void sha512_4way_init( sha512_4way_context *sc);
+void sha512_4way_update( sha512_4way_context *sc, const void *data,
+                         size_t len );
+void sha512_4way_close( sha512_4way_context *sc, void *dst );
+void sha512_4way_full( void *dst, const void *data, size_t len );
+
+#endif  // AVX2
+
+#endif
--- a/algo/sha/sha512256d-4way.c
+++ b/algo/sha/sha512256d-4way.c
@@ -1,5 +1,6 @@
 #include "algo-gate-api.h"
-#include "sha-hash-4way.h"
+#include "sha256-hash.h"
+#include "sha512-hash.h"
 #include <string.h>
 #include <stdint.h>

--- a/algo/sha/sph_sha2.h
+++ b/algo/sha/sph_sha2.h
@@ -41,7 +41,7 @@
 #define SPH_SHA2_H__

 #include <stddef.h>
-#include "sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for SHA-224.
--- a/algo/sha/sph_types.h
+++ b/algo/sha/sph_types.h
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -58,7 +58,7 @@ extern "C"{
           M8, M9, MA, MB, MC, MD, ME, MF; \
   const __m256i FIVE  = _mm256_set1_epi32( 5 ); \
   const __m256i THREE = _mm256_set1_epi32( 3 ); \
-   sph_u32 Wlow, Whigh;
+   uint32_t Wlow, Whigh;

 #define READ_STATE8(state) do \
 { \
@@ -653,7 +653,7 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 	        M8, M9, MA, MB, MC, MD, ME, MF; \
   const __m128i FIVE  = _mm_set1_epi32( 5 ); \
   const __m128i THREE = _mm_set1_epi32( 3 ); \
-   sph_u32 Wlow, Whigh;
+   uint32_t Wlow, Whigh;

 #define READ_STATE(state) do \
 { \
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -1,51 +1,11 @@
-/* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */
-/**
- * Shabal interface. Shabal is a family of functions which differ by
- * their output size; this implementation defines Shabal for output
- * sizes 192, 224, 256, 384 and 512 bits.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_shabal.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
 #ifndef SHABAL_HASH_4WAY_H__
 #define SHABAL_HASH_4WAY_H__ 1

 #ifdef __SSE4_1__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
 #include "simd-utils.h"

-#ifdef __cplusplus
-extern "C"{
-#endif
-
 #define SPH_SIZE_shabal256   256

 #define SPH_SIZE_shabal512   512
@@ -55,7 +15,7 @@ extern "C"{
 typedef struct {
   __m256i buf[16];
   __m256i A[12], B[16], C[16];
-   sph_u32 Whigh, Wlow;
+   uint32_t Whigh, Wlow;
   size_t ptr;
   bool state_loaded;
 } shabal_8way_context __attribute__ ((aligned (64)));
@@ -80,7 +40,7 @@ void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
 typedef struct {
 	__m128i buf[16] __attribute__ ((aligned (64)));
 	__m128i A[12], B[16], C[16];
-	sph_u32 Whigh, Wlow;
+	uint32_t Whigh, Wlow;
   size_t ptr;
   bool state_loaded;
 } shabal_4way_context;
@@ -100,10 +60,6 @@ void shabal512_4way_close( void *cc, void *dst );
 void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                       void *dst );

-#ifdef __cplusplus
-}
-#endif
-
 #endif

 #endif
--- a/algo/shabal/sph_shabal.h
+++ b/algo/shabal/sph_shabal.h
@@ -37,7 +37,7 @@
 #define SPH_SHABAL_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"
 #ifdef __cplusplus
 extern "C"{
 #endif
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -1,6 +1,4 @@
 #include "shavite-hash-2way.h"
-#include "algo/sha/sph_types.h"
-
 #include <stdio.h>

 // This is a fake, it actually does not do parallel AES, that requires VAES.
--- a/algo/shavite/sph_shavite.c
+++ b/algo/shavite/sph_shavite.c
@@ -64,7 +64,7 @@ extern "C"{
 */

 #define AES_BIG_ENDIAN   0
-#include "algo/sha/aes_helper.c"
+#include "compat/aes_helper.c"

 static const sph_u32 IV224[] = {
 	C32(0x6774F31C), C32(0x990AE210), C32(0xC87D4274), C32(0xC9546371),
--- a/algo/shavite/sph_shavite.h
+++ b/algo/shavite/sph_shavite.h
@@ -39,7 +39,7 @@
 #define SPH_SHAVITE_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 #ifdef __cplusplus
 extern "C"{
--- a/algo/simd/nist.h
+++ b/algo/simd/nist.h
@@ -9,7 +9,7 @@
 #endif

 #include "simd-compat.h"
-#include "algo/sha/sha3-defs.h"
+#include "compat/sha3-defs.h"
 /*
 * NIST API Specific types.
 */
--- a/algo/simd/simd-compat.h
+++ b/algo/simd/simd-compat.h
@@ -24,7 +24,7 @@
 */

 #include <stdint.h>
-#include "algo/sha/brg_types.h"
+#include "compat/brg_types.h"

 #define C32(x)    ((u32)(x))

--- a/algo/simd/sph_simd.h
+++ b/algo/simd/sph_simd.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 /**
 * Output size (in bits) for SIMD-224.
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -2,7 +2,6 @@
 #include <string.h>
 #include <stdint.h>
 #include "skein-hash-4way.h"
-#include "algo/sha/sha-hash-4way.h"
 #include "algo/sha/sha256-hash.h"

 #if defined (SKEIN_8WAY)
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -1,5 +1,4 @@
 #include "skein-gate.h"
-#include "sph_skein.h"
 #include "skein-hash-4way.h"

 bool register_skein_algo( algo_gate_t* gate )
--- a/algo/skein/sph_skein.h
+++ b/algo/skein/sph_skein.h
@@ -46,7 +46,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 #if SPH_64

--- a/algo/tiger/sph_tiger.h
+++ b/algo/tiger/sph_tiger.h
@@ -45,7 +45,7 @@
 #define SPH_TIGER_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 #if SPH_64

--- a/algo/whirlpool/sph_whirlpool.h
+++ b/algo/whirlpool/sph_whirlpool.h
@@ -49,7 +49,7 @@
 #define SPH_WHIRLPOOL_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "compat/sph_types.h"

 #if SPH_64

--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -65,6 +65,9 @@ void init_x11_8way_ctx()
 #endif
 }

+static __thread __m512i x11_8way_midstate[16] __attribute__((aligned(64)));
+
+
 void x11_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
@@ -80,8 +83,9 @@ void x11_8way_hash( void *state, const void *input )
     uint64_t hash7[8] __attribute__ ((aligned (64)));
     x11_8way_ctx_holder ctx;
     memcpy( &ctx, &x11_8way_ctx, sizeof(x11_8way_ctx) );
-     blake512_8way_update( &ctx.blake, input, 80 );
-     blake512_8way_close( &ctx.blake, vhash );
+
+     blake512_8way_final_le( &ctx.blake, vhash, casti_m512i( input, 9 ),
+                             x11_8way_midstate );

     bmw512_8way_update( &ctx.bmw, vhash, 64 );
     bmw512_8way_close( &ctx.bmw, vhash );
@@ -252,39 +256,45 @@ void x11_8way_hash( void *state, const void *input )
 int scanhash_x11_8way( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[8*8] __attribute__ ((aligned (128)));
-     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
-     uint32_t *pdata = work->data;
-     uint32_t *ptarget = work->target;
-     uint32_t n = pdata[19];
-     const uint32_t first_nonce = pdata[19];
-     int thr_id = mythr->id;
-     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-     const uint32_t Htarg = ptarget[7];
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   __m128i edata[5] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   int thr_id = mythr->id;
+   __m512i  *noncev = (__m512i*)vdata + 9; 
+   const uint32_t last_nonce = max_nonce -8;
+   const __m512i eight = _mm512_set1_epi64( 8 );

-     const uint32_t last_nonce = max_nonce -8;
-     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   // convert LE32 to LE64
+   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
+   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
+   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
+   edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
+   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );

-     do
-     {
-        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-         _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                           n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+   mm512_intrlv80_8x64( vdata, edata );
+   *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
+                                    0,7, 0,6, 0,5, 0,4, 0,3, 0,2, 0,1, 0,0 ) );
+   blake512_8way_prehash_le( &x11_8way_ctx.blake, x11_8way_midstate, vdata );

-         x11_8way_hash( hash, vdata );
-         pdata[19] = n;
+   do
+   {
+      x11_8way_hash( hash, vdata );

-         for ( int i = 0; i < 8; i++ )
-         if ( ( hash+(i<<3) )[7] <= Htarg
-              && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-         {
-             pdata[19] = n+i;
-             submit_solution( work, hash+(i<<3), mythr );
-         }
-         n += 8;
-     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
-     *hashes_done = n - first_nonce;
-     return 0;
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( valid_hash( hash+(i<<3), ptarget ) && !opt_benchmark ))
+      {
+          pdata[19] = n+i;
+          submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = _mm512_add_epi32( *noncev, eight );
+      n += 8;
+   } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+   *hashes_done = n - first_nonce;
+   return 0;
 }


--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -263,7 +263,7 @@ bool register_hex_algo( algo_gate_t* gate )
  gate->scanhash        = (void*)&scanhash_hex;
  gate->hash            = (void*)&x16r_hash;
  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
-  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
  opt_target_factor = 128.0;
  return true;
 };
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -20,7 +20,7 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha512-hash.h"

 #if defined(__AES__)
 #include "algo/echo/aes_ni/hash_api.h"
@@ -42,7 +42,6 @@
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/shabal/shabal-hash-4way.h"
-#include "algo/sha/sha-hash-4way.h"

 #if defined(__VAES__)
 #include "algo/groestl/groestl512-hash-4way.h"
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -12,9 +12,7 @@
 #include "algo/tiger/sph_tiger.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/lyra2/lyra2.h"
-#if defined(__SHA__)
-  #include "algo/sha/sha256-hash.h"
-#endif
+#include "algo/sha/sha256-hash.h"

 #if defined (X21S_8WAY)

--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -20,7 +20,7 @@
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/haval-hash-4way.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha512-hash.h"
 #if defined(__VAES__)
  #include "algo/groestl/groestl512-hash-4way.h"
  #include "algo/shavite/shavite-hash-4way.h"
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -25,7 +25,7 @@
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/haval-hash-4way.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha512-hash.h"

 #if defined(X17_8WAY)

@@ -37,7 +37,6 @@ union _x17_8way_context_overlay
    jh512_8way_context      jh;
    keccak512_8way_context  keccak;
    luffa_4way_context      luffa;
-//    cube_4way_context       cube;
    cube_4way_2buf_context   cube;
 #if defined(__VAES__)
    groestl512_4way_context groestl;
@@ -190,7 +189,6 @@ int x17_8way_hash( void *state, const void *input, int thr_id )
     hamsi512_8way_init( &ctx.hamsi );
     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
     hamsi512_8way_close( &ctx.hamsi, vhash );
-
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );

--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -19,7 +19,7 @@
 #include "algo/fugue/fugue-aesni.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha512-hash.h"
 #include "algo/haval/haval-hash-4way.h"
 #if defined(__VAES__)
  #include "algo/groestl/groestl512-hash-4way.h"
--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -16,7 +16,8 @@
 #include "algo/fugue/fugue-aesni.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha512-hash.h"
+#include "algo/sha/sha256-hash.h"
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/tiger/sph_tiger.h"
 #include "algo/lyra2/lyra2.h"
@@ -26,9 +27,6 @@
  #include "algo/shavite/shavite-hash-4way.h"
  #include "algo/echo/echo-hash-4way.h"
 #endif
-#if defined(__SHA__)
-  #include "algo/sha/sha256-hash.h"
-#endif

 #if defined(X22I_8WAY)

--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -6,7 +6,8 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/shabal/shabal-hash-4way.h"
-#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha512-hash.h"
+#include "algo/sha/sha256-hash.h"
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/blake/blake2s-hash-4way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -31,9 +32,6 @@
  #include "algo/shavite/shavite-hash-4way.h"
  #include "algo/echo/echo-hash-4way.h"
 #endif
-#if defined(__SHA__)
-  #include "algo/sha/sha256-hash.h"
-#endif

 void x25x_shuffle( void *hash )
 {