v23.8

2025-09-17 23:44:27 +00:00 · 2023-11-11 16:48:57 -05:00
parent e043698442
commit 26b9429589
44 changed files with 4125 additions and 15298 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -79,11 +79,6 @@ cpuminer_SOURCES = \
  algo/hamsi/hamsi-hash-4way.c \
  algo/haval/haval.c \
  algo/haval/haval-hash-4way.c \
  algo/hodl/aes.c \
  algo/hodl/hodl-gate.c \
  algo/hodl/hodl-wolf.c \
  algo/hodl/sha512_avx.c \
  algo/hodl/sha512_avx2.c \
  algo/jh/sph_jh.c \
  algo/jh/jh-hash-4way.c \
  algo/jh/jha-gate.c \
@@ -148,6 +143,8 @@ cpuminer_SOURCES = \
  algo/scrypt/scrypt.c \
  algo/scrypt/scrypt-core-4way.c \
  algo/scrypt/neoscrypt.c \
  algo/sha/sha1.c \
  algo/sha/sha1-hash.c \
  algo/sha/sha256-hash.c \
  algo/sha/sph_sha2.c \
  algo/sha/sph_sha2big.c \
@@ -278,20 +275,10 @@ cpuminer_SOURCES = \
  algo/yespower/yespower-ref.c \
  algo/yespower/yespower-blake2b-ref.c
 disable_flags =
 if USE_ASM
   cpuminer_SOURCES += asm/neoscrypt_asm.S
 if ARCH_x86
   cpuminer_SOURCES += asm/sha2-x86.S asm/scrypt-x86.S
 endif
 if ARCH_x86_64
   cpuminer_SOURCES += asm/sha2-x64.S asm/scrypt-x64.S
 endif
 if ARCH_ARM
   cpuminer_SOURCES += asm/sha2-arm.S asm/scrypt-arm.S
 endif
 else
   disable_flags += -DNOASM
 endif
@@ -301,7 +288,7 @@ if HAVE_WINDOWS
 endif
 cpuminer_LDFLAGS	= @LDFLAGS@
-cpuminer_LDADD	= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lssl -lcrypto -lgmp
+cpuminer_LDADD	= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@  -lgmp
 cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
 cpuminer_CFLAGS   = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
--- a/7
+++ b/7
@@ -73,6 +73,13 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 v23.8
 Cpuminer-opt is no longer dependant on OpenSSL.
 Removed Hodl algo.
 Removed legacy Sha256 & Scrypt ASM code.
 ARM: Echo AES is working and enabled for x17.
 v23.7
 Fixed blakes2s, broken in v3.23.4.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -310,7 +310,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_GROESTL:      rc = register_groestl_algo       ( gate ); break;
    case ALGO_HEX:          rc = register_hex_algo           ( gate ); break;
    case ALGO_HMQ1725:      rc = register_hmq1725_algo       ( gate ); break;
    case ALGO_HODL:         rc = register_hodl_algo          ( gate ); break;
    case ALGO_JHA:          rc = register_jha_algo           ( gate ); break;
    case ALGO_KECCAK:       rc = register_keccak_algo        ( gate ); break;
    case ALGO_KECCAKC:      rc = register_keccakc_algo       ( gate ); break;
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -21,112 +21,92 @@
 #include "hash_api.h"
 #include "simd-utils.h"
-MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
+const uint32_t	const1[]	      __attribute__ ((aligned (32))) =
-MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
+   { 0x00000001, 0x00000000, 0x00000000, 0x00000000 };
-MYALIGN const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1};
+const uint32_t	mul2mask[]     __attribute__ ((aligned (16))) =
-MYALIGN const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C};
+   { 0x00001b00, 0x00000000, 0x00000000, 0x00000000 };
-MYALIGN const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1};
+const uint32_t	lsbmask[]      __attribute__ ((aligned (16))) =
-MYALIGN const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8};
+   { 0x01010101, 0x01010101, 0x01010101, 0x01010101 };
-MYALIGN const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09};
+const uint32_t	invshiftrows[]	__attribute__ ((aligned (16))) =
-MYALIGN const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79};
+   { 0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c };
 MYALIGN const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8};
 MYALIGN const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170};
 MYALIGN const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1};
 MYALIGN const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363};
 MYALIGN const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6};
 MYALIGN const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b};
 MYALIGN const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e};
 MYALIGN const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e};
 MYALIGN const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515};
 MYALIGN const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c};
 MYALIGN const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601};
 MYALIGN const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06};
 MYALIGN const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b};
 #define ECHO_SUBBYTES4( state, j ) \
   state[0][j] = v128_aesenc( state[0][j], k1 ); \
   k1 = v128_add32( k1, cast_v128(const1) ); \
   state[1][j] = v128_aesenc( state[1][j], k1 ); \
   k1 = v128_add32( k1, cast_v128(const1) ); \
   state[2][j] = v128_aesenc( state[2][j], k1 ); \
   k1 = v128_add32( k1, cast_v128(const1) ); \
   state[3][j] = v128_aesenc( state[3][j], k1 ); \
   k1 = v128_add32( k1, cast_v128(const1) ); \
   state[0][j] = v128_aesenc_nokey( state[0][j] ); \
   state[1][j] = v128_aesenc_nokey( state[1][j] ); \
   state[2][j] = v128_aesenc_nokey( state[2][j] ); \
   state[3][j] = v128_aesenc_nokey( state[3][j] )
-MYALIGN const unsigned int 	const1[]		= {0x00000001, 0x00000000, 0x00000000, 0x00000000};
+#define ECHO_SUBBYTES( state, i, j ) \
-MYALIGN const unsigned int	mul2mask[]		= {0x00001b00, 0x00000000, 0x00000000, 0x00000000};
+	state[i][j] = v128_aesenc( state[i][j], k1 ); \
-MYALIGN const unsigned int	lsbmask[]		= {0x01010101, 0x01010101, 0x01010101, 0x01010101};
+   k1 = v128_add32( k1, cast_v128(const1) ); \
-MYALIGN const unsigned int	invshiftrows[]	= {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
+	state[i][j] = v128_aesenc_nokey( state[i][j] )
 MYALIGN const unsigned int	zero[]			= {0x00000000, 0x00000000, 0x00000000, 0x00000000};
 MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
-
+#define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) \
-#define ECHO_SUBBYTES4(state, j) \
+	s2 = v128_add8( state1[0][j], state1[0][j] ); \
-   state[0][j] = v128_aesenc(state[0][j], k1);\
+	t1 = v128_sr16( state1[0][j], 7 ); \
-   k1 = v128_add32(k1, cast_v128(const1));\
+	t1 = v128_and( t1, cast_v128(lsbmask) ); \
-   state[1][j] = v128_aesenc(state[1][j], k1);\
+	t2 = v128_shuffle8( cast_v128(mul2mask), t1 ); \
-   k1 = v128_add32(k1, cast_v128(const1));\
+	s2 = v128_xor( s2, t2 ); \
-   state[2][j] = v128_aesenc(state[2][j], k1);\
+	state2[0][j] = s2; \
-   k1 = v128_add32(k1, cast_v128(const1));\
+	state2[1][j] = state1[0][j]; \
-   state[3][j] = v128_aesenc(state[3][j], k1);\
+	state2[2][j] = state1[0][j]; \
-   k1 = v128_add32(k1, cast_v128(const1));\
+	state2[3][j] = v128_xor(s2, state1[0][j] ); \
-   state[0][j] = v128_aesenc(state[0][j], v128_zero ); \
+	s2 = v128_add8( state1[1][(j + 1) & 3], state1[1][(j + 1) & 3] ); \
-   state[1][j] = v128_aesenc(state[1][j], v128_zero ); \
+	t1 = v128_sr16( state1[1][(j + 1) & 3], 7 ); \
-   state[2][j] = v128_aesenc(state[2][j], v128_zero ); \
+	t1 = v128_and( t1, cast_v128(lsbmask) ); \
-   state[3][j] = v128_aesenc(state[3][j], v128_zero )
+	t2 = v128_shuffle8( cast_v128(mul2mask), t1 ); \
-
+	s2 = v128_xor( s2, t2 ); \
-#define ECHO_SUBBYTES(state, i, j) \
+	state2[0][j] = v128_xor3( state2[0][j], s2, state1[1][(j + 1) & 3] );\
-	state[i][j] = v128_aesenc(state[i][j], k1);\
+	state2[1][j] = v128_xor( state2[1][j], s2 ); \
-   k1 = v128_add32(k1, cast_v128(const1));\
+	state2[2][j] = v128_xor( state2[2][j], state1[1][(j + 1) & 3] ); \
-	state[i][j] = v128_aesenc(state[i][j], cast_v128(zero))
+	state2[3][j] = v128_xor( state2[3][j], state1[1][(j + 1) & 3] ); \
-
+	s2 = v128_add8( state1[2][(j + 2) & 3], state1[2][(j + 2) & 3] ); \
-#define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
+	t1 = v128_sr16( state1[2][(j + 2) & 3], 7 ); \
-	s2 = v128_add8(state1[0][j], state1[0][j]);\
+	t1 = v128_and( t1, cast_v128(lsbmask) ); \
-	t1 = v128_sr16(state1[0][j], 7);\
+	t2 = v128_shuffle8( cast_v128(mul2mask), t1 ); \
-	t1 = v128_and(t1, cast_v128(lsbmask));\
+	s2 = v128_xor( s2, t2 ); \
-	t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
+	state2[0][j] = v128_xor( state2[0][j], state1[2][(j + 2) & 3] ); \
-	s2 = v128_xor(s2, t2);\
+	state2[1][j] = v128_xor3( state2[1][j], s2, state1[2][(j + 2) & 3] ); \
-	state2[0][j] = s2;\
+	state2[2][j] = v128_xor( state2[2][j], s2 ); \
-	state2[1][j] = state1[0][j];\
+	state2[3][j] = v128_xor( state2[3][j], state1[2][(j + 2) & 3] ); \
-	state2[2][j] = state1[0][j];\
+	s2 = v128_add8( state1[3][(j + 3) & 3], state1[3][(j + 3) & 3] ); \
-	state2[3][j] = v128_xor(s2, state1[0][j]);\
+	t1 = v128_sr16( state1[3][(j + 3) & 3], 7 ); \
-	s2 = v128_add8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
+	t1 = v128_and( t1, cast_v128(lsbmask) ); \
-	t1 = v128_sr16(state1[1][(j + 1) & 3], 7);\
+	t2 = v128_shuffle8( cast_v128(mul2mask), t1 ); \
-	t1 = v128_and(t1, cast_v128(lsbmask));\
+	s2 = v128_xor( s2, t2 ); \
-	t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
+	state2[0][j] = v128_xor( state2[0][j], state1[3][(j + 3) & 3] ); \
-	s2 = v128_xor(s2, t2);\
+	state2[1][j] = v128_xor( state2[1][j], state1[3][(j + 3) & 3] ); \
-	state2[0][j] = v128_xor3(state2[0][j], s2, state1[1][(j + 1) & 3] );\
+	state2[2][j] = v128_xor3( state2[2][j], s2, state1[3][(j + 3) & 3] ); \
-	state2[1][j] = v128_xor(state2[1][j], s2);\
+	state2[3][j] = v128_xor( state2[3][j], s2 )
 	state2[2][j] = v128_xor(state2[2][j], state1[1][(j + 1) & 3]);\
 	state2[3][j] = v128_xor(state2[3][j], state1[1][(j + 1) & 3]);\
 	s2 = v128_add8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
 	t1 = v128_sr16(state1[2][(j + 2) & 3], 7);\
 	t1 = v128_and(t1, cast_v128(lsbmask));\
 	t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
 	s2 = v128_xor(s2, t2);\
 	state2[0][j] = v128_xor(state2[0][j], state1[2][(j + 2) & 3]);\
 	state2[1][j] = v128_xor3(state2[1][j], s2, state1[2][(j + 2) & 3] );\
 	state2[2][j] = v128_xor(state2[2][j], s2);\
 	state2[3][j] = v128_xor(state2[3][j], state1[2][(j + 2) & 3]);\
 	s2 = v128_add8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
 	t1 = v128_sr16(state1[3][(j + 3) & 3], 7);\
 	t1 = v128_and(t1, cast_v128(lsbmask));\
 	t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
 	s2 = v128_xor(s2, t2);\
 	state2[0][j] = v128_xor(state2[0][j], state1[3][(j + 3) & 3]);\
 	state2[1][j] = v128_xor(state2[1][j], state1[3][(j + 3) & 3]);\
 	state2[2][j] = v128_xor3(state2[2][j], s2, state1[3][(j + 3) & 3] );\
 	state2[3][j] = v128_xor(state2[3][j], s2)
 #define ECHO_ROUND_UNROLL2 \
-   ECHO_SUBBYTES4(_state, 0);\
+{ \
-   ECHO_SUBBYTES4(_state, 1);\
+   ECHO_SUBBYTES4( _state, 0 ); \
-   ECHO_SUBBYTES4(_state, 2);\
+   ECHO_SUBBYTES4( _state, 1 ); \
-   ECHO_SUBBYTES4(_state, 3);\
+   ECHO_SUBBYTES4( _state, 2 ); \
-   ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
+   ECHO_SUBBYTES4( _state, 3 ); \
-   ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
+   ECHO_MIXBYTES( _state, _state2, 0, t1, t2, s2 ); \
-   ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
+   ECHO_MIXBYTES( _state, _state2, 1, t1, t2, s2 ); \
-   ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
+   ECHO_MIXBYTES( _state, _state2, 2, t1, t2, s2 ); \
-   ECHO_SUBBYTES4(_state2, 0);\
+   ECHO_MIXBYTES( _state, _state2, 3, t1, t2, s2 ); \
-   ECHO_SUBBYTES4(_state2, 1);\
+   ECHO_SUBBYTES4( _state2, 0 ); \
-   ECHO_SUBBYTES4(_state2, 2);\
+   ECHO_SUBBYTES4( _state2, 1 ); \
-   ECHO_SUBBYTES4(_state2, 3);\
+   ECHO_SUBBYTES4( _state2, 2 ); \
-   ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
+   ECHO_SUBBYTES4( _state2, 3 ); \
-   ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
+   ECHO_MIXBYTES( _state2, _state, 0, t1, t2, s2 ); \
-   ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
+   ECHO_MIXBYTES( _state2, _state, 1, t1, t2, s2 ); \
-   ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
+   ECHO_MIXBYTES( _state2, _state, 2, t1, t2, s2 ); \
   ECHO_MIXBYTES( _state2, _state, 3, t1, t2, s2 ); \
 }
 /*
 #define ECHO_ROUND_UNROLL2 \
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -61,9 +61,12 @@ static const v128u64_t SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
 #if defined(__ARM_NEON)
 // No fast shuffle on NEON
-static const uint32x4_t vmask_d8 = {  3, 1, 2, 0 };  
+//static const uint32x4_t vmask_d8 = {  3, 1, 2, 0 };  
 static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
-#define gr_shuffle32( v )       v128_shufflev32( v, vmask_d8 )
+#define gr_shuffle32( v )      v128_blendv( v128_qrev32( v ), v, BLEND_MASK )
 //#define gr_shuffle32( v )       v128_shufflev32( v, vmask_d8 )
 #else
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -35,7 +35,7 @@
 #include <stdio.h>
 #include "hamsi-hash-4way.h"
-static const uint32_t HAMSI_IV512[] =
+static const uint32_t HAMSI_IV512[] __attribute__ ((aligned (32))) =
 {
 	 0x73746565, 0x6c706172, 0x6b204172, 0x656e6265,
    0x72672031, 0x302c2062, 0x75732032, 0x3434362c,
@@ -43,7 +43,8 @@ static const uint32_t HAMSI_IV512[] =
 	 0x65766572, 0x6c65652c, 0x2042656c, 0x6769756d
 };
-static const uint32_t alpha_n[] = {
+static const uint32_t alpha_n[] __attribute__ ((aligned (32))) =
 {
 	0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, 0xff00aaaa,
   0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xf0f0ff00,
   0xf0f0cccc, 0xaaaaff00, 0xccccff00, 0xaaaaf0f0,
@@ -54,7 +55,8 @@ static const uint32_t alpha_n[] = {
   0xff00cccc, 0xaaaaf0f0,	0xff00aaaa, 0xccccf0f0
 };
-static const uint32_t alpha_f[] = {
+static const uint32_t alpha_f[] __attribute__ ((aligned (32))) =
 {
 	0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0,	0xcaf9f9c0,
   0x0ff0f9c0, 0x639ccaf9,	0xf9c00ff0, 0x639ccaf9,
   0x639c0ff0,	0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c,
@@ -69,7 +71,8 @@ static const uint32_t alpha_f[] = {
 /* Note: this table lists bits within each byte from least
   siginificant to most significant. */
-static const uint32_t T512[64][16] = {
+static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
 {
 	{  0xef0b0270, 0x3afd0000, 0x5dae0000, 0x69490000,
      0x9b0f3c06, 0x4405b5f9, 0x66140a51, 0x924f5d0a,
      0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000,
@@ -2260,4 +2263,4 @@ void hamsi512_2x64( void *dst, const void *data, size_t len )
   hamsi512_2x64_close( &sc, dst );
 }   
-#endif   // SSE4.1 or NEON
+#endif   // SSE4.2 or NEON
--- a/algo/hodl/aes.c
+++ b/algo/hodl/aes.c
@@ -1,183 +0,0 @@
 #include <stdint.h>
 #include "miner.h"
 #if defined(__AES__)
 #include <x86intrin.h>
 #include "wolf-aes.h"
 static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
 {
    __m128i tmp4;
    *tmp2 = _mm_shuffle_epi32(*tmp2, 0xFF);
    tmp4 = _mm_slli_si128(*tmp1, 0x04);
    *tmp1 = _mm_xor_si128(*tmp1, tmp4);
    tmp4 = _mm_slli_si128(tmp4, 0x04);
    *tmp1 = _mm_xor_si128(*tmp1, tmp4);
    tmp4 = _mm_slli_si128(tmp4, 0x04);
    *tmp1 = _mm_xor_si128(*tmp1, tmp4);
    *tmp1 = _mm_xor_si128(*tmp1, *tmp2);
 }
 static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
 {
    __m128i tmp2, tmp4;
    tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00);
    tmp2 = _mm_shuffle_epi32(tmp4, 0xAA);
    tmp4 = _mm_slli_si128(*tmp3, 0x04);
    *tmp3 = _mm_xor_si128(*tmp3, tmp4);
    tmp4 = _mm_slli_si128(tmp4, 0x04);
    *tmp3 = _mm_xor_si128(*tmp3, tmp4);
    tmp4 = _mm_slli_si128(tmp4, 0x04);
    *tmp3 = _mm_xor_si128(*tmp3, tmp4);
    *tmp3 = _mm_xor_si128(*tmp3, tmp2);
 }
 // Special thanks to Intel for helping me
 // with ExpandAESKey256() and its subroutines
 void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf)
 {
    __m128i tmp1, tmp2, tmp3;
    tmp1 = keys[0] = KeyBuf[0];
    tmp3 = keys[1] = KeyBuf[1];
    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x01);
    ExpandAESKey256_sub1(&tmp1, &tmp2);
    keys[2] = tmp1;
    ExpandAESKey256_sub2(&tmp1, &tmp3);
    keys[3] = tmp3;
    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x02);
    ExpandAESKey256_sub1(&tmp1, &tmp2);
    keys[4] = tmp1;
    ExpandAESKey256_sub2(&tmp1, &tmp3);
    keys[5] = tmp3;
    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x04);
    ExpandAESKey256_sub1(&tmp1, &tmp2);
    keys[6] = tmp1;
    ExpandAESKey256_sub2(&tmp1, &tmp3);
    keys[7] = tmp3;
    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x08);
    ExpandAESKey256_sub1(&tmp1, &tmp2);
    keys[8] = tmp1;
    ExpandAESKey256_sub2(&tmp1, &tmp3);
    keys[9] = tmp3;
    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10);
    ExpandAESKey256_sub1(&tmp1, &tmp2);
    keys[10] = tmp1;
    ExpandAESKey256_sub2(&tmp1, &tmp3);
    keys[11] = tmp3;
    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20);
    ExpandAESKey256_sub1(&tmp1, &tmp2);
    keys[12] = tmp1;
    ExpandAESKey256_sub2(&tmp1, &tmp3);
    keys[13] = tmp3;
    tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
    ExpandAESKey256_sub1(&tmp1, &tmp2);
    keys[14] = tmp1;
 }
 #if defined(__SSE4_2__)
 //#ifdef __AVX__
 #define AESENC(i,j) \
    State[j] = _mm_aesenc_si128(State[j], ExpandedKey[j][i]);
 #define AESENC_N(i) \
    AESENC(i,0) \
    AESENC(i,1) \
    AESENC(i,2) \
    AESENC(i,3) \
    AESENC(i,4) \
    AESENC(i,5) \
    AESENC(i,6) \
    AESENC(i,7) \
 static inline void AES256Core(__m128i* State, __m128i ExpandedKey[][16])
 {
    const uint32_t N = AES_PARALLEL_N;
    for(int j=0; j<N; ++j) {
        State[j] = _mm_xor_si128(State[j], ExpandedKey[j][0]);
    }
    AESENC_N(1)
    AESENC_N(2)
    AESENC_N(3)
    AESENC_N(4)
    AESENC_N(5)
    AESENC_N(6)
    AESENC_N(7)
    AESENC_N(8)
    AESENC_N(9)
    AESENC_N(10)
    AESENC_N(11)
    AESENC_N(12)
    AESENC_N(13)
    for(int j=0; j<N; ++j) {
        State[j] = _mm_aesenclast_si128(State[j], ExpandedKey[j][14]);
    }        
 }
 void AES256CBC(__m128i** data, const __m128i** next, __m128i ExpandedKey[][16], __m128i* IV)
 {
    const uint32_t N = AES_PARALLEL_N;
    __m128i State[N];
    for(int j=0; j<N; ++j) {
        State[j] = _mm_xor_si128( _mm_xor_si128(data[j][0], next[j][0]), IV[j]);
    }
    AES256Core(State, ExpandedKey);
    for(int j=0; j<N; ++j) {
        data[j][0] = State[j];
    }
    for(int i = 1; i < BLOCK_COUNT; ++i) {
        for(int j=0; j<N; ++j) {
            State[j] = _mm_xor_si128( _mm_xor_si128(data[j][i], next[j][i]), data[j][i - 1]);
        }
        AES256Core(State, ExpandedKey);
        for(int j=0; j<N; ++j) {
            data[j][i] = State[j];
        }
    }
 }
 #else    // NO AVX
 static inline __m128i AES256Core(__m128i State, const __m128i *ExpandedKey)
 {
        State = _mm_xor_si128(State, ExpandedKey[0]);
        for(int i = 1; i < 14; ++i) State = _mm_aesenc_si128(State, ExpandedKey[i]);
        return(_mm_aesenclast_si128(State, ExpandedKey[14]));
 }
 void AES256CBC(__m128i *Ciphertext, const __m128i *Plaintext, const __m128i *ExpandedKey, __m128i IV, uint32_t BlockCount)
 {
        __m128i State = _mm_xor_si128(Plaintext[0], IV);
        State = AES256Core(State, ExpandedKey);
        Ciphertext[0] = State;
        for(int i = 1; i < BlockCount; ++i)
        {
                State = _mm_xor_si128(Plaintext[i], Ciphertext[i - 1]);
                State = AES256Core(State, ExpandedKey);
                Ciphertext[i] = State;
        }
 }
 #endif
 #endif
--- a/algo/hodl/hodl-endian.h
+++ b/algo/hodl/hodl-endian.h
@@ -1,75 +0,0 @@
 #ifndef HODL_BYTESWAP_H
 #define HODL_BYTESWAP_H 1
 #define __bswap_constant_16(x) \
     ((unsigned short int) ((((x) >> 8) & 0xff) | (((x) & 0xff) << 8)))
 static __inline unsigned short int
 __bswap_16 (unsigned short int __bsx)
 {
  return __bswap_constant_16 (__bsx);
 }
 // LE
 #  define htobe16(x) __bswap_16 (x)
 #  define htole16(x) (x)
 #  define be16toh(x) __bswap_16 (x)
 #  define le16toh(x) (x)
 // BE
 //#  define htole16(x) __bswap_16 (x)
 //#  define htobe16(x) (x)
 //#  define le16toh(x) __bswap_16 (x)
 //#  define be16toh(x) (x)
 #define __bswap_constant_32(x) \
     ((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >>  8) |		      \
      (((x) & 0x0000ff00) <<  8) | (((x) & 0x000000ff) << 24))
 static __inline unsigned int
 __bswap_32 (unsigned int __bsx)
 {
  return __builtin_bswap32 (__bsx);
 }
 // LE	  
 #  define htobe32(x) __bswap_32 (x)
 #  define htole32(x) (x)
 #  define be32toh(x) __bswap_32 (x)
 #  define le32toh(x) (x)
 // BE
 //#  define htole32(x) __bswap_32 (x)
 //#  define htobe32(x) (x)
 //#  define le32toh(x) __bswap_32 (x)
 //#  define be32toh(x) (x)
 # define __bswap_constant_64(x) \
     ((((x) & 0xff00000000000000ull) >> 56)				      \
      | (((x) & 0x00ff000000000000ull) >> 40)				      \
      | (((x) & 0x0000ff0000000000ull) >> 24)				      \
      | (((x) & 0x000000ff00000000ull) >> 8)				      \
      | (((x) & 0x00000000ff000000ull) << 8)				      \
      | (((x) & 0x0000000000ff0000ull) << 24)				      \
      | (((x) & 0x000000000000ff00ull) << 40)				      \
      | (((x) & 0x00000000000000ffull) << 56))
 static __inline uint64_t
 __bswap_64 (uint64_t __bsx)
 {
  return __bswap_constant_64 (__bsx);
 }
 // LE
 #  define htobe64(x) __bswap_64 (x)
 #  define htole64(x) (x)
 #  define be64toh(x) __bswap_64 (x)
 #  define le64toh(x) (x)
 // BE
 //#  define htole64(x) __bswap_64 (x)
 //#  define htobe64(x) (x)
 //#  define le64toh(x) __bswap_64 (x)
 //#  define be64toh(x) (x)
 #endif
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -1,185 +0,0 @@
 #include <memory.h>
 //#include <mm_malloc.h>
 #include <stdlib.h>
 #include "hodl-gate.h"
 #include "hodl-wolf.h"
 #define HODL_NSTARTLOC_INDEX 20
 #define HODL_NFINALCALC_INDEX 21
 static struct work hodl_work;
 pthread_barrier_t hodl_barrier;
 // All references to this buffer are local to this file, so no args
 // need to be passed.
 unsigned char *hodl_scratchbuf = NULL;
 void hodl_le_build_stratum_request( char* req, struct work* work,
                                    struct stratum_ctx *sctx ) 
 {
   uint32_t ntime,       nonce,       nstartloc,       nfinalcalc;
   char     ntimestr[9], noncestr[9], nstartlocstr[9], nfinalcalcstr[9];
   unsigned char *xnonce2str;
   le32enc( &ntime, work->data[ algo_gate.ntime_index ] );
   le32enc( &nonce, work->data[ algo_gate.nonce_index ] );
   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
   xnonce2str = abin2hex(work->xnonce2, work->xnonce2_len );
   le32enc( &nstartloc,  work->data[ HODL_NSTARTLOC_INDEX ] );
   le32enc( &nfinalcalc, work->data[ HODL_NFINALCALC_INDEX ] );
   bin2hex( nstartlocstr,  (char*)(&nstartloc),  sizeof(uint32_t) );
   bin2hex( nfinalcalcstr, (char*)(&nfinalcalc), sizeof(uint32_t) );
   sprintf( req, "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
           rpc_user, work->job_id, xnonce2str, ntimestr, noncestr,
           nstartlocstr, nfinalcalcstr );
   free( xnonce2str );
 }
 char* hodl_malloc_txs_request( struct work *work )
 {
  char* req;
  json_t *val;
  char data_str[2 * sizeof(work->data) + 1];
  int i;
  for ( i = 0; i < ARRAY_SIZE(work->data); i++ )
    be32enc( work->data + i, work->data[i] );
  bin2hex( data_str, (unsigned char *)work->data, 88 );
  if ( work->workid )
  {
    char *params;
    val = json_object();
    json_object_set_new( val, "workid", json_string( work->workid ) );
    params = json_dumps( val, 0 );
    json_decref( val );
    req = malloc( 128 + 2*88 + strlen( work->txs ) + strlen( params ) );
    sprintf( req,
     "{\"method\": \"submitblock\", \"params\": [\"%s%s\", %s], \"id\":1}\r\n",
      data_str, work->txs, params);
    free( params );
  }
  else
  {
    req = malloc( 128 + 2*88 + strlen(work->txs));
    sprintf( req,
       "{\"method\": \"submitblock\", \"params\": [\"%s%s\"], \"id\":1}\r\n",
        data_str, work->txs);
  }
  return req;
 }
 void hodl_build_block_header( struct work* g_work, uint32_t version,
                              uint32_t *prevhash, uint32_t *merkle_tree,
                              uint32_t ntime, uint32_t nbits )
 {
   int i;
   memset( g_work->data, 0, sizeof(g_work->data) );
   g_work->data[0] = version;
   if ( have_stratum )
      for ( i = 0; i < 8; i++ )
         g_work->data[ 1+i ] = le32dec( prevhash + i );
   else
      for (i = 0; i < 8; i++)
         g_work->data[ 8-i ] = le32dec( prevhash + i );
   for ( i = 0; i < 8; i++ )
      g_work->data[ 9+i ] = be32dec( merkle_tree + i );
   g_work->data[ algo_gate.ntime_index ] = ntime;
   g_work->data[ algo_gate.nbits_index ] = nbits;
   g_work->data[22] = 0x80000000;
   g_work->data[31] = 0x00000280;
 }
 // called only by thread 0, saves a backup of g_work
 void hodl_get_new_work( struct work* work, struct work* g_work)
 {
 //   pthread_rwlock_rdlock( &g_work_lock );
   work_free( &hodl_work );
   work_copy( &hodl_work, g_work );
   hodl_work.data[ algo_gate.nonce_index ] = ( clock() + rand() ) % 9999;
 //   pthread_rwlock_unlock( &g_work_lock );
 }
 json_t *hodl_longpoll_rpc_call( CURL *curl, int *err, char* lp_url )
 {
   json_t *val;
   char *req = NULL;
   if ( have_gbt )
   {
      req = malloc( strlen( gbt_lp_req ) + strlen( lp_id ) + 1 );
      sprintf( req, gbt_lp_req, lp_id );
   }
   val = json_rpc_call( curl, lp_url, rpc_userpass,
                        req ? req : getwork_req, err, JSON_RPC_LONGPOLL );
   free( req );
   return val;
 }
 // called by every thread, copies the backup to each thread's work.
 void hodl_resync_threads( int thr_id, struct work* work )
 {
   int nonce_index = algo_gate.nonce_index;
   pthread_barrier_wait( &hodl_barrier );
   if ( memcmp( work->data, hodl_work.data, algo_gate.work_cmp_size ) )
   {
      work_free( work );
      work_copy( work, &hodl_work );
   }
   work->data[ nonce_index ] = swab32( hodl_work.data[ nonce_index ] );
   work_restart[thr_id].restart = 0;
 }
 bool hodl_do_this_thread( int thr_id )
 {
  return ( thr_id == 0 );
 }
 int hodl_scanhash( struct work* work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
 #if defined(__AES__)
  GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, mythr->id );
  pthread_barrier_wait( &hodl_barrier );
  return scanhash_hodl_wolf( work, max_nonce, hashes_done, mythr );
 #endif
  return false;
 }
 bool register_hodl_algo( algo_gate_t* gate )
 {
 #if !defined(__AES__)
  applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
  return false;
 #endif
  if ( GARBAGE_SIZE % opt_n_threads )
     applog( LOG_WARNING,"WARNING: Thread count must be power of 2. Miner may crash or produce invalid hash!" );
  pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
  gate->optimizations         = SSE42_OPT | AES_OPT | AVX2_OPT;
  gate->scanhash              = (void*)&hodl_scanhash;
  gate->get_new_work          = (void*)&hodl_get_new_work;
  gate->longpoll_rpc_call     = (void*)&hodl_longpoll_rpc_call;
  gate->build_stratum_request = (void*)&hodl_le_build_stratum_request;
  gate->malloc_txs_request    = (void*)&hodl_malloc_txs_request;
  gate->build_block_header    = (void*)&hodl_build_block_header;
  gate->resync_threads        = (void*)&hodl_resync_threads;
  gate->do_this_thread        = (void*)&hodl_do_this_thread;
  gate->work_cmp_size         = 76;
  hodl_scratchbuf = (unsigned char*)mm_malloc( 1 << 30, 64 );
  allow_getwork = false;
  opt_target_factor = 8388608.0;
  return ( hodl_scratchbuf != NULL );
 }
--- a/algo/hodl/hodl-gate.h
+++ b/algo/hodl/hodl-gate.h
@@ -1,6 +0,0 @@
 #include "algo-gate-api.h"
 extern unsigned char *hodl_scratchbuf;
 bool register_hodl_algo ( algo_gate_t* gate );
--- a/algo/hodl/hodl-wolf.c
+++ b/algo/hodl/hodl-wolf.c
@@ -1,225 +0,0 @@
 #include <string.h>
 #include <openssl/evp.h>
 #include <openssl/sha.h>
 #include "simd-utils.h"
 #include "sha512-avx.h"
 #include "wolf-aes.h"
 #include "hodl-gate.h"
 #include "hodl-wolf.h"
 #include "miner.h"
 #include "algo/sha/sha256d.h"
 #if defined(__AES__)               
 void GenerateGarbageCore( CacheEntry *Garbage, int ThreadID, int ThreadCount,
     void *MidHash )
 {
    const int Chunk = TOTAL_CHUNKS / ThreadCount;
    const uint32_t StartChunk = ThreadID * Chunk;
    const uint32_t EndChunk   = StartChunk + Chunk;
 #if defined(__SSE4_2__)
 //#ifdef __AVX__
    uint64_t* TempBufs[ SHA512_PARALLEL_N ] ;
    uint64_t* desination[ SHA512_PARALLEL_N ];
    for ( int i=0; i < SHA512_PARALLEL_N; ++i )
    {
        TempBufs[i] = (uint64_t*)malloc( 32 );
        memcpy( TempBufs[i], MidHash, 32 );
    }
    for ( uint32_t i = StartChunk; i < EndChunk; i += SHA512_PARALLEL_N )
    {
        for ( int j = 0; j < SHA512_PARALLEL_N; ++j )
        {
            ( (uint32_t*)TempBufs[j] )[0] = i + j;
            desination[j] = (uint64_t*)( (uint8_t *)Garbage + ( (i+j)
                            * GARBAGE_CHUNK_SIZE ) );
        }
        sha512Compute32b_parallel( TempBufs, desination );
    }
    for ( int i = 0; i < SHA512_PARALLEL_N; ++i )
        free( TempBufs[i] );
 #else
    uint32_t TempBuf[8];
    memcpy( TempBuf, MidHash, 32 );
    for ( uint32_t i = StartChunk; i < EndChunk; ++i )
    {
        TempBuf[0] = i;
        SHA512( ( uint8_t *)TempBuf, 32,
                ( (uint8_t *)Garbage ) + ( i * GARBAGE_CHUNK_SIZE ) );
    }
 #endif
 }
 /*
 void Rev256(uint32_t *Dest, const uint32_t *Src)
 {
 	for(int i = 0; i < 8; ++i) Dest[i] = swab32(Src[i]);
 }
 */
 int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr )
 {
 #if defined(__SSE4_2__)
 //#ifdef __AVX__
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    int threadNumber = mythr->id;
    CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf;
    CacheEntry Cache[AES_PARALLEL_N] __attribute__ ((aligned (64)));
    __m128i* data[AES_PARALLEL_N];
    const __m128i* next[AES_PARALLEL_N];
    uint32_t CollisionCount = 0;
    for ( int n=0; n<AES_PARALLEL_N; ++n )
    {
        data[n] = Cache[n].dqwords;
    }
    // Search for pattern in psuedorandom data	
    int searchNumber = COMPARE_SIZE / opt_n_threads;
    int startLoc = threadNumber * searchNumber;
    for ( int32_t k = startLoc; k < startLoc + searchNumber && !work_restart[threadNumber].restart; k += AES_PARALLEL_N )
    {
        // copy data to first l2 cache
        for ( int n=0; n<AES_PARALLEL_N; ++n )
        {
            memcpy(Cache[n].dwords, Garbage + k + n, GARBAGE_SLICE_SIZE);
        }
        for(int j = 0; j < AES_ITERATIONS; ++j)
        {
            __m128i ExpKey[AES_PARALLEL_N][16];
            __m128i ivs[AES_PARALLEL_N];
            // use last 4 bytes of first cache as next location
            for(int n=0; n<AES_PARALLEL_N; ++n) {
                uint32_t nextLocation = Cache[n].dwords[(GARBAGE_SLICE_SIZE >> 2) - 1] & (COMPARE_SIZE - 1); //% COMPARE_SIZE;
                next[n] = Garbage[nextLocation].dqwords;
                __m128i last[2];
                last[0] = _mm_xor_si128(Cache[n].dqwords[254], next[n][254]);
                last[1] = _mm_xor_si128(Cache[n].dqwords[255], next[n][255]);
                // Key is last 32b of Cache
                // IV is last 16b of Cache
                ExpandAESKey256(ExpKey[n], last);
                ivs[n] = last[1];
            }
            AES256CBC(data, next, ExpKey, ivs);
        }
        for(int n=0; n<AES_PARALLEL_N; ++n)
        if((Cache[n].dwords[(GARBAGE_SLICE_SIZE >> 2) - 1] & (COMPARE_SIZE - 1)) < 1000)
        {
            uint32_t BlockHdr[22], FinalPoW[8];
            swab32_array( BlockHdr, pdata, 20 );
            BlockHdr[20] = k + n;
            BlockHdr[21] = Cache[n].dwords[(GARBAGE_SLICE_SIZE >> 2) - 2];
 	      sha256d( (uint8_t *)FinalPoW, (uint8_t *)BlockHdr, 88 );
 	      CollisionCount++;
 	      if( FinalPoW[7] <= ptarget[7] )
 	      {
 	          pdata[20] = swab32( BlockHdr[20] );
             pdata[21] = swab32( BlockHdr[21] );
 		       *hashes_done = CollisionCount;
             submit_solution( work, FinalPoW, mythr );
             return(0);
 	      }
 	   }
 	}
    *hashes_done = CollisionCount;
    return(0);
 #else  // no AVX
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    uint32_t BlockHdr[22], FinalPoW[8];
    CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf;
    CacheEntry Cache;
    uint32_t CollisionCount = 0;
    int threadNumber = mythr->id;
    swab32_array( BlockHdr, pdata, 20 );
        // Search for pattern in psuedorandom data      
        int searchNumber = COMPARE_SIZE / opt_n_threads;
        int startLoc = threadNumber * searchNumber;
        if ( opt_debug )
           applog( LOG_DEBUG,"Hash target= %08lx", ptarget[7] );
        for(int32_t k = startLoc; k < startLoc + searchNumber && !work_restart[threadNumber].restart; k++)
        {
           // copy data to first l2 cache
           memcpy(Cache.dwords, Garbage + k, GARBAGE_SLICE_SIZE);
           for(int j = 0; j < AES_ITERATIONS; j++)
           {
                CacheEntry TmpXOR;
                __m128i ExpKey[16];
                // use last 4 bytes of first cache as next location
                uint32_t nextLocation = Cache.dwords[(GARBAGE_SLICE_SIZE >> 2)
                                   - 1] & (COMPARE_SIZE - 1); //% COMPARE_SIZE;
                // Copy data from indicated location to second l2 cache -
                memcpy(&TmpXOR, Garbage + nextLocation, GARBAGE_SLICE_SIZE);
                //XOR location data into second cache
                for( int i = 0; i < (GARBAGE_SLICE_SIZE >> 4); ++i )
                   TmpXOR.dqwords[i] = _mm_xor_si128( Cache.dqwords[i],
                                                      TmpXOR.dqwords[i] );
                // Key is last 32b of TmpXOR
                // IV is last 16b of TmpXOR
                ExpandAESKey256( ExpKey, TmpXOR.dqwords +
                                 (GARBAGE_SLICE_SIZE / sizeof(__m128i)) - 2 );
                AES256CBC( Cache.dqwords, TmpXOR.dqwords, ExpKey,
                        TmpXOR.dqwords[ (GARBAGE_SLICE_SIZE / sizeof(__m128i))
                                                             - 1 ], 256 );                 }
           // use last X bits as solution
           if( ( Cache.dwords[ (GARBAGE_SLICE_SIZE >> 2) - 1 ]
                                         & (COMPARE_SIZE - 1) ) < 1000 )
           {
              BlockHdr[20] = k;
              BlockHdr[21] = Cache.dwords[ (GARBAGE_SLICE_SIZE >> 2) - 2 ];
              sha256d( (uint8_t *)FinalPoW, (uint8_t *)BlockHdr, 88 );
              CollisionCount++;
              if( FinalPoW[7] <= ptarget[7] )
              {
                  pdata[20] = swab32( BlockHdr[20] );
                  pdata[21] = swab32( BlockHdr[21] );
                  *hashes_done = CollisionCount;
                  submit_solution( work, FinalPoW, mythr );
                  return(0);
              }
           }
        }
    *hashes_done = CollisionCount;
    return(0);
 #endif  // AVX else
 }
 void GenRandomGarbage(CacheEntry *Garbage, uint32_t *pdata, int thr_id)
 {
 	uint32_t BlockHdr[20], MidHash[8];
        swab32_array( BlockHdr, pdata, 20 );
 	sha256d((uint8_t *)MidHash, (uint8_t *)BlockHdr, 80);
 	GenerateGarbageCore(Garbage, thr_id, opt_n_threads, MidHash);
 }
 #endif // AES
--- a/algo/hodl/hodl-wolf.h
+++ b/algo/hodl/hodl-wolf.h
@@ -1,27 +0,0 @@
 #ifndef __HODL_H
 #define __HODL_H
 #include <stdint.h>
 #include "simd-utils.h"
 #include "miner.h"
 #define AES_ITERATIONS 		15
 #define GARBAGE_SIZE		(1 << 30)
 #define GARBAGE_CHUNK_SIZE	(1 << 6)
 #define GARBAGE_SLICE_SIZE	(1 << 12)
 #define TOTAL_CHUNKS		(1 << 24)   // GARBAGE_SIZE / GARBAGE_CHUNK_SIZE
 #define COMPARE_SIZE		(1 << 18)   // GARBAGE_SIZE / GARBAGE_SLICE_SIZE
 typedef union _CacheEntry
 {
 	uint32_t dwords[GARBAGE_SLICE_SIZE >> 2] __attribute__((aligned(16)));
 	v128_t dqwords[GARBAGE_SLICE_SIZE >> 4] __attribute__((aligned(16)));
 } CacheEntry;
 int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );
 void GenRandomGarbage( CacheEntry *Garbage, uint32_t *pdata, int thr_id);
 #endif		// __HODL_H
--- a/algo/hodl/hodlminer.1
+++ b/algo/hodl/hodlminer.1
@@ -1,208 +0,0 @@
 .TH MINERD 1 "March 2016" "cpuminer 2.4.3"
 .SH NAME
 hodlminer \- CPU miner for Hodlcoin
 .SH SYNOPSIS
 .B hodlminer
 [\fIOPTION\fR]...
 .SH DESCRIPTION
 .B hodlminer
 is a multi-threaded CPU miner for Hodlcoin.
 It supports the getwork and getblocktemplate (BIP 22) methods,
 as well as the Stratum mining protocol.
 .PP
 In its normal mode of operation, \fBhodlminer\fR connects to a mining server
 (specified with the \fB\-o\fR option), receives work from it and starts hashing.
 As soon as a solution is found, it is submitted to the same mining server,
 which can accept or reject it.
 When using getwork or getblocktemplate,
 \fBhodlminer\fR can take advantage of long polling, if the server supports it;
 in any case, fresh work is fetched as needed.
 When using the Stratum protocol this is not possible,
 and the server is responsible for sending fresh work at least every minute;
 if it fails to do so,
 \fBhodlminer\fR may drop the connection and try reconnecting again.
 .PP
 By default, \fBhodlminer\fR writes all its messages to standard error.
 On systems that have a syslog, the \fB\-\-syslog\fR option can be used
 to write to it instead.
 .PP
 On start, the nice value of all miner threads is set to 19.
 On Linux, the scheduling policy is also changed to SCHED_IDLE,
 or to SCHED_BATCH if that fails.
 On multiprocessor systems, \fBhodlminer\fR
 automatically sets the CPU affinity of miner threads
 if the number of threads is a multiple of the number of processors.
 .SH EXAMPLES
 To connect to the Hodlcoin mining pool that provides a Stratum server
 at hodl.blockquarry.com on port 8332, authenticating as worker "user.worker" with password "x":
 .PP
 .nf
 .RS
 hodlminer \-o stratum+tcp://hodl.blockquarry.com:8332 \-u user.worker -p x -q
 .RE
 .fi
 .PP
 To mine to a local Hodlcoin instance running on port 18332,
 authenticating with username "rpcuser" and password "rpcpass":
 .PP
 .nf
 .RS
 hodlminer \-a hodl \-o http://localhost:18332 \-O rpcuser:rpcpass \\
 	\-\-coinbase\-addr=mpXwg4jMtRhuSpVq4xS3HFHmCmWp9NyGKt
 .RE
 .fi
 .PP
 .SH OPTIONS
 .TP
 \fB\-a\fR, \fB\-\-algo\fR=\fIALGORITHM\fR
 Set the hashing algorithm to use.
 Default is hodl.
 Possible values are:
 .RS 11
 .TP 10
 .B hodl
 .TP
 \fB\-\-benchmark\fR
 Run in offline benchmark mode.
 .TP
 \fB\-B\fR, \fB\-\-background\fR
 Run in the background as a daemon.
 .TP
 \fB\-\-cert\fR=\fIFILE\fR
 Set an SSL certificate to use with the mining server.
 Only supported when using the HTTPS protocol.
 .TP
 \fB\-\-coinbase\-addr\fR=\fIADDRESS\fR
 Set a payout address for solo mining.
 This is only used in getblocktemplate mode,
 and only if the server does not provide a coinbase transaction.
 .TP
 \fB\-\-coinbase\-sig\fR=\fITEXT\fR
 Set a string to be included in the coinbase (if allowed by the server).
 This is only used in getblocktemplate mode.
 .TP
 \fB\-c\fR, \fB\-\-config\fR=\fIFILE\fR
 Load options from a configuration file.
 \fIFILE\fR must contain a JSON object
 mapping long options to their arguments (as strings),
 or to \fBtrue\fR if no argument is required.
 Sample configuration file:
 .nf
 	{
 		"url": "stratum+tcp://hodl.blockquarry.com:8332",
 		"userpass": "foo:bar",
 		"retry-pause": "10",
 		"quiet": true
 	}
 .fi
 .TP
 \fB\-D\fR, \fB\-\-debug\fR
 Enable debug output.
 .TP
 \fB\-h\fR, \fB\-\-help\fR
 Print a help message and exit.
 .TP
 \fB\-\-no\-gbt\fR
 Do not use the getblocktemplate RPC method.
 .TP
 \fB\-\-no\-getwork\fR
 Do not use the getwork RPC method.
 .TP
 \fB\-\-no\-longpoll\fR
 Do not use long polling.
 .TP
 \fB\-\-no\-redirect\fR
 Ignore requests from the server to switch to a different URL.
 .TP
 \fB\-\-no\-stratum\fR
 Do not switch to Stratum, even if the server advertises support for it.
 .TP
 \fB\-o\fR, \fB\-\-url\fR=[\fISCHEME\fR://][\fIUSERNAME\fR[:\fIPASSWORD\fR]@]\fIHOST\fR:\fIPORT\fR[/\fIPATH\fR]
 Set the URL of the mining server to connect to.
 Supported schemes are \fBhttp\fR, \fBhttps\fR, \fBstratum+tcp\fR
 and \fBstratum+tcps\fR.
 If no scheme is specified, http is assumed.
 Specifying a \fIPATH\fR is only supported for HTTP and HTTPS.
 Specifying credentials has the same effect as using the \fB\-O\fR option.
 By default, on HTTP and HTTPS,
 the miner tries to use the getblocktemplate RPC method,
 and falls back to using getwork if getblocktemplate is unavailable.
 This behavior can be modified by using the \fB\-\-no\-gbt\fR
 and \fB\-\-no\-getwork\fR options.
 .TP
 \fB\-O\fR, \fB\-\-userpass\fR=\fIUSERNAME\fR:\fIPASSWORD\fR
 Set the credentials to use for connecting to the mining server.
 Any value previously set with \fB\-u\fR or \fB\-p\fR is discarded.
 .TP
 \fB\-p\fR, \fB\-\-pass\fR=\fIPASSWORD\fR
 Set the password to use for connecting to the mining server.
 Any password previously set with \fB\-O\fR is discarded.
 .TP
 \fB\-P\fR, \fB\-\-protocol\-dump\fR
 Enable output of all protocol-level activities.
 .TP
 \fB\-q\fR, \fB\-\-quiet\fR
 Disable per-thread hashmeter output.
 .TP
 \fB\-r\fR, \fB\-\-retries\fR=\fIN\fR
 Set the maximum number of times to retry if a network call fails.
 If not specified, the miner will retry indefinitely.
 .TP
 \fB\-R\fR, \fB\-\-retry\-pause\fR=\fISECONDS\fR
 Set how long to wait between retries. Default is 30 seconds.
 .TP
 \fB\-s\fR, \fB\-\-scantime\fR=\fISECONDS\fR
 Set an upper bound on the time the miner can go without fetching fresh work.
 This setting has no effect in Stratum mode or when long polling is activated.
 Default is 5 seconds.
 .TP
 \fB\-S\fR, \fB\-\-syslog\fR
 Log to the syslog facility instead of standard error.
 .TP
 \fB\-t\fR, \fB\-\-threads\fR=\fIN\fR
 Set the number of miner threads.
 If not specified, the miner will try to detect the number of available processors
 and use that.
 .TP
 \fB\-T\fR, \fB\-\-timeout\fR=\fISECONDS\fR
 Set a timeout for long polling.
 .TP
 \fB\-u\fR, \fB\-\-user\fR=\fIUSERNAME\fR
 Set the username to use for connecting to the mining server.
 Any username previously set with \fB\-O\fR is discarded.
 .TP
 \fB\-V\fR, \fB\-\-version\fR
 Display version information and quit.
 .TP
 \fB\-x\fR, \fB\-\-proxy\fR=[\fISCHEME\fR://][\fIUSERNAME\fR:\fIPASSWORD\fR@]\fIHOST\fR:\fIPORT\fR
 Connect to the mining server through a proxy.
 Supported schemes are: \fBhttp\fR, \fBsocks4\fR, \fBsocks5\fR.
 Since libcurl 7.18.0, the following are also supported:
 \fBsocks4a\fR, \fBsocks5h\fR (SOCKS5 with remote name resolving).
 If no scheme is specified, the proxy is treated as an HTTP proxy.
 .SH ENVIRONMENT
 The following environment variables can be specified in lower case or upper case;
 the lower-case version has precedence. \fBhttp_proxy\fR is an exception
 as it is only available in lower case.
 .PP
 .RS
 .TP
 \fBhttp_proxy\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR
 Sets the proxy server to use for HTTP.
 .TP
 \fBHTTPS_PROXY\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR
 Sets the proxy server to use for HTTPS.
 .TP
 \fBALL_PROXY\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR
 Sets the proxy server to use if no protocol-specific proxy is set.
 .RE
 .PP
 Using an environment variable to set the proxy has the same effect as
 using the \fB\-x\fR option.
 .SH AUTHOR
 Most of the code in the current version of minerd was written by
 Pooler <pooler@litecoinpool.org> with contributions from others.
 The original minerd was written by Jeff Garzik <jeff@garzik.org>.
--- a/algo/hodl/sha512-avx.h
+++ b/algo/hodl/sha512-avx.h
@@ -1,50 +0,0 @@
 #ifndef _SHA512_H
 #define _SHA512_H
 #include <stdint.h>
 #include "simd-utils.h"
 //SHA-512 block size
 #define SHA512_BLOCK_SIZE 128
 //SHA-512 digest size
 #define SHA512_DIGEST_SIZE 64
 /*
 #ifndef __AVX2__
 #ifndef __AVX__
 #error "Either AVX or AVX2 supported needed"
 #endif // __AVX__
 #endif // __AVX2__
 */
 typedef struct
 {
 #ifdef __AVX2__
   __m256i h[8];
   __m256i w[80];
 #elif defined(__SSE4_2__)
 //#elif defined(__AVX__)
   v128_t h[8];
   v128_t w[80];
 #else
   int dummy;
 #endif
 } Sha512Context;
 #ifdef __AVX2__
 #define SHA512_PARALLEL_N 8
 #elif defined(__SSE4_2__)
 //#elif defined(__AVX__)
 #define SHA512_PARALLEL_N 4
 #else
 #define SHA512_PARALLEL_N 1   // dummy value
 #endif
 //SHA-512 related functions
 void sha512Compute32b_parallel(
        uint64_t *data[SHA512_PARALLEL_N],
        uint64_t *digest[SHA512_PARALLEL_N]);
 void sha512ProcessBlock(Sha512Context contexti[2] );
 #endif
--- a/algo/hodl/sha512_avx.c
+++ b/algo/hodl/sha512_avx.c
@@ -1,235 +0,0 @@
 #ifndef __AVX2__
 #if defined(__SSE4_2__)
 //#ifdef __AVX__
 //Dependencies
 #include <string.h>
 #include <stdlib.h>
 #ifdef __FreeBSD__
 #include <sys/endian.h>
 #endif 
 #if defined(__CYGWIN__)
 #include <endian.h>
 #endif
 #include "tmmintrin.h"
 #include "smmintrin.h"
 #include "sha512-avx.h"
 #if ((defined(_WIN64) || defined(__WINDOWS__)))
 #include "hodl-endian.h"
 #endif
 //SHA-512 auxiliary functions
 #define Ch(x, y, z) (((x) & (y)) | (~(x) & (z)))
 #define Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
 #define SIGMA1(x) (ROR64(x, 28) ^ ROR64(x, 34) ^ ROR64(x, 39))
 #define SIGMA2(x) (ROR64(x, 14) ^ ROR64(x, 18) ^ ROR64(x, 41))
 #define SIGMA3(x) (ROR64(x, 1)  ^ ROR64(x, 8)  ^ SHR64(x, 7))
 #define SIGMA4(x) (ROR64(x, 19) ^ ROR64(x, 61) ^ SHR64(x, 6))
 //Rotate right operation
 #define ROR64(a, n) _mm_or_si128(_mm_srli_epi64(a, n), _mm_slli_epi64(a, 64 - n))
 //Shift right operation
 #define SHR64(a, n) _mm_srli_epi64(a, n)
 __m128i mm_htobe_epi64(__m128i a) {
  __m128i mask = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
  return _mm_shuffle_epi8(a, mask);
 }
 __m128i mm_betoh_epi64(__m128i a) {
    return mm_htobe_epi64(a);
 }
 //SHA-512 padding
 static const uint8_t padding[128] =
 {
   0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 };
 //SHA-512 constants
 static const uint64_t k[80] =
 {
   0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
   0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
   0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
   0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
   0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
   0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
   0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
   0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
   0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
   0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
   0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
   0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
   0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
   0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
   0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
   0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
   0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
   0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
   0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
   0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
 };
 void sha512Compute32b_parallel(uint64_t *data[SHA512_PARALLEL_N], uint64_t *digest[SHA512_PARALLEL_N]) {
    Sha512Context context[2];
    context[0].h[0] = _mm_set1_epi64x(0x6A09E667F3BCC908);
    context[0].h[1] = _mm_set1_epi64x(0xBB67AE8584CAA73B);
    context[0].h[2] = _mm_set1_epi64x(0x3C6EF372FE94F82B);
    context[0].h[3] = _mm_set1_epi64x(0xA54FF53A5F1D36F1);
    context[0].h[4] = _mm_set1_epi64x(0x510E527FADE682D1);
    context[0].h[5] = _mm_set1_epi64x(0x9B05688C2B3E6C1F);
    context[0].h[6] = _mm_set1_epi64x(0x1F83D9ABFB41BD6B);
    context[0].h[7] = _mm_set1_epi64x(0x5BE0CD19137E2179);
    context[1].h[0] = _mm_set1_epi64x(0x6A09E667F3BCC908);
    context[1].h[1] = _mm_set1_epi64x(0xBB67AE8584CAA73B);
    context[1].h[2] = _mm_set1_epi64x(0x3C6EF372FE94F82B);
    context[1].h[3] = _mm_set1_epi64x(0xA54FF53A5F1D36F1);
    context[1].h[4] = _mm_set1_epi64x(0x510E527FADE682D1);
    context[1].h[5] = _mm_set1_epi64x(0x9B05688C2B3E6C1F);
    context[1].h[6] = _mm_set1_epi64x(0x1F83D9ABFB41BD6B);
    context[1].h[7] = _mm_set1_epi64x(0x5BE0CD19137E2179);
    for(int i=0; i<4; ++i) {
        context[0].w[i] = _mm_set_epi64x ( data[1][i], data[0][i] );
        context[1].w[i] = _mm_set_epi64x ( data[3][i], data[2][i] );
    }
    for(int i=0; i<10; ++i) {
        context[0].w[i+4] = _mm_set1_epi64x( ((uint64_t*)padding)[i] );
        context[1].w[i+4] = _mm_set1_epi64x( ((uint64_t*)padding)[i] );
    }
    //Length of the original message (before padding)
    uint64_t totalSize = 32 * 8;
    //Append the length of the original message
    context[0].w[14] = _mm_set1_epi64x(0);
    context[0].w[15] = _mm_set1_epi64x(htobe64(totalSize));
    context[1].w[14] = _mm_set1_epi64x(0);
    context[1].w[15] = _mm_set1_epi64x(htobe64(totalSize));
    //Calculate the message digest
    sha512ProcessBlock(context);
    //Convert from host byte order to big-endian byte order
    for (int i = 0; i < 8; i++) {
        context[0].h[i] = mm_htobe_epi64(context[0].h[i]);
        context[1].h[i] = mm_htobe_epi64(context[1].h[i]);
    }
    //Copy the resulting digest
    for(int i=0; i<8; ++i) {
        digest[0][i] = _mm_extract_epi64(context[0].h[i], 0);
        digest[1][i] = _mm_extract_epi64(context[0].h[i], 1);
        digest[2][i] = _mm_extract_epi64(context[1].h[i], 0);
        digest[3][i] = _mm_extract_epi64(context[1].h[i], 1);
    }
 }
 #define blk0(n, i) (block[n][i] = mm_betoh_epi64(block[n][i]))
 #define blk(n, i)  (block[n][i] = block[n][i - 16] + SIGMA3(block[n][i - 15]) + \
                            SIGMA4(block[n][i - 2]) + block[n][i - 7])
 #define ROUND512(a,b,c,d,e,f,g,h)   \
    T0 += (h[0]) + SIGMA2(e[0]) + Ch((e[0]), (f[0]), (g[0])) + k[i]; \
    T1 += (h[1]) + SIGMA2(e[1]) + Ch((e[1]), (f[1]), (g[1])) + k[i]; \
    (d[0]) += T0; \
    (d[1]) += T1; \
    (h[0]) = T0 + SIGMA1(a[0]) + Maj((a[0]), (b[0]), (c[0])); \
    (h[1]) = T1 + SIGMA1(a[1]) + Maj((a[1]), (b[1]), (c[1])); \
    i++
 #define ROUND512_0_TO_15(a,b,c,d,e,f,g,h)   \
    T0 = blk0(0, i); \
    T1 = blk0(1, i); \
    ROUND512(a,b,c,d,e,f,g,h)
 #define ROUND512_16_TO_80(a,b,c,d,e,f,g,h)   \
    T0 = blk(0, i); \
    T1 = blk(1, i); \
    ROUND512(a,b,c,d,e,f,g,h)
 #define R512_0 \
    ROUND512_0_TO_15(a, b, c, d, e, f, g, h); \
    ROUND512_0_TO_15(h, a, b, c, d, e, f, g); \
    ROUND512_0_TO_15(g, h, a, b, c, d, e, f); \
    ROUND512_0_TO_15(f, g, h, a, b, c, d, e); \
    ROUND512_0_TO_15(e, f, g, h, a, b, c, d); \
    ROUND512_0_TO_15(d, e, f, g, h, a, b, c); \
    ROUND512_0_TO_15(c, d, e, f, g, h, a, b); \
    ROUND512_0_TO_15(b, c, d, e, f, g, h, a)
 #define R512_16 \
    ROUND512_16_TO_80(a, b, c, d, e, f, g, h); \
    ROUND512_16_TO_80(h, a, b, c, d, e, f, g); \
    ROUND512_16_TO_80(g, h, a, b, c, d, e, f); \
    ROUND512_16_TO_80(f, g, h, a, b, c, d, e); \
    ROUND512_16_TO_80(e, f, g, h, a, b, c, d); \
    ROUND512_16_TO_80(d, e, f, g, h, a, b, c); \
    ROUND512_16_TO_80(c, d, e, f, g, h, a, b); \
    ROUND512_16_TO_80(b, c, d, e, f, g, h, a)
 #define INIT(x,n) \
    x[0] = context[0].h[n]; \
    x[1] = context[1].h[n]; \
 void sha512ProcessBlock(Sha512Context context[2])
 {
    __m128i* block[2];
    block[0] = context[0].w;
    block[1] = context[1].w;
    __m128i T0, T1;
    __m128i a[2], b[2], c[2], d[2], e[2], f[2], g[2], h[2];
    INIT(a, 0)
    INIT(b, 1)
    INIT(c, 2)
    INIT(d, 3)
    INIT(e, 4)
    INIT(f, 5)
    INIT(g, 6)
    INIT(h, 7)
    int i = 0;
    R512_0; R512_0;
    for(int j=0; j<8; ++j) {
        R512_16;
    }
    context[0].h[0] += a[0];
    context[0].h[1] += b[0];
    context[0].h[2] += c[0];
    context[0].h[3] += d[0];
    context[0].h[4] += e[0];
    context[0].h[5] += f[0];
    context[0].h[6] += g[0];
    context[0].h[7] += h[0];
    context[1].h[0] += a[1];
    context[1].h[1] += b[1];
    context[1].h[2] += c[1];
    context[1].h[3] += d[1];
    context[1].h[4] += e[1];
    context[1].h[5] += f[1];
    context[1].h[6] += g[1];
    context[1].h[7] += h[1];
 }
 #endif // __AVX__
 #endif // __AVX2__
--- a/algo/hodl/sha512_avx2.c
+++ b/algo/hodl/sha512_avx2.c
@@ -1,241 +0,0 @@
 #ifdef __AVX2__
 //Dependencies
 #include <string.h>
 #include <stdlib.h>
 #ifdef __FreeBSD__
 #include <sys/endian.h>
 #endif 
 #if defined(__CYGWIN__)
 #include <endian.h>
 #endif
 #include "tmmintrin.h"
 #include "smmintrin.h"
 #include "immintrin.h"
 #include "sha512-avx.h"
 #if ((defined(_WIN64) || defined(__WINDOWS__)))
 #include "hodl-endian.h"
 #endif
 //SHA-512 auxiliary functions
 #define Ch(x, y, z) (((x) & (y)) | (~(x) & (z)))
 #define Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
 #define SIGMA1(x) (ROR64(x, 28) ^ ROR64(x, 34) ^ ROR64(x, 39))
 #define SIGMA2(x) (ROR64(x, 14) ^ ROR64(x, 18) ^ ROR64(x, 41))
 #define SIGMA3(x) (ROR64(x, 1)  ^ ROR64(x, 8)  ^ SHR64(x, 7))
 #define SIGMA4(x) (ROR64(x, 19) ^ ROR64(x, 61) ^ SHR64(x, 6))
 //Rotate right operation
 #define ROR64(a, n) _mm256_or_si256(_mm256_srli_epi64(a, n), _mm256_slli_epi64(a, 64 - n))
 //Shift right operation
 #define SHR64(a, n) _mm256_srli_epi64(a, n)
 __m256i mm256_htobe_epi64(__m256i a) {
  __m256i mask = _mm256_set_epi8(
          24,25,26,27,28,29,30,31,
          16,17,18,19,20,21,22,23,
          8, 9, 10, 11, 12, 13, 14, 15,
          0, 1, 2, 3, 4, 5, 6, 7);
  return _mm256_shuffle_epi8(a, mask);
 }
 __m256i mm256_betoh_epi64(__m256i a) {
    return mm256_htobe_epi64(a);
 }
 //SHA-512 padding
 static const uint8_t padding[128] =
 {
   0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 };
 //SHA-512 constants
 static const uint64_t k[80] =
 {
   0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
   0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
   0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
   0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
   0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
   0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
   0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
   0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
   0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
   0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
   0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
   0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
   0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
   0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
   0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
   0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
   0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
   0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
   0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
   0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
 };
 void sha512Compute32b_parallel(uint64_t *data[SHA512_PARALLEL_N], uint64_t *digest[SHA512_PARALLEL_N]) {
    Sha512Context context[2];
    context[0].h[0] = _mm256_set1_epi64x(0x6A09E667F3BCC908);
    context[0].h[1] = _mm256_set1_epi64x(0xBB67AE8584CAA73B);
    context[0].h[2] = _mm256_set1_epi64x(0x3C6EF372FE94F82B);
    context[0].h[3] = _mm256_set1_epi64x(0xA54FF53A5F1D36F1);
    context[0].h[4] = _mm256_set1_epi64x(0x510E527FADE682D1);
    context[0].h[5] = _mm256_set1_epi64x(0x9B05688C2B3E6C1F);
    context[0].h[6] = _mm256_set1_epi64x(0x1F83D9ABFB41BD6B);
    context[0].h[7] = _mm256_set1_epi64x(0x5BE0CD19137E2179);
    context[1].h[0] = _mm256_set1_epi64x(0x6A09E667F3BCC908);
    context[1].h[1] = _mm256_set1_epi64x(0xBB67AE8584CAA73B);
    context[1].h[2] = _mm256_set1_epi64x(0x3C6EF372FE94F82B);
    context[1].h[3] = _mm256_set1_epi64x(0xA54FF53A5F1D36F1);
    context[1].h[4] = _mm256_set1_epi64x(0x510E527FADE682D1);
    context[1].h[5] = _mm256_set1_epi64x(0x9B05688C2B3E6C1F);
    context[1].h[6] = _mm256_set1_epi64x(0x1F83D9ABFB41BD6B);
    context[1].h[7] = _mm256_set1_epi64x(0x5BE0CD19137E2179);
    for(int i=0; i<4; ++i) {
        context[0].w[i] = _mm256_set_epi64x ( data[3][i], data[2][i], data[1][i], data[0][i] );
        context[1].w[i] = _mm256_set_epi64x ( data[7][i], data[6][i], data[5][i], data[4][i]  );
    }
    for(int i=0; i<10; ++i) {
        context[0].w[i+4] = _mm256_set1_epi64x( ((uint64_t*)padding)[i] );
        context[1].w[i+4] = _mm256_set1_epi64x( ((uint64_t*)padding)[i] );
    }
    //Length of the original message (before padding)
    uint64_t totalSize = 32 * 8;
    //Append the length of the original message
    context[0].w[14] = _mm256_set1_epi64x(0);
    context[0].w[15] = _mm256_set1_epi64x(htobe64(totalSize));
    context[1].w[14] = _mm256_set1_epi64x(0);
    context[1].w[15] = _mm256_set1_epi64x(htobe64(totalSize));
    //Calculate the message digest
    sha512ProcessBlock(context);
    //Convert from host byte order to big-endian byte order
    for (int i = 0; i < 8; i++) {
        context[0].h[i] = mm256_htobe_epi64(context[0].h[i]);
        context[1].h[i] = mm256_htobe_epi64(context[1].h[i]);
    }
    //Copy the resulting digest
    for(int i=0; i<8; ++i) {
        digest[0][i] = _mm256_extract_epi64(context[0].h[i], 0);
        digest[1][i] = _mm256_extract_epi64(context[0].h[i], 1);
        digest[2][i] = _mm256_extract_epi64(context[0].h[i], 2);
        digest[3][i] = _mm256_extract_epi64(context[0].h[i], 3);
        digest[4][i] = _mm256_extract_epi64(context[1].h[i], 0);
        digest[5][i] = _mm256_extract_epi64(context[1].h[i], 1);
        digest[6][i] = _mm256_extract_epi64(context[1].h[i], 2);
        digest[7][i] = _mm256_extract_epi64(context[1].h[i], 3);
    }
 }
 #define blk0(n, i) (block[n][i] = mm256_betoh_epi64(block[n][i]))
 #define blk(n, i)  (block[n][i] = block[n][i - 16] + SIGMA3(block[n][i - 15]) + \
                            SIGMA4(block[n][i - 2]) + block[n][i - 7])
 #define ROUND512(a,b,c,d,e,f,g,h)   \
    T0 += (h[0]) + SIGMA2(e[0]) + Ch((e[0]), (f[0]), (g[0])) + k[i]; \
    T1 += (h[1]) + SIGMA2(e[1]) + Ch((e[1]), (f[1]), (g[1])) + k[i]; \
    (d[0]) += T0; \
    (d[1]) += T1; \
    (h[0]) = T0 + SIGMA1(a[0]) + Maj((a[0]), (b[0]), (c[0])); \
    (h[1]) = T1 + SIGMA1(a[1]) + Maj((a[1]), (b[1]), (c[1])); \
    i++
 #define ROUND512_0_TO_15(a,b,c,d,e,f,g,h)   \
    T0 = blk0(0, i); \
    T1 = blk0(1, i); \
    ROUND512(a,b,c,d,e,f,g,h)
 #define ROUND512_16_TO_80(a,b,c,d,e,f,g,h)   \
    T0 = blk(0, i); \
    T1 = blk(1, i); \
    ROUND512(a,b,c,d,e,f,g,h)
 #define R512_0 \
    ROUND512_0_TO_15(a, b, c, d, e, f, g, h); \
    ROUND512_0_TO_15(h, a, b, c, d, e, f, g); \
    ROUND512_0_TO_15(g, h, a, b, c, d, e, f); \
    ROUND512_0_TO_15(f, g, h, a, b, c, d, e); \
    ROUND512_0_TO_15(e, f, g, h, a, b, c, d); \
    ROUND512_0_TO_15(d, e, f, g, h, a, b, c); \
    ROUND512_0_TO_15(c, d, e, f, g, h, a, b); \
    ROUND512_0_TO_15(b, c, d, e, f, g, h, a)
 #define R512_16 \
    ROUND512_16_TO_80(a, b, c, d, e, f, g, h); \
    ROUND512_16_TO_80(h, a, b, c, d, e, f, g); \
    ROUND512_16_TO_80(g, h, a, b, c, d, e, f); \
    ROUND512_16_TO_80(f, g, h, a, b, c, d, e); \
    ROUND512_16_TO_80(e, f, g, h, a, b, c, d); \
    ROUND512_16_TO_80(d, e, f, g, h, a, b, c); \
    ROUND512_16_TO_80(c, d, e, f, g, h, a, b); \
    ROUND512_16_TO_80(b, c, d, e, f, g, h, a)
 #define INIT(x,n) \
    x[0] = context[0].h[n]; \
    x[1] = context[1].h[n]; \
 void sha512ProcessBlock(Sha512Context context[2])
 {
    __m256i* block[2];
    block[0] = context[0].w;
    block[1] = context[1].w;
    __m256i T0, T1;
    __m256i a[2], b[2], c[2], d[2], e[2], f[2], g[2], h[2];
    INIT(a, 0)
    INIT(b, 1)
    INIT(c, 2)
    INIT(d, 3)
    INIT(e, 4)
    INIT(f, 5)
    INIT(g, 6)
    INIT(h, 7)
    int i = 0;
    R512_0; R512_0;
    for(int j=0; j<8; ++j) {
        R512_16;
    }
    context[0].h[0] += a[0];
    context[0].h[1] += b[0];
    context[0].h[2] += c[0];
    context[0].h[3] += d[0];
    context[0].h[4] += e[0];
    context[0].h[5] += f[0];
    context[0].h[6] += g[0];
    context[0].h[7] += h[0];
    context[1].h[0] += a[1];
    context[1].h[1] += b[1];
    context[1].h[2] += c[1];
    context[1].h[3] += d[1];
    context[1].h[4] += e[1];
    context[1].h[5] += f[1];
    context[1].h[6] += g[1];
    context[1].h[7] += h[1];
 }
 #endif // __AVX2__
--- a/algo/hodl/wolf-aes.h
+++ b/algo/hodl/wolf-aes.h
@@ -1,25 +0,0 @@
 #ifndef __WOLF_AES_H
 #define __WOLF_AES_H
 #include <stdint.h>
 #include "simd-utils.h"
 void ExpandAESKey256(v128_t *keys, const v128_t *KeyBuf);
 #if defined(__SSE4_2__)
 //#ifdef __AVX__
 #define AES_PARALLEL_N 8
 #define BLOCK_COUNT 256
 void AES256CBC( v128_t** data, const v128_t** next, v128_t ExpandedKey[][16],
                v128_t* IV );
 #else
 void AES256CBC( v128_t *Ciphertext, const v128_t *Plaintext,
               const v128_t *ExpandedKey, v128_t IV, uint32_t BlockCount );
 #endif
 #endif		// __WOLF_AES_H
--- a/algo/m7m/m7m.c
+++ b/algo/m7m/m7m.c
@@ -1,6 +1,8 @@
 #include "cpuminer-config.h"
 #include "algo-gate-api.h"
 #if !defined(__APPLE__)
 #include <gmp.h>
 #include <stdbool.h>
 #include <stdlib.h>
@@ -296,8 +298,14 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
    return 0;
 }
 #endif   // not apple
 bool register_m7m_algo( algo_gate_t *gate )
 {
 #if defined(__APPLE__)
  applog( LOG_ERR, "M7M algo is not supported on MacOS");
  return false;
 #else  
  gate->optimizations = SHA_OPT;
  init_m7m_ctx();
  gate->scanhash              = (void*)&scanhash_m7m_hash;
@@ -307,6 +315,6 @@ bool register_m7m_algo( algo_gate_t *gate )
  gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
  opt_target_factor = 65536.0;
  return true;
 #endif
 }
--- a/algo/sha/sha1-hash.c
+++ b/algo/sha/sha1-hash.c
@@ -0,0 +1,390 @@
 #include "simd-utils.h"
 #include <stdint.h>
 #include "sha1-hash.h"
 #if defined(__x86_64__) && defined(__SHA__)
 #define sha1_opt_rounds( state_out, data, state_in ) \
 { \
    __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1; \
    __m128i MSG0, MSG1, MSG2, MSG3; \
 \
    ABCD = _mm_load_si128( (const __m128i*) state_in ); \
    E0 = _mm_set_epi32( state_in[4], 0, 0, 0 ); \
    ABCD = _mm_shuffle_epi32( ABCD, 0x1B ); \
 \
    ABCD_SAVE = ABCD; \
    E0_SAVE = E0; \
 \
    /* Rounds 0-3 */ \
    MSG0 = load_msg( data, 0 ); \
    E0 = _mm_add_epi32( E0, MSG0 ); \
    E1 = ABCD; \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 0 ); \
 \
    /* Rounds 4-7 */ \
    MSG1 = load_msg( data, 1 ); \
    E1 = _mm_sha1nexte_epu32( E1, MSG1 ); \
    E0 = ABCD; \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 0 ); \
    MSG0 = _mm_sha1msg1_epu32( MSG0, MSG1 ); \
 \
    /* Rounds 8-11 */ \
    MSG2 = load_msg( data, 2 ); \
    E0 = _mm_sha1nexte_epu32( E0, MSG2 ); \
    E1 = ABCD; \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 0 ); \
    MSG1 = _mm_sha1msg1_epu32( MSG1, MSG2 ); \
    MSG0 = _mm_xor_si128( MSG0, MSG2 ); \
 \
    /* Rounds 12-15 */ \
    MSG3 = load_msg( data, 3 ); \
    E1 = _mm_sha1nexte_epu32( E1, MSG3 ); \
    E0 = ABCD; \
    MSG0 = _mm_sha1msg2_epu32( MSG0, MSG3 ); \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 0 ); \
    MSG2 = _mm_sha1msg1_epu32( MSG2, MSG3 ); \
    MSG1 = _mm_xor_si128( MSG1, MSG3 ); \
 \
    /* Rounds 16-19 */ \
    E0 = _mm_sha1nexte_epu32( E0, MSG0 ); \
    E1 = ABCD; \
    MSG1 = _mm_sha1msg2_epu32( MSG1, MSG0 ); \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 0 ); \
    MSG3 = _mm_sha1msg1_epu32( MSG3, MSG0 ); \
    MSG2 = _mm_xor_si128( MSG2, MSG0 ); \
 \
    /* Rounds 20-23 */ \
    E1 = _mm_sha1nexte_epu32( E1, MSG1 ); \
    E0 = ABCD; \
    MSG2 = _mm_sha1msg2_epu32( MSG2, MSG1 ); \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 1 ); \
    MSG0 = _mm_sha1msg1_epu32( MSG0, MSG1 ); \
    MSG3 = _mm_xor_si128( MSG3, MSG1 ); \
 \
    /* Rounds 24-27 */ \
    E0 = _mm_sha1nexte_epu32( E0, MSG2 ); \
    E1 = ABCD; \
    MSG3 = _mm_sha1msg2_epu32( MSG3, MSG2 ); \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 1 ); \
    MSG1 = _mm_sha1msg1_epu32( MSG1, MSG2 ); \
    MSG0 = _mm_xor_si128( MSG0, MSG2 ); \
 \
    /* Rounds 28-31 */ \
    E1 = _mm_sha1nexte_epu32( E1, MSG3 ); \
    E0 = ABCD; \
    MSG0 = _mm_sha1msg2_epu32( MSG0, MSG3 ); \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 1 ); \
    MSG2 = _mm_sha1msg1_epu32( MSG2, MSG3 ); \
    MSG1 = _mm_xor_si128( MSG1, MSG3 ); \
 \
    /* Rounds 32-35 */ \
    E0 = _mm_sha1nexte_epu32( E0, MSG0 ); \
    E1 = ABCD; \
    MSG1 = _mm_sha1msg2_epu32( MSG1, MSG0 ); \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 1 ); \
    MSG3 = _mm_sha1msg1_epu32( MSG3, MSG0 ); \
    MSG2 = _mm_xor_si128( MSG2, MSG0 ); \
 \
    /* Rounds 36-39 */ \
    E1 = _mm_sha1nexte_epu32( E1, MSG1 ); \
    E0 = ABCD; \
    MSG2 = _mm_sha1msg2_epu32( MSG2, MSG1 ); \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 1 ); \
    MSG0 = _mm_sha1msg1_epu32( MSG0, MSG1 ); \
    MSG3 = _mm_xor_si128( MSG3, MSG1 ); \
 \
    /* Rounds 40-43 */ \
    E0 = _mm_sha1nexte_epu32( E0, MSG2 ); \
    E1 = ABCD; \
    MSG3 = _mm_sha1msg2_epu32( MSG3, MSG2 ); \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 2 ); \
    MSG1 = _mm_sha1msg1_epu32( MSG1, MSG2 ); \
    MSG0 = _mm_xor_si128( MSG0, MSG2 ); \
 \
    /* Rounds 44-47 */ \
    E1 = _mm_sha1nexte_epu32( E1, MSG3 ); \
    E0 = ABCD; \
    MSG0 = _mm_sha1msg2_epu32( MSG0, MSG3 ); \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 2 ); \
    MSG2 = _mm_sha1msg1_epu32( MSG2, MSG3 ); \
    MSG1 = _mm_xor_si128( MSG1, MSG3 ); \
 \
    /* Rounds 48-51 */ \
    E0 = _mm_sha1nexte_epu32( E0, MSG0 ); \
    E1 = ABCD; \
    MSG1 = _mm_sha1msg2_epu32( MSG1, MSG0 ); \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 2 ); \
    MSG3 = _mm_sha1msg1_epu32( MSG3, MSG0 ); \
    MSG2 = _mm_xor_si128( MSG2, MSG0 ); \
    E0 = _mm_sha1nexte_epu32( E0, MSG0 ); \
    E1 = ABCD; \
    MSG1 = _mm_sha1msg2_epu32( MSG1, MSG0 ); \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 2 ); \
    MSG3 = _mm_sha1msg1_epu32( MSG3, MSG0 ); \
    MSG2 = _mm_xor_si128( MSG2, MSG0 ); \
 \
    /* Rounds 52-55 */ \
    E1 = _mm_sha1nexte_epu32( E1, MSG1 ); \
    E0 = ABCD; \
    MSG2 = _mm_sha1msg2_epu32( MSG2, MSG1 ); \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 2 ); \
    MSG0 = _mm_sha1msg1_epu32( MSG0, MSG1 ); \
    MSG3 = _mm_xor_si128( MSG3, MSG1 ); \
 \
    /* Rounds 56-59 */ \
    E0 = _mm_sha1nexte_epu32( E0, MSG2 ); \
    E1 = ABCD; \
    MSG3 = _mm_sha1msg2_epu32( MSG3, MSG2 ); \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 2 ); \
    MSG1 = _mm_sha1msg1_epu32( MSG1, MSG2 ); \
    MSG0 = _mm_xor_si128( MSG0, MSG2 ); \
 \
    /* Rounds 60-63 */ \
    E1 = _mm_sha1nexte_epu32( E1, MSG3 ); \
    E0 = ABCD; \
    MSG0 = _mm_sha1msg2_epu32( MSG0, MSG3 ); \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 3 ); \
    MSG2 = _mm_sha1msg1_epu32( MSG2, MSG3 ); \
    MSG1 = _mm_xor_si128( MSG1, MSG3 ); \
 \
    /* Rounds 64-67 */ \
    E0 = _mm_sha1nexte_epu32( E0, MSG0 ); \
    E1 = ABCD; \
    MSG1 = _mm_sha1msg2_epu32( MSG1, MSG0 ); \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 3 ); \
    MSG3 = _mm_sha1msg1_epu32( MSG3, MSG0 ); \
    MSG2 = _mm_xor_si128( MSG2, MSG0 ); \
 \
    /* Rounds 68-71 */ \
    E1 = _mm_sha1nexte_epu32( E1, MSG1 ); \
    E0 = ABCD; \
    MSG2 = _mm_sha1msg2_epu32( MSG2, MSG1 ); \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 3 ); \
    MSG3 = _mm_xor_si128( MSG3, MSG1 ); \
 \
    /* Rounds 72-75 */ \
    E0 = _mm_sha1nexte_epu32( E0, MSG2 ); \
    E1 = ABCD; \
    MSG3 = _mm_sha1msg2_epu32( MSG3, MSG2 ); \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 3 ); \
 \
    /* Rounds 76-79 */ \
    E1 = _mm_sha1nexte_epu32( E1, MSG3 ); \
    E0 = ABCD; \
    ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 3 ); \
 \
    /* Combine state */ \
    E0 = _mm_sha1nexte_epu32( E0, E0_SAVE ); \
    ABCD = _mm_add_epi32( ABCD, ABCD_SAVE ); \
 \
    /* Save state */ \
    ABCD = _mm_shuffle_epi32( ABCD, 0x1B ); \
    _mm_store_si128( (__m128i*) state_out, ABCD ); \
    state_out[4] = _mm_extract_epi32( E0, 3 ); \
 }
 void sha1_x86_sha_transform_le( uint32_t *state_out, const void *input,
                                const uint32_t *state_in )
 {
 #define load_msg( m, i ) casti_v128( m, i )
   sha1_opt_rounds( state_out, input, state_in );
 #undef load_msg
 }
 void sha1_x86_sha_transform_be( uint32_t *state_out, const void *input,
                                const uint32_t *state_in )
 {
   const __m128i MASK = _mm_set_epi64x( 0x0001020304050607ULL,
                                        0x08090a0b0c0d0e0fULL );
 #define load_msg( m, i ) _mm_shuffle_epi8( casti_v128( m, i ), MASK )
   sha1_opt_rounds( state_out, input, state_in );
 #undef load_msg
 }
 #endif
 #if defined(__aarch64__) && defined(__ARM_FEATURE_SHA2)
 #define sha1_neon_rounds( state_out, data, state_in ) \
 { \
    uint32x4_t ABCD, ABCD_SAVED; \
    uint32x4_t TMP0, TMP1; \
    uint32x4_t MSG0, MSG1, MSG2, MSG3; \
    uint32_t   E0, E0_SAVED, E1; \
 \
    /* Load state */ \
    ABCD = vld1q_u32( &state_in[0] ); \
    E0 = state_in[4]; \
 \
    /* Save state */ \
    ABCD_SAVED = ABCD; \
    E0_SAVED = E0; \
 \
    MSG0 = load_msg( data, 0 ); \
    MSG1 = load_msg( data, 1 ); \
    MSG2 = load_msg( data, 2 ); \
    MSG3 = load_msg( data, 3 ); \
 \
    TMP0 = vaddq_u32( MSG0, vdupq_n_u32( 0x5A827999 ) ); \
    TMP1 = vaddq_u32( MSG1, vdupq_n_u32( 0x5A827999 ) ); \
 \
    /* Rounds 0-3 */ \
    E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
    ABCD = vsha1cq_u32( ABCD, E0, TMP0 ); \
    TMP0 = vaddq_u32( MSG2, vdupq_n_u32( 0x5A827999 ) ); \
    MSG0 = vsha1su0q_u32( MSG0, MSG1, MSG2 ); \
 \
    /* Rounds 4-7 */ \
    E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
    ABCD = vsha1cq_u32(ABCD, E1, TMP1); \
    TMP1 = vaddq_u32( MSG3, vdupq_n_u32( 0x5A827999 ) ); \
    MSG0 = vsha1su1q_u32( MSG0, MSG3 ); \
    MSG1 = vsha1su0q_u32( MSG1, MSG2, MSG3 ); \
 \
    /* Rounds 8-11 */ \
    E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
    ABCD = vsha1cq_u32( ABCD, E0, TMP0 ); \
    TMP0 = vaddq_u32( MSG0, vdupq_n_u32( 0x5A827999 ) ); \
    MSG1 = vsha1su1q_u32( MSG1, MSG0 ); \
    MSG2 = vsha1su0q_u32( MSG2, MSG3, MSG0 ); \
 \
    /* Rounds 12-15 */ \
    E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
    ABCD = vsha1cq_u32( ABCD, E1, TMP1 ); \
    TMP1 = vaddq_u32( MSG1, vdupq_n_u32( 0x6ED9EBA1 ) ); \
    MSG2 = vsha1su1q_u32( MSG2, MSG1 ); \
    MSG3 = vsha1su0q_u32( MSG3, MSG0, MSG1 ); \
 \
    /* Rounds 16-19 */\
    E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
    ABCD = vsha1cq_u32( ABCD, E0, TMP0 ); \
    TMP0 = vaddq_u32( MSG2, vdupq_n_u32( 0x6ED9EBA1 ) ); \
    MSG3 = vsha1su1q_u32( MSG3, MSG2 ); \
    MSG0 = vsha1su0q_u32( MSG0, MSG1, MSG2 ); \
 \
    /* Rounds 20-23 */ \
    E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
    ABCD = vsha1pq_u32( ABCD, E1, TMP1 ); \
    TMP1 = vaddq_u32( MSG3, vdupq_n_u32( 0x6ED9EBA1 ) ); \
    MSG0 = vsha1su1q_u32( MSG0, MSG3 ); \
    MSG1 = vsha1su0q_u32( MSG1, MSG2, MSG3 ); \
 \
    /* Rounds 24-27 */ \
    E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
    ABCD = vsha1pq_u32( ABCD, E0, TMP0 ); \
    TMP0 = vaddq_u32( MSG0, vdupq_n_u32( 0x6ED9EBA1 ) ); \
    MSG1 = vsha1su1q_u32( MSG1, MSG0 ); \
    MSG2 = vsha1su0q_u32( MSG2, MSG3, MSG0 ); \
 \
    /* Rounds 28-31 */ \
    E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
    ABCD = vsha1pq_u32( ABCD, E1, TMP1 ); \
    TMP1 = vaddq_u32( MSG1, vdupq_n_u32( 0x6ED9EBA1 ) ); \
    MSG2 = vsha1su1q_u32( MSG2, MSG1 ); \
    MSG3 = vsha1su0q_u32( MSG3, MSG0, MSG1 ); \
 \
    /* Rounds 32-35 */ \
    E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
    ABCD = vsha1pq_u32( ABCD, E0, TMP0 ); \
    TMP0 = vaddq_u32( MSG2, vdupq_n_u32( 0x8F1BBCDC ) ); \
    MSG3 = vsha1su1q_u32( MSG3, MSG2 ); \
    MSG0 = vsha1su0q_u32( MSG0, MSG1, MSG2 ); \
 \
    /* Rounds 36-39 */ \
    E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
    ABCD = vsha1pq_u32( ABCD, E1, TMP1 ); \
    TMP1 = vaddq_u32( MSG3, vdupq_n_u32( 0x8F1BBCDC ) ); \
    MSG0 = vsha1su1q_u32( MSG0, MSG3 ); \
    MSG1 = vsha1su0q_u32( MSG1, MSG2, MSG3 ); \
 \
    /* Rounds 40-43 */ \
    E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
    ABCD = vsha1mq_u32( ABCD, E0, TMP0 ); \
    TMP0 = vaddq_u32( MSG0, vdupq_n_u32( 0x8F1BBCDC ) ); \
    MSG1 = vsha1su1q_u32( MSG1, MSG0 ); \
    MSG2 = vsha1su0q_u32( MSG2, MSG3, MSG0 ); \
 \
    /* Rounds 44-47 */ \
    E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
    ABCD = vsha1mq_u32( ABCD, E1, TMP1 ); \
    TMP1 = vaddq_u32( MSG1, vdupq_n_u32( 0x8F1BBCDC ) ); \
    MSG2 = vsha1su1q_u32( MSG2, MSG1 ); \
    MSG3 = vsha1su0q_u32( MSG3, MSG0, MSG1 ); \
 \
    /* Rounds 48-51 */ \
    E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
    ABCD = vsha1mq_u32( ABCD, E0, TMP0 ); \
    TMP0 = vaddq_u32( MSG2, vdupq_n_u32( 0x8F1BBCDC ) ); \
    MSG3 = vsha1su1q_u32( MSG3, MSG2 ); \
    MSG0 = vsha1su0q_u32( MSG0, MSG1, MSG2 ); \
 \
    /* Rounds 52-55 */ \
    E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
    ABCD = vsha1mq_u32( ABCD, E1, TMP1 ); \
    TMP1 = vaddq_u32( MSG3, vdupq_n_u32( 0xCA62C1D6 ) ); \
    MSG0 = vsha1su1q_u32( MSG0, MSG3 ); \
    MSG1 = vsha1su0q_u32( MSG1, MSG2, MSG3 ); \
 \
    /* Rounds 56-59 */ \
    E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
    ABCD = vsha1mq_u32( ABCD, E0, TMP0 ); \
    TMP0 = vaddq_u32( MSG0, vdupq_n_u32( 0xCA62C1D6 ) ); \
    MSG1 = vsha1su1q_u32( MSG1, MSG0 ); \
    MSG2 = vsha1su0q_u32( MSG2, MSG3, MSG0 ); \
 \
    /* Rounds 60-63 */ \
    E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
    ABCD = vsha1pq_u32( ABCD, E1, TMP1 ); \
    TMP1 = vaddq_u32( MSG1, vdupq_n_u32( 0xCA62C1D6 ) ); \
    MSG2 = vsha1su1q_u32( MSG2, MSG1 ); \
    MSG3 = vsha1su0q_u32( MSG3, MSG0, MSG1 ); \
 \
    /* Rounds 64-67 */ \
    E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
    ABCD = vsha1pq_u32( ABCD, E0, TMP0 ); \
    TMP0 = vaddq_u32(MSG2, vdupq_n_u32( 0xCA62C1D6 ) ); \
    MSG3 = vsha1su1q_u32( MSG3, MSG2 ); \
    MSG0 = vsha1su0q_u32( MSG0, MSG1, MSG2 ); \
 \
    /* Rounds 68-71 */ \
    E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0) ); \
    ABCD = vsha1pq_u32( ABCD, E1, TMP1 ); \
    TMP1 = vaddq_u32( MSG3, vdupq_n_u32( 0xCA62C1D6 ) ); \
    MSG0 = vsha1su1q_u32( MSG0, MSG3 ); \
 \
    /* Rounds 72-75 */ \
    E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
    ABCD = vsha1pq_u32( ABCD, E0, TMP0 ); \
 \
    /* Rounds 76-79 */ \
    E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
    ABCD = vsha1pq_u32( ABCD, E1, TMP1 ); \
 \
    /* Combine state */ \
    E0 += E0_SAVED; \
    ABCD = vaddq_u32( ABCD_SAVED, ABCD ); \
 \
    /* Save state */ \
    vst1q_u32( &state_out[0], ABCD ); \
    state_out[4] = E0; \
 }
 void sha1_neon_sha_transform_be( uint32_t *state_out, const void *input,
                                 const uint32_t *state_in )
 {
 #define load_msg( m, i )  v128_bswap32( casti_v128( m, i ) );
   sha1_neon_rounds( state_out, input, state_in );
 #undef load_msg
 }
 void sha1_neon_sha_transform_le( uint32_t *state_out, const void *input,
                                 const uint32_t *state_in )
 {
 #define load_msg( m, i )  casti_v128( m, i );
   sha1_neon_rounds( state_out, input, state_in );
 #undef load_msg
 }
 #endif      
--- a/algo/sha/sha1-hash.h
+++ b/algo/sha/sha1-hash.h
@@ -0,0 +1,40 @@
 #ifndef SHA1_HASH_H__
 #define SHA1_HASH_H__ 1
 #include <stddef.h>
 #include "simd-utils.h"
 #include "cpuminer-config.h"
 #include "sph_sha1.h"
 // SHA hooks for sha1, automaticaaly substituded in SPH
 #if defined(__x86_64__) && defined(__SHA__)
 void sha1_x86_sha_transform_le( uint32_t *state_out, const void *input,
                                const uint32_t *state_in );
 void sha1_x86_sha_transform_be( uint32_t *state_out, const void *input,
                                const uint32_t *state_in );
 #define sha1_transform_le        sha1_x86_sha_transform_le
 #define sha1_transform_be        sha1_x86_sha_transform_be
 #elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
 void sha1_neon_sha_transform_be( uint32_t *state_out, const void *input,
                                 const uint32_t *state_in );
 void sha1_neon_sha_transform_le( uint32_t *state_out, const void *input,
                                 const uint32_t *state_in );
 #define sha1_transform_le        sha1_neon_sha_transform_le
 #define sha1_transform_be        sha1_neon_sha_transform_be
 #else
 #define sha1_transform_le        sph_sha1_transform_le
 #define sha1_transform_be        sph_sha1_transform_be
 #endif
 #define sha1_full                sph_sha1_full
 #endif
--- a/algo/sha/sha1.c
+++ b/algo/sha/sha1.c
@@ -0,0 +1,400 @@
 /* $Id: sha1.c 216 2010-06-08 09:46:57Z tp $ */
 /*
 * SHA-1 implementation.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2007-2010  Projet RNRT SAPHIR
 * 
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */
 #include <stddef.h>
 #include <string.h>
 #include "simd-utils.h"
 #include "sha1-hash.h"
 #define F(B, C, D)     ((((C) ^ (D)) & (B)) ^ (D))
 #define G(B, C, D)     ((B) ^ (C) ^ (D))
 #define H(B, C, D)     (((D) & (C)) | (((D) | (C)) & (B)))
 #define I(B, C, D)     G(B, C, D)
 #define ROTL    rol32
 //#define ROTL    SPH_ROTL32
 #define K1     SPH_C32(0x5A827999)
 #define K2     SPH_C32(0x6ED9EBA1)
 #define K3     SPH_C32(0x8F1BBCDC)
 #define K4     SPH_C32(0xCA62C1D6)
 static const sph_u32 IV[5] = {
 	SPH_C32(0x67452301), SPH_C32(0xEFCDAB89),
 	SPH_C32(0x98BADCFE), SPH_C32(0x10325476),
 	SPH_C32(0xC3D2E1F0)
 };
 /*
 * This macro defines the body for a SHA-1 compression function
 * implementation. The "in" parameter should evaluate, when applied to a
 * numerical input parameter from 0 to 15, to an expression which yields
 * the corresponding input block. The "r" parameter should evaluate to
 * an array or pointer expression designating the array of 5 words which
 * contains the input and output of the compression function.
 */
 #define SHA1_ROUND_BODY(in, r)   do { \
 		sph_u32 A, B, C, D, E; \
 		sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \
 		sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \
 \
 		A = (r)[0]; \
 		B = (r)[1]; \
 		C = (r)[2]; \
 		D = (r)[3]; \
 		E = (r)[4]; \
 \
 		W00 = in(0); \
 		E = SPH_T32(ROTL(A, 5) + F(B, C, D) + E + W00 + K1); \
 		B = ROTL(B, 30); \
 		W01 = in(1); \
 		D = SPH_T32(ROTL(E, 5) + F(A, B, C) + D + W01 + K1); \
 		A = ROTL(A, 30); \
 		W02 = in(2); \
 		C = SPH_T32(ROTL(D, 5) + F(E, A, B) + C + W02 + K1); \
 		E = ROTL(E, 30); \
 		W03 = in(3); \
 		B = SPH_T32(ROTL(C, 5) + F(D, E, A) + B + W03 + K1); \
 		D = ROTL(D, 30); \
 		W04 = in(4); \
 		A = SPH_T32(ROTL(B, 5) + F(C, D, E) + A + W04 + K1); \
 		C = ROTL(C, 30); \
 		W05 = in(5); \
 		E = SPH_T32(ROTL(A, 5) + F(B, C, D) + E + W05 + K1); \
 		B = ROTL(B, 30); \
 		W06 = in(6); \
 		D = SPH_T32(ROTL(E, 5) + F(A, B, C) + D + W06 + K1); \
 		A = ROTL(A, 30); \
 		W07 = in(7); \
 		C = SPH_T32(ROTL(D, 5) + F(E, A, B) + C + W07 + K1); \
 		E = ROTL(E, 30); \
 		W08 = in(8); \
 		B = SPH_T32(ROTL(C, 5) + F(D, E, A) + B + W08 + K1); \
 		D = ROTL(D, 30); \
 		W09 = in(9); \
 		A = SPH_T32(ROTL(B, 5) + F(C, D, E) + A + W09 + K1); \
 		C = ROTL(C, 30); \
 		W10 = in(10); \
 		E = SPH_T32(ROTL(A, 5) + F(B, C, D) + E + W10 + K1); \
 		B = ROTL(B, 30); \
 		W11 = in(11); \
 		D = SPH_T32(ROTL(E, 5) + F(A, B, C) + D + W11 + K1); \
 		A = ROTL(A, 30); \
 		W12 = in(12); \
 		C = SPH_T32(ROTL(D, 5) + F(E, A, B) + C + W12 + K1); \
 		E = ROTL(E, 30); \
 		W13 = in(13); \
 		B = SPH_T32(ROTL(C, 5) + F(D, E, A) + B + W13 + K1); \
 		D = ROTL(D, 30); \
 		W14 = in(14); \
 		A = SPH_T32(ROTL(B, 5) + F(C, D, E) + A + W14 + K1); \
 		C = ROTL(C, 30); \
 		W15 = in(15); \
 		E = SPH_T32(ROTL(A, 5) + F(B, C, D) + E + W15 + K1); \
 		B = ROTL(B, 30); \
 		W00 = ROTL(W13 ^ W08 ^ W02 ^ W00, 1); \
 		D = SPH_T32(ROTL(E, 5) + F(A, B, C) + D + W00 + K1); \
 		A = ROTL(A, 30); \
 		W01 = ROTL(W14 ^ W09 ^ W03 ^ W01, 1); \
 		C = SPH_T32(ROTL(D, 5) + F(E, A, B) + C + W01 + K1); \
 		E = ROTL(E, 30); \
 		W02 = ROTL(W15 ^ W10 ^ W04 ^ W02, 1); \
 		B = SPH_T32(ROTL(C, 5) + F(D, E, A) + B + W02 + K1); \
 		D = ROTL(D, 30); \
 		W03 = ROTL(W00 ^ W11 ^ W05 ^ W03, 1); \
 		A = SPH_T32(ROTL(B, 5) + F(C, D, E) + A + W03 + K1); \
 		C = ROTL(C, 30); \
 		W04 = ROTL(W01 ^ W12 ^ W06 ^ W04, 1); \
 		E = SPH_T32(ROTL(A, 5) + G(B, C, D) + E + W04 + K2); \
 		B = ROTL(B, 30); \
 		W05 = ROTL(W02 ^ W13 ^ W07 ^ W05, 1); \
 		D = SPH_T32(ROTL(E, 5) + G(A, B, C) + D + W05 + K2); \
 		A = ROTL(A, 30); \
 		W06 = ROTL(W03 ^ W14 ^ W08 ^ W06, 1); \
 		C = SPH_T32(ROTL(D, 5) + G(E, A, B) + C + W06 + K2); \
 		E = ROTL(E, 30); \
 		W07 = ROTL(W04 ^ W15 ^ W09 ^ W07, 1); \
 		B = SPH_T32(ROTL(C, 5) + G(D, E, A) + B + W07 + K2); \
 		D = ROTL(D, 30); \
 		W08 = ROTL(W05 ^ W00 ^ W10 ^ W08, 1); \
 		A = SPH_T32(ROTL(B, 5) + G(C, D, E) + A + W08 + K2); \
 		C = ROTL(C, 30); \
 		W09 = ROTL(W06 ^ W01 ^ W11 ^ W09, 1); \
 		E = SPH_T32(ROTL(A, 5) + G(B, C, D) + E + W09 + K2); \
 		B = ROTL(B, 30); \
 		W10 = ROTL(W07 ^ W02 ^ W12 ^ W10, 1); \
 		D = SPH_T32(ROTL(E, 5) + G(A, B, C) + D + W10 + K2); \
 		A = ROTL(A, 30); \
 		W11 = ROTL(W08 ^ W03 ^ W13 ^ W11, 1); \
 		C = SPH_T32(ROTL(D, 5) + G(E, A, B) + C + W11 + K2); \
 		E = ROTL(E, 30); \
 		W12 = ROTL(W09 ^ W04 ^ W14 ^ W12, 1); \
 		B = SPH_T32(ROTL(C, 5) + G(D, E, A) + B + W12 + K2); \
 		D = ROTL(D, 30); \
 		W13 = ROTL(W10 ^ W05 ^ W15 ^ W13, 1); \
 		A = SPH_T32(ROTL(B, 5) + G(C, D, E) + A + W13 + K2); \
 		C = ROTL(C, 30); \
 		W14 = ROTL(W11 ^ W06 ^ W00 ^ W14, 1); \
 		E = SPH_T32(ROTL(A, 5) + G(B, C, D) + E + W14 + K2); \
 		B = ROTL(B, 30); \
 		W15 = ROTL(W12 ^ W07 ^ W01 ^ W15, 1); \
 		D = SPH_T32(ROTL(E, 5) + G(A, B, C) + D + W15 + K2); \
 		A = ROTL(A, 30); \
 		W00 = ROTL(W13 ^ W08 ^ W02 ^ W00, 1); \
 		C = SPH_T32(ROTL(D, 5) + G(E, A, B) + C + W00 + K2); \
 		E = ROTL(E, 30); \
 		W01 = ROTL(W14 ^ W09 ^ W03 ^ W01, 1); \
 		B = SPH_T32(ROTL(C, 5) + G(D, E, A) + B + W01 + K2); \
 		D = ROTL(D, 30); \
 		W02 = ROTL(W15 ^ W10 ^ W04 ^ W02, 1); \
 		A = SPH_T32(ROTL(B, 5) + G(C, D, E) + A + W02 + K2); \
 		C = ROTL(C, 30); \
 		W03 = ROTL(W00 ^ W11 ^ W05 ^ W03, 1); \
 		E = SPH_T32(ROTL(A, 5) + G(B, C, D) + E + W03 + K2); \
 		B = ROTL(B, 30); \
 		W04 = ROTL(W01 ^ W12 ^ W06 ^ W04, 1); \
 		D = SPH_T32(ROTL(E, 5) + G(A, B, C) + D + W04 + K2); \
 		A = ROTL(A, 30); \
 		W05 = ROTL(W02 ^ W13 ^ W07 ^ W05, 1); \
 		C = SPH_T32(ROTL(D, 5) + G(E, A, B) + C + W05 + K2); \
 		E = ROTL(E, 30); \
 		W06 = ROTL(W03 ^ W14 ^ W08 ^ W06, 1); \
 		B = SPH_T32(ROTL(C, 5) + G(D, E, A) + B + W06 + K2); \
 		D = ROTL(D, 30); \
 		W07 = ROTL(W04 ^ W15 ^ W09 ^ W07, 1); \
 		A = SPH_T32(ROTL(B, 5) + G(C, D, E) + A + W07 + K2); \
 		C = ROTL(C, 30); \
 		W08 = ROTL(W05 ^ W00 ^ W10 ^ W08, 1); \
 		E = SPH_T32(ROTL(A, 5) + H(B, C, D) + E + W08 + K3); \
 		B = ROTL(B, 30); \
 		W09 = ROTL(W06 ^ W01 ^ W11 ^ W09, 1); \
 		D = SPH_T32(ROTL(E, 5) + H(A, B, C) + D + W09 + K3); \
 		A = ROTL(A, 30); \
 		W10 = ROTL(W07 ^ W02 ^ W12 ^ W10, 1); \
 		C = SPH_T32(ROTL(D, 5) + H(E, A, B) + C + W10 + K3); \
 		E = ROTL(E, 30); \
 		W11 = ROTL(W08 ^ W03 ^ W13 ^ W11, 1); \
 		B = SPH_T32(ROTL(C, 5) + H(D, E, A) + B + W11 + K3); \
 		D = ROTL(D, 30); \
 		W12 = ROTL(W09 ^ W04 ^ W14 ^ W12, 1); \
 		A = SPH_T32(ROTL(B, 5) + H(C, D, E) + A + W12 + K3); \
 		C = ROTL(C, 30); \
 		W13 = ROTL(W10 ^ W05 ^ W15 ^ W13, 1); \
 		E = SPH_T32(ROTL(A, 5) + H(B, C, D) + E + W13 + K3); \
 		B = ROTL(B, 30); \
 		W14 = ROTL(W11 ^ W06 ^ W00 ^ W14, 1); \
 		D = SPH_T32(ROTL(E, 5) + H(A, B, C) + D + W14 + K3); \
 		A = ROTL(A, 30); \
 		W15 = ROTL(W12 ^ W07 ^ W01 ^ W15, 1); \
 		C = SPH_T32(ROTL(D, 5) + H(E, A, B) + C + W15 + K3); \
 		E = ROTL(E, 30); \
 		W00 = ROTL(W13 ^ W08 ^ W02 ^ W00, 1); \
 		B = SPH_T32(ROTL(C, 5) + H(D, E, A) + B + W00 + K3); \
 		D = ROTL(D, 30); \
 		W01 = ROTL(W14 ^ W09 ^ W03 ^ W01, 1); \
 		A = SPH_T32(ROTL(B, 5) + H(C, D, E) + A + W01 + K3); \
 		C = ROTL(C, 30); \
 		W02 = ROTL(W15 ^ W10 ^ W04 ^ W02, 1); \
 		E = SPH_T32(ROTL(A, 5) + H(B, C, D) + E + W02 + K3); \
 		B = ROTL(B, 30); \
 		W03 = ROTL(W00 ^ W11 ^ W05 ^ W03, 1); \
 		D = SPH_T32(ROTL(E, 5) + H(A, B, C) + D + W03 + K3); \
 		A = ROTL(A, 30); \
 		W04 = ROTL(W01 ^ W12 ^ W06 ^ W04, 1); \
 		C = SPH_T32(ROTL(D, 5) + H(E, A, B) + C + W04 + K3); \
 		E = ROTL(E, 30); \
 		W05 = ROTL(W02 ^ W13 ^ W07 ^ W05, 1); \
 		B = SPH_T32(ROTL(C, 5) + H(D, E, A) + B + W05 + K3); \
 		D = ROTL(D, 30); \
 		W06 = ROTL(W03 ^ W14 ^ W08 ^ W06, 1); \
 		A = SPH_T32(ROTL(B, 5) + H(C, D, E) + A + W06 + K3); \
 		C = ROTL(C, 30); \
 		W07 = ROTL(W04 ^ W15 ^ W09 ^ W07, 1); \
 		E = SPH_T32(ROTL(A, 5) + H(B, C, D) + E + W07 + K3); \
 		B = ROTL(B, 30); \
 		W08 = ROTL(W05 ^ W00 ^ W10 ^ W08, 1); \
 		D = SPH_T32(ROTL(E, 5) + H(A, B, C) + D + W08 + K3); \
 		A = ROTL(A, 30); \
 		W09 = ROTL(W06 ^ W01 ^ W11 ^ W09, 1); \
 		C = SPH_T32(ROTL(D, 5) + H(E, A, B) + C + W09 + K3); \
 		E = ROTL(E, 30); \
 		W10 = ROTL(W07 ^ W02 ^ W12 ^ W10, 1); \
 		B = SPH_T32(ROTL(C, 5) + H(D, E, A) + B + W10 + K3); \
 		D = ROTL(D, 30); \
 		W11 = ROTL(W08 ^ W03 ^ W13 ^ W11, 1); \
 		A = SPH_T32(ROTL(B, 5) + H(C, D, E) + A + W11 + K3); \
 		C = ROTL(C, 30); \
 		W12 = ROTL(W09 ^ W04 ^ W14 ^ W12, 1); \
 		E = SPH_T32(ROTL(A, 5) + I(B, C, D) + E + W12 + K4); \
 		B = ROTL(B, 30); \
 		W13 = ROTL(W10 ^ W05 ^ W15 ^ W13, 1); \
 		D = SPH_T32(ROTL(E, 5) + I(A, B, C) + D + W13 + K4); \
 		A = ROTL(A, 30); \
 		W14 = ROTL(W11 ^ W06 ^ W00 ^ W14, 1); \
 		C = SPH_T32(ROTL(D, 5) + I(E, A, B) + C + W14 + K4); \
 		E = ROTL(E, 30); \
 		W15 = ROTL(W12 ^ W07 ^ W01 ^ W15, 1); \
 		B = SPH_T32(ROTL(C, 5) + I(D, E, A) + B + W15 + K4); \
 		D = ROTL(D, 30); \
 		W00 = ROTL(W13 ^ W08 ^ W02 ^ W00, 1); \
 		A = SPH_T32(ROTL(B, 5) + I(C, D, E) + A + W00 + K4); \
 		C = ROTL(C, 30); \
 		W01 = ROTL(W14 ^ W09 ^ W03 ^ W01, 1); \
 		E = SPH_T32(ROTL(A, 5) + I(B, C, D) + E + W01 + K4); \
 		B = ROTL(B, 30); \
 		W02 = ROTL(W15 ^ W10 ^ W04 ^ W02, 1); \
 		D = SPH_T32(ROTL(E, 5) + I(A, B, C) + D + W02 + K4); \
 		A = ROTL(A, 30); \
 		W03 = ROTL(W00 ^ W11 ^ W05 ^ W03, 1); \
 		C = SPH_T32(ROTL(D, 5) + I(E, A, B) + C + W03 + K4); \
 		E = ROTL(E, 30); \
 		W04 = ROTL(W01 ^ W12 ^ W06 ^ W04, 1); \
 		B = SPH_T32(ROTL(C, 5) + I(D, E, A) + B + W04 + K4); \
 		D = ROTL(D, 30); \
 		W05 = ROTL(W02 ^ W13 ^ W07 ^ W05, 1); \
 		A = SPH_T32(ROTL(B, 5) + I(C, D, E) + A + W05 + K4); \
 		C = ROTL(C, 30); \
 		W06 = ROTL(W03 ^ W14 ^ W08 ^ W06, 1); \
 		E = SPH_T32(ROTL(A, 5) + I(B, C, D) + E + W06 + K4); \
 		B = ROTL(B, 30); \
 		W07 = ROTL(W04 ^ W15 ^ W09 ^ W07, 1); \
 		D = SPH_T32(ROTL(E, 5) + I(A, B, C) + D + W07 + K4); \
 		A = ROTL(A, 30); \
 		W08 = ROTL(W05 ^ W00 ^ W10 ^ W08, 1); \
 		C = SPH_T32(ROTL(D, 5) + I(E, A, B) + C + W08 + K4); \
 		E = ROTL(E, 30); \
 		W09 = ROTL(W06 ^ W01 ^ W11 ^ W09, 1); \
 		B = SPH_T32(ROTL(C, 5) + I(D, E, A) + B + W09 + K4); \
 		D = ROTL(D, 30); \
 		W10 = ROTL(W07 ^ W02 ^ W12 ^ W10, 1); \
 		A = SPH_T32(ROTL(B, 5) + I(C, D, E) + A + W10 + K4); \
 		C = ROTL(C, 30); \
 		W11 = ROTL(W08 ^ W03 ^ W13 ^ W11, 1); \
 		E = SPH_T32(ROTL(A, 5) + I(B, C, D) + E + W11 + K4); \
 		B = ROTL(B, 30); \
 		W12 = ROTL(W09 ^ W04 ^ W14 ^ W12, 1); \
 		D = SPH_T32(ROTL(E, 5) + I(A, B, C) + D + W12 + K4); \
 		A = ROTL(A, 30); \
 		W13 = ROTL(W10 ^ W05 ^ W15 ^ W13, 1); \
 		C = SPH_T32(ROTL(D, 5) + I(E, A, B) + C + W13 + K4); \
 		E = ROTL(E, 30); \
 		W14 = ROTL(W11 ^ W06 ^ W00 ^ W14, 1); \
 		B = SPH_T32(ROTL(C, 5) + I(D, E, A) + B + W14 + K4); \
 		D = ROTL(D, 30); \
 		W15 = ROTL(W12 ^ W07 ^ W01 ^ W15, 1); \
 		A = SPH_T32(ROTL(B, 5) + I(C, D, E) + A + W15 + K4); \
 		C = ROTL(C, 30); \
 \
 		(r)[0] = SPH_T32(r[0] + A); \
 		(r)[1] = SPH_T32(r[1] + B); \
 		(r)[2] = SPH_T32(r[2] + C); \
 		(r)[3] = SPH_T32(r[3] + D); \
 		(r)[4] = SPH_T32(r[4] + E); \
 	} while (0)
 /*
 * One round of SHA-1. The data must be aligned for 32-bit access.
 */
 #if ( defined(__x86_64__) && defined(__SHA__) ) || ( defined(__aarch64__) && defined(__ARM_FEATURE_SHA2) )
 static void
 sha1_round( const unsigned char *data, sph_u32 r[5] )
 {
  sha1_transform_be( (uint32_t*)r,  (uint32_t*)data, (const uint32_t*)r ); 
 }
 #else
 static void
 sha1_round( const unsigned char *data, sph_u32 r[5] )
 {
 #define SHA1_IN(x)   sph_dec32be_aligned(data + (4 * (x)))
 	SHA1_ROUND_BODY(SHA1_IN, r);
 #undef SHA1_IN
 }
 #endif
 /* see sph_sha1.h */
 void
 sph_sha1_init(void *cc)
 {
 	sph_sha1_context *sc;
 	sc = cc;
 	memcpy(sc->val, IV, sizeof IV);
 #if SPH_64
 	sc->count = 0;
 #else
 	sc->count_high = sc->count_low = 0;
 #endif
 }
 #define RFUN   sha1_round
 #define HASH   sha1
 #define BE32   1
 #include "md_helper.c"
 /* see sph_sha1.h */
 void
 sph_sha1_close(void *cc, void *dst)
 {
 	sha1_close(cc, dst, 5);
 	sph_sha1_init(cc);
 }
 /* see sph_sha1.h */
 void
 sph_sha1_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
 	sha1_addbits_and_close(cc, ub, n, dst, 5);
 	sph_sha1_init(cc);
 }
 /* see sph_sha1.h */
 void
 sph_sha1_comp(const sph_u32 msg[16], sph_u32 val[5])
 {
 #define SHA1_IN(x)   msg[x]
 	SHA1_ROUND_BODY(SHA1_IN, val);
 #undef SHA1_IN
 }
 void sph_sha1_full( void *hash, const void *msg, size_t len )
 {   
   sph_sha1_context cc;   
   sph_sha1_init( &cc );
   sph_sha1( &cc, msg, len );
   sph_sha1_close( &cc, hash );
 }
--- a/algo/sha/sph_sha1.h
+++ b/algo/sha/sph_sha1.h
@@ -0,0 +1,133 @@
 /* $Id: sph_sha1.h 216 2010-06-08 09:46:57Z tp $ */
 /**
 * SHA-1 interface.
 *
 * SHA-1 is described in FIPS 180-1 (now superseded by FIPS 180-2, but the
 * description of SHA-1 is still included and has not changed). FIPS
 * standards can be found at: http://csrc.nist.gov/publications/fips/
 *
 * @warning   A theoretical collision attack against SHA-1, with work
 * factor 2^63, has been published. SHA-1 should not be used in new
 * protocol designs.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2007-2010  Projet RNRT SAPHIR
 * 
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 * @file     sph_sha1.h
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */
 #ifndef SPH_SHA1_H__
 #define SPH_SHA1_H__
 #include <stddef.h>
 #include "compat/sph_types.h"
 /**
 * Output size (in bits) for SHA-1.
 */
 #define SPH_SIZE_sha1   160
 /**
 * This structure is a context for SHA-1 computations: it contains the
 * intermediate values and some data from the last entered block. Once
 * a SHA-1 computation has been performed, the context can be reused for
 * another computation.
 *
 * The contents of this structure are private. A running SHA-1 computation
 * can be cloned by copying the context (e.g. with a simple
 * <code>memcpy()</code>).
 */
 typedef struct {
 #ifndef DOXYGEN_IGNORE
 	unsigned char buf[64];    /* first field, for alignment */
 	sph_u32 val[5];
 #if SPH_64
 	sph_u64 count;
 #else
 	sph_u32 count_high, count_low;
 #endif
 #endif
 } sph_sha1_context;
 /**
 * Initialize a SHA-1 context. This process performs no memory allocation.
 *
 * @param cc   the SHA-1 context (pointer to a <code>sph_sha1_context</code>)
 */
 void sph_sha1_init(void *cc);
 /**
 * Process some data bytes. It is acceptable that <code>len</code> is zero
 * (in which case this function does nothing).
 *
 * @param cc     the SHA-1 context
 * @param data   the input data
 * @param len    the input data length (in bytes)
 */
 void sph_sha1(void *cc, const void *data, size_t len);
 /**
 * Terminate the current SHA-1 computation and output the result into the
 * provided buffer. The destination buffer must be wide enough to
 * accomodate the result (20 bytes). The context is automatically
 * reinitialized.
 *
 * @param cc    the SHA-1 context
 * @param dst   the destination buffer
 */
 void sph_sha1_close(void *cc, void *dst);
 /**
 * Add a few additional bits (0 to 7) to the current computation, then
 * terminate it and output the result in the provided buffer, which must
 * be wide enough to accomodate the result (20 bytes). If bit number i
 * in <code>ub</code> has value 2^i, then the extra bits are those
 * numbered 7 downto 8-n (this is the big-endian convention at the byte
 * level). The context is automatically reinitialized.
 *
 * @param cc    the SHA-1 context
 * @param ub    the extra bits
 * @param n     the number of extra bits (0 to 7)
 * @param dst   the destination buffer
 */
 void sph_sha1_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
 /**
 * Apply the SHA-1 compression function on the provided data. The
 * <code>msg</code> parameter contains the 16 32-bit input blocks,
 * as numerical values (hence after the big-endian decoding). The
 * <code>val</code> parameter contains the 5 32-bit input blocks for
 * the compression function; the output is written in place in this
 * array.
 *
 * @param msg   the message block (16 values)
 * @param val   the function 160-bit input and output
 */
 void sph_sha1_comp(const sph_u32 msg[16], sph_u32 val[5]);
 void sph_sha1_full( void *hash, const void *msg, size_t len );
 #endif
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -60,7 +60,6 @@ static const sph_u32 IV512[] = {
 static void
 c512( sph_shavite_big_context *sc, const void *msg )
 {
   const v128_t zero = v128_zero;
   v128_t p0, p1, p2, p3, x;
   v128_t k00, k01, k02, k03, k10, k11, k12, k13;
   v128_t *m = (v128_t*)msg;
@@ -76,39 +75,39 @@ c512( sph_shavite_big_context *sc, const void *msg )
   k00 = m[0];
   x = v128_xor( p1, k00 );
-   x = v128_aesenc( x, zero );
+   x = v128_aesenc_nokey( x );
   k01 = m[1];
   x = v128_xor( x, k01 );
-   x = v128_aesenc( x, zero );
+   x = v128_aesenc_nokey( x );
   k02 = m[2];
   x = v128_xor( x, k02 );
-   x = v128_aesenc( x, zero );
+   x = v128_aesenc_nokey( x );
   k03 = m[3];
   x = v128_xor( x, k03 );
-   x = v128_aesenc( x, zero );
+   x = v128_aesenc_nokey( x );
   p0 = v128_xor( p0, x );
   k10 = m[4];
   x = v128_xor( p3, k10 );
-   x = v128_aesenc( x, zero );
+   x = v128_aesenc_nokey( x );
   k11 = m[5];
   x = v128_xor( x, k11 );
-   x = v128_aesenc( x, zero );
+   x = v128_aesenc_nokey( x );
   k12 = m[6];
   x = v128_xor( x, k12 );
-   x = v128_aesenc( x, zero );
+   x = v128_aesenc_nokey( x );
   k13 = m[7];
   x = v128_xor( x, k13 );
-   x = v128_aesenc( x, zero );
+   x = v128_aesenc_nokey( x );
   p2 = v128_xor( p2, x );
   for ( r = 0; r < 3; r ++ )
   {
      // round 1, 5, 9
-      k00 = v128_shuflr32( v128_aesenc( k00, zero ) );
+      k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
      k00 = v128_xor( k00, k13 ); 
      if ( r == 0 )
@@ -116,8 +115,8 @@ c512( sph_shavite_big_context *sc, const void *msg )
                  ~sc->count3, sc->count2, sc->count1, sc->count0 ) ); 
      x = v128_xor( p0, k00 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
-      k01 = v128_shuflr32( v128_aesenc( k01, zero ) );
+      k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
      k01 = v128_xor( k01, k00 );
      if ( r == 1 )
@@ -125,32 +124,32 @@ c512( sph_shavite_big_context *sc, const void *msg )
                  ~sc->count0, sc->count1, sc->count2, sc->count3 ) );
      x = v128_xor( x, k01 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
-      k02 = v128_shuflr32( v128_aesenc( k02, zero ) );
+      k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
      k02 = v128_xor( k02, k01 );
      x = v128_xor( x, k02 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
-      k03 = v128_shuflr32( v128_aesenc( k03, zero ) );
+      k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
      k03 = v128_xor( k03, k02 );
      x = v128_xor( x, k03 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      p3 = v128_xor( p3, x );
-      k10 = v128_shuflr32( v128_aesenc( k10, zero ) );
+      k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
      k10 = v128_xor( k10, k03 );
      x = v128_xor( p2, k10 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
-      k11 = v128_shuflr32( v128_aesenc( k11, zero ) );
+      k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
      k11 = v128_xor( k11, k10 );
      x = v128_xor( x, k11 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
-      k12 = v128_shuflr32( v128_aesenc( k12, zero ) );
+      k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
      k12 = v128_xor( k12, k11 );
      x = v128_xor( x, k12 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
-      k13 = v128_shuflr32( v128_aesenc( k13, zero ) );
+      k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
      k13 = v128_xor( k13, k12 );
      if ( r == 2 )
@@ -158,78 +157,78 @@ c512( sph_shavite_big_context *sc, const void *msg )
                  ~sc->count1, sc->count0, sc->count3, sc->count2 ) );
      x = v128_xor( x, k13 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      p1 = v128_xor( p1, x );
      // round 2, 6, 10
      k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
      x = v128_xor( p3, k00 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
      x = v128_xor( x, k01 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
      x = v128_xor( x, k02 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
      x = v128_xor( x, k03 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      p2 = v128_xor( p2, x );
      k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
      x = v128_xor( p1, k10 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
      x = v128_xor( x, k11 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
      x = v128_xor( x, k12 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
      x = v128_xor( x, k13 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      p0 = v128_xor( p0, x );
      // round 3, 7, 11
-      k00 = v128_shuflr32( v128_aesenc( k00, zero ) );
+      k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
      k00 = v128_xor( k00, k13 );
      x = v128_xor( p2, k00 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
-      k01 = v128_shuflr32( v128_aesenc( k01, zero ) );
+      k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
      k01 = v128_xor( k01, k00 );
      x = v128_xor( x, k01 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
-      k02 = v128_shuflr32( v128_aesenc( k02, zero ) );
+      k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
      k02 = v128_xor( k02, k01 );
      x = v128_xor( x, k02 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
-      k03 = v128_shuflr32( v128_aesenc( k03, zero ) );
+      k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
      k03 = v128_xor( k03, k02 );
      x = v128_xor( x, k03 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      p1 = v128_xor( p1, x );
-      k10 = v128_shuflr32( v128_aesenc( k10, zero ) );
+      k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
      k10 = v128_xor( k10, k03 );
      x = v128_xor( p0, k10 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
-      k11 = v128_shuflr32( v128_aesenc( k11, zero ) );
+      k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
      k11 = v128_xor( k11, k10 );
      x = v128_xor( x, k11 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
-      k12 = v128_shuflr32( v128_aesenc( k12, zero ) );
+      k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
      k12 = v128_xor( k12, k11 );
      x = v128_xor( x, k12 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
-      k13 = v128_shuflr32( v128_aesenc( k13, zero ) );
+      k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
      k13 = v128_xor( k13, k12 );
      x = v128_xor( x, k13 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      p3 = v128_xor( p3, x );
@@ -237,73 +236,73 @@ c512( sph_shavite_big_context *sc, const void *msg )
      k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
      x = v128_xor( p1, k00 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
      x = v128_xor( x, k01 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
      x = v128_xor( x, k02 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
      x = v128_xor( x, k03 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      p0 = v128_xor( p0, x );
      k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
      x = v128_xor( p3, k10 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
      x = v128_xor( x, k11 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
      x = v128_xor( x, k12 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
      x = v128_xor( x, k13 );
-      x = v128_aesenc( x, zero );
+      x = v128_aesenc_nokey( x );
      p2 = v128_xor( p2, x );
   }
   // round 13
-   k00 = v128_shuflr32( v128_aesenc( k00, zero ) );
+   k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
   k00 = v128_xor( k00, k13 );
   x = v128_xor( p0, k00 );
-   x = v128_aesenc( x, zero );
+   x = v128_aesenc_nokey( x );
-   k01 = v128_shuflr32( v128_aesenc( k01, zero ) ); 
+   k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) ); 
   k01 = v128_xor( k01, k00 );
   x = v128_xor( x, k01 );
-   x = v128_aesenc( x, zero );
+   x = v128_aesenc_nokey( x );
-   k02 = v128_shuflr32( v128_aesenc( k02, zero ) );
+   k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
   k02 = v128_xor( k02, k01 );
   x = v128_xor( x, k02 );
-   x = v128_aesenc( x, zero );
+   x = v128_aesenc_nokey( x );
-   k03 = v128_shuflr32( v128_aesenc( k03, zero ) );
+   k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
   k03 = v128_xor( k03, k02 );
   x = v128_xor( x, k03 );
-   x = v128_aesenc( x, zero );
+   x = v128_aesenc_nokey( x );
   p3 = v128_xor( p3, x );
-   k10 = v128_shuflr32( v128_aesenc( k10, zero ) );
+   k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
   k10 = v128_xor( k10, k03 );
   x = v128_xor( p2, k10 );
-   x = v128_aesenc( x, zero );
+   x = v128_aesenc_nokey( x );
-   k11 = v128_shuflr32( v128_aesenc( k11, zero ) );
+   k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
   k11 = v128_xor( k11, k10 );
   x = v128_xor( x, k11 );
-   x = v128_aesenc( x, zero );
+   x = v128_aesenc_nokey( x );
-   k12 = v128_shuflr32( v128_aesenc( k12, zero ) );
+   k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
   k12 = v128_xor( k12, v128_xor( k11, v128_set32(
               ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
   x = v128_xor( x, k12 );
-   x = v128_aesenc( x, zero );
+   x = v128_aesenc_nokey( x );
-   k13 = v128_shuflr32( v128_aesenc( k13, zero ) );
+   k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
   k13 = v128_xor( k13, k12 );
   x = v128_xor( x, k13 );
-   x = v128_aesenc( x, zero );
+   x = v128_aesenc_nokey( x );
   p1 = v128_xor( p1, x );
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -12,23 +12,8 @@ uint32_t SIMD_IV_512[] __attribute__((aligned(64))) =
   0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8,
   0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
   0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4,
-   0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 };
+   0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
-
+};
 #if defined(__x86_64__)
 #define SHUFXOR_1 0xb1          // rev64_32
 #define SHUFXOR_2 0x4e          // rev64
 #define SHUFXOR_3 0x1b          // rev32
 #elif defined(__aarch64__)
 #define SHUFXOR_1(x)     vrev64q_u32(x)
 #define SHUFXOR_2(x)     v128_rev64(x)
 #define SHUFXOR_3(x)     v128_rev64( v128_qrev32(x) )
 #else
 #endif
 #define CAT(x, y) x##y
 #define XCAT(x,y) CAT(x,y)
@@ -89,8 +74,8 @@ uint32_t SIMD_IV_512[] __attribute__((aligned(64))) =
 #define SUM7_65 4
 #define SUM7_66 5
-#define PERM( z, d, a, shufxor ) \
+#define PERM( p, z, d, a, shufxor ) \
-   XCAT( PERM_, XCAT( SUM7_ ## z, PERM_START ) )( d, a, shufxor )
+   XCAT( PERM_, XCAT( SUM7_ ## z, p ) )( d, a, shufxor )
 #define PERM_0( d, a, shufxor ) /* XOR 1 */ \
 do { \
@@ -188,16 +173,22 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) =
 #if defined(__x86_64__)
-#define shufxor(x,s) _mm_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
+#define SHUFXOR_1(x)        _mm_shuffle_epi32(x,0xb1)
 #define SHUFXOR_2(x)        _mm_shuffle_epi32(x,0x4e)
 #define SHUFXOR_3(x)        _mm_shuffle_epi32(x,0x1b)
 #elif defined(__aarch64__)
-#define shufxor(x,s)   XCAT( SHUFXOR_, s )(x) 
+#define SHUFXOR_1(x)        vrev64q_u32(x)
 #define SHUFXOR_2(x)        v128_rev64(x)
 #define SHUFXOR_3(x)        v128_rev64(v128_qrev32(x))
 #else
-//#warning __FILE__ "Unknown or unsupported CPU architecture"
+//unknown or unsupported architecture
 #endif
 #define shufxor(x,s)   XCAT(SHUFXOR_,s)(x) 
 #define REDUCE(x) \
  v128_sub16( v128_and( x, v128_64( \
                         0x00ff00ff00ff00ff ) ), v128_sra16( x, 8 ) )
@@ -513,7 +504,7 @@ static void ROUNDS512( uint32_t *state, const uint8_t *msg, uint16_t *fft )
 #define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
 #define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
-#define STEP_1_(a,b,c,d,w,fun,r,s,z) \
+#define STEP_1_( a,b,c,d,w,fun,r,s,z,p ) \
 do { \
    TTl  = Fl( a,b,c,fun ); \
    TTh  = Fh( a,b,c,fun ); \
@@ -525,10 +516,10 @@ do { \
    TTh  = v128_add32( TTh, w##h ); \
    TTl  = v128_rol32( TTl, s ); \
    TTh  = v128_rol32( TTh, s ); \
-    PERM( z,d,a, shufxor ); \
+    PERM( p, z,d,a, shufxor ); \
 } while(0)
-#define STEP_1( a,b,c,d,w,fun,r,s,z )   STEP_1_( a,b,c,d,w,fun,r,s,z )
+#define STEP_1( a,b,c,d,w,fun,r,s,z,p )   STEP_1_( a,b,c,d,w,fun,r,s,z,p )
 #define STEP_2_( a,b,c,d,w,fun,r,s ) \
 do { \
@@ -538,10 +529,10 @@ do { \
 #define STEP_2( a,b,c,d,w,fun,r,s )  STEP_2_( a,b,c,d,w,fun,r,s )
-#define STEP( a,b,c,d,w1,w2,fun,r,s,z ) \
+#define STEP( a,b,c,d,w1,w2,fun,r,s,z,p ) \
 do { \
    register v128u32_t TTl, TTh, Wl=w1, Wh=w2; \
-    STEP_1( a,b,c,d,W,fun,r,s,z ); \
+    STEP_1( a,b,c,d,W,fun,r,s,z,p ); \
    STEP_2( a,b,c,d,W,fun,r,s ); \
 } while(0);
@@ -558,63 +549,45 @@ do { \
    w##h = v128_mul16( w##h, code[z].v128 ); \
 } while(0)
-#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z ) \
+#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z,p ) \
 do { \
    register v128u32_t W0l, W1l, W2l, W3l, TTl; \
    register v128u32_t W0h, W1h, W2h, W3h, TTh; \
    MSG( W0, h0, l0, u0, z ); \
-    STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0 ); \
+    STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0, p ); \
    MSG( W1, h1, l1, u1, z ); \
    STEP_2( S(0), S(1), S(2), S(3), W0, fun, r, s ); \
-    STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1 ); \
+    STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1, p ); \
    MSG( W2,h2,l2,u2,z ); \
    STEP_2( S(3), S(0), S(1), S(2), W1, fun, s, t ); \
-    STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2 ); \
+    STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2, p ); \
    MSG( W3,h3,l3,u3,z ); \
    STEP_2( S(2), S(3), S(0), S(1), W2, fun, t, u ); \
-    STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3 ); \
+    STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3, p ); \
    STEP_2( S(1), S(2), S(3), S(0), W3, fun, u, r ); \
 } while(0)
   // 4 rounds with code 185
-#define PERM_START 0
+   ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0, 0);
-   ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0);
+   ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0, 4);
-#undef PERM_START
+   ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22, 7,  0, 1);
-#define PERM_START 4
+   ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22, 7,  0, 5);
   ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0);
 #undef PERM_START
 #define PERM_START 1
   ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22, 7,  0);
 #undef PERM_START
 #define PERM_START 5
   ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22, 7,  0);
 #undef PERM_START
   // 4 rounds with code 233
-#define PERM_START 2
+   ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1, 2);
-   ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1);
+   ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1, 6);
-#undef PERM_START
+   ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1, 3);
-#define PERM_START 6
+   ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1, 0);
   ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1);
 #undef PERM_START
 #define PERM_START 3
   ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1);
 #undef PERM_START
 #define PERM_START 0
   ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1);
 #undef PERM_START
   // 1 round as feed-forward
-#define PERM_START 4
+   STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0,  4, 13, 0, 4 );
-   STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0,  4, 13, 0 );
+   STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1, 4 );
-   STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1 );
+   STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2, 4 );
-   STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2 );
+   STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25,  4, 3, 4 );
   STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25,  4, 3 );
   S[0] = S0l;  S[1] = S0h;  S[2] = S1l;  S[3] = S1h;
   S[4] = S2l;  S[5] = S2h;  S[6] = S3l;  S[7] = S3h;
 #undef PERM_START
 #undef STEP_1
 #undef STEP_1_
 #undef STEP_2
@@ -732,6 +705,9 @@ int simd512( void *hashval, const void *data, int datalen )
 #undef REDUCE_FULL_S
 #undef DO_REDUCE_FULL_S
 #undef c1_16
 #undef SHUFXOR_1 
 #undef SHUFXOR_2 
 #undef SHUFXOR_3 
 #endif
@@ -820,118 +796,12 @@ static const m256_v16 FFT256_Twiddle[] =
       -30,   55,  -58,  -65,  -95,  -40,  -98,   94 }}
 };
 #if 0
 // generic 
 #define SHUFXOR_1 0xb1          // 0b10110001 
 #define SHUFXOR_2 0x4e          // 0b01001110 
 #define SHUFXOR_3 0x1b          // 0b00011011 
-#define CAT(x, y) x##y
+#define SHUFXOR_1(x)        _mm256_shuffle_epi32(x,0xb1)
-#define XCAT(x,y) CAT(x,y)
+#define SHUFXOR_2(x)        _mm256_shuffle_epi32(x,0x4e)
 #define SHUFXOR_3(x)        _mm256_shuffle_epi32(x,0x1b)
-#define SUM7_00 0
+#define shufxor2w(x,s)      XCAT(SHUFXOR_,s)(x)
 #define SUM7_01 1
 #define SUM7_02 2
 #define SUM7_03 3
 #define SUM7_04 4
 #define SUM7_05 5
 #define SUM7_06 6
 #define SUM7_10 1
 #define SUM7_11 2
 #define SUM7_12 3
 #define SUM7_13 4
 #define SUM7_14 5
 #define SUM7_15 6
 #define SUM7_16 0
 #define SUM7_20 2
 #define SUM7_21 3
 #define SUM7_22 4
 #define SUM7_23 5
 #define SUM7_24 6
 #define SUM7_25 0
 #define SUM7_26 1
 #define SUM7_30 3
 #define SUM7_31 4
 #define SUM7_32 5
 #define SUM7_33 6
 #define SUM7_34 0
 #define SUM7_35 1
 #define SUM7_36 2
 #define SUM7_40 4
 #define SUM7_41 5
 #define SUM7_42 6
 #define SUM7_43 0
 #define SUM7_44 1
 #define SUM7_45 2
 #define SUM7_46 3
 #define SUM7_50 5
 #define SUM7_51 6
 #define SUM7_52 0
 #define SUM7_53 1
 #define SUM7_54 2
 #define SUM7_55 3
 #define SUM7_56 4
 #define SUM7_60 6
 #define SUM7_61 0
 #define SUM7_62 1
 #define SUM7_63 2
 #define SUM7_64 3
 #define SUM7_65 4
 #define SUM7_66 5
 #define PERM(z,d,a,shufxor) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a,shufxor)
 #define PERM_0(d,a,shufxor) /* XOR 1 */ \
 do { \
    d##l = shufxor( a##l, 1 ); \
    d##h = shufxor( a##h, 1 ); \
 } while(0)
 #define PERM_1(d,a,shufxor) /* XOR 6 */ \
 do { \
    d##l = shufxor( a##h, 2 ); \
    d##h = shufxor( a##l, 2 ); \
 } while(0)
 #define PERM_2(d,a,shufxor) /* XOR 2 */ \
 do { \
    d##l = shufxor( a##l, 2 ); \
    d##h = shufxor( a##h, 2 ); \
 } while(0)
 #define PERM_3(d,a,shufxor) /* XOR 3 */ \
 do { \
    d##l = shufxor( a##l, 3 ); \
    d##h = shufxor( a##h, 3 ); \
 } while(0)
 #define PERM_4(d,a,shufxor) /* XOR 5 */ \
 do { \
    d##l = shufxor( a##h, 1 ); \
    d##h = shufxor( a##l, 1 ); \
 } while(0)
 #define PERM_5(d,a,shufxor) /* XOR 7 */ \
 do { \
    d##l = shufxor( a##h, 3 ); \
    d##h = shufxor( a##l, 3 ); \
 } while(0)
 #define PERM_6(d,a,shufxor) /* XOR 4 */ \
 do { \
    d##l = a##h; \
    d##h = a##l; \
 } while(0)
 #endif
 #define shufxor2w(x,s) _mm256_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
 #if defined(__AVX512VL__)
 //TODO Enable for AVX10_256
@@ -1262,7 +1132,7 @@ static void rounds512_2way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
 #define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
 #define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
-#define STEP_1_(a,b,c,d,w,fun,r,s,z) \
+#define STEP_1_(a,b,c,d,w,fun,r,s,z,p ) \
 do { \
    TTl  = Fl( a,b,c,fun ); \
    TTh  = Fh( a,b,c,fun ); \
@@ -1274,10 +1144,10 @@ do { \
    TTh  = _mm256_add_epi32( TTh, w##h ); \
    TTl  = mm256_rol_32( TTl, s ); \
    TTh  = mm256_rol_32( TTh, s ); \
-    PERM( z,d,a, shufxor2w ); \
+    PERM( p,z,d,a, shufxor2w ); \
 } while(0)
-#define STEP_1( a,b,c,d,w,fun,r,s,z )   STEP_1_( a,b,c,d,w,fun,r,s,z )
+#define STEP_1( a,b,c,d,w,fun,r,s,z,p )   STEP_1_( a,b,c,d,w,fun,r,s,z,p )
 #define STEP_2_( a,b,c,d,w,fun,r,s ) \
 do { \
@@ -1287,10 +1157,10 @@ do { \
 #define STEP_2( a,b,c,d,w,fun,r,s )  STEP_2_( a,b,c,d,w,fun,r,s )
-#define STEP( a,b,c,d,w1,w2,fun,r,s,z ) \
+#define STEP( a,b,c,d,w1,w2,fun,r,s,z, p ) \
 do { \
    register __m256i TTl, TTh, Wl=w1, Wh=w2; \
-    STEP_1( a,b,c,d,W,fun,r,s,z ); \
+    STEP_1( a,b,c,d,W,fun,r,s,z,p ); \
    STEP_2( a,b,c,d,W,fun,r,s ); \
 } while(0);
@@ -1307,63 +1177,45 @@ do { \
    w##h = _mm256_mullo_epi16( w##h, code[z].v256 ); \
 } while(0)
-#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z ) \
+#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z,p ) \
 do { \
    register __m256i W0l, W1l, W2l, W3l, TTl; \
    register __m256i W0h, W1h, W2h, W3h, TTh; \
    MSG( W0, h0, l0, u0, z ); \
-    STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0 ); \
+    STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0, p ); \
    MSG( W1, h1, l1, u1, z ); \
    STEP_2( S(0), S(1), S(2), S(3), W0, fun, r, s ); \
-    STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1 ); \
+    STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1, p ); \
    MSG( W2,h2,l2,u2,z ); \
    STEP_2( S(3), S(0), S(1), S(2), W1, fun, s, t ); \
-    STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2 ); \
+    STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2, p ); \
    MSG( W3,h3,l3,u3,z ); \
    STEP_2( S(2), S(3), S(0), S(1), W2, fun, t, u ); \
-    STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3 ); \
+    STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3, p ); \
    STEP_2( S(1), S(2), S(3), S(0), W3, fun, u, r ); \
 } while(0)
   // 4 rounds with code 185
-#define PERM_START 0
+   ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0, 0);
-   ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0);
+   ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0, 4);
-#undef PERM_START
+   ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22, 7,  0, 1);
-#define PERM_START 4
+   ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22, 7,  0, 5);
   ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0);
 #undef PERM_START
 #define PERM_START 1
   ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22, 7,  0);
 #undef PERM_START
 #define PERM_START 5
   ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22, 7,  0);
 #undef PERM_START
   // 4 rounds with code 233
-#define PERM_START 2
+   ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1, 2);
-   ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1);
+   ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1, 6);
-#undef PERM_START
+   ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1, 3);
-#define PERM_START 6
+   ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1, 0);
   ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1);
 #undef PERM_START
 #define PERM_START 3
   ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1);
 #undef PERM_START
 #define PERM_START 0
   ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1);
 #undef PERM_START
   // 1 round as feed-forward
-#define PERM_START 4
+   STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0,  4, 13, 0, 4 );
-   STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0,  4, 13, 0 );
+   STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1, 4 );
-   STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1 );
+   STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2, 4 );
-   STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2 );
+   STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25,  4, 3, 4 );
-   STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25,  4, 3 );
+  
   S[0] = S0l;  S[1] = S0h;  S[2] = S1l;  S[3] = S1h;
   S[4] = S2l;  S[5] = S2h;  S[6] = S3l;  S[7] = S3h;
 #undef PERM_START
 #undef STEP_1
 #undef STEP_1_
 #undef STEP_2
@@ -1642,6 +1494,10 @@ int simd512_2way( void *hashval, const void *data, int datalen )
   return 0;
 }
 #undef SHUFXOR_1
 #undef SHUFXOR_2
 #undef SHUFXOR_3
 #endif
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
@@ -1792,7 +1648,11 @@ static const m512_v16 FFT256_Twiddle4w[] =
       -30,   55,  -58,  -65,  -95,  -40,  -98,   94 }}
 };
-#define shufxor4w(x,s) _mm512_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
+#define SHUFXOR_1(x)        _mm512_shuffle_epi32(x,0xb1)
 #define SHUFXOR_2(x)        _mm512_shuffle_epi32(x,0x4e)
 #define SHUFXOR_3(x)        _mm512_shuffle_epi32(x,0x1b)
 #define shufxor4w(x,s)      XCAT(SHUFXOR_,s)(x)
 #define REDUCE4w(x) \
  _mm512_sub_epi16( _mm512_maskz_mov_epi8( 0x5555555555555555, x ), \
@@ -2114,7 +1974,7 @@ static void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
 // targetted
-#define STEP_1_(a,b,c,d,w,fun,r,s,z) \
+#define STEP_1_( a,b,c,d,w,fun,r,s,z,p ) \
 do { \
    TTl  = Fl( a,b,c,fun ); \
    TTh  = Fh( a,b,c,fun ); \
@@ -2126,10 +1986,10 @@ do { \
    TTh  = _mm512_add_epi32( TTh, w##h ); \
    TTl  = mm512_rol_32( TTl, s ); \
    TTh  = mm512_rol_32( TTh, s ); \
-    PERM( z,d,a, shufxor4w ); \
+    PERM( p,z,d,a, shufxor4w ); \
 } while(0)
-#define STEP_1( a,b,c,d,w,fun,r,s,z )   STEP_1_( a,b,c,d,w,fun,r,s,z )
+#define STEP_1( a,b,c,d,w,fun,r,s,z,p )   STEP_1_( a,b,c,d,w,fun,r,s,z,p )
 #define STEP_2_( a,b,c,d,w,fun,r,s ) \
 do { \
@@ -2139,10 +1999,10 @@ do { \
 #define STEP_2( a,b,c,d,w,fun,r,s )  STEP_2_( a,b,c,d,w,fun,r,s )
-#define STEP( a,b,c,d,w1,w2,fun,r,s,z ) \
+#define STEP( a,b,c,d,w1,w2,fun,r,s,z,p ) \
 do { \
    register __m512i TTl, TTh, Wl=w1, Wh=w2; \
-    STEP_1( a,b,c,d,W,fun,r,s,z ); \
+    STEP_1( a,b,c,d,W,fun,r,s,z,p ); \
    STEP_2( a,b,c,d,W,fun,r,s ); \
 } while(0);
@@ -2159,63 +2019,45 @@ do { \
    w##h = _mm512_mullo_epi16( w##h, code[z].v512 ); \
 } while(0)
-#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z ) \
+#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z,p ) \
 do { \
    register __m512i W0l, W1l, W2l, W3l, TTl; \
    register __m512i W0h, W1h, W2h, W3h, TTh; \
    MSG( W0, h0, l0, u0, z ); \
-    STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0 ); \
+    STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0,p ); \
    MSG( W1, h1, l1, u1, z ); \
    STEP_2( S(0), S(1), S(2), S(3), W0, fun, r, s ); \
-    STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1 ); \
+    STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1,p ); \
    MSG( W2,h2,l2,u2,z ); \
    STEP_2( S(3), S(0), S(1), S(2), W1, fun, s, t ); \
-    STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2 ); \
+    STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2,p ); \
    MSG( W3,h3,l3,u3,z ); \
    STEP_2( S(2), S(3), S(0), S(1), W2, fun, t, u ); \
-    STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3 ); \
+    STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3,p ); \
    STEP_2( S(1), S(2), S(3), S(0), W3, fun, u, r ); \
 } while(0)
   // 4 rounds with code 185
-#define PERM_START 0
+   ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0, 0);
-   ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0);
+   ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0, 4);
-#undef PERM_START
+   ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22, 7,  0, 1);
-#define PERM_START 4
+   ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22, 7,  0, 5);
   ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0);
 #undef PERM_START
 #define PERM_START 1
   ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22, 7,  0);
 #undef PERM_START
 #define PERM_START 5
   ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22, 7,  0);
 #undef PERM_START
   // 4 rounds with code 233
-#define PERM_START 2
+   ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1, 2);
-   ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1);
+   ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1, 6);
-#undef PERM_START
+   ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1, 3);
-#define PERM_START 6
+   ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1, 0);
   ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1);
 #undef PERM_START
 #define PERM_START 3
   ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1);
 #undef PERM_START
 #define PERM_START 0
   ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1);
 #undef PERM_START
   // 1 round as feed-forward
-#define PERM_START 4
+   STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0,  4, 13, 0, 4 );
-   STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0,  4, 13, 0 );
+   STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1, 4 );
-   STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1 );
+   STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2, 4 );
-   STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2 );
+   STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25,  4, 3, 4 );
-   STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25,  4, 3 );
+  
   S[0] = S0l;  S[1] = S0h;  S[2] = S1l;  S[3] = S1h;
   S[4] = S2l;  S[5] = S2h;  S[6] = S3l;  S[7] = S3h;
 #undef PERM_START
 #undef STEP_1
 #undef STEP_1_
 #undef STEP_2
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -951,7 +951,7 @@ union _x17_context_overlay
 #else
        sph_groestl512_context  groestl;
 #endif        
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_echo          echo;
 #else
        sph_echo512_context     echo;
@@ -1045,7 +1045,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
    sph_simd512_close( &ctx.simd, hash1 );
 #endif
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
    echo_full( &ctx.echo, hash0, 512, hash0, 64 );
    echo_full( &ctx.echo, hash1, 512, hash1, 64 );
 #else
--- a/api.c
+++ b/api.c
@@ -8,6 +8,7 @@
 * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.  See COPYING for more details.
 */
 #define APIVERSION "1.0"
 #ifdef WIN32
@@ -27,9 +28,9 @@
 #include <math.h>
 #include <stdarg.h>
 #include <assert.h>
 #include <openssl/sha.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include "algo/sha/sha1-hash.h"
 #include "miner.h"
 #include "sysinfos.c"
@@ -208,7 +209,7 @@ static char *remote_seturl(char *params)
 	return buffer;
 }
-/**
+/*-hash*
 * Ask the miner to quit
 */
 static char *remote_quit(char *params)
@@ -336,7 +337,6 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
 	char inpkey[128] = { 0 };
 	char seckey[64];
 	uchar sha1[20];
 //	SHA_CTX ctx;
 	if (opt_protocol)
 		applog(LOG_DEBUG, "clientkey: %s", clientkey);
@@ -346,11 +346,7 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
 	// SHA-1 test from rfc, returns in base64 "s3pPLMBiTxaQ9kYGzzhZRbK+xOo="
 	//sprintf(inpkey, "dGhlIHNhbXBsZSBub25jZQ==258EAFA5-E914-47DA-95CA-C5AB0DC85B11");
-   SHA1( inpkey, strlen(inpkey), sha1 );
+   sph_sha1_full( sha1, inpkey, strlen(inpkey) );
 // Deprecated in openssl-3
 // SHA1_Init(&ctx);
 //	SHA1_Update(&ctx, inpkey, strlen(inpkey));
 //	SHA1_Final(sha1, &ctx);
 	base64_encode(sha1, 20, seckey, sizeof(seckey));
@@ -733,3 +729,4 @@ void *api_thread(void *userdata)
 	return NULL;
 }
--- a/armbuild-all.sh
+++ b/armbuild-all.sh
@@ -40,4 +40,3 @@ rm -f config.status
 CFLAGS="-O3 -march=native -Wall -flax-vector-conversions" ./configure  --with-curl     
 make -j $nproc
 strip -s cpuminer
 mv cpuminer cpuminer
--- a/asm/scrypt-arm.S
+++ b/asm/scrypt-arm.S
--- a/asm/scrypt-x64.S
+++ b/asm/scrypt-x64.S
--- a/asm/scrypt-x86.S
+++ b/asm/scrypt-x86.S
@@ -1,830 +0,0 @@
 /*
 * Copyright 2011-2012, 2014 pooler@litecoinpool.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 #include <cpuminer-config.h>
 #if defined(__linux__) && defined(__ELF__)
 	.section .note.GNU-stack,"",%progbits
 #endif
 #if defined(USE_ASM) && defined(__i386__)
 .macro scrypt_shuffle src, so, dest, do
 	movl	\so+60(\src), %eax
 	movl	\so+44(\src), %ebx
 	movl	\so+28(\src), %ecx
 	movl	\so+12(\src), %edx
 	movl	%eax, \do+12(\dest)
 	movl	%ebx, \do+28(\dest)
 	movl	%ecx, \do+44(\dest)
 	movl	%edx, \do+60(\dest)
 	movl	\so+40(\src), %eax
 	movl	\so+8(\src), %ebx
 	movl	\so+48(\src), %ecx
 	movl	\so+16(\src), %edx
 	movl	%eax, \do+8(\dest)
 	movl	%ebx, \do+40(\dest)
 	movl	%ecx, \do+16(\dest)
 	movl	%edx, \do+48(\dest)
 	movl	\so+20(\src), %eax
 	movl	\so+4(\src), %ebx
 	movl	\so+52(\src), %ecx
 	movl	\so+36(\src), %edx
 	movl	%eax, \do+4(\dest)
 	movl	%ebx, \do+20(\dest)
 	movl	%ecx, \do+36(\dest)
 	movl	%edx, \do+52(\dest)
 	movl	\so+0(\src), %eax
 	movl	\so+24(\src), %ebx
 	movl	\so+32(\src), %ecx
 	movl	\so+56(\src), %edx
 	movl	%eax, \do+0(\dest)
 	movl	%ebx, \do+24(\dest)
 	movl	%ecx, \do+32(\dest)
 	movl	%edx, \do+56(\dest)
 .endm
 .macro salsa8_core_gen_quadround
 	movl	52(%esp), %ecx
 	movl	4(%esp), %edx
 	movl	20(%esp), %ebx
 	movl	8(%esp), %esi
 	leal	(%ecx, %edx), %edi
 	roll	$7, %edi
 	xorl	%edi, %ebx
 	movl	%ebx, 4(%esp)
 	movl	36(%esp), %edi
 	leal	(%edx, %ebx), %ebp
 	roll	$9, %ebp
 	xorl	%ebp, %edi
 	movl	24(%esp), %ebp
 	movl	%edi, 8(%esp)
 	addl	%edi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %ecx
 	movl	40(%esp), %ebx
 	movl	%ecx, 20(%esp)
 	addl	%edi, %ecx
 	roll	$18, %ecx
 	leal	(%esi, %ebp), %edi
 	roll	$7, %edi
 	xorl	%edi, %ebx
 	movl	%ebx, 24(%esp)
 	movl	56(%esp), %edi
 	xorl	%ecx, %edx
 	leal	(%ebp, %ebx), %ecx
 	roll	$9, %ecx
 	xorl	%ecx, %edi
 	movl	%edi, 36(%esp)
 	movl	28(%esp), %ecx
 	movl	%edx, 28(%esp)
 	movl	44(%esp), %edx
 	addl	%edi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %esi
 	movl	60(%esp), %ebx
 	movl	%esi, 40(%esp)
 	addl	%edi, %esi
 	roll	$18, %esi
 	leal	(%ecx, %edx), %edi
 	roll	$7, %edi
 	xorl	%edi, %ebx
 	movl	%ebx, 44(%esp)
 	movl	12(%esp), %edi
 	xorl	%esi, %ebp
 	leal	(%edx, %ebx), %esi
 	roll	$9, %esi
 	xorl	%esi, %edi
 	movl	%edi, 12(%esp)
 	movl	48(%esp), %esi
 	movl	%ebp, 48(%esp)
 	movl	64(%esp), %ebp
 	addl	%edi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %ecx
 	movl	16(%esp), %ebx
 	movl	%ecx, 16(%esp)
 	addl	%edi, %ecx
 	roll	$18, %ecx
 	leal	(%esi, %ebp), %edi
 	roll	$7, %edi
 	xorl	%edi, %ebx
 	movl	32(%esp), %edi
 	xorl	%ecx, %edx
 	leal	(%ebp, %ebx), %ecx
 	roll	$9, %ecx
 	xorl	%ecx, %edi
 	movl	%edi, 32(%esp)
 	movl	%ebx, %ecx
 	movl	%edx, 52(%esp)
 	movl	28(%esp), %edx
 	addl	%edi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %esi
 	movl	40(%esp), %ebx
 	movl	%esi, 28(%esp)
 	addl	%edi, %esi
 	roll	$18, %esi
 	leal	(%ecx, %edx), %edi
 	roll	$7, %edi
 	xorl	%edi, %ebx
 	movl	%ebx, 40(%esp)
 	movl	12(%esp), %edi
 	xorl	%esi, %ebp
 	leal	(%edx, %ebx), %esi
 	roll	$9, %esi
 	xorl	%esi, %edi
 	movl	%edi, 12(%esp)
 	movl	4(%esp), %esi
 	movl	%ebp, 4(%esp)
 	movl	48(%esp), %ebp
 	addl	%edi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %ecx
 	movl	16(%esp), %ebx
 	movl	%ecx, 16(%esp)
 	addl	%edi, %ecx
 	roll	$18, %ecx
 	leal	(%esi, %ebp), %edi
 	roll	$7, %edi
 	xorl	%edi, %ebx
 	movl	%ebx, 48(%esp)
 	movl	32(%esp), %edi
 	xorl	%ecx, %edx
 	leal	(%ebp, %ebx), %ecx
 	roll	$9, %ecx
 	xorl	%ecx, %edi
 	movl	%edi, 32(%esp)
 	movl	24(%esp), %ecx
 	movl	%edx, 24(%esp)
 	movl	52(%esp), %edx
 	addl	%edi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %esi
 	movl	28(%esp), %ebx
 	movl	%esi, 28(%esp)
 	addl	%edi, %esi
 	roll	$18, %esi
 	leal	(%ecx, %edx), %edi
 	roll	$7, %edi
 	xorl	%edi, %ebx
 	movl	%ebx, 52(%esp)
 	movl	8(%esp), %edi
 	xorl	%esi, %ebp
 	leal	(%edx, %ebx), %esi
 	roll	$9, %esi
 	xorl	%esi, %edi
 	movl	%edi, 8(%esp)
 	movl	44(%esp), %esi
 	movl	%ebp, 44(%esp)
 	movl	4(%esp), %ebp
 	addl	%edi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %ecx
 	movl	20(%esp), %ebx
 	movl	%ecx, 4(%esp)
 	addl	%edi, %ecx
 	roll	$18, %ecx
 	leal	(%esi, %ebp), %edi
 	roll	$7, %edi
 	xorl	%edi, %ebx
 	movl	36(%esp), %edi
 	xorl	%ecx, %edx
 	leal	(%ebp, %ebx), %ecx
 	roll	$9, %ecx
 	xorl	%ecx, %edi
 	movl	%edi, 20(%esp)
 	movl	%ebx, %ecx
 	movl	%edx, 36(%esp)
 	movl	24(%esp), %edx
 	addl	%edi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %esi
 	movl	28(%esp), %ebx
 	movl	%esi, 24(%esp)
 	addl	%edi, %esi
 	roll	$18, %esi
 	leal	(%ecx, %edx), %edi
 	roll	$7, %edi
 	xorl	%edi, %ebx
 	movl	%ebx, 28(%esp)
 	xorl	%esi, %ebp
 	movl	8(%esp), %esi
 	leal	(%edx, %ebx), %edi
 	roll	$9, %edi
 	xorl	%edi, %esi
 	movl	40(%esp), %edi
 	movl	%ebp, 8(%esp)
 	movl	44(%esp), %ebp
 	movl	%esi, 40(%esp)
 	addl	%esi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %ecx
 	movl	4(%esp), %ebx
 	movl	%ecx, 44(%esp)
 	addl	%esi, %ecx
 	roll	$18, %ecx
 	leal	(%edi, %ebp), %esi
 	roll	$7, %esi
 	xorl	%esi, %ebx
 	movl	%ebx, 4(%esp)
 	movl	20(%esp), %esi
 	xorl	%ecx, %edx
 	leal	(%ebp, %ebx), %ecx
 	roll	$9, %ecx
 	xorl	%ecx, %esi
 	movl	%esi, 56(%esp)
 	movl	48(%esp), %ecx
 	movl	%edx, 20(%esp)
 	movl	36(%esp), %edx
 	addl	%esi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %edi
 	movl	24(%esp), %ebx
 	movl	%edi, 24(%esp)
 	addl	%esi, %edi
 	roll	$18, %edi
 	leal	(%ecx, %edx), %esi
 	roll	$7, %esi
 	xorl	%esi, %ebx
 	movl	%ebx, 60(%esp)
 	movl	12(%esp), %esi
 	xorl	%edi, %ebp
 	leal	(%edx, %ebx), %edi
 	roll	$9, %edi
 	xorl	%edi, %esi
 	movl	%esi, 12(%esp)
 	movl	52(%esp), %edi
 	movl	%ebp, 36(%esp)
 	movl	8(%esp), %ebp
 	addl	%esi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %ecx
 	movl	16(%esp), %ebx
 	movl	%ecx, 16(%esp)
 	addl	%esi, %ecx
 	roll	$18, %ecx
 	leal	(%edi, %ebp), %esi
 	roll	$7, %esi
 	xorl	%esi, %ebx
 	movl	32(%esp), %esi
 	xorl	%ecx, %edx
 	leal	(%ebp, %ebx), %ecx
 	roll	$9, %ecx
 	xorl	%ecx, %esi
 	movl	%esi, 32(%esp)
 	movl	%ebx, %ecx
 	movl	%edx, 48(%esp)
 	movl	20(%esp), %edx
 	addl	%esi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %edi
 	movl	24(%esp), %ebx
 	movl	%edi, 20(%esp)
 	addl	%esi, %edi
 	roll	$18, %edi
 	leal	(%ecx, %edx), %esi
 	roll	$7, %esi
 	xorl	%esi, %ebx
 	movl	%ebx, 8(%esp)
 	movl	12(%esp), %esi
 	xorl	%edi, %ebp
 	leal	(%edx, %ebx), %edi
 	roll	$9, %edi
 	xorl	%edi, %esi
 	movl	%esi, 12(%esp)
 	movl	28(%esp), %edi
 	movl	%ebp, 52(%esp)
 	movl	36(%esp), %ebp
 	addl	%esi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %ecx
 	movl	16(%esp), %ebx
 	movl	%ecx, 16(%esp)
 	addl	%esi, %ecx
 	roll	$18, %ecx
 	leal	(%edi, %ebp), %esi
 	roll	$7, %esi
 	xorl	%esi, %ebx
 	movl	%ebx, 28(%esp)
 	movl	32(%esp), %esi
 	xorl	%ecx, %edx
 	leal	(%ebp, %ebx), %ecx
 	roll	$9, %ecx
 	xorl	%ecx, %esi
 	movl	%esi, 32(%esp)
 	movl	4(%esp), %ecx
 	movl	%edx, 4(%esp)
 	movl	48(%esp), %edx
 	addl	%esi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %edi
 	movl	20(%esp), %ebx
 	movl	%edi, 20(%esp)
 	addl	%esi, %edi
 	roll	$18, %edi
 	leal	(%ecx, %edx), %esi
 	roll	$7, %esi
 	xorl	%esi, %ebx
 	movl	%ebx, 48(%esp)
 	movl	40(%esp), %esi
 	xorl	%edi, %ebp
 	leal	(%edx, %ebx), %edi
 	roll	$9, %edi
 	xorl	%edi, %esi
 	movl	%esi, 36(%esp)
 	movl	60(%esp), %edi
 	movl	%ebp, 24(%esp)
 	movl	52(%esp), %ebp
 	addl	%esi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %ecx
 	movl	44(%esp), %ebx
 	movl	%ecx, 40(%esp)
 	addl	%esi, %ecx
 	roll	$18, %ecx
 	leal	(%edi, %ebp), %esi
 	roll	$7, %esi
 	xorl	%esi, %ebx
 	movl	%ebx, 52(%esp)
 	movl	56(%esp), %esi
 	xorl	%ecx, %edx
 	leal	(%ebp, %ebx), %ecx
 	roll	$9, %ecx
 	xorl	%ecx, %esi
 	movl	%esi, 56(%esp)
 	addl	%esi, %ebx
 	movl	%edx, 44(%esp)
 	roll	$13, %ebx
 	xorl	%ebx, %edi
 	movl	%edi, 60(%esp)
 	addl	%esi, %edi
 	roll	$18, %edi
 	xorl	%edi, %ebp
 	movl	%ebp, 64(%esp)
 .endm
 	.text
 	.p2align 5
 salsa8_core_gen:
 	salsa8_core_gen_quadround
 	salsa8_core_gen_quadround
 	ret
 	.text
 	.p2align 5
 	.globl scrypt_core
 	.globl _scrypt_core
 scrypt_core:
 _scrypt_core:
 	pushl	%ebx
 	pushl	%ebp
 	pushl	%edi
 	pushl	%esi
 	/* Check for SSE2 availability */
 	movl	$1, %eax
 	cpuid
 	andl	$0x04000000, %edx
 	jnz scrypt_core_sse2
 scrypt_core_gen:
 	movl	20(%esp), %edi
 	movl	24(%esp), %esi
 	movl	28(%esp), %ecx
 	subl	$72, %esp
 .macro scrypt_core_macro1a p, q
 	movl	\p(%edi), %eax
 	movl	\q(%edi), %edx
 	movl	%eax, \p(%esi)
 	movl	%edx, \q(%esi)
 	xorl	%edx, %eax
 	movl	%eax, \p(%edi)
 	movl	%eax, \p(%esp)
 .endm
 .macro scrypt_core_macro1b p, q
 	movl	\p(%edi), %eax
 	xorl	\p(%esi, %edx), %eax
 	movl	\q(%edi), %ebx
 	xorl	\q(%esi, %edx), %ebx
 	movl	%ebx, \q(%edi)
 	xorl	%ebx, %eax
 	movl	%eax, \p(%edi)
 	movl	%eax, \p(%esp)
 .endm
 .macro scrypt_core_macro2 p, q
 	movl	\p(%esp), %eax
 	addl	\p(%edi), %eax
 	movl	%eax, \p(%edi)
 	xorl	\q(%edi), %eax
 	movl	%eax, \q(%edi)
 	movl	%eax, \p(%esp)
 .endm
 .macro scrypt_core_macro3 p, q
 	movl	\p(%esp), %eax
 	addl	\q(%edi), %eax
 	movl	%eax, \q(%edi)
 .endm
 	shll	$7, %ecx
 	addl	%esi, %ecx
 scrypt_core_gen_loop1:
 	movl	%esi, 64(%esp)
 	movl	%ecx, 68(%esp)
 	scrypt_core_macro1a	0, 64
 	scrypt_core_macro1a	4, 68
 	scrypt_core_macro1a	8, 72
 	scrypt_core_macro1a	12, 76
 	scrypt_core_macro1a	16, 80
 	scrypt_core_macro1a	20, 84
 	scrypt_core_macro1a	24, 88
 	scrypt_core_macro1a	28, 92
 	scrypt_core_macro1a	32, 96
 	scrypt_core_macro1a	36, 100
 	scrypt_core_macro1a	40, 104
 	scrypt_core_macro1a	44, 108
 	scrypt_core_macro1a	48, 112
 	scrypt_core_macro1a	52, 116
 	scrypt_core_macro1a	56, 120
 	scrypt_core_macro1a	60, 124
 	call salsa8_core_gen
 	movl	92(%esp), %edi
 	scrypt_core_macro2	0, 64
 	scrypt_core_macro2	4, 68
 	scrypt_core_macro2	8, 72
 	scrypt_core_macro2	12, 76
 	scrypt_core_macro2	16, 80
 	scrypt_core_macro2	20, 84
 	scrypt_core_macro2	24, 88
 	scrypt_core_macro2	28, 92
 	scrypt_core_macro2	32, 96
 	scrypt_core_macro2	36, 100
 	scrypt_core_macro2	40, 104
 	scrypt_core_macro2	44, 108
 	scrypt_core_macro2	48, 112
 	scrypt_core_macro2	52, 116
 	scrypt_core_macro2	56, 120
 	scrypt_core_macro2	60, 124
 	call salsa8_core_gen
 	movl	92(%esp), %edi
 	scrypt_core_macro3	0, 64
 	scrypt_core_macro3	4, 68
 	scrypt_core_macro3	8, 72
 	scrypt_core_macro3	12, 76
 	scrypt_core_macro3	16, 80
 	scrypt_core_macro3	20, 84
 	scrypt_core_macro3	24, 88
 	scrypt_core_macro3	28, 92
 	scrypt_core_macro3	32, 96
 	scrypt_core_macro3	36, 100
 	scrypt_core_macro3	40, 104
 	scrypt_core_macro3	44, 108
 	scrypt_core_macro3	48, 112
 	scrypt_core_macro3	52, 116
 	scrypt_core_macro3	56, 120
 	scrypt_core_macro3	60, 124
 	movl	64(%esp), %esi
 	movl	68(%esp), %ecx
 	addl	$128, %esi
 	cmpl	%ecx, %esi
 	jne scrypt_core_gen_loop1
 	movl	96(%esp), %esi
 	movl	100(%esp), %ecx
 	movl	%ecx, %eax
 	subl	$1, %eax
 	movl	%eax, 100(%esp)
 scrypt_core_gen_loop2:
 	movl	%ecx, 68(%esp)
 	movl	64(%edi), %edx
 	andl	100(%esp), %edx
 	shll	$7, %edx
 	scrypt_core_macro1b	0, 64
 	scrypt_core_macro1b	4, 68
 	scrypt_core_macro1b	8, 72
 	scrypt_core_macro1b	12, 76
 	scrypt_core_macro1b	16, 80
 	scrypt_core_macro1b	20, 84
 	scrypt_core_macro1b	24, 88
 	scrypt_core_macro1b	28, 92
 	scrypt_core_macro1b	32, 96
 	scrypt_core_macro1b	36, 100
 	scrypt_core_macro1b	40, 104
 	scrypt_core_macro1b	44, 108
 	scrypt_core_macro1b	48, 112
 	scrypt_core_macro1b	52, 116
 	scrypt_core_macro1b	56, 120
 	scrypt_core_macro1b	60, 124
 	call salsa8_core_gen
 	movl	92(%esp), %edi
 	scrypt_core_macro2	0, 64
 	scrypt_core_macro2	4, 68
 	scrypt_core_macro2	8, 72
 	scrypt_core_macro2	12, 76
 	scrypt_core_macro2	16, 80
 	scrypt_core_macro2	20, 84
 	scrypt_core_macro2	24, 88
 	scrypt_core_macro2	28, 92
 	scrypt_core_macro2	32, 96
 	scrypt_core_macro2	36, 100
 	scrypt_core_macro2	40, 104
 	scrypt_core_macro2	44, 108
 	scrypt_core_macro2	48, 112
 	scrypt_core_macro2	52, 116
 	scrypt_core_macro2	56, 120
 	scrypt_core_macro2	60, 124
 	call salsa8_core_gen
 	movl	92(%esp), %edi
 	movl	96(%esp), %esi
 	scrypt_core_macro3	0, 64
 	scrypt_core_macro3	4, 68
 	scrypt_core_macro3	8, 72
 	scrypt_core_macro3	12, 76
 	scrypt_core_macro3	16, 80
 	scrypt_core_macro3	20, 84
 	scrypt_core_macro3	24, 88
 	scrypt_core_macro3	28, 92
 	scrypt_core_macro3	32, 96
 	scrypt_core_macro3	36, 100
 	scrypt_core_macro3	40, 104
 	scrypt_core_macro3	44, 108
 	scrypt_core_macro3	48, 112
 	scrypt_core_macro3	52, 116
 	scrypt_core_macro3	56, 120
 	scrypt_core_macro3	60, 124
 	movl	68(%esp), %ecx
 	subl	$1, %ecx
 	ja scrypt_core_gen_loop2
 	addl	$72, %esp
 	popl	%esi
 	popl	%edi
 	popl	%ebp
 	popl	%ebx
 	ret
 .macro salsa8_core_sse2_doubleround
 	movdqa	%xmm1, %xmm4
 	paddd	%xmm0, %xmm4
 	movdqa	%xmm4, %xmm5
 	pslld	$7, %xmm4
 	psrld	$25, %xmm5
 	pxor	%xmm4, %xmm3
 	movdqa	%xmm0, %xmm4
 	pxor	%xmm5, %xmm3
 	paddd	%xmm3, %xmm4
 	movdqa	%xmm4, %xmm5
 	pslld	$9, %xmm4
 	psrld	$23, %xmm5
 	pxor	%xmm4, %xmm2
 	movdqa	%xmm3, %xmm4
 	pxor	%xmm5, %xmm2
 	pshufd	$0x93, %xmm3, %xmm3
 	paddd	%xmm2, %xmm4
 	movdqa	%xmm4, %xmm5
 	pslld	$13, %xmm4
 	psrld	$19, %xmm5
 	pxor	%xmm4, %xmm1
 	movdqa	%xmm2, %xmm4
 	pxor	%xmm5, %xmm1
 	pshufd	$0x4e, %xmm2, %xmm2
 	paddd	%xmm1, %xmm4
 	movdqa	%xmm4, %xmm5
 	pslld	$18, %xmm4
 	psrld	$14, %xmm5
 	pxor	%xmm4, %xmm0
 	movdqa	%xmm3, %xmm4
 	pxor	%xmm5, %xmm0
 	pshufd	$0x39, %xmm1, %xmm1
 	paddd	%xmm0, %xmm4
 	movdqa	%xmm4, %xmm5
 	pslld	$7, %xmm4
 	psrld	$25, %xmm5
 	pxor	%xmm4, %xmm1
 	movdqa	%xmm0, %xmm4
 	pxor	%xmm5, %xmm1
 	paddd	%xmm1, %xmm4
 	movdqa	%xmm4, %xmm5
 	pslld	$9, %xmm4
 	psrld	$23, %xmm5
 	pxor	%xmm4, %xmm2
 	movdqa	%xmm1, %xmm4
 	pxor	%xmm5, %xmm2
 	pshufd	$0x93, %xmm1, %xmm1
 	paddd	%xmm2, %xmm4
 	movdqa	%xmm4, %xmm5
 	pslld	$13, %xmm4
 	psrld	$19, %xmm5
 	pxor	%xmm4, %xmm3
 	movdqa	%xmm2, %xmm4
 	pxor	%xmm5, %xmm3
 	pshufd	$0x4e, %xmm2, %xmm2
 	paddd	%xmm3, %xmm4
 	movdqa	%xmm4, %xmm5
 	pslld	$18, %xmm4
 	psrld	$14, %xmm5
 	pxor	%xmm4, %xmm0
 	pshufd	$0x39, %xmm3, %xmm3
 	pxor	%xmm5, %xmm0
 .endm
 .macro salsa8_core_sse2
 	salsa8_core_sse2_doubleround
 	salsa8_core_sse2_doubleround
 	salsa8_core_sse2_doubleround
 	salsa8_core_sse2_doubleround
 .endm
 	.p2align 5
 scrypt_core_sse2:
 	movl	20(%esp), %edi
 	movl	24(%esp), %esi
 	movl	%esp, %ebp
 	subl	$128, %esp
 	andl	$-16, %esp
 	scrypt_shuffle %edi, 0, %esp, 0
 	scrypt_shuffle %edi, 64, %esp, 64
 	movdqa	96(%esp), %xmm6
 	movdqa	112(%esp), %xmm7
 	movl	%esi, %edx
 	movl	28(%ebp), %ecx
 	shll	$7, %ecx
 	addl	%esi, %ecx
 scrypt_core_sse2_loop1:
 	movdqa	0(%esp), %xmm0
 	movdqa	16(%esp), %xmm1
 	movdqa	32(%esp), %xmm2
 	movdqa	48(%esp), %xmm3
 	movdqa	64(%esp), %xmm4
 	movdqa	80(%esp), %xmm5
 	pxor	%xmm4, %xmm0
 	pxor	%xmm5, %xmm1
 	movdqa	%xmm0, 0(%edx)
 	movdqa	%xmm1, 16(%edx)
 	pxor	%xmm6, %xmm2
 	pxor	%xmm7, %xmm3
 	movdqa	%xmm2, 32(%edx)
 	movdqa	%xmm3, 48(%edx)
 	movdqa	%xmm4, 64(%edx)
 	movdqa	%xmm5, 80(%edx)
 	movdqa	%xmm6, 96(%edx)
 	movdqa	%xmm7, 112(%edx)
 	salsa8_core_sse2
 	paddd	0(%edx), %xmm0
 	paddd	16(%edx), %xmm1
 	paddd	32(%edx), %xmm2
 	paddd	48(%edx), %xmm3
 	movdqa	%xmm0, 0(%esp)
 	movdqa	%xmm1, 16(%esp)
 	movdqa	%xmm2, 32(%esp)
 	movdqa	%xmm3, 48(%esp)
 	pxor	64(%esp), %xmm0
 	pxor	80(%esp), %xmm1
 	pxor	%xmm6, %xmm2
 	pxor	%xmm7, %xmm3
 	movdqa	%xmm0, 64(%esp)
 	movdqa	%xmm1, 80(%esp)
 	movdqa	%xmm2, %xmm6
 	movdqa	%xmm3, %xmm7
 	salsa8_core_sse2
 	paddd	64(%esp), %xmm0
 	paddd	80(%esp), %xmm1
 	paddd	%xmm2, %xmm6
 	paddd	%xmm3, %xmm7
 	movdqa	%xmm0, 64(%esp)
 	movdqa	%xmm1, 80(%esp)
 	addl	$128, %edx
 	cmpl	%ecx, %edx
 	jne scrypt_core_sse2_loop1
 	movdqa	64(%esp), %xmm4
 	movdqa	80(%esp), %xmm5
 	movl	28(%ebp), %ecx
 	movl	%ecx, %eax
 	subl	$1, %eax
 scrypt_core_sse2_loop2:
 	movd	%xmm4, %edx
 	movdqa	0(%esp), %xmm0
 	movdqa	16(%esp), %xmm1
 	movdqa	32(%esp), %xmm2
 	movdqa	48(%esp), %xmm3
 	andl	%eax, %edx
 	shll	$7, %edx
 	pxor	0(%esi, %edx), %xmm0
 	pxor	16(%esi, %edx), %xmm1
 	pxor	32(%esi, %edx), %xmm2
 	pxor	48(%esi, %edx), %xmm3
 	pxor	%xmm4, %xmm0
 	pxor	%xmm5, %xmm1
 	movdqa	%xmm0, 0(%esp)
 	movdqa	%xmm1, 16(%esp)
 	pxor	%xmm6, %xmm2
 	pxor	%xmm7, %xmm3
 	movdqa	%xmm2, 32(%esp)
 	movdqa	%xmm3, 48(%esp)
 	salsa8_core_sse2
 	paddd	0(%esp), %xmm0
 	paddd	16(%esp), %xmm1
 	paddd	32(%esp), %xmm2
 	paddd	48(%esp), %xmm3
 	movdqa	%xmm0, 0(%esp)
 	movdqa	%xmm1, 16(%esp)
 	movdqa	%xmm2, 32(%esp)
 	movdqa	%xmm3, 48(%esp)
 	pxor	64(%esi, %edx), %xmm0
 	pxor	80(%esi, %edx), %xmm1
 	pxor	96(%esi, %edx), %xmm2
 	pxor	112(%esi, %edx), %xmm3
 	pxor	64(%esp), %xmm0
 	pxor	80(%esp), %xmm1
 	pxor	%xmm6, %xmm2
 	pxor	%xmm7, %xmm3
 	movdqa	%xmm0, 64(%esp)
 	movdqa	%xmm1, 80(%esp)
 	movdqa	%xmm2, %xmm6
 	movdqa	%xmm3, %xmm7
 	salsa8_core_sse2
 	paddd	64(%esp), %xmm0
 	paddd	80(%esp), %xmm1
 	paddd	%xmm2, %xmm6
 	paddd	%xmm3, %xmm7
 	movdqa	%xmm0, %xmm4
 	movdqa	%xmm1, %xmm5
 	movdqa	%xmm0, 64(%esp)
 	movdqa	%xmm1, 80(%esp)
 	subl	$1, %ecx
 	ja scrypt_core_sse2_loop2
 	movdqa	%xmm6, 96(%esp)
 	movdqa	%xmm7, 112(%esp)
 	scrypt_shuffle %esp, 0, %edi, 0
 	scrypt_shuffle %esp, 64, %edi, 64
 	movl	%ebp, %esp
 	popl	%esi
 	popl	%edi
 	popl	%ebp
 	popl	%ebx
 	ret
 #endif
--- a/asm/sha2-arm.S
+++ b/asm/sha2-arm.S
--- a/asm/sha2-x64.S
+++ b/asm/sha2-x64.S
--- a/asm/sha2-x86.S
+++ b/asm/sha2-x86.S
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.7.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.8.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='23.7'
+PACKAGE_VERSION='23.8'
-PACKAGE_STRING='cpuminer-opt 23.7'
+PACKAGE_STRING='cpuminer-opt 23.8'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 23.7 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 23.8 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1432,7 +1432,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 23.7:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 23.8:";;
   esac
  cat <<\_ACEOF
@@ -1538,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 23.7
+cpuminer-opt configure 23.8
 generated by GNU Autoconf 2.71
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by cpuminer-opt $as_me 23.7, which was
+It was created by cpuminer-opt $as_me 23.8, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
  $ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi
 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='23.7'
+ VERSION='23.8'
 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 23.7, which was
+This file was extended by cpuminer-opt $as_me 23.8, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 23.7
+cpuminer-opt config.status 23.8
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [23.7])
+AC_INIT([cpuminer-opt], [23.8])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/4357
+++ b/4357
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -36,7 +36,7 @@
 #include <memory.h>
 #include <curl/curl.h>
 #include <jansson.h>
-#include <openssl/sha.h>
+//#include <openssl/sha.h>
 //#include <mm_malloc.h>
 #include "sysinfos.c"
 #include "algo/sha/sha256d.h"
@@ -1967,18 +1967,6 @@ void sha256_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
     sha256d( merkle_root, merkle_root, 64 );
  }
 }
 /*
 // OpenSSL single sha256, deprecated
 void SHA256_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
 {
  SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
  for ( int i = 0; i < sctx->job.merkle_count; i++ )
  {
     memcpy( merkle_root + 32, sctx->job.merkle[i], 32 );
     sha256d( merkle_root, merkle_root, 64 );
  }
 }
 */
 // Default is do_nothing (assumed LE)
 void set_work_data_big_endian( struct work *work )
@@ -2212,8 +2200,8 @@ static void *miner_thread( void *userdata )
 //       int64_t max64 = 1000;
       int nonce_found = 0;
-       if ( likely( algo_gate.do_this_thread( thr_id ) ) )
+//       if ( likely( algo_gate.do_this_thread( thr_id ) ) )
-       {
+//       {
          if ( have_stratum ) 
          {
             while ( unlikely( stratum_down ) )
@@ -2262,8 +2250,8 @@ static void *miner_thread( void *userdata )
          pthread_rwlock_unlock( &g_work_lock );
-       } // do_this_thread
+//       } // do_this_thread
-       algo_gate.resync_threads( thr_id, &work );
+//       algo_gate.resync_threads( thr_id, &work );
       // conditional mining
       if ( unlikely( !wanna_mine( thr_id ) ) )
@@ -3685,8 +3673,8 @@ void get_defconfig_path(char *out, size_t bufsize, char *argv0);
 #include "simd-utils.h"
-#include "algo/hamsi/hamsi-hash-4way.h"
+#include "algo/echo/aes_ni/hash_api.h"
-#include "algo/hamsi/sph_hamsi.h"
+#include "compat/aes_helper.c"
 int main(int argc, char *argv[])
 {
--- a/miner.h
+++ b/miner.h
@@ -1,38 +1,41 @@
-#ifndef __MINER_H__
+#ifndef MINER_H__
-#define __MINER_H__
+#define MINER_H__
 #include <cpuminer-config.h>
 #if defined(__x86_64__)
-   #define USER_AGENT_ARCH "x64"
+   #define USER_AGENT_ARCH "x64"     // Intel, AMD x86_64
 #elif defined(__aarch64__)
-   #define USER_AGENT_ARCH "arm"
+   #define USER_AGENT_ARCH "arm"     // AArch64
 //#elif
 //  #define USER_AGENT_ARCH "R5"     // RISC-V             
 #else
   #define USER_AGENT_ARCH
 #endif
 #if defined(__linux)
-   #define USER_AGENT_OS   "L"
+   #define USER_AGENT_OS   "L"      // GNU Linux
 #elif defined(WIN32)
-   #define USER_AGENT_OS   "W"
+   #define USER_AGENT_OS   "W"      // MS Windows
 #elif defined(__APPLE__)
   #define USER_AGENT_OS   "M"      // Apple MacOS
 // is there a generic BSD macro?
 #elif defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) 
   #define USER_AGENT_OS   "U"      // BSD unix
 #else
   #define USER_AGENT_OS
 #endif
 #define USER_AGENT PACKAGE_NAME "-" PACKAGE_VERSION "-" USER_AGENT_ARCH USER_AGENT_OS
-//#define MAX_CPUS 128
+/*
 #ifdef _MSC_VER
-#undef USE_ASM  /* to fix */
+#undef USE_ASM 
 #ifdef NOASM
 #undef USE_ASM
 #endif
 /* missing arch defines for msvc */
 #if defined(_M_X64)
 #define __i386__ 1
 #define __x86_64__ 1
@@ -40,8 +43,8 @@
 #define __i386__ 1
 #endif
-#endif /* _MSC_VER */
+#endif
-
+*/
 #include <stdbool.h>
 #include <inttypes.h>
@@ -75,7 +78,7 @@
 #endif
-
+//TODO for windows
 static inline bool is_root()
 {
 #if defined(WIN32)
@@ -607,7 +610,6 @@ enum algos {
        ALGO_GROESTL,     
        ALGO_HEX,
        ALGO_HMQ1725,
        ALGO_HODL,
        ALGO_JHA,
        ALGO_KECCAK,
        ALGO_KECCAKC,
@@ -703,7 +705,6 @@ static const char* const algo_names[] = {
        "groestl",
        "hex",
        "hmq1725",
        "hodl",
        "jha",
        "keccak",
        "keccakc",
@@ -865,7 +866,6 @@ Options:\n\
                          groestl       Groestl coin\n\
                          hex           x16r-hex\n\
                          hmq1725       Espers\n\
                          hodl          Hodlcoin\n\
                          jha           jackppot (Jackpotcoin)\n\
                          keccak        Maxcoin\n\
                          keccakc       Creative Coin\n\
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -153,10 +153,16 @@
 #define v128_unpackhi8                 _mm_unpackhi_epi8
 // AES
 // Nokey means nothing on x86_64 but it saves an instruction and a register
 // on ARM.
 #define v128_aesenc                    _mm_aesenc_si128
 #define v128_aesenc_nokey(v)           _mm_aesenc_si128( v, v128_zero )
 #define v128_aesenclast                _mm_aesenclast_si128
 #define v128_aesenclast_nokey(v)       _mm_aesenclast_si128( v, v128_zero )
 #define v128_aesdec                    _mm_aesdec_si128
 #define v128_aesdec_nokey(v)           _mm_aesdec_si128( v, v128_zero )
 #define v128_aesdeclast                _mm_aesdeclast_si128
 #define v128_aesdeclast_nokey(v)       _mm_aesdeclast_si128( v, v128_zero )
 // Used instead if casting.
 typedef union
@@ -499,73 +505,141 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 //
 // Bit rotations
-// Slow bit rotation, used as last resort
+#define v128_shuffle16( v, c ) \
-#define mm128_ror_64_sse2( v, c ) \
+       _mm_shufflehi_epi16( _mm_shufflelo_epi16( v, c ), c )
 #define v128_qrev32(v)      _mm_shuffle_epi32( v, 0xb1 )
 #define v128_swap64_32(v)   _mm_shuffle_epi32( v, 0xb1 )  // grandfathered
 #define v128_qrev16(v)      v128_shuffle16( v, 0x1b )
 #define v128_lrev16(v)      v128_shuffle16( v, 0xb1 )
 // These sgould never be callled from application code, use rol/ror.
 #define v128_ror64_sse2( v, c ) \
   _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
-#define mm128_rol_64_sse2( v, c ) \
+#define v128_rol64_sse2( v, c ) \
   _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
-#define mm128_ror_32_sse2( v, c ) \
+#define v128_ror32_sse2( v, c ) \
   _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
-#define mm128_rol_32_sse2( v, c ) \
+#define v128_rol32_sse2( v, c ) \
   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
 #if defined(__AVX512VL__)
-#define mm128_ror_64    _mm_ror_epi64
+// AVX512 fastest all rotations.
-#define mm128_rol_64    _mm_rol_epi64
+#define mm128_ror_64                _mm_ror_epi64
-#define mm128_ror_32    _mm_ror_epi32
+#define mm128_rol_64                _mm_rol_epi64
-#define mm128_rol_32    _mm_rol_epi32
+#define mm128_ror_32                _mm_ror_epi32
 #define mm128_rol_32                _mm_rol_epi32
 // ror/rol will alway find the fastest but these names may fit better with
 // application code performing shuffles rather than bit rotations.
 #define v128_shuflr64_8( v)         _mm_ror_epi64( v,  8 )
 #define v128_shufll64_8( v)         _mm_rol_epi64( v,  8 )
 #define v128_shuflr64_16(v)         _mm_ror_epi64( v, 16 )
 #define v128_shufll64_16(v)         _mm_rol_epi64( v, 16 )
 #define v128_shuflr64_24(v)         _mm_ror_epi64( v, 24 )
 #define v128_shufll64_24(v)         _mm_rol_epi64( v, 24 )
 #define v128_shuflr32_8( v)         _mm_ror_epi32( v,  8 )
 #define v128_shufll32_8( v)         _mm_rol_epi32( v,  8 )
 #define v128_shuflr32_16(v)         _mm_ror_epi32( v, 16 )
 #define v128_shufll32_16(v)         _mm_rol_epi32( v, 16 )
 // optimized byte wise rotation
 #elif defined(__SSSE3__)
 // SSE2: fastest 32 bit, very fast 16, fast 8
 #define v128_shuflr64_8( v ) \
    _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                  0x080f0e0d0c0b0a09, 0x0007060504030201 ) )
 #define v128_shufll64_8( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                  0x0e0d0c0b0a09080f, 0x0605040302010007 ) )
 #define v128_shuflr64_24( v ) \
    _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                  0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
 #define v128_shufll64_24( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                  0x0c0b0a09080f0e0d, 0x0403020100070605 ) )
 #define v128_shuflr32_8( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                  0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
 #define v128_shufll32_8( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                  0x0e0d0c0f0a09080b, 0x0605040702010003 ) )
 #define mm128_ror_64( v, c ) \
-   ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
+   ( (c) ==  8 ) ? v128_shuflr64_8( v ) \
- : ( (c) == 24 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
+ : ( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
-                                  0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) \
+ : ( (c) == 24 ) ? v128_shuflr64_24( v ) \
- : ( (c) == 16 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
+ : ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
-                                  0x09080f0e0d0c0b0a, 0x0100070605040302 ) ) \
+ : ( (c) == 40 ) ? v128_shufll64_24( v ) \
- : ( (c) ==  8 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
+ : ( (c) == 48 ) ? v128_shuffle16( v, 0x93 ) \
-                                  0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) \
+ : ( (c) == 56 ) ? v128_shufll64_8( v ) \
- : mm128_ror_64_sse2( v, c ) 
+ : v128_ror64_sse2( v, c ) 
 #define mm128_rol_64( v, c ) \
-   ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
+   ( (c) ==  8 ) ? v128_shufll64_8( v ) \
- : ( (c) == 24 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
+ : ( (c) == 16 ) ? v128_shuffle16( v, 0x93 ) \
-                                  0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) \
+ : ( (c) == 24 ) ? v128_shufll64_24( v ) \
- : ( (c) == 16 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
+ : ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
-                                  0x0d0c0b0a09080f0e, 0x0504030201000706 ) ) \
+ : ( (c) == 40 ) ? v128_shuflr64_24( v ) \
- : ( (c) ==  8 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
+ : ( (c) == 48 ) ? v128_shuffle16( v, 0x39 ) \
-                                  0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) \
+ : ( (c) == 56 ) ? v128_shuflr64_8( v ) \
- : mm128_rol_64_sse2( v, c ) 
+ : v128_rol64_sse2( v, c ) 
 #define mm128_ror_32( v, c ) \
-   ( (c) == 16 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
+   ( (c) ==  8 ) ? v128_shuflr32_8( v ) \
-                                  0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) \
+ : ( (c) == 16 ) ? v128_lrev16( v ) \
- : ( (c) ==  8 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
+ : ( (c) == 24 ) ? v128_shufll32_8( v ) \
-                                  0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) \
+ : v128_ror32_sse2( v, c ) 
 : mm128_ror_32_sse2( v, c ) 
 #define mm128_rol_32( v, c ) \
-   ( (c) == 16 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
+   ( (c) ==  8 ) ? v128_shufll32_8( v ) \
-                                  0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) \
+ : ( (c) == 16 ) ? v128_lrev16( v ) \
- : ( (c) ==  8 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
+ : ( (c) == 24 ) ? v128_shuflr32_8( v ) \
-                                  0x0e0d0c0f0a09080b, 0x0605040702010003 ) ) \
+ : v128_rol32_sse2( v, c )
- : mm128_rol_32_sse2( v, c )
+
 #elif defined(__SSE2__)
 // SSE2: fastest 32 bit, very fast 16
 #define mm128_ror_64( v, c ) \
   ( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
 : ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
 : ( (c) == 48 ) ? v128_shuffle16( v, 0x93 ) \
 : v128_ror64_sse2( v, c )
 #define mm128_rol_64( v, c ) \
   ( (c) == 16 ) ? v128_shuffle16( v, 0x93 ) \
 : ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
 : ( (c) == 48 ) ? v128_shuffle16( v, 0x39 ) \
 : v128_rol64_sse2( v, c )
 #define mm128_ror_32( v, c ) \
  ( (c) == 16 ) ? v128_lrev16( v ) \
 : v128_ror32_sse2( v, c )
 #define mm128_rol_32( v, c ) \
  ( (c) == 16 ) ? v128_lrev16( v ) \
 : v128_rol32_sse2( v, c )
 #else 
-#define mm128_ror_64         mm128_ror_64_sse2
+#define mm128_ror_64         v128_ror64_sse2
-#define mm128_rol_64         mm128_rol_64_sse2
+#define mm128_rol_64         v128_rol64_sse2
-#define mm128_ror_32         mm128_ror_32_sse2
+#define mm128_ror_32         v128_ror32_sse2
-#define mm128_rol_32         mm128_rol_32_sse2
+#define mm128_rol_32         v128_rol32_sse2
 #endif
-// Architecturally agnostic naming
+// Generic names for portable code
 #define v128_ror64            mm128_ror_64
 #define v128_rol64            mm128_rol_64
 #define v128_ror32            mm128_ror_32
@@ -669,9 +743,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 // Rotate vector elements accross all lanes
 #define v128_shuffle16( v, c ) \
   _mm_or_si128( _mm_shufflehi_epi16( v, c ), _mm_shufflelo_epi16( v, c ) )
 // reverse elements in vector
 #define v128_swap64(v)      _mm_shuffle_epi32( v, 0x4e )  // grandfathered 
 #define v128_rev64(v)       _mm_shuffle_epi32( v, 0x4e )  // preferred
@@ -685,24 +756,12 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define v128_shuflr16(v)    v128_shuffle16( v, 0x39 )
 #define v128_shufll16(v)    v128_shuffle16( v, 0x93 )
 // Some sub-vector shuffles are identical to bit rotation. Shuffle is faster.
 // Bit rotation already promotes faster widths. Usage of these versions
 // are context sensitive.
 // reverse elements in vector lanes
 #define v128_qrev32(v)      v128_ror64( v, 32 )
 #define v128_swap64_32(v)   v128_ror64( v, 32 )   // grandfathered
 #define v128_qrev16(v) \
    _mm_or_si128( _mm_shufflehi_epi16( v, v128u16( 0x1b ) ) \
                  _mm_shufflelo_epi16( v, v128u16( 0x1b ) ) )
 #define v128_lrev16(v)      v128_ror32( v, 16 )
 //TODO fix this
 // alias bswap
-#define v128_qrev8(v)       _mm_shuffle_epi8( v, v128_8( 0,1,2,3,4,5,6,7 ) )
+//#define v128_qrev8(v)       _mm_shuffle_epi8( v, v128_8( 0,1,2,3,4,5,6,7 ) )
-#define v128_lrev8(v)       _mm_shuffle_epi8( v, v128_8( 4,5,6,7, 0,1,2,3 ) )
+//#define v128_lrev8(v)       _mm_shuffle_epi8( v, v128_8( 4,5,6,7, 0,1,2,3 ) )
-#define v128_wrev8(v)       _mm_shuffle_epi8( v, v128_8( 6,7, 4,5, 2,3, 1,0 ) )
+//#define v128_wrev8(v)       _mm_shuffle_epi8( v, v128_8( 6,7, 4,5, 2,3, 1,0 ) )
 // reverse bits, can it be done?
 //#define v128_bitrev8( v )              vrbitq_u8
@@ -790,6 +849,16 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 #define mm128_block_bswap32_256      mm128_block_bswap_32
 #define v128_block_bswap32_256       mm128_block_bswap_32
 #define mm128_block_bswap32_128( d, s ) \
 { \
   __m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \
  casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \
  casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \
  casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \
 }   
 #define v128_block_bswap32_512( d, s ) \
 { \
   __m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -218,7 +218,29 @@ static inline __m256i mm256_not( const __m256i v )
 //
 //           Bit rotations.
-// Slow version, used as last resort
+#define mm256_shuffle16( v, c ) \
   _mm256_shufflehi_epi16( _mm256_shufflelo_epi16( v, c ), c )
 #define mm256_qrev32(v)    _mm256_shuffle_epi32( v, 0xb1 )
 #define mm256_swap64_32    mm256_qrev32       // grandfathered
 #define mm256_qrev16(v)    mm256_shuffle16( v, 0x1b )
 #define mm256_qrev8(v) \
   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
                         v128_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) )
 #define mm256_lrev16(v)    mm256_shuffle16( v, 0xb1 )
 #define mm256_lrev8(v) \
   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
                         v128_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) )
 #define mm256_wrev8(v)  \
   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
                         v128_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
 // These should never be called directly by applications.
 #define mm256_ror_64_avx2( v, c ) \
   _mm256_or_si256( _mm256_srli_epi64( v, c ), \
                    _mm256_slli_epi64( v, 64-(c) ) )
@@ -242,40 +264,76 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_ror_32    _mm256_ror_epi32
 #define mm256_rol_32    _mm256_rol_epi32
 // Redundant but naming may be a better fit in some applications.
 #define mm126_shuflr64_8( v)      _mm256_ror_epi64( v,  8 )
 #define mm156_shufll64_8( v)      _mm256_rol_epi64( v,  8 )
 #define mm256_shuflr64_16(v)      _mm256_ror_epi64( v, 16 )
 #define mm256_shufll64_16(v)      _mm256_rol_epi64( v, 16 )
 #define mm256_shuflr64_24(v)      _mm256_ror_epi64( v, 24 )
 #define mm256_shufll64_24(v)      _mm256_rol_epi64( v, 24 )
 #define mm256_shuflr32_8( v)      _mm256_ror_epi32( v,  8 )
 #define mm256_shufll32_8( v)      _mm256_rol_epi32( v,  8 )
 #define mm256_shuflr32_16(v)      _mm256_ror_epi32( v, 16 )
 #define mm256_shufll32_16(v)      _mm256_rol_epi32( v, 16 )
 #else
 // ROR & ROL will always find the fastest but these names may be a better fit
 // in some applications.
 #define mm256_shuflr64_8( v ) \
    _mm256_shuffle_epi8( v, mm256_bcast_m128( \
                 _mm_set_epi64x( 0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) )
 #define mm256_shufll64_8( v ) \
   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
                 _mm_set_epi64x( 0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) )
 #define mm256_shuflr64_24( v ) \
   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
                  _mm_set_epi64x( 0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) )
 #define mm256_shufll64_24( v ) \
   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
                  _mm_set_epi64x( 0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) )
 #define mm256_shuflr32_8( v ) \
   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
                  _mm_set_epi64x( 0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) )
 #define mm256_shufll32_8( v ) \
   _mm256_shuffle_epi8( v, mm256_bcast_m128( \
                  _mm_set_epi64x( 0x0e0d0c0f0a09080b, 0x0605040702010003 ) ) )
 #define mm256_ror_64( v, c ) \
-   ( (c) == 32 ) ? _mm256_shuffle_epi32( v, 0xb1 ) \
+   ( (c) ==  8 ) ? mm256_shuflr64_8( v ) \
- : ( (c) == 24 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+ : ( (c) == 16 ) ? mm256_shuffle16( v, 0x39 ) \
-              _mm_set_epi64x( 0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) ) \
+ : ( (c) == 24 ) ? mm256_shuflr64_24( v ) \
- : ( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+ : ( (c) == 32 ) ? _mm256_shuffle_epi32( v, 0xb1 ) \
-              _mm_set_epi64x( 0x09080f0e0d0c0b0a, 0x0100070605040302 ) ) ) \
+ : ( (c) == 40 ) ? mm256_shufll64_24( v ) \
- : ( (c) ==  8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+ : ( (c) == 48 ) ? mm256_shuffle16( v, 0x93 ) \
-              _mm_set_epi64x( 0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) ) \
+ : ( (c) == 56 ) ? mm256_shufll64_8( v ) \
 : mm256_ror_64_avx2( v, c )
 #define mm256_rol_64( v, c ) \
-   ( (c) == 32 ) ? _mm256_shuffle_epi32( v, 0xb1 ) \
+   ( (c) ==  8 ) ? mm256_shufll64_8( v ) \
- : ( (c) == 24 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+ : ( (c) == 16 ) ? mm256_shuffle16( v, 0x93 ) \
-             _mm_set_epi64x( 0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) ) \
+ : ( (c) == 24 ) ? mm256_shufll64_24( v ) \
- : ( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+ : ( (c) == 32 ) ? _mm256_shuffle_epi32( v, 0xb1 ) \
-             _mm_set_epi64x( 0x0d0c0b0a09080f0e, 0x0504030201000706 ) ) ) \
+ : ( (c) == 40 ) ? mm256_shuflr64_24( v ) \
- : ( (c) ==  8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+ : ( (c) == 48 ) ? mm256_shuffle16( v, 0x39 ) \
-             _mm_set_epi64x( 0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) ) \
+ : ( (c) == 56 ) ? mm256_shuflr64_8( v ) \
 : mm256_rol_64_avx2( v, c )
 #define mm256_ror_32( v, c ) \
-   ( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+   ( (c) ==  8 ) ? mm256_shuflr32_8( v ) \
-             _mm_set_epi64x( 0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) )\
+ : ( (c) == 16 ) ? mm256_lrev16( v ) \
- : ( (c) ==  8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+ : ( (c) == 24 ) ? mm256_shufll32_8( v ) \
             _mm_set_epi64x( 0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) ) \
 : mm256_ror_32_avx2( v, c )
 #define mm256_rol_32( v, c ) \
-   ( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+   ( (c) ==  8 ) ? mm256_shufll32_8( v ) \
-             _mm_set_epi64x( 0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) ) \
+ : ( (c) == 16 ) ? mm256_lrev16( v ) \
- : ( (c) ==  8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
+ : ( (c) == 24 ) ? mm256_shuflr32_8( v ) \
             _mm_set_epi64x( 0x0e0d0c0f0a09080b, 0x0605040702010003 ) ) ) \
 : mm256_rol_32_avx2( v, c )
 #endif
@@ -400,25 +458,19 @@ static inline __m256i mm256_not( const __m256i v )
 /* Not used
 // Rotate 256 bit vector by one 32 bit element.
 #if defined(__AVX512VL__)
 static inline __m256i mm256_shuflr_32( const __m256i v )
 { return _mm256_alignr_epi32( v, v, 1 ); }
 static inline __m256i mm256_shufll_32( const __m256i v )
 { return _mm256_alignr_epi32( v, v, 15 ); }
 #else
 #define mm256_shuflr_32( v ) \
    _mm256_permutevar8x32_epi32( v, \
                 _mm256_set_spi64x( 0x0000000000000007, 0x0000000600000005, \
                                    0x0000000400000003, 0x0000000200000001 ) )
 #define mm256_shufll_32( v ) \
    _mm256_permutevar8x32_epi32( v, \
                 _mm256_set_epi64x( 0x0000000600000005,  0x0000000400000003, \
                                    0x0000000200000001,  0x0000000000000007 ) )
 #endif
 */
@@ -450,21 +502,6 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 { return _mm256_alignr_epi8( v, v, c ); }
 */
 // Same as bit rotation but logically used as byte/word rotation.
 #define mm256_swap64_32( v )    mm256_ror_64( v, 32 )  // grandfathered
 #define mm256_rev64_32( v )     mm256_ror_64( v, 32 ) 
 #define mm256_shuflr64_16(v)    _mm256_ror_epi64( v, 16 )
 #define mm256_shufll64_16(v)    _mm256_rol_epi64( v, 16 )
 #define mm256_shuflr64_8(v)     _mm256_ror_epi64( v,  8 )
 #define mm256_shufll64_8(v)     _mm256_rol_epi64( v,  8 )
 #define mm256_rev32_16( v )     mm256_ror_32( v, 16 ) 
 #define mm256_shuflr32_8(v)     _mm256_ror_epi32( v,  8 )
 #define mm256_shufll32_8(v)     _mm256_rol_epi32( v,  8 )
 // Reverse byte order in elements, endian bswap.
 #define mm256_bswap_64( v ) \
   _mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -1,6 +1,9 @@
 #if !defined(SIMD_INT_H__)
 #define SIMD_INT_H__ 1
 //TODO compile time test for byte order
 // be64 etc using HW bowap.
 //
 // Endian byte swap
 #if defined(__x86_64__)
@@ -9,8 +12,6 @@
 #elif defined(__aarch64__)
 //#pragma message "aarch64 fast bswap"
 static inline uint64_t bswap_64( uint64_t a )
 {
   uint64_t b;
--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -81,7 +81,7 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 #define v128_cmpeq16                   vceqq_u16
 #define v128_cmpeq8                    vceqq_u8
-#define v128_cmpeq0                    vceqzq_u64
+#define v128_iszero                    vceqzq_u64
 // Not yet needed
 //#define v128_cmpeq1
@@ -174,12 +174,31 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 // AES
-// consistent with Intel AES, break up for optimizing
+// consistent with Intel AES intrinsics, break up for optimizing
-#define v128_aesenc( v, k )            vaesmcq_u8( vaeseq_u8( v, k ) )
+#define v128_aesenc( v, k ) \
-#define v128_aesenclast( v, k )        vaeseq_u8( v, k )
+   v128_xor( k, vaesmcq_u8( vaeseq_u8( v, v128_zero ) ) )
 #define v128_aesenc_nokey( v ) \
   vaesmcq_u8( vaeseq_u8( v, v128_zero ) )
 #define v128_aesenclast( v, k ) \
   v128_xor( k, vaeseq_u8( v, v128_zero ) )
 #define v128_aesenclast_nokey( v, k ) \
   vaeseq_u8( v, v128_zero )
 #define v128_aesdec( v, k ) \
    v128_xor( k, vaesimcq_u8( vaesdq_u8( v, v128_zero ) ) )
 #define v128_aesdec_nokey( v, k ) \
    vaesimcq_u8( vaesdq_u8( v, v128_zero ) )
 #define v128_aesdeclast( v, k ) \
    v128_xor( k, vaesdq_u8( v, v128_zero ) )
 #define v128_aesdeclast_nokey( v, k ) \
    vaesdq_u8( v, v128_zero )
 #define v128_aesdec( v, k )            vaesimcq_u8( vaesdq_u8( v, k ) )
 #define v128_aesdeclast( v, k )        vaesdq_u8( v, k )
 typedef union
 {
@@ -189,7 +208,7 @@ typedef union
 } __attribute__ ((aligned (16))) v128_ovly;
-// Broadcast lane 0 to all lanes
+// Broadcast lane 0 to all lanes, consistent with x86_64 broadcast
 #define v128_bcast64(v)                vdupq_laneq_u64( v, 0 )
 #define v128_bcast32(v)                vdupq_laneq_u32( v, 0 )
 #define v128_bcast16(v)                vdupq_laneq_u16( v, 0 )
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -15,7 +15,7 @@
 #include <string.h>
 #include "miner.h"
-#if defined(__aarch64__)
+#if defined(__aarch64__) && !defined(__APPLE__)
 // for arm's "cpuid"
 #include <sys/auxv.h>
 #include <asm/hwcap.h>
@@ -141,26 +141,13 @@ static inline void linux_cpu_hilo_freq( float *lo, float *hi )
   *lo = (float)lo_freq;
 }
 #else /* WIN32 */
 static inline float win32_cputemp( int core )
 {
 	// todo
 	return 0.0;
 }
 #endif /* !WIN32 */
 /* exports */
 static inline float cpu_temp( int core )
 {
 #ifdef WIN32
-	return win32_cputemp( core );
+	return 0.;
 #else
 	return linux_cputemp( core );
 #endif
@@ -321,7 +308,7 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf,
 #endif
 }
-#elif defined(__aarch64__)
+#elif defined(__aarch64__) && !defined(__APPLE__)
 static inline void cpuid( unsigned int leaf, unsigned int subleaf,
                          unsigned int output[4] )
@@ -495,11 +482,9 @@ static inline bool cpu_arch_aarch64()
 static inline bool has_sse()
 {
 #if defined(__x86_64__)
   unsigned int cpu_info[4] = { 0 };
   cpuid( CPU_INFO, 0, cpu_info );
   return cpu_info[ EDX_Reg ] & SSE_Flag;
 #else
   return false;
 #endif
@@ -508,11 +493,9 @@ static inline bool has_sse()
 static inline bool has_sse2()
 {
 #if defined(__x86_64__)
    unsigned int cpu_info[4] = { 0 };
    cpuid( CPU_INFO, 0, cpu_info );
    return cpu_info[ EDX_Reg ] & SSE2_Flag;
 #else
    return false;
 #endif
@@ -521,11 +504,9 @@ static inline bool has_sse2()
 static inline bool has_ssse3()
 {
 #if defined(__x86_64__)
   unsigned int cpu_info[4] = { 0 };
   cpuid( CPU_INFO, 0, cpu_info );
   return cpu_info[ ECX_Reg ] & SSSE3_Flag;
 #else
   return false;
 #endif
@@ -534,11 +515,9 @@ static inline bool has_ssse3()
 static inline bool has_sse41()
 {
 #if defined(__x86_64__)
   unsigned int cpu_info[4] = { 0 };
   cpuid( CPU_INFO, 0, cpu_info );
   return cpu_info[ ECX_Reg ] & SSE41_Flag;
 #else
   return false;
 #endif
@@ -547,11 +526,9 @@ static inline bool has_sse41()
 static inline bool has_sse42()
 {
 #if defined(__x86_64__)
   unsigned int cpu_info[4] = { 0 };
   cpuid( CPU_INFO, 0, cpu_info );
   return cpu_info[ ECX_Reg ] & SSE42_Flag;
 #else
   return false;
 #endif
@@ -559,7 +536,7 @@ static inline bool has_sse42()
 static inline bool has_neon()
 {
-#if defined(__aarch64__)
+#if defined(__aarch64__) && !defined(__APPLE__)
    unsigned int cpu_info[4] = { 0 };
    return cpu_info[0];
 #else
@@ -570,7 +547,6 @@ static inline bool has_neon()
 static inline bool has_aes_ni()
 {
 #if defined(__x86_64__)
   if ( has_sse2() )
   {
      unsigned int cpu_info[4] = { 0 };
@@ -578,9 +554,7 @@ static inline bool has_aes_ni()
      return cpu_info[ ECX_Reg ] & AES_NI_Flag;
   }
   return false;
-
+#elif defined(__aarch64__) && !defined(__APPLE__)
 #elif defined(__aarch64__)
   if ( has_neon() )
   {
      unsigned int cpu_info[4] = { 0 };
@@ -588,7 +562,6 @@ static inline bool has_aes_ni()
      return cpu_info[0] & HWCAP_AES;
   }
   return false;
 #else
   return false;
 #endif
@@ -597,11 +570,9 @@ static inline bool has_aes_ni()
 static inline bool has_avx()
 {
 #if defined(__x86_64__)
   unsigned int cpu_info[4] = { 0 };
   cpuid( CPU_INFO, 0, cpu_info );
   return ( ( cpu_info[ ECX_Reg ] & AVX_mask ) == AVX_mask );
 #else
   return false;
 #endif
@@ -610,11 +581,9 @@ static inline bool has_avx()
 static inline bool has_avx2()
 {
 #if defined(__x86_64__)
   unsigned int cpu_info[4] = { 0 };
   cpuid( EXTENDED_FEATURES, 0, cpu_info );
   return cpu_info[ EBX_Reg ] & AVX2_Flag;
 #else
   return false;
 #endif
@@ -623,7 +592,6 @@ static inline bool has_avx2()
 static inline bool has_sha()
 {
 #if defined(__x86_64__)
    if ( has_avx() )
    {
       unsigned int cpu_info[4] = { 0 };
@@ -631,9 +599,7 @@ static inline bool has_sha()
       return cpu_info[ EBX_Reg ] & SHA_Flag;
    }
    return false;
-
+#elif defined(__aarch64__) && !defined(__APPLE__)
 #elif defined(__aarch64__)
    if ( has_neon() )
    {
       unsigned int cpu_info[4] = { 0 };
@@ -641,7 +607,6 @@ static inline bool has_sha()
       return cpu_info[0] & HWCAP_SHA2;
    }
    return false;
 #else
    return false;
 #endif
@@ -650,7 +615,6 @@ static inline bool has_sha()
 static inline bool has_sha512()
 {
 #if defined(__x86_64__)
    if ( has_avx2() )
    {
       unsigned int cpu_info[4] = { 0 };
@@ -658,9 +622,7 @@ static inline bool has_sha512()
       return cpu_info[ EAX_Reg ] & SHA512_Flag;
    }
    return false;
-
+#elif defined(__aarch64__) && !defined(__APPLE__)
 #elif defined(__aarch64__)
    if ( has_neon() )
    {
       unsigned int cpu_info[4] = { 0 };
@@ -668,7 +630,6 @@ static inline bool has_sha512()
       return cpu_info[0] & HWCAP_SHA3;
    }
    return false;
 #else
    return false;
 #endif
@@ -677,7 +638,6 @@ static inline bool has_sha512()
 static inline bool has_avx512f()
 {
 #if defined(__x86_64__)
   unsigned int cpu_info[4] = { 0 };
   cpuid( EXTENDED_FEATURES, 0, cpu_info );
   return cpu_info[ EBX_Reg ] & AVX512_F_Flag;
@@ -689,7 +649,6 @@ static inline bool has_avx512f()
 static inline bool has_avx512dq()
 {
 #if defined(__x86_64__)
   unsigned int cpu_info[4] = { 0 };
   cpuid( EXTENDED_FEATURES, 0, cpu_info );
   return cpu_info[ EBX_Reg ] & AVX512_DQ_Flag;
@@ -701,7 +660,6 @@ static inline bool has_avx512dq()
 static inline bool has_avx512bw()
 {
 #if defined(__x86_64__)
   unsigned int cpu_info[4] = { 0 };
   cpuid( EXTENDED_FEATURES, 0, cpu_info );
   return cpu_info[ EBX_Reg ] & AVX512_BW_Flag;
@@ -713,7 +671,6 @@ static inline bool has_avx512bw()
 static inline bool has_avx512vl()
 {
 #if defined(__x86_64__)
   unsigned int cpu_info[4] = { 0 };
   cpuid( EXTENDED_FEATURES, 0, cpu_info );
   return cpu_info[ EBX_Reg ] & AVX512_VL_Flag;
@@ -722,14 +679,13 @@ static inline bool has_avx512vl()
 #endif
 }
 // baseline for useability
 static inline bool has_avx512()
 {
 #if defined(__x86_64__)
   unsigned int cpu_info[4] = { 0 };
   cpuid( EXTENDED_FEATURES, 0, cpu_info );
   return ( ( cpu_info[ EBX_Reg ] & AVX512_mask ) == AVX512_mask );
 #else
   return false;    
 #endif
@@ -738,7 +694,6 @@ static inline bool has_avx512()
 static inline bool has_vaes()
 {
 #if defined(__x86_64__)
   if ( has_avx2() )
   {
       unsigned int cpu_info[4] = { 0 };
@@ -754,11 +709,9 @@ static inline bool has_vaes()
 static inline bool has_vbmi()
 {
 #if defined(__x86_64__)
    unsigned int cpu_info[4] = { 0 };
    cpuid( EXTENDED_FEATURES, 0, cpu_info );
    return cpu_info[ ECX_Reg ] & AVX512_VBMI_Flag;
 #else
   return false;
 #endif
@@ -767,7 +720,6 @@ static inline bool has_vbmi()
 static inline bool has_vbmi2()
 {
 #if defined(__x86_64__)
    unsigned int cpu_info[4] = { 0 };
    cpuid( EXTENDED_FEATURES, 0, cpu_info );
    return cpu_info[ ECX_Reg ] & AVX512_VBMI2_Flag;
@@ -780,7 +732,6 @@ static inline bool has_vbmi2()
 static inline bool has_xop()
 {
 #if defined(__x86_64__)
   unsigned int cpu_info[4] = { 0 };
   cpuid( EXTENDED_CPU_INFO, 0, cpu_info );
   return cpu_info[ ECX_Reg ] & XOP_Flag;
@@ -792,11 +743,9 @@ static inline bool has_xop()
 static inline bool has_fma3()
 {
 #if defined(__x86_64__)
   unsigned int cpu_info[4] = { 0 };
   cpuid( CPU_INFO, 0, cpu_info );
   return ( ( cpu_info[ ECX_Reg ] & FMA3_mask ) == FMA3_mask );
 #else
   return false;
 #endif
@@ -805,24 +754,21 @@ static inline bool has_fma3()
 static inline bool has_apx_f()
 {
 #if defined(__x86_64__)
   unsigned int cpu_info[4] = { 0 };
   cpuid( EXTENDED_FEATURES, 1, cpu_info );
   return cpu_info[ EDX_Reg ] & APX_F_Flag;
 #else
   return false;
 #endif
 }
 // Not much use on it's own
 static inline bool has_avx10()
 {
 #if defined(__x86_64__)
    unsigned int cpu_info[4] = { 0 };
    cpuid( EXTENDED_FEATURES, 1, cpu_info );
    return cpu_info[ EDX_Reg ] & AVX10_Flag;
 #else
    return false;
 #endif
@@ -831,7 +777,6 @@ static inline bool has_avx10()
 static inline unsigned int avx10_version()
 {
 #if defined(__x86_64__)
    if ( has_avx10() )
    {
       unsigned int cpu_info[4] = { 0 };
@@ -839,7 +784,6 @@ static inline unsigned int avx10_version()
       return cpu_info[ EBX_Reg ] & AVX10_VERSION_mask;
    }
    return 0;
 #else
    return 0;
 #endif
@@ -849,7 +793,6 @@ static inline unsigned int avx10_version()
 static inline bool has_avx10_512()
 {
 #if defined(__x86_64__)
    if ( has_avx10() )
    {
       unsigned int cpu_info[4] = { 0 };
@@ -857,17 +800,15 @@ static inline bool has_avx10_512()
       return cpu_info[ EBX_Reg ] & AVX10_512_Flag;
    }
    return false;
 #else
    return false;
 #endif
 }
-// may not include 512
+// Includes 128 but may not include 512
 static inline bool has_avx10_256()
 {
 #if defined(__x86_64__)
    if ( has_avx10() )
    {
       unsigned int cpu_info[4] = { 0 };
@@ -875,7 +816,6 @@ static inline bool has_avx10_256()
       return cpu_info[ EBX_Reg ] & AVX10_256_Flag;
    }
    return false;
 #else
    return false;
 #endif
@@ -885,7 +825,6 @@ static inline bool has_avx10_256()
 static inline unsigned int avx10_vector_length()
 {
 #if defined(__x86_64__)
    if ( has_avx10() )
    {
       unsigned int cpu_info[4] = { 0 };
@@ -894,16 +833,12 @@ static inline unsigned int avx10_vector_length()
          : ( cpu_info[ EBX_Reg ] & AVX10_256_Flag ? 256 : 0 );
    }
    return 0;
 #else
    return 0;
 #endif
 }
 static inline uint32_t cpuid_get_highest_function_number()
 {
 #if defined(__x86_64__)
@@ -922,7 +857,7 @@ static inline void cpuid_get_highest_function( char* s )
 {
 #if defined(__x86_64__)
-   uint32_t fn = cpuid_get_highest_function_number();
+  uint32_t fn = cpuid_get_highest_function_number();
  switch (fn)
  {
    case 0x16:
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -10,12 +10,14 @@
 # define some local variables
 export LOCAL_LIB="$HOME/usr/lib"
-export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
+export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --host=x86_64-w64-mingw32"
 #export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
 export MINGW_LIB="/usr/x86_64-w64-mingw32/lib"
 # set correct gcc version
 export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
 # used by GCC
-export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
+export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs"
 #export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
 # Support for Windows 7 CPU groups, AES sometimes not included in -march
 # CPU groups disabled due to incompatibilities between Intel and AMD CPUs.
 #export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
@@ -38,7 +40,7 @@ cp $MINGW_LIB/zlib1.dll release/
 cp $MINGW_LIB/libwinpthread-1.dll release/
 cp $GCC_MINGW_LIB/libstdc++-6.dll release/
 cp $GCC_MINGW_LIB/libgcc_s_seh-1.dll release/
-cp ./../libcrypto-1_1-x64.dll release/
+#cp ./../libcrypto-1_1-x64.dll release/
 cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
 # Start building...