v24.5

v24.4
v24.3
2025-09-17 23:44:27 +00:00 · 2024-09-13 14:14:57 -04:00 · 2024-07-01 00:33:19 -04:00 · 2024-05-28 18:20:19 -04:00 · 2024-05-20 23:08:50 -04:00 · 2024-04-16 21:31:35 -04:00
214 changed files with 6153 additions and 4445 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -16,6 +16,7 @@ bin_PROGRAMS	= cpuminer
 dist_man_MANS	= cpuminer.1
 cpuminer_SOURCES = \
  dummy.cpp \
  cpu-miner.c \
  util.c \
  api.c \
@@ -113,7 +114,6 @@ cpuminer_SOURCES = \
  algo/lyra2/phi2-4way.c \
  algo/lyra2/phi2.c \
  algo/m7m/m7m.c \
  algo/m7m/magimath.cpp \
  algo/nist5/nist5-gate.c \
  algo/nist5/nist5-4way.c \
  algo/nist5/nist5.c \
@@ -166,7 +166,6 @@ cpuminer_SOURCES = \
  algo/shavite/sph-shavite-aesni.c \
  algo/shavite/shavite-hash-2way.c \
  algo/shavite/shavite-hash-4way.c \
  algo/shavite/shavite.c \
  algo/simd/nist.c \
  algo/simd/vector.c \
  algo/simd/sph_simd.c \
@@ -250,6 +249,7 @@ cpuminer_SOURCES = \
  algo/x16/x16rt.c \
  algo/x16/x16rt-4way.c \
  algo/x16/hex.c \
  algo/x16/x20r.c \
  algo/x16/x21s-4way.c \
  algo/x16/x21s.c \
  algo/x16/minotaur.c \
@@ -288,7 +288,7 @@ if HAVE_WINDOWS
 endif
 cpuminer_LDFLAGS	= @LDFLAGS@
-cpuminer_LDADD	= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@  -lgmp
+cpuminer_LDADD	= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
 cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
 cpuminer_CFLAGS   = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
--- a/README.md
+++ b/README.md
@@ -87,7 +87,6 @@ Supported Algorithms
                          groestl       Groestl coin
                          hex           x16r-hex
                          hmq1725       
                          hodl          Hodlcoin
                          jha           Jackpotcoin
                          keccak        Maxcoin
                          keccakc       Creative coin
@@ -115,9 +114,11 @@ Supported Algorithms
                          scrypt:N      scrypt(N, 1, 1)
                          scryptn2      scrypt(1048576, 1, 1)
                          sha256d       Double SHA-256
                          sha256dt
                          sha256q       Quad SHA-256
                          sha256t       Triple SHA-256
                          sha3d         Double keccak256 (BSHA3)
                          sha512256d
                          skein         Skein+Sha (Skeincoin)
                          skein2        Double Skein (Woodcoin)
                          skunk         Signatum (SIGT)
@@ -145,6 +146,7 @@ Supported Algorithms
                          x16rt-veil    veil
                          x16s          
                          x17
                          x20r
                          x21s
                          x22i
                          x25x
--- a/89
+++ b/89
@@ -27,17 +27,19 @@ See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions
 Requirements
 ------------
-Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
+- A x86_64 architecture CPU with a minimum of SSE2 support. This includes Intel Core2 and newer and AMD equivalents.
-supported.
+- Arm CPU supporting AArch64 and NEON.
-64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
+32 bit CPUs are not supported.
 are not supported. FreeBSD YMMV.
-ARM requirements (Beta):
+Older CPUs are supported by open source cpuminer-multi by TPruvot but at reduced performance.
-CPU: Armv8 and NEON, SHA2 & AES are optional
+Mining on mobile devices that meet the requirements is not recommended due to the risk of
-OS: Linux distribution built for AArch64.
+overheating and damaging the battery. Mining has unlimited demand, it will push any device
-Packages: source code only.
+to or beyond its limits. There is also a fire risk with overheated lithium batteries.
 Beware of apps claiming "mobile only mining". There is no such thing, they aren't miners.
 If a mobile CPU can mine it any CPU can.
 See wiki for details.
@@ -73,6 +75,77 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 v24.5
 Fix MinGW compile error after MSys2 upgrade to GCC-14.2. 
 #427: GBT: Improved handling of new work.
 Removed shavite3 algo.
 v24.4
 x86_64: fixed a bug in ornot macro for AVX2 which broke some algos in v24.2.
 x86_64: fixed a bug in alignr macros for SSE2.
 ARM: CPU feature reporting enhancements.
 Some code cleanup.
 v24.3
 ARM: CPU feature detection and reporting is now working.
 ARM: Verthash is now working.
 ARM: Small speedup for yescrypt, yespower & argon2d.
 Code cleanup.
 v24.2
 x86_64: Fixed blakes2s for AVX2 & AVX512, x25x for AVX512, broken in v3.23.4.
 x86_64: Initial support for CPUs with AVX10, needs GCC-14.
 ARM NEON: Various code optimisations.
 v24.1
 #414: fix bug in merkle error handling.
 #416: change $nproc to $(nproc) in build scripts.
 #420: change some inline function definitions to static inline. 
 #413: Fix formatting error for share result log when using no-color.
 Faster 2 way interleaving.
 Cleanup sha256 architecture targetting.
 v23.15
 Fixed x11gost (sib) algo for all architectures, broken in v3.23.4.
 ARM: Fugue AES optimizations enabled.
 ARM: quark, qubit, x11gost algos optimized with NEON & AES.
 v23.14
 ARM: Groestl AES optimizations enabled.
 All: Small optimization to Shabal 4way.
 x86_64: Extend Shabal 4way support to SSE2 from SSE4.1.
 All: deleted some unused files.
 v23.13
 Added x20r algo.
 Eliminated redundant hash order calculations for x16r family.
 v23.12
 Several bugs fixes and speed improvements for x16r family for all CPU architectures.
 v23.11
 This is a release candidate for full AArch64 support, marking the end of the Beta phase.
 Fixed hmq1725 & x25x algos, SSE2 & NEON, broken in v3.23.4.
 Most CPU-mineable SHA3 algos (X*) upgraded to 2-way SSE2 & NEON.
 v23.10
 x86_64: Fixed scrypt, scryptn2 algos SSE2. 
 Fixed sha512256d algo AVX2, SSE2, NEON.
 Fixed a bug in Skein N-way that reduced performance.
 ARM: Skein optimized for NEON, SHA2 & SSE2.
 Skein2 algo 2-way optimized for NEON & SSE2.
 v23.9
 x86_64: fixed minotaurx crash, broken in 23.7.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -184,7 +184,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
 #endif
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 //int scanhash_8way_64_64( struct work *work, uint32_t max_nonce,
 //                      uint64_t *hashes_done, struct thr_info *mythr )
@@ -263,8 +263,8 @@ static void init_algo_gate( algo_gate_t* gate )
   gate->build_block_header      = (void*)&std_build_block_header;
   gate->build_extraheader       = (void*)&std_build_extraheader;
   gate->set_work_data_endian    = (void*)&do_nothing;
-   gate->resync_threads          = (void*)&do_nothing;
+//   gate->resync_threads          = (void*)&do_nothing;
-   gate->do_this_thread          = (void*)&return_true;
+//   gate->do_this_thread          = (void*)&return_true;
   gate->longpoll_rpc_call       = (void*)&std_longpoll_rpc_call;
   gate->get_work_data_size      = (void*)&std_get_work_data_size;
   gate->optimizations           = EMPTY_SET;
@@ -340,7 +340,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_SHA256T:      rc = register_sha256t_algo       ( gate ); break;
    case ALGO_SHA3D:        rc = register_sha3d_algo         ( gate ); break;
    case ALGO_SHA512256D:   rc = register_sha512256d_algo    ( gate ); break;
    case ALGO_SHAVITE3:     rc = register_shavite_algo       ( gate ); break;
    case ALGO_SKEIN:        rc = register_skein_algo         ( gate ); break;
    case ALGO_SKEIN2:       rc = register_skein2_algo        ( gate ); break;
    case ALGO_SKUNK:        rc = register_skunk_algo         ( gate ); break;
@@ -368,6 +367,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_X16RT_VEIL:   rc = register_x16rt_veil_algo    ( gate ); break;
    case ALGO_X16S:         rc = register_x16s_algo          ( gate ); break;
    case ALGO_X17:          rc = register_x17_algo           ( gate ); break;
    case ALGO_X20R:         rc = register_x20r_algo          ( gate ); break;
    case ALGO_X21S:         rc = register_x21s_algo          ( gate ); break;
    case ALGO_X22I:         rc = register_x22i_algo          ( gate ); break;
    case ALGO_X25X:         rc = register_x25x_algo          ( gate ); break;
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -98,25 +98,27 @@ typedef  uint32_t set_t;
 #define AVX512_OPT       1 <<  6   // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
 #define AES_OPT          1 <<  7   // Intel Westmere, AArch64
 #define VAES_OPT         1 <<  8   // Icelake, Zen3
-#define SHA_OPT          1 <<  9   // Zen1, Icelake, AArch64 
+#define SHA256_OPT       1 <<  9   // Zen1, Icelake, AArch64 
-#define SHA512_OPT       1 << 10   // AArch64 
+#define SHA512_OPT       1 << 10   // Intel Arrow Lake, AArch64 
 #define NEON_OPT         1 << 11   // AArch64 
 #define AVX10_256        1 << 12
 #define AVX10_512        1 << 13
 // AVX10 does not have explicit algo features:
 //  AVX10_512 is compatible with AVX512 + VAES
 //  AVX10_256 is compatible with AVX2 + VAES
 // return set containing all elements from sets a & b
-inline set_t set_union ( set_t a, set_t b ) { return a | b; }
+static inline set_t set_union ( set_t a, set_t b ) { return a | b; }
 // return set contained common elements from sets a & b
-inline set_t set_intsec ( set_t a, set_t b) { return a & b; }
+static inline set_t set_intsec ( set_t a, set_t b) { return a & b; }
 // all elements in set a are included in set b
-inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }
+static inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }
 // no elements in set a are included in set b
-inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
+static inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
 typedef struct
 {
@@ -163,10 +165,10 @@ char* ( *malloc_txs_request )   ( struct work* );
 void ( *set_work_data_endian )  ( struct work* );
 // Diverge mining threads
-bool ( *do_this_thread )        ( int );
+//bool ( *do_this_thread )        ( int );
 // After do_this_thread
-void ( *resync_threads )        ( int, struct work* );
+//void ( *resync_threads )        ( int, struct work* );
 json_t* ( *longpoll_rpc_call )  ( CURL*, int*, char* );
@@ -246,7 +248,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
 #endif
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 //int scanhash_8way_64in_64out( struct work *work, uint32_t max_nonce,
 //                      uint64_t *hashes_done, struct thr_info *mythr );
--- a/algo/argon2d/argon2d/opt.c
+++ b/algo/argon2d/argon2d/opt.c
@@ -35,7 +35,7 @@
 * @pre all block pointers must be valid
 */
-#if defined(__AVX512F__)
+#if defined(SIMD512)
 static inline __m512i blamka( __m512i x, __m512i y )
 {
@@ -237,7 +237,7 @@ void fill_segment(const argon2_instance_t *instance,
    uint64_t pseudo_rand, ref_index, ref_lane;
    uint32_t prev_offset, curr_offset;
    uint32_t starting_index, i;
-#if defined(__AVX512F__)
+#if defined(SIMD512)
    __m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
 #elif defined(__AVX2__)
    __m256i state[ARGON2_HWORDS_IN_BLOCK];
--- a/algo/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2d/blake2/blamka-round-opt.h
@@ -21,7 +21,7 @@
 #include "blake2-impl.h"
 #include "simd-utils.h"
-#if !defined(__AVX512F__)
+#if !defined(SIMD512)
 #if !defined(__AVX2__)
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -39,7 +39,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
   blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
   do {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      blakehash_4way( hash, vdata );
--- a/algo/blake/blake256-hash.c
+++ b/algo/blake/blake256-hash.c
@@ -429,7 +429,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
 #define BLAKE256_4X32_BLOCK_BSWAP32 \
 { \
   v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
-                                          0x0405060700010203 ); \
+                                     0x0405060700010203 ); \
   M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
   M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
   M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
@@ -931,14 +931,14 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
   const v128_t shuf_bswap32 =
                      v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
-   H[0] = _mm_shuffle_epi8( mm128_xor3( V8, V0, h[0] ), shuf_bswap32 );
+   H[0] = _mm_shuffle_epi8( v128_xor3( V8, V0, h[0] ), shuf_bswap32 );
-   H[1] = _mm_shuffle_epi8( mm128_xor3( V9, V1, h[1] ), shuf_bswap32 );
+   H[1] = _mm_shuffle_epi8( v128_xor3( V9, V1, h[1] ), shuf_bswap32 );
-   H[2] = _mm_shuffle_epi8( mm128_xor3( VA, V2, h[2] ), shuf_bswap32 );
+   H[2] = _mm_shuffle_epi8( v128_xor3( VA, V2, h[2] ), shuf_bswap32 );
-   H[3] = _mm_shuffle_epi8( mm128_xor3( VB, V3, h[3] ), shuf_bswap32 );
+   H[3] = _mm_shuffle_epi8( v128_xor3( VB, V3, h[3] ), shuf_bswap32 );
-   H[4] = _mm_shuffle_epi8( mm128_xor3( VC, V4, h[4] ), shuf_bswap32 );
+   H[4] = _mm_shuffle_epi8( v128_xor3( VC, V4, h[4] ), shuf_bswap32 );
-   H[5] = _mm_shuffle_epi8( mm128_xor3( VD, V5, h[5] ), shuf_bswap32 );
+   H[5] = _mm_shuffle_epi8( v128_xor3( VD, V5, h[5] ), shuf_bswap32 );
-   H[6] = _mm_shuffle_epi8( mm128_xor3( VE, V6, h[6] ), shuf_bswap32 );
+   H[6] = _mm_shuffle_epi8( v128_xor3( VE, V6, h[6] ), shuf_bswap32 );
-   H[7] = _mm_shuffle_epi8( mm128_xor3( VF, V7, h[7] ), shuf_bswap32 );
+   H[7] = _mm_shuffle_epi8( v128_xor3( VF, V7, h[7] ), shuf_bswap32 );
 #else
@@ -1611,7 +1611,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
 #endif
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 ///////////////////////////////////////
 //
@@ -2617,7 +2617,7 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
 #endif
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 //Blake-256 16 way AVX512
--- a/algo/blake/blake256-hash.h
+++ b/algo/blake/blake256-hash.h
@@ -147,7 +147,7 @@ void blake256r8_8way_close(void *cc, void *dst);
 #define blake256r8_8x32_update        blake256r14_8way_update
 #define blake256r8_8x32_close         blake256r14_8way_close
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 ///////////////////////////////////
 //
--- a/algo/blake/blake2b-hash.c
+++ b/algo/blake/blake2b-hash.c
@@ -226,7 +226,7 @@ static const uint8_t sigma[12][16] =
 #define Mx_(n)      Mx__(n)
 #define Mx__(n)     M ## n
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 #define B2B8W_G(a, b, c, d, x, y) \
 { \
@@ -240,7 +240,7 @@ static const uint8_t sigma[12][16] =
   v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
 }
-static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
+static void blake2b_8x64_compress( blake2b_8x64_ctx *ctx, int last )
 {  
   __m512i v[16], m[16];
@@ -306,7 +306,7 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
   ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
 }
-int blake2b_8way_init( blake2b_8way_ctx *ctx )
+int blake2b_8x64_init( blake2b_8x64_ctx *ctx )
 {
   size_t i;
@@ -333,7 +333,7 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
 }
-void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
+void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
                          size_t inlen )
 {
   __m512i* in =(__m512i*)input;
@@ -348,7 +348,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
         ctx->t[0] += ctx->c;
         if ( ctx->t[0] < ctx->c )
            ctx->t[1]++;
-         blake2b_8way_compress( ctx, 0 );
+         blake2b_8x64_compress( ctx, 0 );
         ctx->c = 0;
      }
      ctx->b[ c++ ] = in[i];
@@ -356,7 +356,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
   }
 }
-void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
+void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out )
 {
   size_t c;
   c = ctx->c >> 3;
@@ -371,7 +371,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
      ctx->c += 8;
   }
-   blake2b_8way_compress( ctx, 1 );           // final block flag = 1
+   blake2b_8x64_compress( ctx, 1 );           // final block flag = 1
   casti_m512i( out, 0 ) = ctx->h[0];
   casti_m512i( out, 1 ) = ctx->h[1];
@@ -407,7 +407,7 @@ static const uint64_t blake2b_iv[8] = {
 };
 */
-static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
+static void blake2b_4x64_compress( blake2b_4x64_ctx *ctx, int last )
 {
 	__m256i v[16], m[16];
@@ -473,7 +473,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
   ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
 }
-int blake2b_4way_init( blake2b_4way_ctx *ctx ) 
+int blake2b_4x64_init( blake2b_4x64_ctx *ctx ) 
 {
 	size_t i;
@@ -499,7 +499,7 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
 	return 0;
 }
-void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
+void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
                          size_t inlen ) 
 {
   __m256i* in =(__m256i*)input;
@@ -514,7 +514,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
 			ctx->t[0] += ctx->c;
 			if ( ctx->t[0] < ctx->c )
 				ctx->t[1]++;
-			blake2b_4way_compress( ctx, 0 );
+			blake2b_4x64_compress( ctx, 0 );
 			ctx->c = 0;
 		}
      ctx->b[ c++ ] = in[i];
@@ -522,7 +522,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
   }
 }
-void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
+void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out )
 {
 	size_t c;
   c = ctx->c >> 3;
@@ -537,7 +537,7 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
      ctx->c += 8;
   }
-   blake2b_4way_compress( ctx, 1 );           // final block flag = 1
+   blake2b_4x64_compress( ctx, 1 );           // final block flag = 1
   casti_m256i( out, 0 ) = ctx->h[0];
   casti_m256i( out, 1 ) = ctx->h[1];
--- a/algo/blake/blake2b-hash.h
+++ b/algo/blake/blake2b-hash.h
@@ -1,6 +1,6 @@
 #pragma once
-#ifndef __BLAKE2B_HASH_4WAY_H__
+#ifndef BLAKE2B_HASH_4WAY_H__
-#define __BLAKE2B_HASH_4WAY_H__
+#define BLAKE2B_HASH_4WAY_H__
 #include "simd-utils.h"
 #include <stddef.h>
@@ -15,7 +15,7 @@
 #endif
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 typedef struct ALIGN( 64 ) {
   __m512i b[16]; // input buffer
@@ -23,12 +23,17 @@ typedef struct ALIGN( 64 ) {
   uint64_t t[2];  // total number of bytes
   size_t c;       // pointer for b[]
   size_t outlen;  // digest size
-} blake2b_8way_ctx;
+} blake2b_8x64_ctx;
-int blake2b_8way_init( blake2b_8way_ctx *ctx );
+int blake2b_8x64_init( blake2b_8x64_ctx *ctx );
-void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
+void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
                          size_t inlen );
-void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
+void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );
 #define blake2b_8way_ctx         blake2b_8x64_ctx
 #define blake2b_8way_init        blake2b_8x64_init
 #define blake2b_8way_update      blake2b_8x64_update
 #define blake2b_8way_final       blake2b_8x64_final
 #endif
@@ -41,12 +46,17 @@ typedef struct ALIGN( 64 ) {
 	uint64_t t[2];  // total number of bytes
 	size_t c;       // pointer for b[]
 	size_t outlen;  // digest size
-} blake2b_4way_ctx;
+} blake2b_4x64_ctx;
-int blake2b_4way_init( blake2b_4way_ctx *ctx );
+int blake2b_4x64_init( blake2b_4x64_ctx *ctx );
-void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
+void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
                          size_t inlen );
-void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
+void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );
 #define blake2b_4way_ctx         blake2b_4x64_ctx
 #define blake2b_4way_init        blake2b_4x64_init
 #define blake2b_4way_update      blake2b_4x64_update
 #define blake2b_4way_final       blake2b_4x64_final
 #endif
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -3,7 +3,7 @@
 #include <stdint.h>
 #include "blake2b-hash.h"
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define BLAKE2B_8WAY
 #elif defined(__AVX2__)
  #define BLAKE2B_4WAY
--- a/algo/blake/blake2s-hash.c
+++ b/algo/blake/blake2s-hash.c
@@ -497,7 +497,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
 #endif // __AVX2__
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 // Blake2s-256 16 way
--- a/algo/blake/blake2s-hash.h
+++ b/algo/blake/blake2s-hash.h
@@ -11,8 +11,8 @@
 * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
 */
 //#pragma once
-#ifndef __BLAKE2S_HASH_4WAY_H__
+#ifndef BLAKE2S_HASH_4WAY_H__
-#define __BLAKE2S_HASH_4WAY_H__ 1
+#define BLAKE2S_HASH_4WAY_H__ 1
 #if defined(__SSE2__) || defined(__ARM_NEON)
@@ -29,20 +29,20 @@
 #define ALIGN(x) __attribute__((aligned(x)))
 #endif
-   typedef struct __blake2s_nway_param
+typedef struct __blake2s_nway_param
-   {
+{
-      uint8_t  digest_length; // 1
+   uint8_t  digest_length; // 1
-      uint8_t  key_length;    // 2
+   uint8_t  key_length;    // 2
-      uint8_t  fanout;        // 3
+   uint8_t  fanout;        // 3
-      uint8_t  depth;         // 4
+   uint8_t  depth;         // 4
-      uint32_t leaf_length;   // 8
+   uint32_t leaf_length;   // 8
-      uint8_t  node_offset[6];// 14
+   uint8_t  node_offset[6];// 14
-      uint8_t  node_depth;    // 15
+   uint8_t  node_depth;    // 15
-      uint8_t  inner_length;  // 16
+   uint8_t  inner_length;  // 16
-      // uint8_t  reserved[0];
+   // uint8_t  reserved[0];
-      uint8_t  salt[8]; // 24
+   uint8_t  salt[8]; // 24
-      uint8_t  personal[8];  // 32
+   uint8_t  personal[8];  // 32
-   } blake2s_nway_param;
+} blake2s_nway_param;
 typedef struct ALIGN( 64 ) __blake2s_4way_state
 {
@@ -67,7 +67,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
 typedef struct ALIGN( 64 ) __blake2s_8way_state
 {
   __m256i h[8];
-   uint8_t  buf[ 32 * 8 ];
+   uint8_t  buf[ 64 * 8 ];
   uint32_t t[2];
   uint32_t f[2];
   size_t   buflen;
@@ -83,12 +83,12 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
 #endif
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 typedef struct ALIGN( 64 ) __blake2s_16way_state
 {
   __m512i h[8];
-   uint8_t  buf[ 32 * 16 ];
+   uint8_t  buf[ 64 * 16 ];
   uint32_t t[2];
   uint32_t f[2];
   size_t   buflen;
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -3,7 +3,7 @@
 #include <string.h>
 #include <stdint.h>
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define BLAKE2S_16WAY
 #elif defined(__AVX2__)
  #define BLAKE2S_8WAY
--- a/algo/blake/blake512-hash.c
+++ b/algo/blake/blake512-hash.c
@@ -349,16 +349,16 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
   Va = v128_add64( Va, v128_add64( Vb, \
                            v128_set64( CBx( r, Sd ) ^ Mx( r, Sc ), \
                                        CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
-   Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
+   Vd = v128_ror64xor( Vd, Va, 32 ); \
   Vc = v128_add64( Vc, Vd ); \
-   Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \
+   Vb = v128_ror64xor( Vb, Vc, 25 ); \
 \
   Va = v128_add64( Va, v128_add64( Vb, \
                            v128_set64( CBx( r, Sc ) ^ Mx( r, Sd ), \
                                        CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
-   Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
+   Vd = v128_ror64xor( Vd, Va, 16 ); \
   Vc = v128_add64( Vc, Vd ); \
-   Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \
+   Vb = v128_ror64xor( Vb, Vc, 11 ); \
 }
 #define BLAKE512_ROUND( R ) \
@@ -559,7 +559,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
 #if defined(__AVX2__)
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 ////////////////////////////////////
 //
@@ -1887,13 +1887,13 @@ blake512_4x64_close(void *cc, void *dst)
 #define GB_2X64( m0, m1, c0, c1, a, b, c, d ) \
 { \
   a = v128_add64( v128_add64( v128_xor( v128_64( c1 ), m0 ), b ), a ); \
-   d = v128_ror64( v128_xor( d, a ), 32 ); \
+   d = v128_ror64xor( d, a, 32 ); \
   c = v128_add64( c, d ); \
-   b = v128_ror64( v128_xor( b, c ), 25 ); \
+   b = v128_ror64xor( b, c, 25 ); \
   a = v128_add64( v128_add64( v128_xor( v128_64( c0 ), m1 ), b ), a ); \
-   d = v128_ror64( v128_xor( d, a ), 16 ); \
+   d = v128_ror64xor( d, a, 16 ); \
   c = v128_add64( c, d ); \
-   b = v128_ror64( v128_xor( b, c ), 11 ); \
+   b = v128_ror64xor( b, c, 11 ); \
 }
 #define ROUND_B_2X64(r) \
@@ -2054,9 +2054,9 @@ void blake512_2x64_prehash_part1_le( blake_2x64_big_context *sc,
   // G4 skip nonce
   V0 = v128_add64( v128_add64( v128_xor( v128_64( CB9 ), sc->buf[ 8] ), V5 ),
                                          V0 );
-   VF = v128_ror64( v128_xor( VF, V0 ), 32 );
+   VF = v128_ror64xor( VF, V0, 32 );
   VA = v128_add64( VA, VF );
-   V5 = v128_ror64( v128_xor( V5, VA ), 25 );
+   V5 = v128_ror64xor( V5, VA, 25 );
   V0 = v128_add64( V0, V5 );
   GB_2X64( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
@@ -2137,9 +2137,9 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
   // finish round 0, with the nonce now available 
   V0 = v128_add64( V0, v128_xor( v128_64( CB8 ), M9 ) );
-   VF = v128_ror64( v128_xor( VF, V0 ), 16 );
+   VF = v128_ror64xor( VF, V0, 16 );
   VA = v128_add64( VA, VF );
-   V5 = v128_ror64( v128_xor( V5, VA ), 11 );
+   V5 = v128_ror64xor( V5, VA, 11 );
   // Round 1
   // G0
@@ -2147,34 +2147,34 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
   // G1
   V1 = v128_add64( V1, V5 );
-   VD = v128_ror64( v128_xor( VD, V1 ), 32 );
+   VD = v128_ror64xor( VD, V1, 32 );
   V9 = v128_add64( V9, VD );
-   V5 = v128_ror64( v128_xor( V5, V9 ), 25 );
+   V5 = v128_ror64xor( V5, V9, 25 );
   V1 = v128_add64( V1, v128_add64( v128_xor( v128_64( CBx(1,2) ), Mx(1,3) ),
                                              V5 ) );
-   VD = v128_ror64( v128_xor( VD, V1 ), 16 );
+   VD = v128_ror64xor( VD, V1, 16 );
   V9 = v128_add64( V9, VD );
-   V5 = v128_ror64( v128_xor( V5, V9 ), 11 );
+   V5 = v128_ror64xor( V5, V9, 11 );
   // G2
   V2 = v128_add64( V2, v128_xor( v128_64( CBF ), M9 ) );
-   VE = v128_ror64( v128_xor( VE, V2 ), 32 );
+   VE = v128_ror64xor( VE, V2, 32 );
   VA = v128_add64( VA, VE );
-   V6 = v128_ror64( v128_xor( V6, VA ), 25 );
+   V6 = v128_ror64xor( V6, VA, 25 );
   V2 = v128_add64( V2, v128_add64( v128_xor( v128_64( CB9 ), MF ), V6 ) );
-   VE = v128_ror64( v128_xor( VE, V2 ), 16 );
+   VE = v128_ror64xor( VE, V2, 16 );
   VA = v128_add64( VA, VE );
-   V6 = v128_ror64( v128_xor( V6, VA ), 11 );
+   V6 = v128_ror64xor( V6, VA, 11 );
   // G3
-   VF = v128_ror64( v128_xor( VF, V3 ), 32 );
+   VF = v128_ror64xor( VF, V3, 32 );
   VB = v128_add64( VB, VF );
-   V7 = v128_ror64( v128_xor( V7, VB ), 25 );
+   V7 = v128_ror64xor( V7, VB, 25 );
   V3 = v128_add64( V3, v128_add64( v128_xor( v128_64( CBx(1, 6) ), Mx(1, 7) ),
                                              V7 ) );
-   VF = v128_ror64( v128_xor( VF, V3 ), 16 );
+   VF = v128_ror64xor( VF, V3, 16 );
   VB = v128_add64( VB, VF );
-   V7 = v128_ror64( v128_xor( V7, VB ), 11 );
+   V7 = v128_ror64xor( V7, VB, 11 );
   // G4, G5, G6, G7
   GB_2X64(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
--- a/algo/blake/blake512-hash.h
+++ b/algo/blake/blake512-hash.h
@@ -92,7 +92,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
 #define blake512_4way_prehash_le  blake512_4x64_prehash_le
 #define blake512_4way_final_le    blake512_4x64_final_le
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 ////////////////////////////
 //
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -182,7 +182,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
   blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
   do {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      pdata[19] = n;
      blakecoin_4way_hash( hash, vdata );
--- a/algo/blake/blakecoin-gate.h
+++ b/algo/blake/blakecoin-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define BLAKECOIN_16WAY
 #elif defined(__AVX2__)
  #define BLAKECOIN_8WAY
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -101,15 +101,15 @@
 { \
   Va = v128_add64( Va, v128_add64( Vb, \
                 v128_set64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
-   Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
+   Vd = v128_ror64xor( Vd, Va, 32 ); \
   Vc = v128_add64( Vc, Vd ); \
-   Vb = v128_ror64( v128_xor( Vb, Vc ), 24 ); \
+   Vb = v128_ror64xor( Vb, Vc, 24 ); \
 \
   Va = v128_add64( Va, v128_add64( Vb, \
                 v128_set64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
-   Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
+   Vd = v128_ror64xor( Vd, Va, 16 ); \
   Vc = v128_add64( Vc, Vd ); \
-   Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
+   Vb = v128_ror64xor( Vb, Vc, 63 ); \
 }
 #define BLAKE2B_ROUND( R ) \
@@ -131,47 +131,7 @@
   V[7] = v128_alignr64( V6, V7, 1 ); \
 }
 /*
 #elif defined(__SSE2__)
 // always true
 #define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
 { \
   Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
                 _mm_set_epi64x( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
   Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
   Vc = _mm_add_epi64( Vc, Vd ); \
   Vb = mm128_shuflr64_24( _mm_xor_si128( Vb, Vc ) ); \
 \
   Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
                 _mm_set_epi64x( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
   Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
   Vc = _mm_add_epi64( Vc, Vd ); \
   Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 63 ); \
 }
 #define BLAKE2B_ROUND( R ) \
 { \
   v128_t *V = (v128_t*)v; \
   v128_t V2, V3, V6, V7; \
   const uint8_t *sigmaR = sigma[R]; \
   BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
   BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
   V2 = mm128_alignr_64( V[3], V[2], 1 ); \
   V3 = mm128_alignr_64( V[2], V[3], 1 ); \
   V6 = mm128_alignr_64( V[6], V[7], 1 ); \
   V7 = mm128_alignr_64( V[7], V[6], 1 ); \
   BLAKE2B_G( V[0], V2, V[5], V6,  8,  9, 10, 11 ); \
   BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
   V[2] = mm128_alignr_64( V2, V3, 1 ); \
   V[3] = mm128_alignr_64( V3, V2, 1 ); \
   V[6] = mm128_alignr_64( V7, V6, 1 ); \
   V[7] = mm128_alignr_64( V6, V7, 1 ); \
 }
 */
 #else
 // never used, SSE2 is always available
 #ifndef ROTR64
 #define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -87,7 +87,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
 #endif
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 // BMW-256 16 way 32
@@ -157,7 +157,7 @@ void bmw512_4way_addbits_and_close(
 #endif  // __AVX2__
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 // BMW-512 64 bit 8 way
 typedef struct
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -62,78 +62,78 @@ static const uint32_t IV256[] = {
 */
 #define ss0(x) \
-   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
+   v128_xor( v128_xor( v128_sr32( (x), 1), \
-                                 _mm_slli_epi32( (x), 3) ), \
+                                 v128_sl32( (x), 3) ), \
-                  _mm_xor_si128( mm128_rol_32( (x),  4), \
+                  v128_xor( v128_rol32( (x),  4), \
-                                 mm128_rol_32( (x), 19) ) )
+                                 v128_rol32( (x), 19) ) )
 #define ss1(x) \
-   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
+   v128_xor( v128_xor( v128_sr32( (x), 1), \
-                                 _mm_slli_epi32( (x), 2) ), \
+                                 v128_sl32( (x), 2) ), \
-                  _mm_xor_si128( mm128_rol_32( (x),  8), \
+                  v128_xor( v128_rol32( (x),  8), \
-                                 mm128_rol_32( (x), 23) ) )
+                                 v128_rol32( (x), 23) ) )
 #define ss2(x) \
-   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
+   v128_xor( v128_xor( v128_sr32( (x), 2), \
-                                 _mm_slli_epi32( (x), 1) ), \
+                                 v128_sl32( (x), 1) ), \
-                  _mm_xor_si128( mm128_rol_32( (x), 12), \
+                  v128_xor( v128_rol32( (x), 12), \
-                                 mm128_rol_32( (x), 25) ) )
+                                 v128_rol32( (x), 25) ) )
 #define ss3(x) \
-   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
+   v128_xor( v128_xor( v128_sr32( (x), 2), \
-                                 _mm_slli_epi32( (x), 2) ), \
+                                 v128_sl32( (x), 2) ), \
-                  _mm_xor_si128( mm128_rol_32( (x), 15), \
+                  v128_xor( v128_rol32( (x), 15), \
-                                 mm128_rol_32( (x), 29) ) )
+                                 v128_rol32( (x), 29) ) )
 #define ss4(x) \
-  _mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) )
+  v128_xor( (x), v128_sr32( (x), 1 ) )
 #define ss5(x) \
-  _mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) )
+  v128_xor( (x), v128_sr32( (x), 2 ) )
-#define rs1(x)    mm128_rol_32( x,  3 ) 
+#define rs1(x)    v128_rol32( x,  3 ) 
-#define rs2(x)    mm128_rol_32( x,  7 ) 
+#define rs2(x)    v128_rol32( x,  7 ) 
-#define rs3(x)    mm128_rol_32( x, 13 ) 
+#define rs3(x)    v128_rol32( x, 13 ) 
-#define rs4(x)    mm128_rol_32( x, 16 ) 
+#define rs4(x)    v128_rol32( x, 16 ) 
-#define rs5(x)    mm128_rol_32( x, 19 ) 
+#define rs5(x)    v128_rol32( x, 19 ) 
-#define rs6(x)    mm128_rol_32( x, 23 ) 
+#define rs6(x)    v128_rol32( x, 23 ) 
-#define rs7(x)    mm128_rol_32( x, 27 ) 
+#define rs7(x)    v128_rol32( x, 27 ) 
 #define rol_off_32( M, j, off ) \
-   mm128_rol_32( M[ ( (j) + (off) ) & 0xF ] , \
+   v128_rol32( M[ ( (j) + (off) ) & 0xF ] , \
                ( ( (j) + (off) ) & 0xF ) + 1 )
 #define add_elt_s( M, H, j ) \
-   _mm_xor_si128( \
+   v128_xor( \
-       _mm_add_epi32( \
+       v128_add32( \
-             _mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
+             v128_sub32( v128_add32( rol_off_32( M, j, 0 ), \
                                           rol_off_32( M, j, 3 ) ), \
                            rol_off_32( M, j, 10 ) ), \
-       _mm_set1_epi32( ( (j)+16 ) * 0x05555555UL ) ), \
+       v128_32( ( (j)+16 ) * 0x05555555UL ) ), \
   H[ ( (j)+7 ) & 0xF ] )
 #define expand1s( qt, M, H, i ) \
-   _mm_add_epi32(  mm128_add4_32( \
+   v128_add32(  v128_add4_32( \
-            mm128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
+            v128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
                           ss3( qt[ (i)-14 ] ), ss0( qt[ (i)-13 ] ) ), \
-            mm128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
+            v128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
                           ss3( qt[ (i)-10 ] ), ss0( qt[ (i)- 9 ] ) ), \
-            mm128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
+            v128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
                           ss3( qt[ (i)- 6 ] ), ss0( qt[ (i)- 5 ] ) ),  \
-            mm128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
+            v128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
                           ss3( qt[ (i)- 2 ] ), ss0( qt[ (i)- 1 ] ) ) ), \
      add_elt_s( M, H, (i)-16 ) )
 #define expand2s( qt, M, H, i) \
-   _mm_add_epi32( mm128_add4_32( \
+   v128_add32( v128_add4_32( \
-            mm128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
+            v128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
                           qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ), \
-            mm128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
+            v128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
                           qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ), \
-            mm128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
+            v128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
                           qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ), \
-            mm128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
+            v128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
                           ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \
      add_elt_s( M, H, (i)-16 ) )
@@ -141,169 +141,169 @@ static const uint32_t IV256[] = {
 // resulting in some sign changes compared to the reference code.
 #define Ws0 \
-   _mm_add_epi32( \
+   v128_add32( \
-      _mm_add_epi32( \
+      v128_add32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
+         v128_sub32( v128_xor( M[ 5], H[ 5] ), \
-                        _mm_xor_si128( M[ 7], H[ 7] ) ), \
+                        v128_xor( M[ 7], H[ 7] ) ), \
-         _mm_xor_si128( M[10], H[10] ) ), \
+         v128_xor( M[10], H[10] ) ), \
-      _mm_add_epi32( _mm_xor_si128( M[13], H[13] ), \
+      v128_add32( v128_xor( M[13], H[13] ), \
-                     _mm_xor_si128( M[14], H[14] ) ) )
+                     v128_xor( M[14], H[14] ) ) )
 #define Ws1 \
-   _mm_add_epi32( \
+   v128_add32( \
-       _mm_add_epi32( \
+       v128_add32( \
-          _mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
+          v128_sub32( v128_xor( M[ 6], H[ 6] ), \
-                         _mm_xor_si128( M[ 8], H[ 8] ) ), \
+                         v128_xor( M[ 8], H[ 8] ) ), \
-          _mm_xor_si128( M[11], H[11] ) ), \
+          v128_xor( M[11], H[11] ) ), \
-       _mm_sub_epi32( _mm_xor_si128( M[14], H[14] ), \
+       v128_sub32( v128_xor( M[14], H[14] ), \
-                      _mm_xor_si128( M[15], H[15] ) ) )
+                      v128_xor( M[15], H[15] ) ) )
 #define Ws2 \
-   _mm_sub_epi32( \
+   v128_sub32( \
-      _mm_add_epi32( \
+      v128_add32( \
-         _mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
+         v128_add32( v128_xor( M[ 0], H[ 0] ), \
-                        _mm_xor_si128( M[ 7], H[ 7] ) ), \
+                        v128_xor( M[ 7], H[ 7] ) ), \
-         _mm_xor_si128( M[ 9], H[ 9] ) ), \
+         v128_xor( M[ 9], H[ 9] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
+      v128_sub32( v128_xor( M[12], H[12] ), \
-                     _mm_xor_si128( M[15], H[15] ) ) )
+                     v128_xor( M[15], H[15] ) ) )
 #define Ws3 \
-   _mm_sub_epi32( \
+   v128_sub32( \
-      _mm_add_epi32( \
+      v128_add32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
+         v128_sub32( v128_xor( M[ 0], H[ 0] ), \
-                        _mm_xor_si128( M[ 1], H[ 1] ) ), \
+                        v128_xor( M[ 1], H[ 1] ) ), \
-         _mm_xor_si128( M[ 8], H[ 8] ) ), \
+         v128_xor( M[ 8], H[ 8] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[10], H[10] ), \
+      v128_sub32( v128_xor( M[10], H[10] ), \
-                     _mm_xor_si128( M[13], H[13] ) ) )
+                     v128_xor( M[13], H[13] ) ) )
 #define Ws4 \
-   _mm_sub_epi32( \
+   v128_sub32( \
-      _mm_add_epi32( \
+      v128_add32( \
-         _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
+         v128_add32( v128_xor( M[ 1], H[ 1] ), \
-                        _mm_xor_si128( M[ 2], H[ 2] ) ), \
+                        v128_xor( M[ 2], H[ 2] ) ), \
-         _mm_xor_si128( M[ 9], H[ 9] ) ), \
+         v128_xor( M[ 9], H[ 9] ) ), \
-      _mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
+      v128_add32( v128_xor( M[11], H[11] ), \
-                     _mm_xor_si128( M[14], H[14] ) ) )
+                     v128_xor( M[14], H[14] ) ) )
 #define Ws5 \
-   _mm_sub_epi32( \
+   v128_sub32( \
-      _mm_add_epi32( \
+      v128_add32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
+         v128_sub32( v128_xor( M[ 3], H[ 3] ), \
-                        _mm_xor_si128( M[ 2], H[ 2] ) ), \
+                        v128_xor( M[ 2], H[ 2] ) ), \
-         _mm_xor_si128( M[10], H[10] ) ), \
+         v128_xor( M[10], H[10] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
+      v128_sub32( v128_xor( M[12], H[12] ), \
-                     _mm_xor_si128( M[15], H[15] ) ) )
+                     v128_xor( M[15], H[15] ) ) )
 #define Ws6 \
-   _mm_sub_epi32( \
+   v128_sub32( \
-      _mm_sub_epi32( \
+      v128_sub32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
+         v128_sub32( v128_xor( M[ 4], H[ 4] ), \
-                        _mm_xor_si128( M[ 0], H[ 0] ) ), \
+                        v128_xor( M[ 0], H[ 0] ) ), \
-         _mm_xor_si128( M[ 3], H[ 3] ) ), \
+         v128_xor( M[ 3], H[ 3] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[11], H[11] ), \
+      v128_sub32( v128_xor( M[11], H[11] ), \
-                     _mm_xor_si128( M[13], H[13] ) ) )
+                     v128_xor( M[13], H[13] ) ) )
 #define Ws7 \
-   _mm_sub_epi32( \
+   v128_sub32( \
-      _mm_sub_epi32( \
+      v128_sub32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
+         v128_sub32( v128_xor( M[ 1], H[ 1] ), \
-                        _mm_xor_si128( M[ 4], H[ 4] ) ), \
+                        v128_xor( M[ 4], H[ 4] ) ), \
-         _mm_xor_si128( M[ 5], H[ 5] ) ), \
+         v128_xor( M[ 5], H[ 5] ) ), \
-      _mm_add_epi32( _mm_xor_si128( M[12], H[12] ), \
+      v128_add32( v128_xor( M[12], H[12] ), \
-                     _mm_xor_si128( M[14], H[14] ) ) )
+                     v128_xor( M[14], H[14] ) ) )
 #define Ws8 \
-   _mm_add_epi32( \
+   v128_add32( \
-      _mm_sub_epi32( \
+      v128_sub32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
+         v128_sub32( v128_xor( M[ 2], H[ 2] ), \
-                        _mm_xor_si128( M[ 5], H[ 5] ) ), \
+                        v128_xor( M[ 5], H[ 5] ) ), \
-         _mm_xor_si128( M[ 6], H[ 6] ) ), \
+         v128_xor( M[ 6], H[ 6] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[13], H[13] ), \
+      v128_sub32( v128_xor( M[13], H[13] ), \
-                     _mm_xor_si128( M[15], H[15] ) ) )
+                     v128_xor( M[15], H[15] ) ) )
 #define Ws9 \
-   _mm_sub_epi32( \
+   v128_sub32( \
-      _mm_add_epi32( \
+      v128_add32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
+         v128_sub32( v128_xor( M[ 0], H[ 0] ), \
-                        _mm_xor_si128( M[ 3], H[ 3] ) ), \
+                        v128_xor( M[ 3], H[ 3] ) ), \
-         _mm_xor_si128( M[ 6], H[ 6] ) ), \
+         v128_xor( M[ 6], H[ 6] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
+      v128_sub32( v128_xor( M[ 7], H[ 7] ), \
-                     _mm_xor_si128( M[14], H[14] ) ) )
+                     v128_xor( M[14], H[14] ) ) )
 #define Ws10 \
-   _mm_sub_epi32( \
+   v128_sub32( \
-      _mm_sub_epi32( \
+      v128_sub32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
+         v128_sub32( v128_xor( M[ 8], H[ 8] ), \
-                        _mm_xor_si128( M[ 1], H[ 1] ) ), \
+                        v128_xor( M[ 1], H[ 1] ) ), \
-         _mm_xor_si128( M[ 4], H[ 4] ) ), \
+         v128_xor( M[ 4], H[ 4] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
+      v128_sub32( v128_xor( M[ 7], H[ 7] ), \
-                     _mm_xor_si128( M[15], H[15] ) ) )
+                     v128_xor( M[15], H[15] ) ) )
 #define Ws11 \
-   _mm_sub_epi32( \
+   v128_sub32( \
-      _mm_sub_epi32( \
+      v128_sub32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
+         v128_sub32( v128_xor( M[ 8], H[ 8] ), \
-                        _mm_xor_si128( M[ 0], H[ 0] ) ), \
+                        v128_xor( M[ 0], H[ 0] ) ), \
-         _mm_xor_si128( M[ 2], H[ 2] ) ), \
+         v128_xor( M[ 2], H[ 2] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
+      v128_sub32( v128_xor( M[ 5], H[ 5] ), \
-                     _mm_xor_si128( M[ 9], H[ 9] ) ) )
+                     v128_xor( M[ 9], H[ 9] ) ) )
 #define Ws12 \
-   _mm_sub_epi32( \
+   v128_sub32( \
-      _mm_sub_epi32( \
+      v128_sub32( \
-         _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
+         v128_add32( v128_xor( M[ 1], H[ 1] ), \
-                        _mm_xor_si128( M[ 3], H[ 3] ) ), \
+                        v128_xor( M[ 3], H[ 3] ) ), \
-         _mm_xor_si128( M[ 6], H[ 6] ) ), \
+         v128_xor( M[ 6], H[ 6] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
+      v128_sub32( v128_xor( M[ 9], H[ 9] ), \
-                     _mm_xor_si128( M[10], H[10] ) ) )
+                     v128_xor( M[10], H[10] ) ) )
 #define Ws13 \
-   _mm_add_epi32( \
+   v128_add32( \
-      _mm_add_epi32( \
+      v128_add32( \
-         _mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
+         v128_add32( v128_xor( M[ 2], H[ 2] ), \
-                        _mm_xor_si128( M[ 4], H[ 4] ) ), \
+                        v128_xor( M[ 4], H[ 4] ) ), \
-         _mm_xor_si128( M[ 7], H[ 7] ) ), \
+         v128_xor( M[ 7], H[ 7] ) ), \
-      _mm_add_epi32( _mm_xor_si128( M[10], H[10] ), \
+      v128_add32( v128_xor( M[10], H[10] ), \
-                     _mm_xor_si128( M[11], H[11] ) ) )
+                     v128_xor( M[11], H[11] ) ) )
 #define Ws14 \
-   _mm_sub_epi32( \
+   v128_sub32( \
-      _mm_add_epi32( \
+      v128_add32( \
-         _mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
+         v128_sub32( v128_xor( M[ 3], H[ 3] ), \
-                        _mm_xor_si128( M[ 5], H[ 5] ) ), \
+                        v128_xor( M[ 5], H[ 5] ) ), \
-         _mm_xor_si128( M[ 8], H[ 8] ) ), \
+         v128_xor( M[ 8], H[ 8] ) ), \
-      _mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
+      v128_add32( v128_xor( M[11], H[11] ), \
-                     _mm_xor_si128( M[12], H[12] ) ) )
+                     v128_xor( M[12], H[12] ) ) )
 #define Ws15 \
-   _mm_sub_epi32( \
+   v128_sub32( \
-      _mm_sub_epi32( \
+      v128_sub32( \
-         _mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
+         v128_sub32( v128_xor( M[12], H[12] ), \
-                        _mm_xor_si128( M[ 4], H[4] ) ), \
+                        v128_xor( M[ 4], H[4] ) ), \
-         _mm_xor_si128( M[ 6], H[ 6] ) ), \
+         v128_xor( M[ 6], H[ 6] ) ), \
-      _mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
+      v128_sub32( v128_xor( M[ 9], H[ 9] ), \
-                     _mm_xor_si128( M[13], H[13] ) ) )
+                     v128_xor( M[13], H[13] ) ) )
-void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
+void compress_small( const v128u64_t *M, const v128u64_t H[16], v128u64_t dH[16] )
 {
-   __m128i qt[32], xl, xh; \
+   v128u64_t qt[32], xl, xh; \
-   qt[ 0] = _mm_add_epi32( ss0( Ws0 ), H[ 1] );
+   qt[ 0] = v128_add32( ss0( Ws0 ), H[ 1] );
-   qt[ 1] = _mm_add_epi32( ss1( Ws1 ), H[ 2] );
+   qt[ 1] = v128_add32( ss1( Ws1 ), H[ 2] );
-   qt[ 2] = _mm_add_epi32( ss2( Ws2 ), H[ 3] );
+   qt[ 2] = v128_add32( ss2( Ws2 ), H[ 3] );
-   qt[ 3] = _mm_add_epi32( ss3( Ws3 ), H[ 4] );
+   qt[ 3] = v128_add32( ss3( Ws3 ), H[ 4] );
-   qt[ 4] = _mm_add_epi32( ss4( Ws4 ), H[ 5] );
+   qt[ 4] = v128_add32( ss4( Ws4 ), H[ 5] );
-   qt[ 5] = _mm_add_epi32( ss0( Ws5 ), H[ 6] );
+   qt[ 5] = v128_add32( ss0( Ws5 ), H[ 6] );
-   qt[ 6] = _mm_add_epi32( ss1( Ws6 ), H[ 7] );
+   qt[ 6] = v128_add32( ss1( Ws6 ), H[ 7] );
-   qt[ 7] = _mm_add_epi32( ss2( Ws7 ), H[ 8] );
+   qt[ 7] = v128_add32( ss2( Ws7 ), H[ 8] );
-   qt[ 8] = _mm_add_epi32( ss3( Ws8 ), H[ 9] );
+   qt[ 8] = v128_add32( ss3( Ws8 ), H[ 9] );
-   qt[ 9] = _mm_add_epi32( ss4( Ws9 ), H[10] );
+   qt[ 9] = v128_add32( ss4( Ws9 ), H[10] );
-   qt[10] = _mm_add_epi32( ss0( Ws10), H[11] );
+   qt[10] = v128_add32( ss0( Ws10), H[11] );
-   qt[11] = _mm_add_epi32( ss1( Ws11), H[12] );
+   qt[11] = v128_add32( ss1( Ws11), H[12] );
-   qt[12] = _mm_add_epi32( ss2( Ws12), H[13] );
+   qt[12] = v128_add32( ss2( Ws12), H[13] );
-   qt[13] = _mm_add_epi32( ss3( Ws13), H[14] );
+   qt[13] = v128_add32( ss3( Ws13), H[14] );
-   qt[14] = _mm_add_epi32( ss4( Ws14), H[15] );
+   qt[14] = v128_add32( ss4( Ws14), H[15] );
-   qt[15] = _mm_add_epi32( ss0( Ws15), H[ 0] );
+   qt[15] = v128_add32( ss0( Ws15), H[ 0] );
   qt[16] = expand1s( qt, M, H, 16 );
   qt[17] = expand1s( qt, M, H, 17 );
   qt[18] = expand2s( qt, M, H, 18 );
@@ -321,92 +321,92 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
   qt[30] = expand2s( qt, M, H, 30 );
   qt[31] = expand2s( qt, M, H, 31 );
-   xl = _mm_xor_si128( mm128_xor4( qt[16], qt[17], qt[18], qt[19] ),
+   xl = v128_xor( v128_xor4( qt[16], qt[17], qt[18], qt[19] ),
-                       mm128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
+                       v128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
-   xh = _mm_xor_si128( xl, _mm_xor_si128(
+   xh = v128_xor( xl, v128_xor(
-                             mm128_xor4( qt[24], qt[25], qt[26], qt[27] ),
+                             v128_xor4( qt[24], qt[25], qt[26], qt[27] ),
-                             mm128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
+                             v128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
-   dH[ 0] = _mm_add_epi32(
+   dH[ 0] = v128_add32(
-                 _mm_xor_si128( M[0],
+                 v128_xor( M[0],
-                      _mm_xor_si128( _mm_slli_epi32( xh, 5 ),
+                      v128_xor( v128_sl32( xh, 5 ),
-                                     _mm_srli_epi32( qt[16], 5 ) ) ),
+                                     v128_sr32( qt[16], 5 ) ) ),
-                 _mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] ));
+                 v128_xor( v128_xor( xl, qt[24] ), qt[ 0] ));
-   dH[ 1] = _mm_add_epi32(
+   dH[ 1] = v128_add32(
-                 _mm_xor_si128( M[1],
+                 v128_xor( M[1],
-                      _mm_xor_si128( _mm_srli_epi32( xh, 7 ),
+                      v128_xor( v128_sr32( xh, 7 ),
-                                     _mm_slli_epi32( qt[17], 8 ) ) ),
+                                     v128_sl32( qt[17], 8 ) ) ),
-                 _mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] ));
+                 v128_xor( v128_xor( xl, qt[25] ), qt[ 1] ));
-   dH[ 2] = _mm_add_epi32(
+   dH[ 2] = v128_add32(
-                 _mm_xor_si128( M[2],
+                 v128_xor( M[2],
-                      _mm_xor_si128( _mm_srli_epi32( xh, 5 ),
+                      v128_xor( v128_sr32( xh, 5 ),
-                                     _mm_slli_epi32( qt[18], 5 ) ) ),
+                                     v128_sl32( qt[18], 5 ) ) ),
-                 _mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] ));
+                 v128_xor( v128_xor( xl, qt[26] ), qt[ 2] ));
-   dH[ 3] = _mm_add_epi32(
+   dH[ 3] = v128_add32(
-                 _mm_xor_si128( M[3],
+                 v128_xor( M[3],
-                      _mm_xor_si128( _mm_srli_epi32( xh, 1 ),
+                      v128_xor( v128_sr32( xh, 1 ),
-                                     _mm_slli_epi32( qt[19], 5 ) ) ),
+                                     v128_sl32( qt[19], 5 ) ) ),
-                 _mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] ));
+                 v128_xor( v128_xor( xl, qt[27] ), qt[ 3] ));
-   dH[ 4] = _mm_add_epi32(
+   dH[ 4] = v128_add32(
-                 _mm_xor_si128( M[4],
+                 v128_xor( M[4],
-                      _mm_xor_si128( _mm_srli_epi32( xh, 3 ),
+                      v128_xor( v128_sr32( xh, 3 ),
-                                     _mm_slli_epi32( qt[20], 0 ) ) ),
+                                     v128_sl32( qt[20], 0 ) ) ),
-                 _mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] ));
+                 v128_xor( v128_xor( xl, qt[28] ), qt[ 4] ));
-   dH[ 5] = _mm_add_epi32(
+   dH[ 5] = v128_add32(
-                 _mm_xor_si128( M[5],
+                 v128_xor( M[5],
-                      _mm_xor_si128( _mm_slli_epi32( xh, 6 ),
+                      v128_xor( v128_sl32( xh, 6 ),
-                                     _mm_srli_epi32( qt[21], 6 ) ) ),
+                                     v128_sr32( qt[21], 6 ) ) ),
-                 _mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] ));
+                 v128_xor( v128_xor( xl, qt[29] ), qt[ 5] ));
-   dH[ 6] = _mm_add_epi32(
+   dH[ 6] = v128_add32(
-                 _mm_xor_si128( M[6],
+                 v128_xor( M[6],
-                      _mm_xor_si128( _mm_srli_epi32( xh, 4 ),
+                      v128_xor( v128_sr32( xh, 4 ),
-                                     _mm_slli_epi32( qt[22], 6 ) ) ),
+                                     v128_sl32( qt[22], 6 ) ) ),
-                 _mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] ));
+                 v128_xor( v128_xor( xl, qt[30] ), qt[ 6] ));
-   dH[ 7] = _mm_add_epi32(
+   dH[ 7] = v128_add32(
-                 _mm_xor_si128( M[7],
+                 v128_xor( M[7],
-                      _mm_xor_si128( _mm_srli_epi32( xh, 11 ),
+                      v128_xor( v128_sr32( xh, 11 ),
-                                     _mm_slli_epi32( qt[23], 2 ) ) ),
+                                     v128_sl32( qt[23], 2 ) ) ),
-                 _mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ));
+                 v128_xor( v128_xor( xl, qt[31] ), qt[ 7] ));
-   dH[ 8] = _mm_add_epi32( _mm_add_epi32(
+   dH[ 8] = v128_add32( v128_add32(
-                 mm128_rol_32( dH[4], 9 ),
+                 v128_rol32( dH[4], 9 ),
-                 _mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )),
+                 v128_xor( v128_xor( xh, qt[24] ), M[ 8] )),
-                 _mm_xor_si128( _mm_slli_epi32( xl, 8 ),
+                 v128_xor( v128_sl32( xl, 8 ),
-                                _mm_xor_si128( qt[23], qt[ 8] ) ) );
+                                v128_xor( qt[23], qt[ 8] ) ) );
-   dH[ 9] = _mm_add_epi32( _mm_add_epi32(
+   dH[ 9] = v128_add32( v128_add32(
-                 mm128_rol_32( dH[5], 10 ),
+                 v128_rol32( dH[5], 10 ),
-                 _mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )),
+                 v128_xor( v128_xor( xh, qt[25] ), M[ 9] )),
-                 _mm_xor_si128( _mm_srli_epi32( xl, 6 ),
+                 v128_xor( v128_sr32( xl, 6 ),
-                                _mm_xor_si128( qt[16], qt[ 9] ) ) );
+                                v128_xor( qt[16], qt[ 9] ) ) );
-   dH[10] = _mm_add_epi32( _mm_add_epi32(
+   dH[10] = v128_add32( v128_add32(
-                 mm128_rol_32( dH[6], 11 ),
+                 v128_rol32( dH[6], 11 ),
-                 _mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )),
+                 v128_xor( v128_xor( xh, qt[26] ), M[10] )),
-                 _mm_xor_si128( _mm_slli_epi32( xl, 6 ),
+                 v128_xor( v128_sl32( xl, 6 ),
-                                _mm_xor_si128( qt[17], qt[10] ) ) );
+                                v128_xor( qt[17], qt[10] ) ) );
-   dH[11] = _mm_add_epi32( _mm_add_epi32(
+   dH[11] = v128_add32( v128_add32(
-                 mm128_rol_32( dH[7], 12 ),
+                 v128_rol32( dH[7], 12 ),
-                 _mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
+                 v128_xor( v128_xor( xh, qt[27] ), M[11] )),
-                 _mm_xor_si128( _mm_slli_epi32( xl, 4 ),
+                 v128_xor( v128_sl32( xl, 4 ),
-                                _mm_xor_si128( qt[18], qt[11] ) ) );
+                                v128_xor( qt[18], qt[11] ) ) );
-   dH[12] = _mm_add_epi32( _mm_add_epi32(
+   dH[12] = v128_add32( v128_add32(
-                 mm128_rol_32( dH[0], 13 ),
+                 v128_rol32( dH[0], 13 ),
-                 _mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )),
+                 v128_xor( v128_xor( xh, qt[28] ), M[12] )),
-                 _mm_xor_si128( _mm_srli_epi32( xl, 3 ),
+                 v128_xor( v128_sr32( xl, 3 ),
-                                _mm_xor_si128( qt[19], qt[12] ) ) );
+                                v128_xor( qt[19], qt[12] ) ) );
-   dH[13] = _mm_add_epi32( _mm_add_epi32(
+   dH[13] = v128_add32( v128_add32(
-                 mm128_rol_32( dH[1], 14 ),
+                 v128_rol32( dH[1], 14 ),
-                 _mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )),
+                 v128_xor( v128_xor( xh, qt[29] ), M[13] )),
-                 _mm_xor_si128( _mm_srli_epi32( xl, 4 ),
+                 v128_xor( v128_sr32( xl, 4 ),
-                                _mm_xor_si128( qt[20], qt[13] ) ) );
+                                v128_xor( qt[20], qt[13] ) ) );
-   dH[14] = _mm_add_epi32( _mm_add_epi32(
+   dH[14] = v128_add32( v128_add32(
-                 mm128_rol_32( dH[2], 15 ),
+                 v128_rol32( dH[2], 15 ),
-                 _mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )),
+                 v128_xor( v128_xor( xh, qt[30] ), M[14] )),
-                 _mm_xor_si128( _mm_srli_epi32( xl, 7 ),
+                 v128_xor( v128_sr32( xl, 7 ),
-                                _mm_xor_si128( qt[21], qt[14] ) ) );
+                                v128_xor( qt[21], qt[14] ) ) );
-   dH[15] = _mm_add_epi32( _mm_add_epi32(
+   dH[15] = v128_add32( v128_add32(
-                 mm128_rol_32( dH[3], 16 ),
+                 v128_rol32( dH[3], 16 ),
-                 _mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )),
+                 v128_xor( v128_xor( xh, qt[31] ), M[15] )),
-                 _mm_xor_si128( _mm_srli_epi32( xl, 2 ),
+                 v128_xor( v128_sr32( xl, 2 ),
-                                _mm_xor_si128( qt[22], qt[15] ) ) );
+                                v128_xor( qt[22], qt[15] ) ) );
 }
 static const uint32_t final_s[16][4] =
@@ -429,7 +429,7 @@ static const uint32_t final_s[16][4] =
   { 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
 };
 /*
-static const __m128i final_s[16] =
+static const v128u64_t final_s[16] =
 {
   { 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
   { 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
@@ -451,26 +451,26 @@ static const __m128i final_s[16] =
 */
 void bmw256_4way_init( bmw256_4way_context *ctx )
 {
-   ctx->H[ 0] = _mm_set1_epi64x( 0x4041424340414243 );
+   ctx->H[ 0] = v128_64( 0x4041424340414243 );
-   ctx->H[ 1] = _mm_set1_epi64x( 0x4445464744454647 );
+   ctx->H[ 1] = v128_64( 0x4445464744454647 );
-   ctx->H[ 2] = _mm_set1_epi64x( 0x48494A4B48494A4B );
+   ctx->H[ 2] = v128_64( 0x48494A4B48494A4B );
-   ctx->H[ 3] = _mm_set1_epi64x( 0x4C4D4E4F4C4D4E4F );
+   ctx->H[ 3] = v128_64( 0x4C4D4E4F4C4D4E4F );
-   ctx->H[ 4] = _mm_set1_epi64x( 0x5051525350515253 );
+   ctx->H[ 4] = v128_64( 0x5051525350515253 );
-   ctx->H[ 5] = _mm_set1_epi64x( 0x5455565754555657 );
+   ctx->H[ 5] = v128_64( 0x5455565754555657 );
-   ctx->H[ 6] = _mm_set1_epi64x( 0x58595A5B58595A5B );
+   ctx->H[ 6] = v128_64( 0x58595A5B58595A5B );
-   ctx->H[ 7] = _mm_set1_epi64x( 0x5C5D5E5F5C5D5E5F );
+   ctx->H[ 7] = v128_64( 0x5C5D5E5F5C5D5E5F );
-   ctx->H[ 8] = _mm_set1_epi64x( 0x6061626360616263 );
+   ctx->H[ 8] = v128_64( 0x6061626360616263 );
-   ctx->H[ 9] = _mm_set1_epi64x( 0x6465666764656667 );
+   ctx->H[ 9] = v128_64( 0x6465666764656667 );
-   ctx->H[10] = _mm_set1_epi64x( 0x68696A6B68696A6B );
+   ctx->H[10] = v128_64( 0x68696A6B68696A6B );
-   ctx->H[11] = _mm_set1_epi64x( 0x6C6D6E6F6C6D6E6F );
+   ctx->H[11] = v128_64( 0x6C6D6E6F6C6D6E6F );
-   ctx->H[12] = _mm_set1_epi64x( 0x7071727370717273 );
+   ctx->H[12] = v128_64( 0x7071727370717273 );
-   ctx->H[13] = _mm_set1_epi64x( 0x7475767774757677 );
+   ctx->H[13] = v128_64( 0x7475767774757677 );
-   ctx->H[14] = _mm_set1_epi64x( 0x78797A7B78797A7B );
+   ctx->H[14] = v128_64( 0x78797A7B78797A7B );
-   ctx->H[15] = _mm_set1_epi64x( 0x7C7D7E7F7C7D7E7F );
+   ctx->H[15] = v128_64( 0x7C7D7E7F7C7D7E7F );
 //   for ( int i = 0; i < 16; i++ )
-//      sc->H[i] = _mm_set1_epi32( iv[i] );
+//      sc->H[i] = v128_32( iv[i] );
   ctx->ptr = 0;
   ctx->bit_count = 0;
 }
@@ -478,10 +478,10 @@ void bmw256_4way_init( bmw256_4way_context *ctx )
 static void
 bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
 {
-   __m128i *vdata = (__m128i*)data;
+   v128u64_t *vdata = (v128u64_t*)data;
-   __m128i *buf;
+   v128u64_t *buf;
-   __m128i htmp[16];
+   v128u64_t htmp[16];
-   __m128i *h1, *h2;
+   v128u64_t *h1, *h2;
   size_t ptr;
   const int buf_size = 64;  // bytes of one lane, compatible with len
@@ -497,13 +497,13 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
      clen = buf_size - ptr;
      if ( clen > len )
         clen = len;
-      memcpy_128( buf + (ptr>>2), vdata, clen >> 2 );
+      v128_memcpy( buf + (ptr>>2), vdata, clen >> 2 );
      vdata += ( clen >> 2 );
      len -= clen;
      ptr += clen;
      if ( ptr == buf_size )
      {
-         __m128i *ht;
+         v128u64_t *ht;
         compress_small( buf, h1, h2 );
         ht = h1;
         h1 = h2;
@@ -513,46 +513,45 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
   }
   sc->ptr = ptr;
   if ( h1 != sc->H )
-        memcpy_128( sc->H, h1, 16 );
+        v128_memcpy( sc->H, h1, 16 );
 }
 static void
 bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
 	void *dst, size_t out_size_w32)
 {
-   __m128i *buf;
+   v128u64_t *buf;
-   __m128i h1[16], h2[16], *h;
+   v128u64_t h1[16], h2[16], *h;
   size_t ptr, u, v;
   const int buf_size = 64;  // bytes of one lane, compatible with len
   buf = sc->buf;
   ptr = sc->ptr;
-   buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
+   buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
   ptr += 4;
   h = sc->H;
   // assume bit_count fits in 32 bits 
   if ( ptr > buf_size - 4 )
   {
-      memset_zero_128( buf + (ptr>>2), (buf_size - ptr) >> 2 );
+      v128_memset_zero( buf + (ptr>>2), (buf_size - ptr) >> 2 );
      compress_small( buf, h, h1 );
      ptr = 0;
      h = h1;
   }
-   memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
+   v128_memset_zero( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
-   buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
+   buf[ (buf_size - 8) >> 2 ] = v128_32( sc->bit_count + n );
-   buf[ (buf_size - 4) >> 2 ] = m128_zero;
+   buf[ (buf_size - 4) >> 2 ] = v128_zero;
   compress_small( buf, h, h2 );
   for ( u = 0; u < 16; u ++ )
      buf[u] = h2[u];
-   compress_small( buf, (__m128i*)final_s, h1 );
+   compress_small( buf, (v128u64_t*)final_s, h1 );
   for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
-      casti_m128i( dst, u ) = h1[v];
+      casti_v128( dst, u ) = h1[v];
 }
 /*
@@ -1058,7 +1057,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
 #endif // __AVX2__
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 // BMW-256 16 way 32
--- a/algo/bmw/bmw512-4way.c
+++ b/algo/bmw/bmw512-4way.c
@@ -2,12 +2,11 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
 //#include "sph_keccak.h"
 #include "bmw-hash-4way.h"
 #if defined(BMW512_8WAY)
-void bmw512hash_8way(void *state, const void *input)
+void bmw512hash_8way( void *state, const void *input )
 {
    bmw512_8way_context ctx;
    bmw512_8way_init( &ctx );
@@ -27,9 +26,9 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
-   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   __m512i  *noncev = (__m512i*)vdata + 9;
   const uint32_t Htarg = ptarget[7];
-   int thr_id = mythr->id;
+   const int thr_id = mythr->id;
   mm512_bswap32_intrlv80_8x64( vdata, pdata );
   do {
@@ -43,7 +42,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) )
+          if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
          {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr );
@@ -59,9 +58,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
 #elif defined(BMW512_4WAY)
-//#ifdef BMW512_4WAY
+void bmw512hash_4way( void *state, const void *input )
 void bmw512hash_4way(void *state, const void *input)
 {
    bmw512_4way_context ctx;
    bmw512_4way_init( &ctx );
@@ -80,10 +77,10 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce -  4;
+   const uint32_t last_nonce = max_nonce - 4;
-   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   __m256i  *noncev = (__m256i*)vdata + 9; 
   const uint32_t Htarg = ptarget[7];
-    int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;  
   mm256_bswap32_intrlv80_4x64( vdata, pdata );
   do {
@@ -96,7 +93,7 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) )
+          if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
          {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr );
@@ -110,4 +107,55 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }
 #elif defined(BMW512_2WAY)
 void bmw512hash_2x64( void *state, const void *input )
 {
    bmw512_2x64_context ctx;
    bmw512_2x64_init( &ctx );
    bmw512_2x64_update( &ctx, input, 80 );
    bmw512_2x64_close( &ctx, state );
 }
 int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
   uint32_t hash[16*2] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[13]);   // 3*4+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 2;
   v128_t *noncev = (v128_t*)vdata + 9;  
   const uint32_t Htarg = ptarget[7];
   const int thr_id = mythr->id; 
   v128_bswap32_intrlv80_2x64( vdata, pdata );
   do {
      *noncev = v128_intrlv_blend_32( v128_bswap32(
                                      v128_set32( n+1, 0, n, 0 ) ), *noncev );
      bmw512hash_2x64( hash, vdata );
      for ( int lane = 0; lane < 2; lane++ )
      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
      {
          extr_lane_2x64( lane_hash, hash, lane, 256 );
          if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
          {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr );
          }
      }
      n += 2;
   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
   *hashes_done = n - first_nonce;
   return 0;
 }
 #endif
--- a/algo/bmw/bmw512-gate.c
+++ b/algo/bmw/bmw512-gate.c
@@ -2,7 +2,7 @@
 bool register_bmw512_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
  opt_target_factor = 256.0;
 #if defined (BMW512_8WAY)
  gate->scanhash  = (void*)&scanhash_bmw512_8way;
@@ -10,6 +10,9 @@ bool register_bmw512_algo( algo_gate_t* gate )
 #elif defined (BMW512_4WAY)
  gate->scanhash  = (void*)&scanhash_bmw512_4way;
  gate->hash      = (void*)&bmw512hash_4way;
 #elif defined (BMW512_2WAY)
  gate->scanhash  = (void*)&scanhash_bmw512_2x64;
  gate->hash      = (void*)&bmw512hash_2x64;
 #else
  gate->scanhash        = (void*)&scanhash_bmw512;
  gate->hash            = (void*)&bmw512hash;
--- a/algo/bmw/bmw512-gate.h
+++ b/algo/bmw/bmw512-gate.h
@@ -4,23 +4,31 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define BMW512_8WAY 1
 #elif defined(__AVX2__)
  #define BMW512_4WAY 1
 #elif defined(__SSE2__) || defined(__ARM_NEON)
  #define BMW512_2WAY 1
 #endif
 #if defined(BMW512_8WAY)
 void bmw512hash_8way( void *state, const void *input );
 int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
+                          uint64_t *hashes_done, struct thr_info *mythr );
 #elif defined(BMW512_4WAY)
 void bmw512hash_4way( void *state, const void *input );
 int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
+                          uint64_t *hashes_done, struct thr_info *mythr );
 #elif defined(BMW512_2WAY)
 void bmw512hash_2x64( void *state, const void *input );
 int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 #else
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -950,7 +950,7 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #endif  // __AVX2__
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 // BMW-512 8 WAY
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -26,7 +26,7 @@ static const uint64_t IV512[] =
 0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
 };
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 // 4 way 128 is handy to avoid reinterleaving in many algos.
 // If reinterleaving is necessary it may be more efficient to use
--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -6,7 +6,7 @@
 #if defined(__AVX2__)
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 struct _cube_4way_context
 {
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -13,7 +13,7 @@ static void transform( cubehashParam *sp )
    int r;
    const int rounds = sp->rounds;
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
    register __m512i x0, x1;
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -236,9 +236,7 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc
 }
-
+HashReturn init_echo( hashState_echo *ctx, int nHashSize )
 HashReturn init_echo(hashState_echo *ctx, int nHashSize)
 {
 	int i, j;
@@ -280,7 +278,8 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
 	return SUCCESS;
 }
-HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
+HashReturn update_echo( hashState_echo *state, const void *data,
                        uint32_t databitlen )
 {
 	unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -330,7 +329,7 @@ HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLengt
 	return SUCCESS;
 }
-HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
+HashReturn final_echo( hashState_echo *state, void *hashval)
 {
 	v128_t remainingbits;
@@ -407,8 +406,8 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
 	return SUCCESS;
 }
-HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
+HashReturn update_final_echo( hashState_echo *state, void *hashval,
-                              const BitSequence *data, DataLength databitlen )
+                              const void *data, uint32_t databitlen )
 {
   unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -530,8 +529,8 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
   return SUCCESS;
 }
-HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
+HashReturn echo_full( hashState_echo *state, void *hashval,
-            int nHashSize, const BitSequence *data, DataLength datalen )
+            int nHashSize, const void *data, uint32_t datalen )
 {
   int i, j;
@@ -578,7 +577,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
        {
           // Fill the buffer
           memcpy( state->buffer + state->uBufferBytes,
-                   (void*)data, state->uBlockLength - state->uBufferBytes );
+                   data, state->uBlockLength - state->uBufferBytes );
           // Process buffer
           Compress( state, state->buffer, 1 );
@@ -601,7 +600,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
        }
        if( uRemainingBytes > 0 )
-        memcpy(state->buffer, (void*)data, uRemainingBytes);
+        memcpy(state->buffer, data, uRemainingBytes);
        state->uBufferBytes = uRemainingBytes;
   }
@@ -689,7 +688,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
 }
-
+#if 0
 HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
 {
 	HashReturn hRet;
@@ -746,5 +745,6 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
 	return SUCCESS;
 }
 #endif
 #endif
--- a/algo/echo/aes_ni/hash_api.h
+++ b/algo/echo/aes_ni/hash_api.h
@@ -47,16 +47,16 @@ HashReturn init_echo(hashState_echo *state, int hashbitlen);
 HashReturn reinit_echo(hashState_echo *state);
-HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen);
+HashReturn update_echo(hashState_echo *state, const void *data, uint32_t databitlen);
-HashReturn final_echo(hashState_echo *state, BitSequence *hashval);
+HashReturn final_echo(hashState_echo *state, void *hashval);
-HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
+HashReturn hash_echo(int hashbitlen, const void *data, uint32_t databitlen, void *hashval);
-HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
+HashReturn update_final_echo( hashState_echo *state, void *hashval,
-                              const BitSequence *data, DataLength databitlen );
+                              const void *data, uint32_t databitlen );
-HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
+HashReturn echo_full( hashState_echo *state, void *hashval,
-            int nHashSize, const BitSequence *data, DataLength databitlen );
+            int nHashSize, const void *data, uint32_t databitlen );
 #endif // HASH_API_H
--- a/algo/echo/echo-hash-4way.c
+++ b/algo/echo/echo-hash-4way.c
@@ -11,7 +11,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
 };
 */
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 #define ECHO_SUBBYTES4(state, j) \
   state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \
--- a/algo/echo/echo-hash-4way.h
+++ b/algo/echo/echo-hash-4way.h
@@ -5,7 +5,7 @@
 #include "simd-utils.h"
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 typedef struct
 {
--- a/algo/echo/sph_echo.c
+++ b/algo/echo/sph_echo.c
@@ -36,7 +36,6 @@
 #include "sph_echo.h"
 #if !defined(__AES__)
 #ifdef __cplusplus
 extern "C"{
@@ -1031,4 +1030,3 @@ sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #ifdef __cplusplus
 }
 #endif 
 #endif  // !AES
--- a/algo/echo/sph_echo.h
+++ b/algo/echo/sph_echo.h
@@ -36,8 +36,6 @@
 #ifndef SPH_ECHO_H__
 #define SPH_ECHO_H__
 #if !defined(__AES__)
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -318,5 +316,4 @@ void sph_echo512_addbits_and_close(
 #ifdef __cplusplus
 }
 #endif
 #endif // !AES
 #endif
--- a/algo/fugue/fugue-aesni.c
+++ b/algo/fugue/fugue-aesni.c
@@ -15,237 +15,176 @@
 *
 */
-#if defined(__AES__)
+#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
 #include <x86intrin.h>
 #include <memory.h>
 #include "fugue-aesni.h"
 static const v128u64_t _supermix1a	__attribute__ ((aligned (16))) =
   { 0x0202010807020100, 0x0a05000f06010c0b };
-MYALIGN const unsigned long long _supermix1a[]	= {0x0202010807020100, 0x0a05000f06010c0b};
+static const v128u64_t _supermix1b	__attribute__ ((aligned (16))) =
-MYALIGN const unsigned long long _supermix1b[]	= {0x0b0d080703060504, 0x0e0a090c050e0f0a};
+   { 0x0b0d080703060504, 0x0e0a090c050e0f0a };
 MYALIGN const unsigned long long _supermix1c[]	= {0x0402060c070d0003, 0x090a060580808080};
 MYALIGN const unsigned long long _supermix1d[]	= {0x808080800f0e0d0c, 0x0f0e0d0c80808080};
 MYALIGN const unsigned long long _supermix2a[]	= {0x07020d0880808080, 0x0b06010c050e0f0a};
 MYALIGN const unsigned long long _supermix4a[]	= {0x000f0a050c0b0601, 0x0302020404030e09};
 MYALIGN const unsigned long long _supermix4b[]	= {0x07020d08080e0d0d, 0x07070908050e0f0a};
 MYALIGN const unsigned long long _supermix4c[]	= {0x0706050403020000, 0x0302000007060504};
 MYALIGN const unsigned long long _supermix7a[]	= {0x010c0b060d080702, 0x0904030e03000104};
 MYALIGN const unsigned long long _supermix7b[]	= {0x8080808080808080, 0x0504070605040f06};
 //MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
 //MYALIGN const unsigned char _shift_one_mask[]   = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
 //MYALIGN const unsigned char _shift_four_mask[]  = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
 //MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
 //MYALIGN const unsigned char _aes_shift_rows[]   = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
 MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
 MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
 MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
 MYALIGN const unsigned int _lsbmask2[] = {0x03030303, 0x03030303, 0x03030303, 0x03030303};
 static const v128u64_t _supermix1c	__attribute__ ((aligned (16))) =
   { 0x0402060c070d0003, 0x090a060580808080 };
-MYALIGN const unsigned int _IV512[] = {		
+static const v128u64_t _supermix1d	__attribute__ ((aligned (16))) =
-	0x00000000, 0x00000000,	0x7ea50788, 0x00000000,
+   { 0x808080800f0e0d0c, 0x0f0e0d0c80808080 };
 static const v128u64_t _supermix2a	__attribute__ ((aligned (16))) =
   { 0x07020d0880808080, 0x0b06010c050e0f0a };
 static const v128u64_t _supermix4a	__attribute__ ((aligned (16))) =
   { 0x000f0a050c0b0601, 0x0302020404030e09 };
 static const v128u64_t _supermix4b	__attribute__ ((aligned (16))) =
   { 0x07020d08080e0d0d, 0x07070908050e0f0a };
 static const v128u64_t _supermix4c	__attribute__ ((aligned (16))) =
   { 0x0706050403020000, 0x0302000007060504 };
 static const v128u64_t _supermix7a	__attribute__ ((aligned (16))) =
   { 0x010c0b060d080702, 0x0904030e03000104 };
 static const v128u64_t _supermix7b	__attribute__ ((aligned (16))) =
   { 0x8080808080808080, 0x0504070605040f06 };
 static const v128u64_t _inv_shift_rows __attribute__ ((aligned (16))) =
   { 0x0b0e0104070a0d00, 0x0306090c0f020508 };
 static const v128u64_t _mul2mask __attribute__ ((aligned (16))) =
   { 0x000000001b1b0000, 0x0000000000000000 };
 static const v128u64_t _mul4mask __attribute__ ((aligned (16))) =
   { 0x000000002d361b00, 0x0000000000000000 };
 static const v128u64_t _lsbmask2 __attribute__ ((aligned (16))) =
   { 0x0303030303030303, 0x0303030303030303 };
 static const uint32_t _IV512[] __attribute__ ((aligned (32))) =
 {	0x00000000, 0x00000000,	0x7ea50788, 0x00000000,
 	0x75af16e6, 0xdbe4d3c5, 0x27b09aac, 0x00000000,
 	0x17f115d9, 0x54cceeb6, 0x0b02e806, 0x00000000,
 	0xd1ef924a, 0xc9e2c6aa, 0x9813b2dd, 0x00000000,
 	0x3858e6ca, 0x3f207f43, 0xe778ea25, 0x00000000,
-	0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000};
+	0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000
 };
-#if defined(__SSE4_1__)
+#if defined(__ARM_NEON)
-#define PACK_S0(s0, s1, t1)\
+#define mask_1000(v)         v128_put32( v, 0, 3 )
   s0 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s0), _mm_castsi128_ps(s1), 0x30))
-#define UNPACK_S0(s0, s1, t1)\
+static const v128u32_t MASK_3321 __attribute__ ((aligned (16))) =
-   s1 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s1), _mm_castsi128_ps(s0), 0xc0));\
+   { 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x0f0e0d0c };
   s0 = mm128_mask_32( s0, 8 )
-#define CMIX(s1, s2, r1, r2, t1, t2)\
+static const v128u32_t MASK_3033 __attribute__ ((aligned (16))) =
-   t1 = s1;\
+   { 0x0f0e0d0c, 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c };
   t1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t1), _mm_castsi128_ps(s2), _MM_SHUFFLE(3, 0, 2, 1)));\
   r1 = _mm_xor_si128(r1, t1);\
   r2 = _mm_xor_si128(r2, t1);
-#else   // SSE2
+static const v128u32_t MASK_3303 __attribute__ ((aligned (16))) =
   { 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c, 0x0f0e0d0c };
-#define PACK_S0(s0, s1, t1)\
+static const v128u32_t MASK_0321 __attribute__ ((aligned (16))) =
-   t1 = _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 3, 3));\
+   { 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x03020100 };
   s0 = _mm_xor_si128(s0, t1);
-#define UNPACK_S0(s0, s1, t1)\
+#define shuffle_3303(v)      vqtbl1q_u8( v, MASK_3303 )
-   t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 3, 3));\
+#define shuffle_0321(v)      vqtbl1q_u8( v, MASK_0321 )
   s1 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s1), _mm_castsi128_ps(t1)));\
   s0 = mm128_mask_32( s0, 8 )
-#define CMIX(s1, s2, r1, r2, t1, t2)\
+#define CMIX( s1, s2, r1, r2, t1, t2 ) \
-   t1 = _mm_shuffle_epi32(s1, 0xf9);\
+   t1 = vqtbl1q_u8( s1, MASK_3321 ); \
-   t2 = _mm_shuffle_epi32(s2, 0xcf);\
+   t2 = vqtbl1q_u8( s2, MASK_3033 ); \
-   t1 = _mm_xor_si128(t1, t2);\
+   t1 = v128_xor( t1, t2 ); \
-   r1 = _mm_xor_si128(r1, t1);\
+   r1 = v128_xor( r1, t1 ); \
-   r2 = _mm_xor_si128(r2, t1)
+   r2 = v128_xor( r2, t1 );
 #elif defined(__SSE4_1__)
 #define mask_1000(v)         v128_mask32( v, 8 )
 #define shuffle_3303(v)      _mm_shuffle_epi32( v, 0xf3 )
 #define shuffle_0321(v)      _mm_shuffle_epi32( v, 0x39 )
 #define CMIX( s1, s2, r1, r2, t1, t2 ) \
   t1 = s1; \
   t1 = v128_shuffle2_32( t1, s2, _MM_SHUFFLE( 3, 0, 2, 1 ) ); \
   r1 = v128_xor( r1, t1 ); \
   r2 = v128_xor( r2, t1 );
 #endif
-#define TIX256(msg, s10, s8, s24, s0, t1, t2, t3)\
+#define PACK_S0( s0, s1, t1 ) \
-	t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
+ s0 = v128_movlane32( s0, 3, s1, 0 )
 	s10 = _mm_xor_si128(s10, t1);\
 	t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
 	s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
 	t1 = _mm_slli_si128(t1, 8);\
 	s8 = _mm_xor_si128(s8, t1);\
 	t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
 	s0 = _mm_xor_si128(s0, t1)
-
+#define UNPACK_S0( s0, s1, t1 ) \
-#define TIX384(msg, s16, s8, s27, s30, s0, s4, t1, t2, t3)\
+   s1 = v128_movlane32( s1, 0, s0, 3 ); \
-	t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
+   s0 = mask_1000( s0 )
 	s16 = _mm_xor_si128(s16, t1);\
 	t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
 	s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
 	t1 = _mm_slli_si128(t1, 8);\
 	s8 = _mm_xor_si128(s8, t1);\
 	t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
 	s0 = _mm_xor_si128(s0, t1);\
 	t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
 	s4 = _mm_xor_si128(s4, t1)
 #define TIX512(msg, s22, s8, s24, s27, s30, s0, s4, s7, t1, t2, t3)\
-	t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
+	t1 = shuffle_3303( s0 ); \
-	s22 = _mm_xor_si128(s22, t1);\
+	s22 = v128_xor(s22, t1);\
-	t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
+	t1 = v128_put32( v128_zero, *(uint32_t*)msg, 0 ); \
-	s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
+	s0 = v128_movlane32( s0, 0, t1, 0 ); \
-	t1 = _mm_slli_si128(t1, 8);\
+	t1 = v128_alignr64( t1, v128_zero, 1 ); \
-	s8 = _mm_xor_si128(s8, t1);\
+	s8 = v128_xor(s8, t1);\
-	t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
+	t1 = shuffle_3303( s24 ); \
-	s0 = _mm_xor_si128(s0, t1);\
+	s0 = v128_xor(s0, t1);\
-	t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
+	t1 = shuffle_3303( s27 ); \
-	s4 = _mm_xor_si128(s4, t1);\
+	s4 = v128_xor(s4, t1);\
-	t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
+	t1 = shuffle_3303( s30 ); \
-	s7 = _mm_xor_si128(s7, t1)
+	s7 = v128_xor(s7, t1)
-#define PRESUPERMIX(t0, t1, t2, t3, t4)\
+#define SUBSTITUTE( r0, _t2 ) \
-   t2 = t0;\
+	_t2 = v128_shuffle8( r0, _inv_shift_rows ); \
-   t3 = _mm_add_epi8(t0, t0);\
+	_t2 = v128_aesenclast_nokey( _t2 )
   t4 = _mm_add_epi8(t3, t3);\
   t1 = _mm_srli_epi16(t0, 6);\
   t1 = _mm_and_si128(t1, M128(_lsbmask2));\
   t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
   t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
 /*
 #define PRESUPERMIX(x, t1, s1, s2, t2)\
 	s1 = x;\
 	s2 = _mm_add_epi8(x, x);\
 	t2 = _mm_add_epi8(s2, s2);\
 	t1 = _mm_srli_epi16(x, 6);\
 	t1 = _mm_and_si128(t1, M128(_lsbmask2));\
 	s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
 	x  = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
 */
 #define SUBSTITUTE(r0, _t2 )\
 	_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
 	_t2 = _mm_aesenclast_si128( _t2, m128_zero )
 #define SUPERMIX(t0, t1, t2, t3, t4)\
   t2 = t0;\
-   t3 = _mm_add_epi8(t0, t0);\
+   t3 = v128_add8( t0, t0 ); \
-   t4 = _mm_add_epi8(t3, t3);\
+   t4 = v128_add8( t3, t3 ); \
-   t1 = _mm_srli_epi16(t0, 6);\
+   t1 = v128_sr16( t0, 6 ); \
-   t1 = _mm_and_si128(t1, M128(_lsbmask2));\
+   t1 = v128_and( t1, _lsbmask2 ); \
-   t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
+   t0 = v128_xor( t4, v128_shuffle8( _mul4mask, t1 ) ); \
-   t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
+   t4 = v128_shuffle8( t2, _supermix1b ); \
-   t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
+   t3 = v128_xor( t3, v128_shuffle8( _mul2mask, t1 ) ); \
-   t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
+   t1 = v128_shuffle8( t4, _supermix1c ); \
-   t4 = _mm_xor_si128(t4, t1);\
+   t4 = v128_xor( t4, t1 ); \
-   t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
+   t1 = v128_shuffle8( t4, _supermix1d ); \
-   t4 = _mm_xor_si128(t4, t1);\
+   t4 = v128_xor( t4, t1 ); \
-   t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
+   t1 = v128_shuffle8( t2, _supermix1a ); \
-   t2 = mm128_xor3(t2, t3, t0 );\
+   t2 = v128_xor3( t2, t3, t0 ); \
-   t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
+   t2 = v128_shuffle8( t2, _supermix7a ); \
-   t4 = mm128_xor3( t4, t1, t2 ); \
+   t4 = v128_xor3( t4, t1, t2 ); \
-   t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
+   t2 = v128_shuffle8( t2, _supermix7b ); \
-   t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
+   t3 = v128_shuffle8( t3, _supermix2a ); \
-   t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
+   t1 = v128_shuffle8( t0, _supermix4a ); \
-   t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
+   t0 = v128_shuffle8( t0, _supermix4b ); \
-   t4 = mm128_xor3( t4, t2, t1 ); \
+   t4 = v128_xor3( t4, t2, t1 ); \
-   t0 = _mm_xor_si128(t0, t3);\
+   t0 = v128_xor( t0, t3 ); \
-   t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
+   t4 = v128_xor3( t4, t0, v128_shuffle8( t0, _supermix4c ) );
 /*
 #define SUPERMIX(t0, t1, t2, t3, t4)\
 	PRESUPERMIX(t0, t1, t2, t3, t4);\
 	POSTSUPERMIX(t0, t1, t2, t3, t4)
 */
 #define POSTSUPERMIX(t0, t1, t2, t3, t4)\
 	t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
 	t4 = t1;\
 	t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
 	t4 = _mm_xor_si128(t4, t1);\
 	t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
 	t4 = _mm_xor_si128(t4, t1);\
 	t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
 	t4 = _mm_xor_si128(t4, t1);\
 	t2 = mm128_xor3(t2, t3, t0 );\
 	t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
 	t4 = _mm_xor_si128(t4, t2);\
 	t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
 	t4 = _mm_xor_si128(t4, t2);\
 	t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
 	t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
 	t4 = _mm_xor_si128(t4, t1);\
 	t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
 	t0 = _mm_xor_si128(t0, t3);\
 	t4 = _mm_xor_si128(t4, t0);\
 	t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
 	t4 = _mm_xor_si128(t4, t0)
 #define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
 	CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
 	PACK_S0(r1c, r1a, _t0);\
 	SUBSTITUTE(r1c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
 	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
 	r2c = _mm_xor_si128(r2c, _t0);\
   _t0 = mm128_mask_32( _t0, 8 ); \
 	r2d = _mm_xor_si128(r2d, _t0);\
 	UNPACK_S0(r1c, r1a, _t3);\
 	SUBSTITUTE(r2c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
 	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
 	r3c = _mm_xor_si128(r3c, _t0);\
   _t0 = mm128_mask_32( _t0, 8 ); \
 	r3d = _mm_xor_si128(r3d, _t0);\
 	UNPACK_S0(r2c, r2a, _t3);\
 	SUBSTITUTE(r3c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
 	UNPACK_S0(r3c, r3a, _t3)
 #define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
 	CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
 	PACK_S0(r1c, r1a, _t0);\
 	SUBSTITUTE( r1c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
-	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
+	_t0 = shuffle_0321( r1c ); \
-	r2c = _mm_xor_si128(r2c, _t0);\
+	r2c = v128_xor(r2c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
+   _t0 = mask_1000( _t0 ); \
-	r2d = _mm_xor_si128(r2d, _t0);\
+	r2d = v128_xor(r2d, _t0);\
 	UNPACK_S0(r1c, r1a, _t3);\
 	SUBSTITUTE(r2c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
-	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
+	_t0 = shuffle_0321( r2c ); \
-	r3c = _mm_xor_si128(r3c, _t0);\
+	r3c = v128_xor(r3c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
+   _t0 = mask_1000( _t0 ); \
-	r3d = _mm_xor_si128(r3d, _t0);\
+	r3d = v128_xor(r3d, _t0);\
 	UNPACK_S0(r2c, r2a, _t3);\
 	SUBSTITUTE( r3c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
-	_t0 = _mm_shuffle_epi32(r3c, 0x39);\
+	_t0 = shuffle_0321( r3c ); \
-	r4c = _mm_xor_si128(r4c, _t0);\
+	r4c = v128_xor(r4c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
+   _t0 = mask_1000( _t0 ); \
-	r4d = _mm_xor_si128(r4d, _t0);\
+	r4d = v128_xor(r4d, _t0);\
 	UNPACK_S0(r3c, r3a, _t3);\
 	SUBSTITUTE( r4c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
@@ -256,18 +195,19 @@ MYALIGN const unsigned int _IV512[] = {
 	block[1] = col[(base + a + 1) % s];\
 	block[2] = col[(base + a + 2) % s];\
 	block[3] = col[(base + a + 3) % s];\
-	x = _mm_load_si128((__m128i*)block)
+	x = v128_load( (v128_t*)block )
 #define STORECOLUMN(x, s)\
-	_mm_store_si128((__m128i*)block, x);\
+	v128_store((v128_t*)block, x );\
 	col[(base + 0) % s] = block[0];\
 	col[(base + 1) % s] = block[1];\
 	col[(base + 2) % s] = block[2];\
 	col[(base + 3) % s] = block[3]
-void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
+void Compress512( hashState_fugue *ctx, const unsigned char *pmsg,
                  unsigned int uBlockCount )
 {
-   __m128i _t0, _t1, _t2, _t3;
+   v128_t _t0, _t1, _t2, _t3;
   switch(ctx->base)
   {
@@ -346,134 +286,133 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
      pmsg += 4;
      uBlockCount--;
   }
 }
-void Final512(hashState_fugue *ctx, BitSequence *hashval)
+void Final512( hashState_fugue *ctx, uint8_t *hashval )
 {
   unsigned int block[4] __attribute__ ((aligned (32)));
   unsigned int col[36] __attribute__ ((aligned (16)));
 	unsigned int i, base;
-	__m128i r0, _t0, _t1, _t2, _t3;
+	v128_t r0, _t0, _t1, _t2, _t3;
-	for(i = 0; i < 12; i++)
+	for( i = 0; i < 12; i++ )
 	{
-		_mm_store_si128((__m128i*)block, ctx->state[i]);
+		v128_store( (v128_t*)block, ctx->state[i] );
 		col[3 * i + 0] = block[0];
 		col[3 * i + 1] = block[1];
 		col[3 * i + 2] = block[2];
 	}
-	base = (36 - (12 * ctx->base)) % 36;
+	base = ( 36 - (12 * ctx->base) ) % 36;
-	for(i = 0; i < 32; i++)
+	for( i = 0; i < 32; i++ )
 	{
 		// ROR3
 		base = (base + 33) % 36;
 		// CMIX
-		col[(base +  0) % 36] ^= col[(base + 4) % 36];
+		col[ (base +  0) % 36 ] ^= col[ (base + 4) % 36 ];
-		col[(base +  1) % 36] ^= col[(base + 5) % 36];
+		col[ (base +  1) % 36 ] ^= col[ (base + 5) % 36 ];
-		col[(base +  2) % 36] ^= col[(base + 6) % 36];
+		col[ (base +  2) % 36 ] ^= col[ (base + 6) % 36 ];
-		col[(base +  18) % 36] ^= col[(base + 4) % 36];
+		col[ (base + 18) % 36 ] ^= col[ (base + 4) % 36 ];
-		col[(base +  19) % 36] ^= col[(base + 5) % 36];
+		col[ (base + 19) % 36 ] ^= col[ (base + 5) % 36 ];
-		col[(base +  20) % 36] ^= col[(base + 6) % 36];
+		col[ (base + 20) % 36 ] ^= col[ (base + 6) % 36 ];
 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
+		LOADCOLUMN( r0, 36, 0 );
-		SUBSTITUTE(r0, _t2);
+		SUBSTITUTE( r0, _t2 );
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
-		STORECOLUMN(r0, 36);
+		STORECOLUMN( r0, 36 );
 	}
-	for(i = 0; i < 13; i++)
+	for( i = 0; i < 13; i++ )
 	{
 		// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
-		col[(base +  9) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  9) % 36 ] ^= col[ (base + 0) % 36 ];
-		col[(base + 18) % 36] ^= col[(base + 0) % 36];
+		col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
-		col[(base + 27) % 36] ^= col[(base + 0) % 36];
+		col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
 		// ROR9
 		base = (base + 27) % 36;
 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
+		LOADCOLUMN( r0, 36, 0 );
-		SUBSTITUTE(r0, _t2);
+		SUBSTITUTE( r0, _t2 );
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
-		STORECOLUMN(r0, 36);
+		STORECOLUMN( r0, 36 );
 		// S4 += S0; S10 += S0; S18 += S0; S27 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
-		col[(base + 10) % 36] ^= col[(base + 0) % 36];
+		col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
-		col[(base + 18) % 36] ^= col[(base + 0) % 36];
+		col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
-		col[(base + 27) % 36] ^= col[(base + 0) % 36];
+		col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
 		// ROR9
 		base = (base + 27) % 36;
 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
+		LOADCOLUMN( r0, 36, 0 );
-		SUBSTITUTE(r0, _t2);
+		SUBSTITUTE( r0, _t2 );
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
-		STORECOLUMN(r0, 36);
+		STORECOLUMN( r0, 36 );
 		// S4 += S0; S10 += S0; S19 += S0; S27 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
-		col[(base + 10) % 36] ^= col[(base + 0) % 36];
+		col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
-		col[(base + 19) % 36] ^= col[(base + 0) % 36];
+		col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
-		col[(base + 27) % 36] ^= col[(base + 0) % 36];
+		col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
 		// ROR9
 		base = (base + 27) % 36;
 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
+		LOADCOLUMN( r0, 36, 0 );
-		SUBSTITUTE(r0, _t2);
+		SUBSTITUTE( r0, _t2 );
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
-		STORECOLUMN(r0, 36);
+		STORECOLUMN( r0, 36 );
 		// S4 += S0; S10 += S0; S19 += S0; S28 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
-		col[(base + 10) % 36] ^= col[(base + 0) % 36];
+		col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
-		col[(base + 19) % 36] ^= col[(base + 0) % 36];
+		col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
-		col[(base + 28) % 36] ^= col[(base + 0) % 36];
+		col[ (base + 28) % 36 ] ^= col[ (base + 0) % 36 ];
 		// ROR8
 		base = (base + 28) % 36;
 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
+		LOADCOLUMN( r0, 36, 0 );
-		SUBSTITUTE(r0, _t2);
+		SUBSTITUTE( r0, _t2 );
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
-		STORECOLUMN(r0, 36);
+		STORECOLUMN( r0, 36 );
 	}
 	// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
-	col[(base +  4) % 36] ^= col[(base + 0) % 36];
+	col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
-	col[(base +  9) % 36] ^= col[(base + 0) % 36];
+	col[ (base +  9) % 36 ] ^= col[ (base + 0) % 36 ];
-	col[(base + 18) % 36] ^= col[(base + 0) % 36];
+	col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
-	col[(base + 27) % 36] ^= col[(base + 0) % 36];
+	col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
 	// Transform to the standard basis and store output; S1 || S2 || S3 || S4
-	LOADCOLUMN(r0, 36, 1);
+	LOADCOLUMN( r0, 36, 1 );
-	_mm_store_si128((__m128i*)hashval, r0);
+	v128_store( (v128_t*)hashval, r0 );
 	// Transform to the standard basis and store output; S9 || S10 || S11 || S12
-	LOADCOLUMN(r0, 36, 9);
+	LOADCOLUMN( r0, 36, 9 );
-	_mm_store_si128((__m128i*)hashval + 1, r0);
+	v128_store( (v128_t*)hashval + 1, r0 );
 	// Transform to the standard basis and store output; S18 || S19 || S20 || S21
-	LOADCOLUMN(r0, 36, 18);
+	LOADCOLUMN( r0, 36, 18 );
-	_mm_store_si128((__m128i*)hashval + 2, r0);
+	v128_store( (v128_t*)hashval + 2, r0 );
 	// Transform to the standard basis and store output; S27 || S28 || S29 || S30
-	LOADCOLUMN(r0, 36, 27);
+	LOADCOLUMN( r0, 36, 27 );
-	_mm_store_si128((__m128i*)hashval + 3, r0);
+	v128_store( (v128_t*)hashval + 3, r0 );
 }
-HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
+int fugue512_Init( hashState_fugue *ctx, int nHashSize )
 {
 	int i;
 	ctx->processed_bits = 0;
@@ -485,20 +424,20 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
 	ctx->uBlockLength = 4;
 	for(i = 0; i < 6; i++)
-		ctx->state[i] = m128_zero;
+		ctx->state[i] = v128_zero;
-	ctx->state[6]  = _mm_load_si128((__m128i*)_IV512 + 0);
+	ctx->state[6]  = casti_v128( _IV512, 0 );
-	ctx->state[7]  = _mm_load_si128((__m128i*)_IV512 + 1);
+	ctx->state[7]  = casti_v128( _IV512, 1 );
-	ctx->state[8]  = _mm_load_si128((__m128i*)_IV512 + 2);
+	ctx->state[8]  = casti_v128( _IV512, 2 );
-	ctx->state[9]  = _mm_load_si128((__m128i*)_IV512 + 3);
+	ctx->state[9]  = casti_v128( _IV512, 3 );
-	ctx->state[10] = _mm_load_si128((__m128i*)_IV512 + 4);
+	ctx->state[10] = casti_v128( _IV512, 4 );
-	ctx->state[11] = _mm_load_si128((__m128i*)_IV512 + 5);
+	ctx->state[11] = casti_v128( _IV512, 5 );
-	return SUCCESS;
+	return 0;
 }
-
+int fugue512_Update( hashState_fugue *state, const void *data,
-HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen)
+                            uint64_t databitlen )
 {
 	unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -509,7 +448,8 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
 		if(state->uBufferBytes != 0)
 		{
 			// Fill the buffer
-			memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
+			memcpy( state->buffer + state->uBufferBytes, (void*)data,
                 state->uBlockLength - state->uBufferBytes );
 			// Process the buffer
 			Compress512(state, state->buffer, 1);
@@ -545,13 +485,13 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
 		state->uBufferBytes += uByteLength;
 	}
-	return SUCCESS;
+	return 0;
 }
-HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
+int fugue512_Final( hashState_fugue *state, void *hashval )
 {
 	unsigned int i;
-	BitSequence lengthbuf[8] __attribute__((aligned(64)));
+	uint8_t lengthbuf[8] __attribute__((aligned(64)));
 	// Update message bit count
 	state->processed_bits += state->uBufferBytes * 8;
@@ -575,16 +515,17 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
 	// Finalization
 	Final512(state, hashval);
-	return SUCCESS;
+	return 0;
 }
-HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen)
+int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
                   uint64_t databitlen )
 {
-	fugue512_Init(hs, 512);
+	fugue512_Init( hs, 512 );
-	fugue512_Update(hs, data, databitlen*8);
+	fugue512_Update( hs, data, databitlen*8 );
-	fugue512_Final(hs, hashval);
+	fugue512_Final( hs, hashval );
-	return SUCCESS;
+	return 0;
 }
 #endif  // AES
--- a/algo/fugue/fugue-aesni.h
+++ b/algo/fugue/fugue-aesni.h
@@ -14,37 +14,31 @@
 #ifndef FUGUE_HASH_API_H
 #define FUGUE_HASH_API_H
-#if defined(__AES__) 
+#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
 #if !defined(__SSE4_1__)
 #error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
 #endif
 #include "compat/sha3_common.h"
 #include "simd-utils.h"
 typedef struct
 {
-	__m128i			state[12];
+	v128_t			state[12];
 	unsigned int	base;
 	unsigned int	uHashSize;
 	unsigned int	uBlockLength;
 	unsigned int	uBufferBytes;
-	DataLength		processed_bits;
+	uint64_t 		processed_bits;
-	BitSequence		buffer[4];
+	uint8_t  		buffer[4];
 } hashState_fugue __attribute__ ((aligned (64)));
 // These functions are deprecated, use the lower case macro aliases that use
 // the standard interface. This will be cleaned up at a later date.
-HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
+int fugue512_Init( hashState_fugue *state, int hashbitlen );
-HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
+int fugue512_Update( hashState_fugue *state, const void *data,
                     uint64_t databitlen );
-HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
+int fugue512_Final( hashState_fugue *state, void *hashval );
 #define fugue512_init( state ) \
   fugue512_Init( state, 512 )
@@ -54,7 +48,8 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
   fugue512_Final
-HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
+int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
                   uint64_t databitlen);
 #endif // AES
 #endif // HASH_API_H
--- a/algo/gost/sph_gost.c
+++ b/algo/gost/sph_gost.c
@@ -696,7 +696,7 @@ static void AddModulo512(const void *a,const void *b,void *c)
 static void AddXor512(const void *a,const void *b,void *c)
 {
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
   casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ),
                                           casti_m512i( b, 0 ) );
 #elif defined(__AVX2__)
@@ -704,15 +704,15 @@ static void AddXor512(const void *a,const void *b,void *c)
                                           casti_m256i( b, 0 ) );
   casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
                                           casti_m256i( b, 1 ) );
-#elif defined(__SSE2__)
+#elif defined(__SSE2__) || defined(__ARM_NEON)
-   casti_m128i( c, 0 ) = _mm_xor_si128( casti_m128i( a, 0 ),
+   casti_v128( c, 0 ) = v128_xor( casti_v128( a, 0 ),
-                                        casti_m128i( b, 0 ) );
+                                  casti_v128( b, 0 ) );
-   casti_m128i( c, 1 ) = _mm_xor_si128( casti_m128i( a, 1 ),
+   casti_v128( c, 1 ) = v128_xor( casti_v128( a, 1 ),
-                                        casti_m128i( b, 1 ) );
+                                  casti_v128( b, 1 ) );
-   casti_m128i( c, 2 ) = _mm_xor_si128( casti_m128i( a, 2 ),
+   casti_v128( c, 2 ) = v128_xor( casti_v128( a, 2 ),
-                                        casti_m128i( b, 2 ) );
+                                  casti_v128( b, 2 ) );
-   casti_m128i( c, 3 ) = _mm_xor_si128( casti_m128i( a, 3 ),
+   casti_v128( c, 3 ) = v128_xor( casti_v128( a, 3 ),
-                                        casti_m128i( b, 3 ) );
+                                  casti_v128( b, 3 ) );
 #else
   const unsigned long long *A=a, *B=b;
 	unsigned long long *C=c;
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -60,21 +60,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
 #if defined(__ARM_NEON)
-// No fast shuffle on NEON
+static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
-//static const uint32x4_t vmask_d8 = {  3, 1, 2, 0 };  
+   { 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
 static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
-#define gr_shuffle32( v )      v128_blendv( v128_qrev32( v ), v, BLEND_MASK )
+#define gr_shuffle32(v)       vqtbl1q_u8( v, gr_mask ) 
 //#define gr_shuffle32( v )       v128_shufflev32( v, vmask_d8 )
 #else
-#define gr_shuffle32( v )       _mm_shuffle_epi32( v, 0xd8 )
+#define gr_shuffle32(v)       _mm_shuffle_epi32( v, 0xd8 )
 #endif
 #define tos(a)    #a
 #define tostr(a)  tos(a)
@@ -107,7 +103,7 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
   K. Matusiewicz, 2011/05/29 */
-#if defined(__AVX512VL__)
+#if defined(VL256)
 #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* t_i = a_i + a_{i+1} */\
@@ -301,17 +297,16 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
 */
 #define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* SubBytes */\
-  b0 = v128_xor(b0, b0);\
+  a0 = v128_aesenclast_nokey( a0 ); \
-  a0 = v128_aesenclast(a0, b0);\
+  a1 = v128_aesenclast_nokey( a1 ); \
-  a1 = v128_aesenclast(a1, b0);\
+  a2 = v128_aesenclast_nokey( a2 ); \
-  a2 = v128_aesenclast(a2, b0);\
+  a3 = v128_aesenclast_nokey( a3 ); \
-  a3 = v128_aesenclast(a3, b0);\
+  a4 = v128_aesenclast_nokey( a4 ); \
-  a4 = v128_aesenclast(a4, b0);\
+  a5 = v128_aesenclast_nokey( a5 ); \
-  a5 = v128_aesenclast(a5, b0);\
+  a6 = v128_aesenclast_nokey( a6 ); \
-  a6 = v128_aesenclast(a6, b0);\
+  a7 = v128_aesenclast_nokey( a7 ); \
  a7 = v128_aesenclast(a7, b0);\
  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+  MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7 ); \
 }
 #define ROUNDS_P(){\
@@ -329,10 +324,9 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
    xmm13 = v128_shuffle8( xmm13, SUBSH_MASK5 ); \
    xmm14 = v128_shuffle8( xmm14, SUBSH_MASK6 ); \
    xmm15 = v128_shuffle8( xmm15, SUBSH_MASK7 ); \
-    /* SubBytes + MixBytes */\
+     /* SubBytes + MixBytes */\
    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7 ); \
    \
    /* AddRoundConstant P1024 */\
    xmm0 = v128_xor( xmm0, \
             casti_v128( round_const_p, round_counter+1 ) ); \
@@ -434,7 +428,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
  t1 = v128_unpackhi16(t1, i3);\
  i2 = v128_unpacklo16(i2, i3);\
  i0 = v128_unpacklo16(i0, i1);\
 \
  /* shuffle with immediate */\
  t0 = gr_shuffle32( t0 ); \
  t1 = gr_shuffle32( t1 ); \
@@ -444,7 +437,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
  i2 = gr_shuffle32( i2 ); \
  i4 = gr_shuffle32( i4 ); \
  i6 = gr_shuffle32( i6 ); \
 \
  /* continue with unpack */\
  t4 = i0;\
  i0 = v128_unpacklo32(i0, i2);\
@@ -551,7 +543,8 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
  /* transpose done */\
 }/**/
-
+#if 0
 // not used
 void INIT( v128_t* chaining )
 {
  static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
@@ -580,6 +573,7 @@ void INIT( v128_t* chaining )
  chaining[6] = xmm14;
  chaining[7] = xmm15;
 }
 #endif
 void TF1024( v128_t* chaining, const v128_t* message )
 {
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -1,3 +1,6 @@
 #if !defined GROESTL256_INTR_AES_H__
 #define GROESTL256_INTR_AES_H__
 /* groestl-intr-aes.h     Aug 2011
 *
 * Groestl implementation with intrinsics using ssse3, sse4.1, and aes
@@ -50,18 +53,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
 #if defined(__ARM_NEON)
-// No fast shuffle on NEON
+static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
-static const uint32x4_t vmask_d8 = {  3, 1, 2, 0 };
+   { 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
-#define gr_shuffle32( v )       v128_shufflev32( v, vmask_d8 )
+#define gr_shuffle32(v)       vqtbl1q_u8( v, gr_mask ) 
 #else
-#define gr_shuffle32( v )       _mm_shuffle_epi32( v, 0xd8 )
+#define gr_shuffle32(v)       _mm_shuffle_epi32( v, 0xd8 )
 #endif
 #define tos(a)    #a
 #define tostr(a)  tos(a)
@@ -93,7 +95,7 @@ static const uint32x4_t vmask_d8 = {  3, 1, 2, 0 };
   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
   K. Matusiewicz, 2011/05/29 */
-#if defined(__AVX512VL__)
+#if defined(VL256)
 #define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* t_i = a_i + a_{i+1} */\
@@ -598,4 +600,4 @@ void OF512( v128_t* chaining )
  chaining[3] = xmm11;
 }
-
+#endif
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -146,7 +146,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
   const int hash_offset = SIZE512 - hashlen_m128i;
   uint64_t blocks = len / SIZE512;
   v128_t* in = (v128_t*)input;
-
+   
   // digest any full blocks, process directly from input 
   for ( i = 0; i < blocks; i++ )
      TF1024( ctx->chaining, &in[ i * SIZE512 ] );
@@ -181,6 +181,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
   // digest final padding block and do output transform
   TF1024( ctx->chaining, ctx->buffer );
   OF1024( ctx->chaining );
   // store hash result in output 
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -87,6 +87,7 @@ int final_groestl( hashState_groestl*, void* );
 int update_and_final_groestl( hashState_groestl*,  void*, const void*, int );
 int groestl512( hashState_groestl*,  void*, const void*, uint64_t );
 #define groestl512_full   groestl512
 #define groestl512_ctx    groestl512
 #endif /* __hash_h */
--- a/algo/groestl/groestl-gate.h
+++ b/algo/groestl/groestl-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__VAES__) && defined(SIMD512)
  #define GROESTL_4WAY_VAES 1
 #endif
--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -17,7 +17,7 @@
 #if defined(__AVX2__) && defined(__VAES__)
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
--- a/algo/groestl/groestl256-hash-4way.h
+++ b/algo/groestl/groestl256-hash-4way.h
@@ -43,7 +43,7 @@
 #define SIZE256 (SIZE_512/16)
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 typedef struct {
  __attribute__ ((aligned (128))) __m512i chaining[SIZE256];
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -42,7 +42,7 @@ static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
   { 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
 };
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
                                     0x1d1519111c141810, 0x1f171b131e161a12,
@@ -626,7 +626,7 @@ static const __m256i SUBSH_MASK7_2WAY =
 #define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
-  b1 = mm256_bcast_m128( mm128_mask_32( m128_neg1, 0x3 ) ); \
+  b1 = mm256_bcast_m128( v128_mask32( v128_neg1, 0x3 ) ); \
  a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
  a1 = _mm256_xor_si256( a1, b1 );\
  a2 = _mm256_xor_si256( a2, b1 );\
--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -17,7 +17,7 @@
 #if defined(__AVX2__) && defined(__VAES__)
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
 {
--- a/algo/groestl/groestl512-hash-4way.h
+++ b/algo/groestl/groestl512-hash-4way.h
@@ -33,7 +33,7 @@
 #define SIZE512 (SIZE_1024/16)
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 typedef struct {
  __attribute__ ((aligned (128))) __m512i chaining[SIZE512];
--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -50,7 +50,7 @@ static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
   { 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
 };
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
                                     0x1d1519111c141810, 0x1f171b131e161a12,
@@ -239,7 +239,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
  { \
    /* AddRoundConstant P1024 */\
    xmm8 = _mm512_xor_si512( xmm8, mm512_bcast_m128( \
-             casti_m128i( round_const_p, round_counter ) ) ); \
+             casti_v128u32( round_const_p, round_counter ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm8  = _mm512_shuffle_epi8( xmm8,  SUBSH_MASK0 ); \
    xmm9  = _mm512_shuffle_epi8( xmm9,  SUBSH_MASK1 );\
@@ -254,7 +254,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    \
     /* AddRoundConstant P1024 */\
    xmm0 = _mm512_xor_si512( xmm0, mm512_bcast_m128( \
-             casti_m128i( round_const_p, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_p, round_counter+1 ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
    xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
@@ -283,7 +283,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
    xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
    xmm15 = _mm512_xor_si512( xmm15, mm512_bcast_m128( \
-                 casti_m128i( round_const_q, round_counter ) ) ); \
+                 casti_v128u32( round_const_q, round_counter ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm8  = _mm512_shuffle_epi8( xmm8,  SUBSH_MASK1 );\
    xmm9  = _mm512_shuffle_epi8( xmm9,  SUBSH_MASK3 );\
@@ -306,7 +306,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
    xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
    xmm7 = _mm512_xor_si512( xmm7, mm512_bcast_m128( \
-             casti_m128i( round_const_q, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_q, round_counter+1 ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
    xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
@@ -812,7 +812,7 @@ static const __m256i SUBSH_MASK7_2WAY =
  { \
    /* AddRoundConstant P1024 */\
    xmm8 = _mm256_xor_si256( xmm8, mm256_bcast_m128( \
-             casti_m128i( round_const_p, round_counter ) ) ); \
+             casti_v128u32( round_const_p, round_counter ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm8  = _mm256_shuffle_epi8( xmm8,  SUBSH_MASK0_2WAY ); \
    xmm9  = _mm256_shuffle_epi8( xmm9,  SUBSH_MASK1_2WAY );\
@@ -827,7 +827,7 @@ static const __m256i SUBSH_MASK7_2WAY =
    \
     /* AddRoundConstant P1024 */\
    xmm0 = _mm256_xor_si256( xmm0, mm256_bcast_m128( \
-             casti_m128i( round_const_p, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_p, round_counter+1 ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
    xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\
@@ -856,7 +856,7 @@ static const __m256i SUBSH_MASK7_2WAY =
    xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
    xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
    xmm15 = _mm256_xor_si256( xmm15, mm256_bcast_m128( \
-                 casti_m128i( round_const_q, round_counter ) ) ); \
+                 casti_v128u32( round_const_q, round_counter ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm8  = _mm256_shuffle_epi8( xmm8,  SUBSH_MASK1_2WAY );\
    xmm9  = _mm256_shuffle_epi8( xmm9,  SUBSH_MASK3_2WAY );\
@@ -879,7 +879,7 @@ static const __m256i SUBSH_MASK7_2WAY =
    xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
    xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
    xmm7 = _mm256_xor_si256( xmm7, mm256_bcast_m128( \
-             casti_m128i( round_const_q, round_counter+1 ) ) ); \
+             casti_v128u32( round_const_q, round_counter+1 ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
    xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -213,7 +213,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
   v128_bswap32_intrlv80_4x32( vdata, pdata );
   do {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3,n+2,n+1,n ) );
      myriad_4way_hash( hash, vdata );
      pdata[19] = n;
--- a/algo/groestl/myrgr-gate.c
+++ b/algo/groestl/myrgr-gate.c
@@ -16,7 +16,7 @@ bool register_myriad_algo( algo_gate_t* gate )
  init_myrgr_ctx();
  gate->scanhash  = (void*)&scanhash_myriad;
  gate->hash      = (void*)&myriad_hash;
-  gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
+  gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA256_OPT | VAES_OPT;
 #endif
  return true;
 };
--- a/algo/groestl/myrgr-gate.h
+++ b/algo/groestl/myrgr-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__VAES__) && defined(SIMD512)
  #define MYRGR_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
  #define MYRGR_4WAY 1
--- a/algo/groestl/sph_groestl.c
+++ b/algo/groestl/sph_groestl.c
@@ -35,8 +35,6 @@
 #include "sph_groestl.h"
 #if !defined(__AES__)
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -3119,5 +3117,4 @@ sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #ifdef __cplusplus
 }
 #endif  // !AES
 #endif
--- a/algo/groestl/sph_groestl.h
+++ b/algo/groestl/sph_groestl.h
@@ -42,7 +42,6 @@ extern "C"{
 #include <stddef.h>
 #include "compat/sph_types.h"
 #if !defined(__AES__)   
 /**
 * Output size (in bits) for Groestl-224.
 */
@@ -327,5 +326,4 @@ void sph_groestl512_addbits_and_close(
 }
 #endif
 #endif  // !AES
 #endif
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -382,7 +382,7 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
 #define S1F   MF
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 // Hamsi 8 way AVX512 
@@ -1122,7 +1122,7 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
 // Hamsi 4 way AVX2
-#if defined(__AVX512VL__)
+#if defined(VL256)
 #define INPUT_BIG \
 do { \
@@ -1501,7 +1501,7 @@ do { /* order is important */ \
   sc->h[14] = CE; \
   sc->h[15] = CF;
-#if defined(__AVX512VL__)
+#if defined(VL256)
 #define INPUT_8X32 \
 { \
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -38,7 +38,7 @@
 #include <stddef.h>
 #include "simd-utils.h"
-// SSE2 or NEON Hamsi-512 2x64
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
 typedef struct
 {
@@ -57,6 +57,8 @@ void hamsi512_2x64_ctx( hamsi512_2x64_context *sc, void *dst, const void *data,
                        size_t len );
 void hamsi512_2x64( void *dst, const void *data, size_t len );
 #endif
 #if defined (__AVX2__)
 // Hamsi-512 4x64
@@ -102,7 +104,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,
 #endif
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 // Hamsi-512 8x64
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -53,7 +53,7 @@ extern "C"{
 #define SPH_SMALL_FOOTPRINT_HAVAL   1
 //#endif
-#if defined(__AVX512VL__)
+#if defined(VL256)
 // ( ~( a ^ b ) ) & c
 #define v128_andnotxor( a, b, c ) \
@@ -583,7 +583,7 @@ do { \
 // Haval-256 8 way 32 bit avx2
-#if defined (__AVX512VL__)
+#if defined (VL256)
 // ( ~( a ^ b ) ) & c
 #define mm256_andnotxor( a, b, c ) \
@@ -882,7 +882,7 @@ do { \
 #endif // AVX2
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512) 
 // ( ~( a ^ b ) ) & c
 #define mm512_andnotxor( a, b, c ) \
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -107,7 +107,7 @@ void haval256_5_8way_close( void *cc, void *dst );
 #endif // AVX2
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 typedef struct {
   __m512i buf[32];
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -204,7 +204,7 @@ static const uint64_t IV512[] =
      (state)->H[15] = h7l; \
   } while (0)
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 #define Sb_8W(x0, x1, x2, x3, c) \
 { \
@@ -364,8 +364,7 @@ static const uint64_t IV512[] =
 #if defined(__AVX2__)
-#if defined(__AVX512VL__)
+#if defined(VL256)
 //TODO enable for AVX10_256, not used with AVX512VL
 #define notxorandnot( a, b, c ) \
   _mm256_ternarylogic_epi64( a, b, c, 0x2d )
@@ -522,7 +521,7 @@ static const uint64_t IV512[] =
 #endif   // AVX2
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 void jh256_8x64_init( jh_8x64_context *sc )
 {
--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -55,7 +55,7 @@
 * <code>memcpy()</code>).
 */
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 typedef struct
 {
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -78,7 +78,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
-   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   __m256i *noncev = (__m256i*)vdata + 9;   // aligned
   const uint32_t Htarg = ptarget[7];
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
@@ -108,4 +108,53 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }
 #elif defined(KECCAK_2WAY)
 void keccakhash_2x64(void *state, const void *input)
 {
    keccak256_2x64_context ctx;
    keccak256_2x64_init( &ctx );
    keccak256_2x64_update( &ctx, input, 80 );
    keccak256_2x64_close( &ctx, state );
 }
 int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
   uint32_t hash[16*2] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[13]);   // 3*4+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   v128_t *noncev = (v128_t*)vdata + 9;
   const uint32_t Htarg = ptarget[7];
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   v128_bswap32_intrlv80_2x64( vdata, pdata );
   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
   do {
      keccakhash_2x64( hash, vdata );
      for ( int lane = 0; lane < 2; lane++ )
      if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
      {
          extr_lane_2x64( lane_hash, hash, lane, 256 );
          if ( valid_hash( lane_hash, ptarget ))
          {
              pdata[19] = bswap_32( n + lane );
              submit_solution( work, lane_hash, mythr );
          }
      }
      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
      n += 2;
   } while ( (n < max_nonce-2) && !work_restart[thr_id].restart);
   pdata[19] = n;
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
 #endif
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -17,6 +17,9 @@ bool register_keccak_algo( algo_gate_t* gate )
 #elif defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
 #elif defined (KECCAK_2WAY)
  gate->scanhash  = (void*)&scanhash_keccak_2x64;
  gate->hash      = (void*)&keccakhash_2x64;
 #else
  gate->scanhash  = (void*)&scanhash_keccak;
  gate->hash      = (void*)&keccakhash;
@@ -37,6 +40,9 @@ bool register_keccakc_algo( algo_gate_t* gate )
 #elif defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
 #elif defined (KECCAK_2WAY)
  gate->scanhash  = (void*)&scanhash_keccak_2x64;
  gate->hash      = (void*)&keccakhash_2x64;
 #else
  gate->scanhash  = (void*)&scanhash_keccak;
  gate->hash      = (void*)&keccakhash;
@@ -75,15 +81,17 @@ void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
 bool register_sha3d_algo( algo_gate_t* gate )
 {
  hard_coded_eb = 6;
-//  opt_extranonce = false;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
  gate->optimizations = AVX2_OPT | AVX512_OPT;
  gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
-#if defined (KECCAK_8WAY)
+#if defined (SHA3D_8WAY)
  gate->scanhash  = (void*)&scanhash_sha3d_8way;
  gate->hash      = (void*)&sha3d_hash_8way;
-#elif defined (KECCAK_4WAY)
+#elif defined (SHA3D_4WAY)
  gate->scanhash  = (void*)&scanhash_sha3d_4way;
  gate->hash      = (void*)&sha3d_hash_4way;
 #elif defined (SHA3D_2WAY)
  gate->scanhash  = (void*)&scanhash_sha3d_2x64;
  gate->hash      = (void*)&sha3d_hash_2x64;
 #else
  gate->scanhash  = (void*)&scanhash_sha3d;
  gate->hash      = (void*)&sha3d_hash;
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -4,10 +4,20 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define KECCAK_8WAY 1
 #elif defined(__AVX2__)
  #define KECCAK_4WAY 1
 #elif defined(__SSE2__) || defined(__ARM_NEON)
  #define KECCAK_2WAY 1
 #endif
 #if defined(SIMD512)
  #define SHA3D_8WAY 1
 #elif defined(__AVX2__)
  #define SHA3D_4WAY 1
 #elif defined(__SSE2__) || defined(__ARM_NEON)
  #define SHA3D_2WAY 1
 #endif
 extern int hard_coded_eb;
@@ -16,27 +26,47 @@ extern int hard_coded_eb;
 void keccakhash_8way( void *state, const void *input );
 int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
+                          uint64_t *hashes_done, struct thr_info *mythr );
 void sha3d_hash_8way( void *state, const void *input );
 int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 #elif defined(KECCAK_4WAY)
 void keccakhash_4way( void *state, const void *input );
 int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
+                          uint64_t *hashes_done, struct thr_info *mythr );
-void sha3d_hash_4way( void *state, const void *input );
+#elif defined(KECCAK_2WAY)
-int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
+
-                         uint64_t *hashes_done, struct thr_info *mythr );
+void keccakhash_2x64( void *state, const void *input );
 int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 #else
 void keccakhash( void *state, const void *input );
 int scanhash_keccak( struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done, struct thr_info *mythr );
+                     uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 #if defined(SHA3D_8WAY)
 void sha3d_hash_8way( void *state, const void *input );
 int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 #elif defined(SHA3D_4WAY)
 void sha3d_hash_4way( void *state, const void *input );
 int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 #elif defined(SHA3D_2WAY)
 void sha3d_hash_2x64( void *state, const void *input );
 int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 #else
 void sha3d_hash( void *state, const void *input );
 int scanhash_sha3d( struct work *work, uint32_t max_nonce,
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -57,7 +57,7 @@ static const uint64_t RC[] = {
 #define DO(x)   x
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 #define INPUT_BUF(size)   do { \
    size_t j; \
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -4,7 +4,7 @@
 #include <stddef.h>
 #include "simd-utils.h"
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 typedef struct
 {
--- a/algo/keccak/sha3d-4way.c
+++ b/algo/keccak/sha3d-4way.c
@@ -4,7 +4,7 @@
 #include <stdint.h>
 #include "keccak-hash-4way.h"
-#if defined(KECCAK_8WAY)
+#if defined(SHA3D_8WAY)
 void sha3d_hash_8way(void *state, const void *input)
 {
@@ -64,7 +64,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
   return 0;
 }
-#elif defined(KECCAK_4WAY)
+#elif defined(SHA3D_4WAY)
 void sha3d_hash_4way(void *state, const void *input)
 {
@@ -122,4 +122,60 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }
 #elif defined(SHA3D_2WAY)
 void sha3d_hash_2x64(void *state, const void *input)
 {
    uint32_t buffer[16*4] __attribute__ ((aligned (64)));
    keccak256_2x64_context ctx;
    keccak256_2x64_init( &ctx );
    keccak256_2x64_update( &ctx, input, 80 );
    keccak256_2x64_close( &ctx, buffer );
    keccak256_2x64_init( &ctx );
    keccak256_2x64_update( &ctx, buffer, 32 );
    keccak256_2x64_close( &ctx, state );
 }
 int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[24*2] __attribute__ ((aligned (64)));
   uint32_t hash[16*2] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[13]);   // 3*4+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 2;
   v128_t *noncev = (v128_t*)vdata + 9;
   const uint32_t Htarg = ptarget[7];
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   v128_bswap32_intrlv80_2x64( vdata, pdata );
   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
   do {
      sha3d_hash_2x64( hash, vdata );
      for ( int lane = 0; lane < 2; lane++ )
      if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
      {
          extr_lane_2x64( lane_hash, hash, lane, 256 );
          if ( valid_hash( lane_hash, ptarget ) )
          {
              pdata[19] = bswap_32( n + lane );
              submit_solution( work, lane_hash, mythr );
          }
      }
      *noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
      n += 2;
   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
 #endif
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -59,7 +59,7 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
 };
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 #define cns4w(i)  mm512_bcast_m128( ( (__m128i*)CNS_INIT)[i] )
@@ -524,8 +524,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
    a = _mm256_xor_si256( a, c0 ); \
    b = _mm256_xor_si256( b, c1 );
-//TODO Enable for AVX10_256, not used with AVX512 or AVX10_512
+#if defined(VL256) 
 #if defined(__AVX512VL__) 
 #define MULT2( a0, a1 ) \
 { \
--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -51,7 +51,7 @@
 #define LIMIT_512 128
 /*********************************/
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 typedef struct {
    uint32_t buffer[8*4];
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -28,8 +28,7 @@
    a = v128_xor( a, c0 ); \
    b = v128_xor( b, c1 ); \
-#if defined(__AVX512VL__)
+#if defined(VL256)
 //TODO enable for AVX10_512 AVX10_256
 #define MULT2( a0, a1 ) \
 { \
@@ -48,43 +47,36 @@
  a1 = _mm_alignr_epi8( b, a1, 4 ); \
 }
-#elif defined(__ARM_NEON)
+
 #elif defined(__ARM_NEON) || defined(__SSE2__)
 // { a1_0, 0, a1_0, a1_0 }
 #define MULT2( a0, a1 ) \
 { \
-  v128_t b = v128_xor( a0, v128_and( vdupq_laneq_u32( a1, 0 ), MASK ) ); \
+  v128_t b = v128_xor( a0, v128_and( v128_bcast32( a1 ), MASK ) ); \
  a0 = v128_alignr32( a1, b, 1 ); \
  a1 = v128_alignr32( b, a1, 1 ); \
 }
-#else   // assume SSE2
+#else
-
+  #warning __FILE__ ":" __LINE__ " Unknown or unsupported CPU architecture."
 #define MULT2( a0, a1 ) \
 { \
  v128_t b = v128_xor( a0, v128_and( _mm_shuffle_epi32( a1, 0 ), MASK ) ); \
  a0 = v128_or( _mm_srli_si128(  b, 4 ), _mm_slli_si128( a1, 12 ) ); \
  a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128(  b, 12 ) ); \
 } 
 #endif
-#if defined(__AVX512VL__)
+#if defined(VL256)
 //TODO enable for AVX10_512 AVX10_256
 #define SUBCRUMB( a0, a1, a2, a3 ) \
 { \
    v128_t t = a0; \
-    a0 = mm128_xoror( a3, a0, a1 ); \
+    a0 = v128_xoror( a3, a0, a1 ); \
    a2 = v128_xor( a2, a3 ); \
    a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
-    a3 = mm128_xorand( a2, a3, t ); \
+    a3 = v128_xorand( a2, a3, t ); \
-    a2 = mm128_xorand( a1, a2, a0 ); \
+    a2 = v128_xorand( a1, a2, a0 ); \
    a1 = v128_or( a1, a3 ); \
    a3 = v128_xor( a3, a2 ); \
    t  = v128_xor( t, a1 ); \
    a2 = v128_and( a2, a1 ); \
-    a1 = mm128_xnor( a1, a0 ); \
+    a1 = v128_xnor( a1, a0 ); \
    a0 = t; \
 }
--- a/algo/luffa/luffa_for_sse2.h
+++ b/algo/luffa/luffa_for_sse2.h
@@ -68,4 +68,4 @@ int update_and_final_luffa( hashState_luffa *state, void* output,
 int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
                                   const void* data, size_t inlen );
-#endif   // LUFFA_FOR_SSE2_H___
+#endif   // LUFFA_FOR_SSE2_H__
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -15,7 +15,7 @@
 #include "algo/groestl/sph_groestl.h"
 #endif
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define ALLIUM_16WAY 1
 #elif defined(__AVX2__)
  #define ALLIUM_8WAY 1
@@ -465,12 +465,8 @@ typedef union
 {
   keccak256_2x64_context    keccak;
   cubehashParam             cube;
 //#if defined(__x86_64__)
   skein256_2x64_context     skein;
-//#else
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
 //   sph_skein512_context      skein;
 //#endif
 #if defined(__AES__) // || defined(__ARM_FEATURE_AES)
   hashState_groestl256      groestl;
 #else
   sph_groestl256_context     groestl;
@@ -516,7 +512,6 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
 //#if defined(__x86_64__)
   intrlv_2x64( vhashA, hash0, hash1, 256 );
   skein256_2x64_init( &ctx.skein );
   skein256_2x64_update( &ctx.skein, vhashA, 32 );
@@ -527,23 +522,8 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
   skein256_2x64_update( &ctx.skein, vhashA, 32 );
   skein256_2x64_close( &ctx.skein, vhashA );
   dintrlv_2x64( hash2, hash3, vhashA, 256 );
-/*
+
-#else
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
    sph_skein256_init( &ctx.skein );
    sph_skein256( &ctx.skein, hash0, 32 );
    sph_skein256_close( &ctx.skein, hash0 );
    sph_skein256_init( &ctx.skein );
    sph_skein256( &ctx.skein, hash1, 32 );
    sph_skein256_close( &ctx.skein, hash1 );
    sph_skein256_init( &ctx.skein );
    sph_skein256( &ctx.skein, hash2, 32 );
    sph_skein256_close( &ctx.skein, hash2 );
    sph_skein256_init( &ctx.skein );
    sph_skein256( &ctx.skein, hash3, 32 );
    sph_skein256_close( &ctx.skein, hash3 );
 #endif
 */
 #if defined(__AES__) // || defined(__ARM_FEATURE_AES)
   groestl256_full( &ctx.groestl, hash0, hash0, 256 );
   groestl256_full( &ctx.groestl, hash1, hash1, 256 );
   groestl256_full( &ctx.groestl, hash2, hash2, 256 );
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -5,7 +5,7 @@
 #include <stdint.h>
 #include "lyra2.h"
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define LYRA2REV3_16WAY 1
 #elif defined(__AVX2__)
  #define LYRA2REV3_8WAY 1
@@ -49,7 +49,7 @@ bool init_lyra2rev3_ctx();
 //////////////////////////////////
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define LYRA2REV2_16WAY 1
 #elif defined(__AVX2__)
  #define LYRA2REV2_8WAY 1
@@ -108,7 +108,7 @@ bool lyra2h_thread_init();
 /////////////////////////////////////////
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define PHI2_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define PHI2_4WAY 1
--- a/algo/lyra2/lyra2-hash-2way.c
+++ b/algo/lyra2/lyra2-hash-2way.c
@@ -41,7 +41,7 @@
 //  lyra2z330, lyra2h, 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 /**
 * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -59,7 +59,7 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
 int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
                  uint64_t timeCost, uint64_t nRows, uint64_t nCols );
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -67,7 +67,7 @@ int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
   lyra2h_4way_midstate( vdata );
   do {
-     *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+     *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      lyra2h_4way_hash( hash, vdata );
      for ( int i = 0; i < 4; i++ )
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -456,7 +456,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
   do
   {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      lyra2rev2_4way_hash( hash, vdata );
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -3,7 +3,7 @@
 #include "lyra2.h"
 #include "algo/blake/blake256-hash.h"
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define LYRA2Z_16WAY 1
 #elif defined(__AVX2__)
  #define LYRA2Z_8WAY 1
--- a/algo/lyra2/phi2-4way.c
+++ b/algo/lyra2/phi2-4way.c
@@ -4,7 +4,7 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "lyra2.h"
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__VAES__) && defined(SIMD512)
  #include "algo/echo/echo-hash-4way.h"
 #elif defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -27,7 +27,7 @@
 #include "lyra2.h"
 #include "simd-utils.h"
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
 {
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -43,7 +43,7 @@ static const uint64_t blake2b_IV[8] =
  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
 };
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 #define G2W_4X64(a,b,c,d) \
   a = _mm512_add_epi64( a, b ); \
@@ -150,13 +150,13 @@ static const uint64_t blake2b_IV[8] =
 // returns void, all args updated
 #define G_2X64(a,b,c,d) \
   a = v128_add64( a, b ); \
-   d = v128_ror64( v128_xor( d, a), 32 ); \
+   d = v128_ror64xor( d, a, 32 ); \
   c = v128_add64( c, d ); \
-   b = v128_ror64( v128_xor( b, c ), 24 ); \
+   b = v128_ror64xor( b, c, 24 ); \
   a = v128_add64( a, b ); \
-   d = v128_ror64( v128_xor( d, a ), 16 ); \
+   d = v128_ror64xor( d, a, 16 ); \
   c = v128_add64( c, d ); \
-   b = v128_ror64( v128_xor( b, c ), 63 );
+   b = v128_ror64xor( b, c, 63 );
 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
 { \
@@ -195,10 +195,6 @@ static const uint64_t blake2b_IV[8] =
 #endif // AVX2 else SSE2
 static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    return ( w >> c ) | ( w << ( 64 - c ) );
 }
 #define G( r, i, a, b, c, d ) \
 { \
    a = a + b; \
@@ -222,7 +218,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    G( r, 7, v[ 3], v[ 4], v[ 9], v[14] );
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
 union _ovly_512
 {
--- a/algo/m7m/m7m.c
+++ b/algo/m7m/m7m.c
@@ -21,7 +21,7 @@
 #define EPS1 DBL_EPSILON
 #define EPS2 3.0e-11
-inline double exp_n( double xt )
+static inline double exp_n( double xt )
 {
    if ( xt < -700.0 )
        return 0;
@@ -33,7 +33,7 @@ inline double exp_n( double xt )
        return exp( xt );
 }
-inline double exp_n2( double x1, double x2 )
+static inline double exp_n2( double x1, double x2 )
 {
    double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
           p5 = 37., p6 = 700.;
@@ -306,7 +306,7 @@ bool register_m7m_algo( algo_gate_t *gate )
  applog( LOG_ERR, "M7M algo is not supported on MacOS");
  return false;
 #else  
-  gate->optimizations = SHA_OPT;
+  gate->optimizations = SHA256_OPT;
  init_m7m_ctx();
  gate->scanhash              = (void*)&scanhash_m7m_hash;
  gate->build_stratum_request = (void*)&std_be_build_stratum_request;
--- a/algo/m7m/magimath.cpp
+++ b/algo/m7m/magimath.cpp
@@ -1,75 +0,0 @@
 // Copyright (c) 2014 The Magi developers
 // Distributed under the MIT/X11 software license, see the accompanying
 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
 #include <iostream>
 #include <cfloat>
 #include <limits>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include "magimath.h"
 #define EPS1 (std::numeric_limits<double>::epsilon())
 #define EPS2 3.0e-11
 static void gauleg(double x1, double x2, double x[], double w[], const int n)
 {
 	int m,j,i;
 	double z1, z, xm, xl, pp, p3, p2, p1;
 	m=(n+1)/2;
 	xm=0.5*(x2+x1);
 	xl=0.5*(x2-x1);
 	for (i=1;i<=m;i++) {
 		z=cos(3.141592654*(i-0.25)/(n+0.5));
 		do {
 			p1=1.0;
 			p2=0.0;
 			for (j=1;j<=n;j++) {
 				p3=p2;
 				p2=p1;
 				p1=((2.0*j-1.0)*z*p2-(j-1.0)*p3)/j;
 			}
 			pp=n*(z*p1-p2)/(z*z-1.0);
 			z1=z;
 			z=z1-p1/pp;
 		} while (fabs(z-z1) > EPS2);
 		x[i]=xm-xl*z;
 		x[n+1-i]=xm+xl*z;
 		w[i]=2.0*xl/((1.0-z*z)*pp*pp);
 		w[n+1-i]=w[i];
 	}
 }
 static double GaussianQuad_N(double func(const double), const double a2, const double b2, const int NptGQ)
 {
 	double s=0.0;
 #ifdef _MSC_VER
 #define SW_DIVS 23
 	double x[SW_DIVS+1], w[SW_DIVS+1];
 #else
 	double x[NptGQ+1], w[NptGQ+1];
 #endif
 	gauleg(a2, b2, x, w, NptGQ);
 	for (int j=1; j<=NptGQ; j++) {
 		s += w[j]*func(x[j]);
 	}
 	return s;
 }
 static double swit_(double wvnmb)
 {
 	return pow( (5.55243*(exp_n(-0.3*wvnmb/15.762) - exp_n(-0.6*wvnmb/15.762)))*wvnmb, 0.5)
 		/ 1034.66 * pow(sin(wvnmb/65.), 2.);
 }
 uint32_t sw_(int nnounce, int divs)
 {
 	double wmax = ((sqrt((double)(nnounce))*(1.+EPS1))/450+100);
 	return ((uint32_t)(GaussianQuad_N(swit_, 0., wmax, divs)*(1.+EPS1)*1.e6));
 }
--- a/algo/m7m/magimath.h
+++ b/algo/m7m/magimath.h
@@ -1,54 +0,0 @@
 // Copyright (c) 2014 The Magi developers
 // Distributed under the MIT/X11 software license, see the accompanying
 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
 #ifndef MAGI_MATH_H
 #define MAGI_MATH_H
 #include <math.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 uint32_t sw_(int nnounce, int divs);
 #ifdef __cplusplus
 }
 #endif
 inline double exp_n(double xt)
 {
 	double p1 = -700.0, p3 = -0.8e-8, p4 = 0.8e-8, p6 = 700.0;
 	if(xt < p1)
 		return 0;
 	else if(xt > p6)
 		return 1e200;
 	else if(xt > p3 && xt < p4)
 		return (1.0 + xt);
 	else
 		return exp(xt);
 }
 // 1 / (1 + exp(x1-x2))
 inline double exp_n2(double x1, double x2)
 {
 	double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8, p5 = 37., p6 = 700.;
 	double xt = x1 - x2;
 	if (xt < p1+1.e-200)
 		return 1.;
 	else if (xt > p1 && xt < p2 + 1.e-200)
 		return ( 1. - exp(xt) );
 	else if (xt > p2 && xt < p3 + 1.e-200)
 		return ( 1. / (1. + exp(xt)) );
 	else if (xt > p3 && xt < p4)
 		return ( 1. / (2. + xt) );
 	else if (xt > p4 - 1.e-200 && xt < p5)
 		return ( exp(-xt) / (1. + exp(-xt)) );
 	else if (xt > p5 - 1.e-200 && xt < p6)
 		return ( exp(-xt) );
 	else //if (xt > p6 - 1.e-200)
 		return 0.;
 }
 #endif
--- a/algo/nist5/nist5-gate.h
+++ b/algo/nist5/nist5-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define NIST5_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define NIST5_4WAY 1
--- a/algo/panama/panama-hash-4way.c
+++ b/algo/panama/panama-hash-4way.c
@@ -71,8 +71,7 @@ do { \
 } while (0)
 #define GAMMA_4W(n0, n1, n2, n4)   \
-   (g ## n0 = v128_xor( a ## n0, \
+   (g ## n0 = v128_xor( a ## n0, v128_ornot( a ## n2, a ## n1 ) ) )
                             v128_or( a ## n1, v128_not( a ## n2 ) ) ) )
 #define PI_ALL_4W   do { \
      a0  = g0; \
@@ -312,7 +311,7 @@ do { \
      BUPDATE1_8W( 7, 1 ); \
 } while (0)
-#if defined(__AVX512VL__)
+#if defined(VL256)
 #define GAMMA_8W(n0, n1, n2, n4)   \
   ( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )  
--- a/algo/quark/anime-gate.h
+++ b/algo/quark/anime-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define ANIME_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define ANIME_4WAY 1
--- a/algo/quark/hmq1725-gate.c
+++ b/algo/quark/hmq1725-gate.c
@@ -9,11 +9,11 @@ bool register_hmq1725_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_hmq1725_4way;
  gate->hash      = (void*)&hmq1725_4way_hash;
 #else
  init_hmq1725_ctx();
  gate->scanhash  = (void*)&scanhash_hmq1725;
  gate->hash      = (void*)&hmq1725hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT 
                      | NEON_OPT;
  opt_target_factor = 65536.0;
  return true;
 };
--- a/algo/quark/hmq1725-gate.h
+++ b/algo/quark/hmq1725-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define HMQ1725_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define HMQ1725_4WAY 1
@@ -29,7 +29,6 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
 void hmq1725hash( void *state, const void *input );
 int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );
 void init_hmq1725_ctx();
 #endif
--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -4,367 +4,273 @@
 #include <string.h>
 #include <stdint.h>
-#include "algo/blake/sph_blake.h"
+#include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
-#include "algo/groestl/sph_groestl.h"
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/fugue/fugue-aesni.h"
 #else
  #include "algo/fugue/sph_fugue.h"
 #endif
 #if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #else
  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
 #endif
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/sha/sph_sha2.h"
 #if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/fugue/fugue-aesni.h"
 #else
  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
  #include "algo/fugue/sph_fugue.h"
 #endif
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
  #include "algo/simd/sph_simd.h"
 #else
  #include "algo/simd/nist.h"
 #endif
-typedef struct {
+union _hmq1725_ctx_holder
  sph_blake512_context    blake1, blake2;
  sph_bmw512_context      bmw1, bmw2, bmw3;
  sph_skein512_context    skein1, skein2;
  sph_jh512_context       jh1, jh2;
  sph_keccak512_context   keccak1, keccak2;
  hashState_luffa         luffa1, luffa2;
  cubehashParam           cube;
  sph_shavite512_context  shavite1, shavite2;
 #if defined(__aarch64__)
  sph_simd512_context     simd1, simd2;
 #else
  hashState_sd            simd1, simd2;
 #endif
  sph_hamsi512_context    hamsi1;
  sph_shabal512_context   shabal1;
  sph_whirlpool_context   whirlpool1, whirlpool2, whirlpool3, whirlpool4;
  sph_sha512_context      sha1, sha2;
  sph_haval256_5_context  haval1, haval2;
 #if defined(__AES__)
  hashState_echo          echo1, echo2;
  hashState_groestl       groestl1, groestl2;
  hashState_fugue         fugue1, fugue2;
 #else
  sph_groestl512_context  groestl1, groestl2;
  sph_echo512_context     echo1, echo2;
  sph_fugue512_context    fugue1, fugue2;
 #endif
 } hmq1725_ctx_holder;
 static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
 static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));
 void init_hmq1725_ctx()
 {
-    sph_blake512_init(&hmq1725_ctx.blake1);
+   blake512_context        blake;
-    sph_blake512_init(&hmq1725_ctx.blake2);
+   sph_bmw512_context      bmw;
-
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
-    sph_bmw512_init(&hmq1725_ctx.bmw1);
+   hashState_fugue         fugue;
    sph_bmw512_init(&hmq1725_ctx.bmw2);
    sph_bmw512_init(&hmq1725_ctx.bmw3);
    sph_skein512_init(&hmq1725_ctx.skein1);
    sph_skein512_init(&hmq1725_ctx.skein2);
    sph_jh512_init(&hmq1725_ctx.jh1);
    sph_jh512_init(&hmq1725_ctx.jh2);
    sph_keccak512_init(&hmq1725_ctx.keccak1);
    sph_keccak512_init(&hmq1725_ctx.keccak2);
    init_luffa( &hmq1725_ctx.luffa1, 512 );
    init_luffa( &hmq1725_ctx.luffa2, 512 );
    cubehashInit( &hmq1725_ctx.cube, 512, 16, 32 );
    sph_shavite512_init(&hmq1725_ctx.shavite1);
    sph_shavite512_init(&hmq1725_ctx.shavite2);
 #if defined(__aarch64__)
    sph_simd512_init(&hmq1725_ctx.simd1);
    sph_simd512_init(&hmq1725_ctx.simd2);
 #else    
    init_sd( &hmq1725_ctx.simd1, 512 );
    init_sd( &hmq1725_ctx.simd2, 512 );
 #endif
    sph_hamsi512_init(&hmq1725_ctx.hamsi1);
 #if defined(__AES__)
    fugue512_Init( &hmq1725_ctx.fugue1, 512 );
    fugue512_Init( &hmq1725_ctx.fugue2, 512 );
 #else
-    sph_fugue512_init(&hmq1725_ctx.fugue1);
+   sph_fugue512_context    fugue;
    sph_fugue512_init(&hmq1725_ctx.fugue2);
 #endif
-
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
-    sph_shabal512_init(&hmq1725_ctx.shabal1);
+   hashState_groestl       groestl;
-
+   hashState_echo          echo;
    sph_whirlpool_init(&hmq1725_ctx.whirlpool1);
    sph_whirlpool_init(&hmq1725_ctx.whirlpool2);
    sph_whirlpool_init(&hmq1725_ctx.whirlpool3);
    sph_whirlpool_init(&hmq1725_ctx.whirlpool4);
    sph_sha512_init( &hmq1725_ctx.sha1 );
    sph_sha512_init( &hmq1725_ctx.sha2 );
    sph_haval256_5_init(&hmq1725_ctx.haval1);
    sph_haval256_5_init(&hmq1725_ctx.haval2);
 #if defined(__AES__)
     init_echo( &hmq1725_ctx.echo1, 512 );
     init_echo( &hmq1725_ctx.echo2, 512 );
     init_groestl( &hmq1725_ctx.groestl1, 64 );
     init_groestl( &hmq1725_ctx.groestl2, 64 );
 #else
-     sph_groestl512_init( &hmq1725_ctx.groestl1 );
+   sph_groestl512_context  groestl;
-     sph_groestl512_init( &hmq1725_ctx.groestl2 );
+   sph_echo512_context     echo;
     sph_echo512_init( &hmq1725_ctx.echo1 );
     sph_echo512_init( &hmq1725_ctx.echo2 );
 #endif
-}
+   sph_skein512_context    skein;
-
+   sph_jh512_context       jh;
-void hmq_bmw512_midstate( const void* input )
+   sph_keccak512_context   keccak;
-{
+   hashState_luffa         luffa;
-    memcpy( &hmq_bmw_mid, &hmq1725_ctx.bmw1, sizeof hmq_bmw_mid );
+   cubehashParam           cube;
-    sph_bmw512( &hmq_bmw_mid, input, 64 );
+   sph_shavite512_context  shavite;
-}
+   simd512_context         simd;
-
+   sph_hamsi512_context    hamsi;
-__thread hmq1725_ctx_holder h_ctx __attribute__ ((aligned (64)));
+   sph_shabal512_context   shabal;
   sph_whirlpool_context   whirlpool;
   sph_sha512_context      sha;
   sph_haval256_5_context  haval;
 };
 typedef union _hmq1725_ctx_holder hmq1725_ctx_holder;
 extern void hmq1725hash(void *state, const void *input)
 {
    const uint32_t mask = 24;
-    uint32_t hashA[32] __attribute__((aligned(64)));
+    uint32_t hashA[32] __attribute__((aligned(32)));
-    uint32_t hashB[32] __attribute__((aligned(64)));
+    uint32_t hashB[32] __attribute__((aligned(32)));
-    const int midlen = 64;            // bytes
+    hmq1725_ctx_holder ctx __attribute__ ((aligned (64)));
    const int tail   = 80 - midlen;   // 16
-    memcpy(&h_ctx, &hmq1725_ctx, sizeof(hmq1725_ctx));
+    sph_bmw512_init( &ctx.bmw );
    sph_bmw512( &ctx.bmw, input, 80 );
    sph_bmw512_close( &ctx.bmw, hashA );   //1
-    memcpy( &h_ctx.bmw1, &hmq_bmw_mid, sizeof hmq_bmw_mid );
+    sph_whirlpool_init( &ctx.whirlpool );
-    sph_bmw512( &h_ctx.bmw1, input + midlen, tail );
+    sph_whirlpool( &ctx.whirlpool, hashA, 64 );    //0
-    sph_bmw512_close(&h_ctx.bmw1, hashA);   //1
+    sph_whirlpool_close( &ctx.whirlpool, hashB );   //1
    sph_whirlpool (&h_ctx.whirlpool1, hashA, 64);    //0
    sph_whirlpool_close(&h_ctx.whirlpool1, hashB);   //1
    if ( hashB[0] & mask )   //1
    {
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
-     update_and_final_groestl( &h_ctx.groestl1, (char*)hashA,
+       groestl512_full( &ctx.groestl, hashA, hashB, 512 );
                               (const char*)hashB, 512 );
 #else
-     sph_groestl512 (&h_ctx.groestl1, hashB, 64); //1
+       sph_groestl512_init( &ctx.groestl );
-     sph_groestl512_close(&h_ctx.groestl1, hashA); //2
+       sph_groestl512( &ctx.groestl, hashB, 64 ); //1
       sph_groestl512_close( &ctx.groestl, hashA ); //2
 #endif
    }
    else
    {
-      sph_skein512 (&h_ctx.skein1, hashB, 64); //1
+      sph_skein512_init( &ctx.skein );
-      sph_skein512_close(&h_ctx.skein1, hashA); //2
+      sph_skein512( &ctx.skein, hashB, 64 ); //1
      sph_skein512_close( &ctx.skein, hashA ); //2
    }
-    sph_jh512 (&h_ctx.jh1, hashA, 64); //3
+    sph_jh512_init( &ctx.jh );
-    sph_jh512_close(&h_ctx.jh1, hashB); //4
+    sph_jh512( &ctx.jh, hashA, 64 ); //3
    sph_jh512_close( &ctx.jh, hashB ); //4
-    sph_keccak512 (&h_ctx.keccak1, hashB, 64); //2
+    sph_keccak512_init( &ctx.keccak );
-    sph_keccak512_close(&h_ctx.keccak1, hashA); //3
+    sph_keccak512( &ctx.keccak, hashB, 64 ); //2
    sph_keccak512_close( &ctx.keccak, hashA ); //3
    if ( hashA[0] & mask ) //4
    {
-        sph_blake512 (&h_ctx.blake1, hashA, 64); //
+        blake512_init( &ctx.blake );
-        sph_blake512_close(&h_ctx.blake1, hashB); //5
+        blake512_update( &ctx.blake, hashA, 64 );
        blake512_close( &ctx.blake, hashB );
    }
    else
    {
-        sph_bmw512 (&h_ctx.bmw2, hashA, 64); //4
+        sph_bmw512_init( &ctx.bmw );
-        sph_bmw512_close(&h_ctx.bmw2, hashB);   //5
+        sph_bmw512( &ctx.bmw, hashA, 64 ); //4
        sph_bmw512_close( &ctx.bmw, hashB );   //5
    }
-     update_and_final_luffa( &h_ctx.luffa1, hashA, hashB, 64 );
+    luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );
-     cubehashUpdateDigest( &h_ctx.cube, hashB, hashA, 64 );
+    cubehash_full( &ctx.cube, hashB, 512, hashA, 64 );
    if ( hashB[0] & mask ) //7
    {
-        sph_keccak512 (&h_ctx.keccak2, hashB, 64); //
+        sph_keccak512_init( &ctx.keccak );
-        sph_keccak512_close(&h_ctx.keccak2, hashA); //8
+        sph_keccak512( &ctx.keccak, hashB, 64 ); //
        sph_keccak512_close( &ctx.keccak, hashA ); //8
    }
    else
    {
-        sph_jh512 (&h_ctx.jh2, hashB, 64); //7
+        sph_jh512_init( &ctx.jh );
-        sph_jh512_close(&h_ctx.jh2, hashA); //8
+        sph_jh512( &ctx.jh, hashB, 64 ); //7
        sph_jh512_close( &ctx.jh, hashA ); //8
    }
-    sph_shavite512 (&h_ctx.shavite1, hashA, 64); //3
+    sph_shavite512_init( &ctx.shavite );
-    sph_shavite512_close(&h_ctx.shavite1, hashB); //4
+    sph_shavite512( &ctx.shavite, hashA, 64 ); //3
    sph_shavite512_close( &ctx.shavite, hashB ); //4
-#if defined(__aarch64__)
+    simd512_ctx( &ctx.simd, hashA, hashB, 64 );
    sph_simd512 (&h_ctx.simd1, hashB, 64); //3
    sph_simd512_close(&h_ctx.simd1, hashA); //4
 #else    
    update_final_sd( &h_ctx.simd1, (BitSequence *)hashA,
                                   (const BitSequence *)hashB, 512 );
 #endif
    if ( hashA[0] & mask ) //4
    {
-        sph_whirlpool (&h_ctx.whirlpool2, hashA, 64); //
+        sph_whirlpool_init( &ctx.whirlpool );
-        sph_whirlpool_close(&h_ctx.whirlpool2, hashB); //5
+        sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //
        sph_whirlpool_close( &ctx.whirlpool, hashB ); //5
    }
    else
    {
-        sph_haval256_5 (&h_ctx.haval1, hashA, 64); //4
+        sph_haval256_5_init( &ctx.haval );
-        sph_haval256_5_close(&h_ctx.haval1, hashB);   //5
+        sph_haval256_5( &ctx.haval, hashA, 64 ); //4
        sph_haval256_5_close( &ctx.haval, hashB );   //5
        memset(&hashB[8], 0, 32);
    }
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
-    update_final_echo ( &h_ctx.echo1, (BitSequence *)hashA,
+    echo_full( &ctx.echo, hashA, 512, hashB, 64 );
                        (const BitSequence *)hashB, 512 );
 #else
-    sph_echo512 (&h_ctx.echo1, hashB, 64); //5
+    sph_echo512_init( &ctx.echo );
-    sph_echo512_close(&h_ctx.echo1, hashA); //6
+    sph_echo512( &ctx.echo, hashB, 64 ); //5
    sph_echo512_close( &ctx.echo, hashA ); //6
 #endif
-    sph_blake512 (&h_ctx.blake2, hashA, 64); //6
+    blake512_init( &ctx.blake );
-    sph_blake512_close(&h_ctx.blake2, hashB); //7
+    blake512_update( &ctx.blake, hashA, 64 );
    blake512_close( &ctx.blake, hashB );
    if ( hashB[0] & mask ) //7
    {
-        sph_shavite512 (&h_ctx.shavite2, hashB, 64); //
+       sph_shavite512_init( &ctx.shavite );
-        sph_shavite512_close(&h_ctx.shavite2, hashA); //8
+       sph_shavite512( &ctx.shavite, hashB, 64 ); //
       sph_shavite512_close( &ctx.shavite, hashA ); //8
    }
    else
-    {
+       luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );
     update_and_final_luffa( &h_ctx.luffa2, hashA, hashB, 64 );
    }
-    sph_hamsi512 (&h_ctx.hamsi1, hashA, 64); //3
+    sph_hamsi512_init( &ctx.hamsi );
-    sph_hamsi512_close(&h_ctx.hamsi1, hashB); //4
+    sph_hamsi512( &ctx.hamsi, hashA, 64 ); //3
    sph_hamsi512_close( &ctx.hamsi, hashB ); //4
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
-    fugue512_Update( &h_ctx.fugue1, hashB, 512 ); //2   ////
+    fugue512_full( &ctx.fugue, hashA, hashB, 64 );
    fugue512_Final( &h_ctx.fugue1, hashA ); //3 
 #else
-    sph_fugue512 (&h_ctx.fugue1, hashB, 64); //2   ////
+    sph_fugue512_init( &ctx.fugue );
-    sph_fugue512_close(&h_ctx.fugue1, hashA); //3 
+    sph_fugue512( &ctx.fugue, hashB, 64 ); //2   ////
    sph_fugue512_close( &ctx.fugue, hashA ); //3 
 #endif
    if ( hashA[0] & mask ) //4
    {
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
-     update_final_echo ( &h_ctx.echo2, (BitSequence *)hashB,
+       echo_full( &ctx.echo, hashB, 512, hashA, 64 );
                         (const BitSequence *)hashA, 512 );
 #else
-     sph_echo512 (&h_ctx.echo2, hashA, 64); //
+       sph_echo512_init( &ctx.echo );
-     sph_echo512_close(&h_ctx.echo2, hashB); //5
+       sph_echo512( &ctx.echo, hashA, 64 ); //
       sph_echo512_close( &ctx.echo, hashB ); //5
 #endif
    }
    else
-    {
+       simd512_ctx( &ctx.simd, hashB, hashA, 64 );
 #if defined(__aarch64__)
    sph_simd512(&h_ctx.simd2, hashA, 64); //6
    sph_simd512_close(&h_ctx.simd2, hashB); //7
 #else
    update_final_sd( &h_ctx.simd2, (BitSequence *)hashB,
                      (const BitSequence *)hashA, 512 );
 #endif
    }
-    sph_shabal512 (&h_ctx.shabal1, hashB, 64); //5
+    sph_shabal512_init( &ctx.shabal );
-    sph_shabal512_close(&h_ctx.shabal1, hashA); //6
+    sph_shabal512( &ctx.shabal, hashB, 64 ); //5
    sph_shabal512_close( &ctx.shabal, hashA ); //6
-    sph_whirlpool (&h_ctx.whirlpool3, hashA, 64); //6
+    sph_whirlpool_init( &ctx.whirlpool );
-    sph_whirlpool_close(&h_ctx.whirlpool3, hashB); //7
+    sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //6
    sph_whirlpool_close( &ctx.whirlpool, hashB ); //7
    if ( hashB[0] & mask ) //7
    {
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
-        fugue512_Update( &h_ctx.fugue2, hashB, 512 ); //
+       fugue512_full( &ctx.fugue, hashA, hashB, 64 );
        fugue512_Final( &h_ctx.fugue2, hashA ); //8
 #else
-        sph_fugue512 (&h_ctx.fugue2, hashB, 64); //
+       sph_fugue512_init( &ctx.fugue );
-        sph_fugue512_close(&h_ctx.fugue2, hashA); //8
+       sph_fugue512( &ctx.fugue, hashB, 64 ); //
       sph_fugue512_close( &ctx.fugue, hashA ); //8
 #endif
    }
    else
    {
-        sph_sha512( &h_ctx.sha1, hashB, 64 );
+       sph_sha512_init( &ctx.sha );
-        sph_sha512_close( &h_ctx.sha1, hashA );
+       sph_sha512( &ctx.sha, hashB, 64 );
       sph_sha512_close( &ctx.sha, hashA );
    }
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
-    update_and_final_groestl( &h_ctx.groestl2, (char*)hashB,
+    groestl512_full( &ctx.groestl, hashB, hashA, 512 );
                               (const char*)hashA, 512 );
 #else
-    sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3
+    sph_groestl512_init( &ctx.groestl );
-    sph_groestl512_close(&h_ctx.groestl2, hashB); //4
+    sph_groestl512( &ctx.groestl, hashA, 64 ); //3
    sph_groestl512_close( &ctx.groestl, hashB ); //4
 #endif
-    sph_sha512( &h_ctx.sha2, hashB, 64 );
+    sph_sha512_init( &ctx.sha );
-    sph_sha512_close( &h_ctx.sha2, hashA );
+    sph_sha512( &ctx.sha, hashB, 64 );
    sph_sha512_close( &ctx.sha, hashA );
    if ( hashA[0] & mask ) //4
    {
-        sph_haval256_5 (&h_ctx.haval2, hashA, 64); //
+        sph_haval256_5_init( &ctx.haval );
-        sph_haval256_5_close(&h_ctx.haval2, hashB); //5
+        sph_haval256_5( &ctx.haval, hashA, 64 ); //
-	memset(&hashB[8], 0, 32);
+        sph_haval256_5_close( &ctx.haval, hashB ); //5
        memset( &hashB[8], 0, 32 );
    }
    else
    {
-        sph_whirlpool (&h_ctx.whirlpool4, hashA, 64); //4
+        sph_whirlpool_init( &ctx.whirlpool );
-        sph_whirlpool_close(&h_ctx.whirlpool4, hashB);   //5
+        sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //4
        sph_whirlpool_close( &ctx.whirlpool, hashB );   //5
    }
-    sph_bmw512 (&h_ctx.bmw3, hashB, 64); //5
+    sph_bmw512_init( &ctx.bmw );
-    sph_bmw512_close(&h_ctx.bmw3, hashA); //6
+    sph_bmw512( &ctx.bmw, hashB, 64 ); //5
    sph_bmw512_close( &ctx.bmw, hashA ); //6
-	memcpy(state, hashA, 32);
+	memcpy( state, hashA, 32 );
 }
 int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-//        uint32_t endiandata[32] __attribute__((aligned(64)));
+   uint32_t endiandata[20] __attribute__((aligned(32)));
-        uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t hash64[8] __attribute__((aligned(32)));
-        uint32_t hash64[8] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
-        uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
        uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
   int thr_id = mythr->id;  // thr_id arg is deprecated
 	//const uint32_t Htarg = ptarget[7];
 	//we need bigendian data...
-//        for (int k = 0; k < 32; k++)
+   for (int k = 0; k < 20; k++)
-        for (int k = 0; k < 20; k++)
+         be32enc(&endiandata[k], pdata[k]);
                be32enc(&endiandata[k], pdata[k]);
        hmq_bmw512_midstate( endiandata );
 //	if (opt_debug) 
 //	{
 //		applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
 //	}
 	/* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */
 	/* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */
 	if (ptarget[7]==0) {
 		do {
 			pdata[19] = ++n;
--- a/algo/quark/quark-gate.c
+++ b/algo/quark/quark-gate.c
@@ -14,7 +14,8 @@ bool register_quark_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_quark;
  gate->hash      = (void*)&quark_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
                      | NEON_OPT;
  return true;
 };
--- a/algo/quark/quark-gate.h
+++ b/algo/quark/quark-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define QUARK_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define QUARK_4WAY 1
--- a/algo/quark/quark.c
+++ b/algo/quark/quark.c
@@ -7,12 +7,12 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/blake/sph_blake.h"
+#include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #else
  #include "algo/groestl/sph_groestl.h"
@@ -21,9 +21,9 @@
 void quark_hash(void *state, const void *input)
 {
   uint32_t hash[16] __attribute__((aligned(64)));
-   sph_blake512_context    ctx_blake;
+   blake512_context        ctx_blake;
   sph_bmw512_context      ctx_bmw;
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   hashState_groestl       ctx_groestl;
 #else
   sph_groestl512_context  ctx_groestl;
@@ -33,17 +33,15 @@ void quark_hash(void *state, const void *input)
   sph_keccak512_context   ctx_keccak;
   uint32_t mask = 8;
-   sph_blake512_init( &ctx_blake );
+   blake512_full( &ctx_blake, hash, input, 80 );
-   sph_blake512( &ctx_blake, input, 80 );
+   
   sph_blake512_close( &ctx_blake, hash );
   sph_bmw512_init( &ctx_bmw );
   sph_bmw512( &ctx_bmw, hash, 64 );
   sph_bmw512_close( &ctx_bmw, hash ); 
   if ( hash[0] & mask )
   {
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
      init_groestl( &ctx_groestl, 64 );
      update_and_final_groestl( &ctx_groestl, (char*)hash,
                                        (const char*)hash, 512 );
@@ -60,7 +58,7 @@ void quark_hash(void *state, const void *input)
      sph_skein512_close( &ctx_skein, hash );
   }
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   init_groestl( &ctx_groestl, 64 );
   update_and_final_groestl( &ctx_groestl, (char*)hash,
                                     (const char*)hash, 512 );
@@ -76,9 +74,7 @@ void quark_hash(void *state, const void *input)
   if ( hash[0] & mask )
   {
-      sph_blake512_init( &ctx_blake );
+      blake512_full( &ctx_blake, hash, hash, 64 );
      sph_blake512( &ctx_blake, hash, 64 );
      sph_blake512_close( &ctx_blake, hash );
   }
   else
   {
--- a/algo/qubit/deep-2way.c
+++ b/algo/qubit/deep-2way.c
@@ -83,7 +83,7 @@ int scanhash_deep_2way( struct work *work,uint32_t max_nonce,
     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
-     casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+     casti_v128(  endiandata, 4 ) = v128_bswap32(   casti_v128(  pdata, 4 ) );
     uint64_t *edata = (uint64_t*)endiandata;
     intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -236,7 +236,7 @@ int scanhash_qubit_2way( struct work *work,uint32_t max_nonce,
     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
-     casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+     casti_v128(  endiandata, 4 ) = v128_bswap32(   casti_v128(  pdata, 4 ) );
     uint64_t *edata = (uint64_t*)endiandata;
     intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );
--- a/algo/qubit/qubit-gate.c
+++ b/algo/qubit/qubit-gate.c
@@ -16,7 +16,8 @@ bool register_qubit_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_qubit;
  gate->hash      = (void*)&qubit_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
                      | NEON_OPT;
  return true;
 };
--- a/algo/qubit/qubit-gate.h
+++ b/algo/qubit/qubit-gate.h
@@ -5,7 +5,7 @@
 #include <stdint.h>
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(SIMD512)
  #define QUBIT_4WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define QUBIT_2WAY 1
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -8,13 +8,9 @@
 #include <stdio.h>
 #include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/cubehash_sse2.h" 
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
  #include "algo/simd/sph_simd.h"
 #else
  #include "algo/simd/nist.h"
 #endif
 #include "algo/shavite/sph_shavite.h"
-#ifdef __AES__
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
 #include "algo/echo/aes_ni/hash_api.h"
 #else
 #include "algo/echo/sph_echo.h"
@@ -25,12 +21,8 @@ typedef struct
        hashState_luffa         luffa;
        cubehashParam           cubehash;
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
+        simd512_context         simd;
-   sph_simd512_context     simd;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
 #else
   hashState_sd            simd;
 #endif
 #ifdef __AES__
        hashState_echo          echo;
 #else
        sph_echo512_context echo;
@@ -45,12 +37,7 @@ void init_qubit_ctx()
        init_luffa(&qubit_ctx.luffa,512);
        cubehashInit(&qubit_ctx.cubehash,512,16,32);
        sph_shavite512_init(&qubit_ctx.shavite);
-#if defined(__aarch64__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   sph_simd512_init( &qubit_ctx.simd );
 #else
   init_sd( &qubit_ctx.simd, 512 );
 #endif
 #ifdef __AES__
        init_echo(&qubit_ctx.echo, 512);
 #else
        sph_echo512_init(&qubit_ctx.echo);
@@ -81,15 +68,9 @@ void qubit_hash(void *output, const void *input)
        sph_shavite512( &ctx.shavite, hash, 64);
        sph_shavite512_close( &ctx.shavite, hash);
-#if defined(__aarch64__)
+        simd512_ctx( &ctx.simd, hash, hash, 64 );
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
+        
-    sph_simd512_close(&ctx.simd, hash);
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
 #else
    update_sd( &ctx.simd, (const BitSequence *)hash, 512 );
    final_sd( &ctx.simd, (BitSequence *)hash );
 #endif
 #ifdef __AES__
        update_final_echo( &ctx.echo, (BitSequence *) hash,
                     (const BitSequence *) hash, 512 );
 #else
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -104,14 +104,14 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
   int thr_id = mythr->id;  // thr_id arg is deprecated
   // we need bigendian data...
-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
-   casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
+   casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
-   casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
+   casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
-   casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
+   casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
   intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
        edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
@@ -224,14 +224,14 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
   int thr_id = mythr->id;  // thr_id arg is deprecated
   // we need bigendian data...
-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
-   casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
+   casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
-   casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
+   casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
-   casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
+   casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
   intrlv_8x32( vdata, edata, edata, edata, edata,
                       edata, edata, edata, edata, 1024 );
--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -51,7 +51,6 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
 bool register_lbry_algo( algo_gate_t* gate )
 {
 //  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
 #if defined (LBRY_16WAY)
  gate->scanhash              = (void*)&scanhash_lbry_16way;
  gate->hash                  = (void*)&lbry_16way_hash;
@@ -67,7 +66,7 @@ bool register_lbry_algo( algo_gate_t* gate )
 #else 
  gate->scanhash              = (void*)&scanhash_lbry;
  gate->hash                  = (void*)&lbry_hash;
-  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
+  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA256_OPT;
 #endif
  gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
  gate->build_extraheader     = (void*)&lbry_build_extraheader;
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	8e91bfbe19	v24.5	2024-09-13 14:14:57 -04:00
Jay D Dee	47e24b50e8	v24.4	2024-07-01 00:33:19 -04:00
Jay D Dee	c47c4a8885	v24.3	2024-05-28 18:20:19 -04:00
Jay D Dee	042d13d1e1	v24.2	2024-05-20 23:08:50 -04:00
Jay D Dee	4f930574cc	v24.1	2024-04-16 21:31:35 -04:00
Jay D Dee	9d3a46c355	v23.15	2023-11-30 14:36:47 -05:00
Jay D Dee	4e3f1b926f	v23.14	2023-11-28 00:58:43 -05:00
Jay D Dee	045b42babf	v23.13	2023-11-21 14:18:15 -05:00
Jay D Dee	fc696dbbe5	v23.12	2023-11-20 11:51:57 -05:00
Jay D Dee	f3fde95f27	v23.10	2023-11-15 11:05:41 -05:00